commit 08272c7de5240f204532d282a92cfb76926d68ad
Author: erwin coumans <erwin.coumans@gmail.com>
Date:   Mon Mar 11 22:03:27 2013 +0100

    import opencl_course source for a start

diff --git a/build/findDirectX11.lua b/build/findDirectX11.lua
new file mode 100644
index 000000000..68771c4a0
--- /dev/null
+++ b/build/findDirectX11.lua
@@ -0,0 +1,36 @@
+function findDirectX11()
+		local dx11path = os.getenv("DXSDK_DIR")
+		if (dx11path) then
+			local filepath = string.format("%s%s",dx11path,"Include/D3D11.h")
+			headerdx11 = io.open(filepath, "r")
+			if (headerdx11) then
+				 printf("Found DX11: '%s'", filepath)
+				return true
+			end
+		end
+		return false
+	end
+
+function initDirectX11()
+	configuration {}
+	
+	local dx11path = os.getenv("DXSDK_DIR")
+			defines { "ADL_ENABLE_DX11"}
+			includedirs {"$(DXSDK_DIR)/include"}
+	
+		configuration "x32"
+			libdirs {"$(DXSDK_DIR)/Lib/x86"}
+		configuration "x64"
+			libdirs {"$(DXSDK_DIR)/Lib/x64"}
+		configuration {}
+		links {"d3dcompiler",
+					"dxerr",
+					"dxguid",
+					"d3dx9",
+					"d3d9",
+					"winmm",
+					"comctl32",
+					"d3dx11"
+		}
+		return true
+end
\ No newline at end of file
diff --git a/build/findOpenCL.lua b/build/findOpenCL.lua
new file mode 100644
index 000000000..e2eda667a
--- /dev/null
+++ b/build/findOpenCL.lua
@@ -0,0 +1,151 @@
+
+
+	function findOpenCL_Apple()
+		if os.is("macosx") then
+			return true	
+		else
+			return false
+		end
+	end
+
+	
+	function findOpenCL_AMD()
+		local amdopenclpath = os.getenv("AMDAPPSDKROOT")
+		if (amdopenclpath) then
+			return true
+		end
+		return false
+	end
+
+	function findOpenCL_NVIDIA()
+		local nvidiaopenclpath = os.getenv("CUDA_PATH")
+		if (nvidiaopenclpath) then
+			return true
+		end
+		return false
+	end
+
+	function findOpenCL_Intel()
+		if os.is("Windows") then
+			local intelopenclpath = os.getenv("INTELOCLSDKROOT")
+			if (intelopenclpath) then
+			return true
+			end
+		end
+		if os.is("Linux") then
+			local intelsdk = io.open("/usr/include/CL/opencl.h","r")
+			if (intelsdk) then
+				return true;
+			end
+		end
+		return false
+	end
+		
+	function initOpenCL_Apple()
+		configuration{}
+		includedirs {
+			"/System/Library/Frameworks/OpenCL.framework"
+		}
+		libdirs "/System/Library/Frameworks/OpenCL.framework"
+		links
+		{
+			"OpenCL.framework"
+		}
+	end
+	
+	function initOpenCL_AMD()
+		configuration {}
+		local amdopenclpath = os.getenv("AMDAPPSDKROOT")
+		if (amdopenclpath) then
+			defines { "ADL_ENABLE_CL" , "CL_PLATFORM_AMD"}
+			includedirs {
+				"$(AMDAPPSDKROOT)/include"				
+			}
+			configuration "x32"
+				libdirs {"$(AMDAPPSDKROOT)/lib/x86"}
+			configuration "x64"
+				libdirs {"$(AMDAPPSDKROOT)/lib/x86_64"}
+			configuration {}
+			links {"OpenCL"}
+			return true
+		end
+		return false
+	end
+
+
+	function initOpenCL_NVIDIA()
+		configuration {}
+		local nvidiaopenclpath = os.getenv("CUDA_PATH")
+		if (nvidiaopenclpath) then
+			defines { "ADL_ENABLE_CL" , "CL_PLATFORM_NVIDIA"}
+			includedirs {
+				"$(CUDA_PATH)/include"				
+			}
+			configuration "x32"
+				libdirs {"$(CUDA_PATH)/lib/Win32"}
+			configuration "x64"
+				libdirs {"$(CUDA_PATH)/lib/x64"}
+			configuration {}
+			links {"OpenCL"}
+			return true
+		end
+		return false
+	end
+
+	function initOpenCL_Intel()
+		configuration {}
+		if os.is("Windows") then
+		local intelopenclpath = os.getenv("INTELOCLSDKROOT")
+		if (intelopenclpath) then
+			defines { "ADL_ENABLE_CL" , "CL_PLATFORM_INTEL"}
+			includedirs {
+				"$(INTELOCLSDKROOT)/include"				
+			}
+			configuration "x32"
+				libdirs {"$(INTELOCLSDKROOT)/lib/x86"}
+			configuration "x64"
+				libdirs {"$(INTELOCLSDKROOT)/lib/x64"}
+			configuration {}
+			links {"OpenCL"}
+			return true
+		end
+		end
+		if os.is("Linux") then
+			defines { "ADL_ENABLE_CL" , "CL_PLATFORM_INTEL"}
+                        configuration {}
+                        links {"OpenCL"}
+		end
+		return false
+	end
+	
+	function findOpenCL (vendor )
+		if vendor=="AMD" then
+			return findOpenCL_AMD()
+		end
+		if vendor=="NVIDIA" then
+			return findOpenCL_NVIDIA()
+		end
+			if vendor=="Intel" then
+			return findOpenCL_Intel()
+		end
+		if vendor=="Apple" then
+			return findOpenCL_Apple()
+		end
+		return false
+	end
+	
+	function initOpenCL ( vendor )
+		if vendor=="AMD" then
+			initOpenCL_AMD()
+		end
+		if vendor=="NVIDIA" then
+			return initOpenCL_NVIDIA()
+		end
+		if vendor=="Intel" then
+			initOpenCL_Intel()
+		end
+		if vendor=="Apple" then
+			return initOpenCL_Apple()
+		end
+	end
+	
diff --git a/build/findOpenGLGlewGlut.lua b/build/findOpenGLGlewGlut.lua
new file mode 100644
index 000000000..f808972da
--- /dev/null
+++ b/build/findOpenGLGlewGlut.lua
@@ -0,0 +1,51 @@
+
+	function initOpenGL()
+		configuration {}
+		configuration {"Windows"}
+			links {"opengl32","glu32"}
+		configuration {"MacOSX"}
+ 			links { "OpenGL.framework"} 
+		configuration {"not Windows", "not MacOSX"}
+			links {"GL"}
+		configuration{}
+	end
+
+	function initGlut()
+		configuration {}
+		configuration {"Windows"}
+
+			includedirs {
+				projectRootDir .. "rendering/GlutGlewWindows"
+			}
+			libdirs { projectRootDir .. "rendering/GlutGlewWindows"}
+		configuration {"Windows", "x32"}
+			links {"glut32"}
+		configuration {"Windows", "x64"}
+			links {"glut64"}
+	
+		configuration {"MacOSX"}
+ 			links { "Glut.framework" } 
+		configuration {"Linux"}
+			links {"glut","GLU"}
+		configuration{}
+	end
+
+	function initGlew()
+		configuration {}
+		if os.is("Windows") then
+			configuration {"Windows"}
+			defines { "GLEW_STATIC"}
+			includedirs {
+					projectRootDir .. "rendering/GlutGlewWindows"
+			}
+			libdirs {	projectRootDir .. "rendering/GlutGlewWindows"}
+			files { projectRootDir .. "rendering/GlutGlewWindows/glew.c"}
+		end
+		if os.is("Linux") then
+			links{"GLEW"}
+		end
+		configuration{}
+	end
+
+
+
diff --git a/build/premake4.exe b/build/premake4.exe
new file mode 100644
index 000000000..072560edb
Binary files /dev/null and b/build/premake4.exe differ
diff --git a/build/premake4.lua b/build/premake4.lua
new file mode 100644
index 000000000..16a50c9c0
--- /dev/null
+++ b/build/premake4.lua
@@ -0,0 +1,96 @@
+
+  solution "0MySolution"
+
+	-- Multithreaded compiling
+	if _ACTION == "vs2010" or _ACTION=="vs2008" then
+		buildoptions { "/MP"  }
+	end 
+	
+	act = ""
+    
+    if _ACTION then
+        act = _ACTION
+    end
+
+
+	newoption 
+	{
+    		trigger     = "ios",
+    		description = "Enable iOS target (requires xcode4)"
+  	}
+	
+  
+	configurations {"Release", "Debug"}
+	configuration "Release"
+		flags { "Optimize", "EnableSSE","StaticRuntime", "NoMinimalRebuild", "FloatFast"}
+	configuration "Debug"
+		defines {"_DEBUG=1"}
+		flags { "Symbols", "StaticRuntime" , "NoMinimalRebuild", "NoEditAndContinue" ,"FloatFast"}
+		
+	platforms {"x32", "x64"}
+
+	configuration {"x32"}
+		targetsuffix ("_" .. act)
+	configuration "x64"		
+		targetsuffix ("_" .. act .. "_64" )
+	configuration {"x64", "debug"}
+		targetsuffix ("_" .. act .. "_x64_debug")
+	configuration {"x64", "release"}
+		targetsuffix ("_" .. act .. "_x64_release" )
+	configuration {"x32", "debug"}
+		targetsuffix ("_" .. act .. "_debug" )
+	
+	configuration{}
+
+	postfix=""
+
+	if _ACTION == "xcode4" then
+		if _OPTIONS["ios"] then
+      			postfix = "ios";
+      			xcodebuildsettings
+      			{
+              		'CODE_SIGN_IDENTITY = "iPhone Developer"',
+              		"SDKROOT = iphoneos",
+              		'ARCHS = "armv7"',
+              		'TARGETED_DEVICE_FAMILY = "1,2"',
+              		'VALID_ARCHS = "armv7"',
+      			}      
+      		else
+      			xcodebuildsettings
+      			{
+              		'ARCHS = "$(ARCHS_STANDARD_32_BIT) $(ARCHS_STANDARD_64_BIT)"',
+              		'VALID_ARCHS = "x86_64 i386"',
+      			}
+    		end
+	end
+
+	
+	flags { "NoRTTI", "NoExceptions"}
+	defines { "_HAS_EXCEPTIONS=0" }
+	targetdir "../bin"
+	location("./" .. act .. postfix)
+
+	
+	projectRootDir = os.getcwd() .. "/../"
+	print("Project root directroy: " .. projectRootDir);
+
+	dofile ("findOpenCL.lua")
+	dofile ("findDirectX11.lua")
+	dofile ("findOpenGLGlewGlut.lua")
+	
+	language "C++"
+	
+
+
+	if not _OPTIONS["ios"] then
+		include "../opencl/vector_add_simplified"
+		include "../opencl/vector_add"
+		include "../opencl/basic_initialize"
+		include "../opencl/parallel_primitives/host"
+		include "../opencl/parallel_primitives/test"
+		include "../opencl/parallel_primitives/benchmark"
+		include "../opencl/lds_bank_conflict"
+		include "../opencl/reduce"
+		
+		
+	end
\ No newline at end of file
diff --git a/build/premake4_linux b/build/premake4_linux
new file mode 100644
index 000000000..53442a801
Binary files /dev/null and b/build/premake4_linux differ
diff --git a/build/premake4_linux64 b/build/premake4_linux64
new file mode 100644
index 000000000..4724db588
Binary files /dev/null and b/build/premake4_linux64 differ
diff --git a/build/premake4_osx b/build/premake4_osx
new file mode 100644
index 000000000..67e25d5b9
Binary files /dev/null and b/build/premake4_osx differ
diff --git a/build/stringify.bat b/build/stringify.bat
new file mode 100644
index 000000000..890f27d9d
--- /dev/null
+++ b/build/stringify.bat
@@ -0,0 +1,13 @@
+
+@echo off
+
+
+premake4 --file=stringifyKernel.lua --kernelfile="../opencl/vector_add/VectorAddKernels.cl" --headerfile="../opencl/vector_add/VectorAddKernels.h" --stringname="vectorAddCL" stringify
+
+premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" 	--headerfile="../opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/BoundSearchKernels.cl" 	--headerfile="../opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/PrefixScanKernels.cl" 	--headerfile="../opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/FillKernels.cl" 				--headerfile="../opencl/parallel_primitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify
+
+
+pause
\ No newline at end of file
diff --git a/build/stringify.sh b/build/stringify.sh
new file mode 100644
index 000000000..bb93fa405
--- /dev/null
+++ b/build/stringify.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/vector_add/VectorAddKernels.cl" --headerfile="../opencl/vector_add/VectorAddKernels.h" --stringname="vectorAddCL" stringify
+./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" 	--headerfile="../opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify
+./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/BoundSearchKernels.cl" 	--headerfile="../opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify
+./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/PrefixScanKernels.cl" 	--headerfile="../opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify
+./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/FillKernels.cl" 				--headerfile="../opencl/parallel_primitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify
+
diff --git a/build/stringifyKernel.lua b/build/stringifyKernel.lua
new file mode 100644
index 000000000..dea0a73fe
--- /dev/null
+++ b/build/stringifyKernel.lua
@@ -0,0 +1,78 @@
+
+
+function stringifyKernel(filenameIn, filenameOut, kernelMethod)
+  local BUFSIZE = 1024*1024     -- 1MB
+	local f = io.open(filenameIn,"r");
+   	local fw = io.open(filenameOut,"w");
+   	fw:write("//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project\n")
+   	fw:write("static const char* " .. kernelMethod .. "= \\\n")
+    local cc, lc, wc = 0, 0, 0   -- char, line, and word counts
+    while true do
+      local lines, rest = f:read(BUFSIZE, "*line")
+      if not lines then break end
+      
+      local i = 0
+      local startpos = 0
+      local slen = string.len(lines)
+      local endpos = 0
+    	while true do
+      	i = string.find(lines, "\n", i+1)    -- find 'next' newline
+      	if i == nil then 
+      		endpos = slen
+      	else
+      	  endpos = i
+      	end
+      	oneline = string.sub(lines,startpos,endpos)
+      	oneline = string.gsub(oneline,"\n","")
+      	oneline = '\"' .. oneline .. '\\n\"'
+      	oneline = string.gsub(oneline,"\\\\n","")
+      	oneline = oneline .. "\n"
+      	--print(oneline)
+      	fw:write(oneline)
+      	if i == nil then break end
+      	startpos = i+1
+      end
+      
+      if rest then lines = lines .. rest .. '\n' end
+      cc = cc + string.len(lines)
+      -- count words in the chunk
+      local _,t = string.gsub(lines, "%S+", "")
+      wc = wc + t
+      -- count newlines in the chunk
+      _,t = string.gsub(lines, "\n", "\n")
+      lc = lc + t
+    end
+    --print("stringified " .. filenameIn .. " into " .. filenameOut .. " processed " .. lc .. " lines")
+    print(filenameIn .. " (" .. lc .. " lines)")
+
+ 		f:close()
+ 		fw:write(";\n")
+ 		fw:close()
+ end
+ 
+ newoption {
+    trigger     = "kernelfile",
+    value				=	"kernelpath",
+    description = "full path to the kernel source input file"
+  }
+
+ newoption {
+    trigger     = "headerfile",
+    value				=	"path",
+    description = "full path to the header output file"
+  }
+
+ newoption {
+    trigger     = "stringname",
+    value				=	"var",
+    description = "name of the kernel string variable"
+  }
+  
+ newaction {
+   trigger     = "stringify",
+   description = "stringify kernels source code into strings",
+   execute = function ()
+    stringifyKernel( _OPTIONS["kernelfile"] , _OPTIONS["headerfile"], _OPTIONS["stringname"])    
+ 
+   end
+}
\ No newline at end of file
diff --git a/build/vs2010.bat b/build/vs2010.bat
new file mode 100644
index 000000000..584d0baa8
--- /dev/null
+++ b/build/vs2010.bat
@@ -0,0 +1,6 @@
+
+rem premake4 --with-pe  vs2010
+premake4  vs2010
+
+mkdir vs2010\cache
+pause
\ No newline at end of file
diff --git a/build/xcode.command b/build/xcode.command
new file mode 100644
index 000000000..52a4a4a7c
--- /dev/null
+++ b/build/xcode.command
@@ -0,0 +1,4 @@
+
+cd `dirname $0`
+./premake4_osx xcode4
+
diff --git a/opencl/basic_initialize/btOpenCLInclude.h b/opencl/basic_initialize/btOpenCLInclude.h
new file mode 100644
index 000000000..5f0e78da6
--- /dev/null
+++ b/opencl/basic_initialize/btOpenCLInclude.h
@@ -0,0 +1,44 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_OPENCL_INCLUDE_H
+#define BT_OPENCL_INCLUDE_H
+
+
+#ifdef __APPLE__
+#ifdef USE_MINICL
+#include <MiniCL/cl.h>
+#else
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_ext.h> //clLogMessagesToStderrAPPLE
+#endif
+#else
+#ifdef USE_MINICL
+#include <MiniCL/cl.h>
+#else
+#include <CL/cl.h>
+#ifdef _WIN32
+#include "CL/cl_gl.h"
+#endif //_WIN32
+#endif
+#endif //__APPLE__
+
+#include <assert.h>
+#include <stdio.h>
+#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); }
+
+
+#endif //BT_OPENCL_INCLUDE_H
+
diff --git a/opencl/basic_initialize/btOpenCLUtils.cpp b/opencl/basic_initialize/btOpenCLUtils.cpp
new file mode 100644
index 000000000..af29461e7
--- /dev/null
+++ b/opencl/basic_initialize/btOpenCLUtils.cpp
@@ -0,0 +1,903 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//original author: Roman Ponomarev
+//cleanup by Erwin Coumans
+
+#include <string.h>
+
+#ifdef _WIN32
+#pragma warning (disable:4996)
+#endif
+#include "btOpenCLUtils.h"
+//#include "btOpenCLInclude.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define BT_MAX_CL_DEVICES 16 //who needs 16 devices?
+
+#ifdef _WIN32
+#include <Windows.h>
+#endif
+
+#include <assert.h>
+#define btAssert assert
+
+
+//Set the preferred platform vendor using the OpenCL SDK
+static const char* spPlatformVendor =
+#if defined(CL_PLATFORM_MINI_CL)
+"MiniCL, SCEA";
+#elif defined(CL_PLATFORM_AMD)
+"Advanced Micro Devices, Inc.";
+#elif defined(CL_PLATFORM_NVIDIA)
+"NVIDIA Corporation";
+#elif defined(CL_PLATFORM_INTEL)
+"Intel(R) Corporation";
+#else
+"Unknown Vendor";
+#endif
+
+#ifndef CL_PLATFORM_MINI_CL
+#ifdef _WIN32
+#include "CL/cl_gl.h"
+#endif //_WIN32
+#endif
+
+bool gDebugForceLoadingFromSource = false;
+bool gDebugSkipLoadingBinary = false;
+
+void MyFatalBreakAPPLE(   const char *  errstr ,
+                       const void *  private_info ,
+                       size_t        cb ,
+                       void *        user_data  )
+{
+    printf("Error: %s\n", errstr);
+
+    const char* patloc = strstr(errstr, "Warning");
+    //find out if it is a warning or error, exit if error
+
+    if (patloc)
+    {
+        printf("warning\n");
+    } else
+    {
+        printf("error\n");
+        btAssert(0);
+    }
+
+
+}
+
+
+int btOpenCLUtils_getNumPlatforms(cl_int* pErrNum)
+{
+
+	cl_platform_id pPlatforms[10] = { 0 };
+
+    cl_uint numPlatforms = 0;
+    cl_int ciErrNum = clGetPlatformIDs(10, pPlatforms, &numPlatforms);
+	//cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+
+	if(ciErrNum != CL_SUCCESS)
+	{
+		if(pErrNum != NULL)
+			*pErrNum = ciErrNum;
+	}
+	return numPlatforms;
+
+}
+
+const char* btOpenCLUtils_getSdkVendorName()
+{
+	return spPlatformVendor;
+}
+
+cl_platform_id btOpenCLUtils_getPlatform(int platformIndex0, cl_int* pErrNum)
+{
+	cl_platform_id platform = 0;
+	unsigned int platformIndex = (unsigned int )platformIndex0;
+	cl_uint numPlatforms;
+	cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+
+	if (platformIndex>=0 && platformIndex<numPlatforms)
+	{
+		cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms);
+		ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
+		if(ciErrNum != CL_SUCCESS)
+		{
+			if(pErrNum != NULL)
+				*pErrNum = ciErrNum;
+			return platform;
+		}
+
+		platform = platforms[platformIndex];
+
+		free (platforms);
+	}
+
+	return platform;
+}
+
+void btOpenCLUtils::getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo* platformInfo)
+{
+	cl_int ciErrNum;
+	ciErrNum = clGetPlatformInfo(	platform,CL_PLATFORM_VENDOR,BT_MAX_STRING_LENGTH,platformInfo->m_platformVendor,NULL);
+	oclCHECKERROR(ciErrNum,CL_SUCCESS);
+	ciErrNum = clGetPlatformInfo(	platform,CL_PLATFORM_NAME,BT_MAX_STRING_LENGTH,platformInfo->m_platformName,NULL);
+	oclCHECKERROR(ciErrNum,CL_SUCCESS);
+	ciErrNum = clGetPlatformInfo(	platform,CL_PLATFORM_VERSION,BT_MAX_STRING_LENGTH,platformInfo->m_platformVersion,NULL);
+	oclCHECKERROR(ciErrNum,CL_SUCCESS);
+}
+
+void btOpenCLUtils_printPlatformInfo(cl_platform_id platform)
+{
+	btOpenCLPlatformInfo platformInfo;
+	btOpenCLUtils::getPlatformInfo (platform, &platformInfo);
+	printf("Platform info:\n");
+	printf("  CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
+	printf("  CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
+	printf("  CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
+}
+
+
+
+cl_context btOpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex)
+{
+	cl_context retContext = 0;
+	cl_int ciErrNum=0;
+	cl_uint num_entries;
+	cl_device_id devices[BT_MAX_CL_DEVICES];
+	cl_uint num_devices;
+	cl_context_properties* cprops;
+
+	/*
+	* If we could find our platform, use it. Otherwise pass a NULL and get whatever the
+	* implementation thinks we should be using.
+	*/
+	cl_context_properties cps[7] = {0,0,0,0,0,0,0};
+	cps[0] = CL_CONTEXT_PLATFORM;
+	cps[1] = (cl_context_properties)platform;
+#ifdef _WIN32
+	if (pGLContext && pGLDC)
+	{
+		cps[2] = CL_GL_CONTEXT_KHR;
+		cps[3] = (cl_context_properties)pGLContext;
+		cps[4] = CL_WGL_HDC_KHR;
+		cps[5] = (cl_context_properties)pGLDC;
+	}
+#endif //_WIN32
+	num_entries = BT_MAX_CL_DEVICES;
+
+
+	num_devices=-1;
+
+	ciErrNum = clGetDeviceIDs(
+		platform,
+		deviceType,
+ 		num_entries,
+ 		devices,
+ 		&num_devices);
+
+    if (ciErrNum<0)
+    {
+        printf("clGetDeviceIDs returned %d\n",ciErrNum);
+        return 0;
+    }
+	cprops = (NULL == platform) ? NULL : cps;
+
+	if (!num_devices)
+		return 0;
+
+	if (pGLContext)
+	{
+		//search for the GPU that relates to the OpenCL context
+		unsigned int i;
+		for (i=0;i<num_devices;i++)
+		{
+			retContext = clCreateContext(cprops,1,&devices[i],NULL,NULL,&ciErrNum);
+			if (ciErrNum==CL_SUCCESS)
+				break;
+		}
+	}
+	else
+	{
+		if (preferredDeviceIndex>=0 && (unsigned int)preferredDeviceIndex<num_devices)
+		{
+			//create a context of the preferred device index
+			retContext = clCreateContext(cprops,1,&devices[preferredDeviceIndex],NULL,NULL,&ciErrNum);
+		} else
+		{
+			//create a context of all devices
+#if defined (__APPLE__)
+			retContext = clCreateContext(cprops,num_devices,devices,MyFatalBreakAPPLE,NULL,&ciErrNum);
+#else
+        printf("numDevices=%d\n",num_devices);
+
+			retContext = clCreateContext(cprops,num_devices,devices,NULL,NULL,&ciErrNum);
+#endif
+		}
+	}
+	if(pErrNum != NULL)
+	{
+		*pErrNum = ciErrNum;
+	};
+
+	return retContext;
+}
+
+cl_context btOpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC , int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* retPlatformId)
+{
+	cl_uint numPlatforms;
+	cl_context retContext = 0;
+	unsigned int i;
+
+	cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+	if(ciErrNum != CL_SUCCESS)
+	{
+		if(pErrNum != NULL) *pErrNum = ciErrNum;
+		return NULL;
+	}
+	if(numPlatforms > 0)
+	{
+		cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms);
+		ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
+		if(ciErrNum != CL_SUCCESS)
+		{
+			if(pErrNum != NULL)
+				*pErrNum = ciErrNum;
+			free(platforms);
+			return NULL;
+		}
+
+
+
+		for ( i = 0; i < numPlatforms; ++i)
+		{
+			char pbuf[128];
+			ciErrNum = clGetPlatformInfo(	platforms[i],
+				CL_PLATFORM_VENDOR,
+				sizeof(pbuf),
+				pbuf,
+				NULL);
+			if(ciErrNum != CL_SUCCESS)
+			{
+				if(pErrNum != NULL) *pErrNum = ciErrNum;
+				return NULL;
+			}
+
+			if (preferredPlatformIndex>=0 && i==preferredPlatformIndex)
+			{
+				cl_platform_id tmpPlatform = platforms[0];
+				platforms[0] = platforms[i];
+				platforms[i] = tmpPlatform;
+				break;
+			} else
+			{
+				if(!strcmp(pbuf, spPlatformVendor))
+				{
+					cl_platform_id tmpPlatform = platforms[0];
+					platforms[0] = platforms[i];
+					platforms[i] = tmpPlatform;
+				}
+			}
+		}
+
+		for (i = 0; i < numPlatforms; ++i)
+		{
+			cl_platform_id platform = platforms[i];
+			assert(platform);
+
+			retContext = btOpenCLUtils_createContextFromPlatform(platform,deviceType,pErrNum,pGLContext,pGLDC,preferredDeviceIndex,preferredPlatformIndex);
+
+			if (retContext)
+			{
+//				printf("OpenCL platform details:\n");
+				btOpenCLPlatformInfo platformInfo;
+
+				btOpenCLUtils::getPlatformInfo(platform, &platformInfo);
+
+				if (retPlatformId)
+					*retPlatformId = platform;
+
+				break;
+			}
+		}
+
+		free (platforms);
+	}
+	return retContext;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the nth device from the context
+//!
+//! @return the id or -1 when out of range
+//! @param cxMainContext         OpenCL context
+//! @param device_idx            index of the device of interest
+//////////////////////////////////////////////////////////////////////////////
+cl_device_id btOpenCLUtils_getDevice(cl_context cxMainContext, int deviceIndex)
+{
+	assert(cxMainContext);
+
+	size_t szParmDataBytes;
+	cl_device_id* cdDevices;
+	cl_device_id device ;
+
+	// get the list of devices associated with context
+	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
+
+	if( szParmDataBytes / sizeof(cl_device_id) < (unsigned int)deviceIndex ) {
+		return (cl_device_id)-1;
+	}
+
+	cdDevices = (cl_device_id*) malloc(szParmDataBytes);
+
+	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
+
+	device = cdDevices[deviceIndex];
+	free(cdDevices);
+
+	return device;
+}
+
+int btOpenCLUtils_getNumDevices(cl_context cxMainContext)
+{
+	size_t szParamDataBytes;
+	int device_count;
+	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes);
+	device_count = (int) szParamDataBytes/ sizeof(cl_device_id);
+	return device_count;
+}
+
+
+
+void btOpenCLUtils::getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo* info)
+{
+	// CL_DEVICE_NAME
+	clGetDeviceInfo(device, CL_DEVICE_NAME, BT_MAX_STRING_LENGTH, &info->m_deviceName, NULL);
+
+	// CL_DEVICE_VENDOR
+	clGetDeviceInfo(device, CL_DEVICE_VENDOR, BT_MAX_STRING_LENGTH, &info->m_deviceVendor, NULL);
+
+	// CL_DRIVER_VERSION
+	clGetDeviceInfo(device, CL_DRIVER_VERSION, BT_MAX_STRING_LENGTH, &info->m_driverVersion, NULL);
+
+	// CL_DEVICE_INFO
+	clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &info->m_deviceType, NULL);
+
+	// CL_DEVICE_MAX_COMPUTE_UNITS
+	clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(info->m_computeUnits), &info->m_computeUnits, NULL);
+
+	// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(info->m_workitemDims), &info->m_workitemDims, NULL);
+
+	// CL_DEVICE_MAX_WORK_ITEM_SIZES
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(info->m_workItemSize), &info->m_workItemSize, NULL);
+
+	// CL_DEVICE_MAX_WORK_GROUP_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(info->m_workgroupSize), &info->m_workgroupSize, NULL);
+
+	// CL_DEVICE_MAX_CLOCK_FREQUENCY
+	clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(info->m_clockFrequency), &info->m_clockFrequency, NULL);
+
+	// CL_DEVICE_ADDRESS_BITS
+	clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(info->m_addressBits), &info->m_addressBits, NULL);
+
+	// CL_DEVICE_MAX_MEM_ALLOC_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(info->m_maxMemAllocSize), &info->m_maxMemAllocSize, NULL);
+
+	// CL_DEVICE_GLOBAL_MEM_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(info->m_globalMemSize), &info->m_globalMemSize, NULL);
+
+	// CL_DEVICE_ERROR_CORRECTION_SUPPORT
+	clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(info->m_errorCorrectionSupport), &info->m_errorCorrectionSupport, NULL);
+
+	// CL_DEVICE_LOCAL_MEM_TYPE
+	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(info->m_localMemType), &info->m_localMemType, NULL);
+
+	// CL_DEVICE_LOCAL_MEM_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(info->m_localMemSize), &info->m_localMemSize, NULL);
+
+	// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(info->m_constantBufferSize), &info->m_constantBufferSize, NULL);
+
+	// CL_DEVICE_QUEUE_PROPERTIES
+	clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(info->m_queueProperties), &info->m_queueProperties, NULL);
+
+	// CL_DEVICE_IMAGE_SUPPORT
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(info->m_imageSupport), &info->m_imageSupport, NULL);
+
+	// CL_DEVICE_MAX_READ_IMAGE_ARGS
+	clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(info->m_maxReadImageArgs), &info->m_maxReadImageArgs, NULL);
+
+	// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(info->m_maxWriteImageArgs), &info->m_maxWriteImageArgs, NULL);
+
+	// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &info->m_image2dMaxWidth, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &info->m_image2dMaxHeight, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &info->m_image3dMaxWidth, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &info->m_image3dMaxHeight, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &info->m_image3dMaxDepth, NULL);
+
+	// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
+	clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, BT_MAX_STRING_LENGTH, &info->m_deviceExtensions, NULL);
+
+	// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &info->m_vecWidthChar, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &info->m_vecWidthShort, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &info->m_vecWidthInt, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &info->m_vecWidthLong, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &info->m_vecWidthFloat, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info->m_vecWidthDouble, NULL);
+}
+
+
+void btOpenCLUtils_printDeviceInfo(cl_device_id device)
+{
+	btOpenCLDeviceInfo info;
+	btOpenCLUtils::getDeviceInfo(device,&info);
+	printf("Device Info:\n");
+	printf("  CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName);
+	printf("  CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor);
+	printf("  CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion);
+
+	if( info.m_deviceType & CL_DEVICE_TYPE_CPU )
+		printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
+	if( info.m_deviceType & CL_DEVICE_TYPE_GPU )
+		printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
+	if( info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR )
+		printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
+	if( info.m_deviceType & CL_DEVICE_TYPE_DEFAULT )
+		printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
+
+	printf("  CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits);
+	printf("  CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", info.m_workitemDims);
+	printf("  CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", info.m_workItemSize[0], info.m_workItemSize[1], info.m_workItemSize[2]);
+	printf("  CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize);
+	printf("  CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency);
+	printf("  CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits);
+	printf("  CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize/ (1024 * 1024)));
+	printf("  CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize/ (1024 * 1024)));
+	printf("  CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport== CL_TRUE ? "yes" : "no");
+	printf("  CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global");
+	printf("  CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024));
+	printf("  CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024));
+	if( info.m_queueProperties  & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
+		printf("  CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
+	if( info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE )
+		printf("  CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
+
+	printf("  CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport);
+
+	printf("  CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", info.m_maxReadImageArgs);
+	printf("  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", info.m_maxWriteImageArgs);
+	printf("\n  CL_DEVICE_IMAGE <dim>");
+	printf("\t\t\t2D_MAX_WIDTH\t %u\n", info.m_image2dMaxWidth);
+	printf("\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", info.m_image2dMaxHeight);
+	printf("\t\t\t\t\t3D_MAX_WIDTH\t %u\n", info.m_image3dMaxWidth);
+	printf("\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", info.m_image3dMaxHeight);
+	printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth);
+	if (info.m_deviceExtensions != 0)
+		printf("\n  CL_DEVICE_EXTENSIONS:%s\n",info.m_deviceExtensions);
+	else
+		printf("  CL_DEVICE_EXTENSIONS: None\n");
+	printf("  CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
+	printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
+		info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong,info.m_vecWidthFloat, info.m_vecWidthDouble);
+
+
+}
+
+
+static const char* strip2(const char* name, const char* pattern)
+{
+	  size_t const patlen = strlen(pattern);
+  	size_t patcnt = 0;
+	  const char * oriptr;
+	  const char * patloc;
+		// find how many times the pattern occurs in the original string
+	  for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
+	  {
+		patcnt++;
+	  }
+	  return oriptr;
+}
+
+cl_program btOpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSourceOrg, cl_int* pErrNum, const char* additionalMacrosArg , const char* clFileNameForCaching)
+{
+	const char* additionalMacros = additionalMacrosArg?additionalMacrosArg:"";
+
+	cl_program m_cpProgram=0;
+	cl_int status;
+
+#ifdef _WIN32
+	char binaryFileName[BT_MAX_STRING_LENGTH];
+	char* bla=0;
+
+	if (clFileNameForCaching && !(gDebugSkipLoadingBinary||gDebugForceLoadingFromSource) )
+	{
+
+		char deviceName[256];
+		char driverVersion[256];
+		const char* strippedName;
+		int fileUpToDate = 0;
+		int binaryFileValid=0;
+		FILETIME modtimeBinary;
+
+		clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL);
+		clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
+
+
+		strippedName = strip2(clFileNameForCaching,"\\");
+		strippedName = strip2(strippedName,"/");
+
+#ifdef _WIN32
+		sprintf_s(binaryFileName,BT_MAX_STRING_LENGTH,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion );
+#else
+		sprintf(binaryFileName,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion );
+#endif
+
+
+		//printf("searching for %s\n", binaryFileName);
+
+
+
+
+		CreateDirectory("cache",0);
+		{
+
+			HANDLE binaryFileHandle = CreateFile(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+			if (binaryFileHandle ==INVALID_HANDLE_VALUE)
+			{
+				DWORD errorCode;
+				errorCode = GetLastError();
+				switch (errorCode)
+				{
+				case ERROR_FILE_NOT_FOUND:
+					{
+						printf("\nCached file not found %s\n", binaryFileName);
+						break;
+					}
+				case ERROR_PATH_NOT_FOUND:
+					{
+						printf("\nCached file path not found %s\n", binaryFileName);
+						break;
+					}
+				default:
+					{
+						printf("\nFailed reading cached file with errorCode = %d\n", errorCode);
+					}
+				}
+			} else
+			{
+				if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0)
+				{
+					DWORD errorCode;
+					errorCode = GetLastError();
+					printf("\nGetFileTime errorCode = %d\n", errorCode);
+				} else
+				{
+					binaryFileValid = 1;
+				}
+				CloseHandle(binaryFileHandle);
+			}
+
+			if (binaryFileValid)
+			{
+				HANDLE srcFileHandle = CreateFile(clFileNameForCaching,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+
+				if (srcFileHandle==INVALID_HANDLE_VALUE)
+				{
+					const char* prefix[]={"../","../../","../../../","../../../../"};
+					for (int i=0;(srcFileHandle==INVALID_HANDLE_VALUE) && i<3;i++)
+					{
+						char relativeFileName[1024];
+						sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching);
+						srcFileHandle = CreateFile(relativeFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+					}
+
+				}
+
+
+				if (srcFileHandle!=INVALID_HANDLE_VALUE)
+				{
+					FILETIME modtimeSrc;
+					if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0)
+					{
+						DWORD errorCode;
+						errorCode = GetLastError();
+						printf("\nGetFileTime errorCode = %d\n", errorCode);
+					}
+					if (  ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime)
+						||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
+					{
+						fileUpToDate=1;
+					} else
+					{
+						printf("\nCached binary file out-of-date (%s)\n",binaryFileName);
+					}
+					CloseHandle(srcFileHandle);
+				}
+				else
+				{
+#ifdef _DEBUG
+					DWORD errorCode;
+					errorCode = GetLastError();
+					switch (errorCode)
+					{
+					case ERROR_FILE_NOT_FOUND:
+						{
+							printf("\nSrc file not found %s\n", clFileNameForCaching);
+							break;
+						}
+					case ERROR_PATH_NOT_FOUND:
+						{
+							printf("\nSrc path not found %s\n", clFileNameForCaching);
+							break;
+						}
+					default:
+						{
+							printf("\nnSrc file reading errorCode = %d\n", errorCode);
+						}
+					}
+
+					//we should make sure the src file exists so we can verify the timestamp with binary
+					assert(0);
+					fileUpToDate = false;
+#else
+					//if we cannot find the source, assume it is OK in release builds
+					fileUpToDate = true;
+#endif
+				}
+			}
+
+
+		}
+
+		if( fileUpToDate)
+		{
+#ifdef _WIN32
+			FILE* file;
+			if (fopen_s(&file,binaryFileName, "rb")!=0)
+				file=0;
+#else
+			FILE* file = fopen(binaryFileName, "rb");
+#endif
+
+			if (file)
+			{
+				size_t binarySize=0;
+				char* binary =0;
+
+				fseek( file, 0L, SEEK_END );
+				binarySize = ftell( file );
+				rewind( file );
+				binary = (char*)malloc(sizeof(char)*binarySize);
+				fread( binary, sizeof(char), binarySize, file );
+				fclose( file );
+
+				m_cpProgram = clCreateProgramWithBinary( clContext, 1,&device, &binarySize, (const unsigned char**)&binary, 0, &status );
+				btAssert( status == CL_SUCCESS );
+				status = clBuildProgram( m_cpProgram, 1, &device, additionalMacros, 0, 0 );
+				btAssert( status == CL_SUCCESS );
+
+				if( status != CL_SUCCESS )
+				{
+					char *build_log;
+					size_t ret_val_size;
+					clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+					build_log = (char*)malloc(sizeof(char)*(ret_val_size+1));
+					clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+					build_log[ret_val_size] = '\0';
+					printf("%s\n", build_log);
+					free (build_log);
+					btAssert(0);
+					m_cpProgram = 0;
+				}
+				free (binary);
+			}
+		}
+
+	}
+#endif //_WIN32
+
+	if (!m_cpProgram)
+	{
+
+		cl_int localErrNum;
+		char* compileFlags;
+		int flagsize;
+
+
+
+		const char* kernelSource = kernelSourceOrg;
+
+		if (!kernelSourceOrg || gDebugForceLoadingFromSource)
+		{
+			if (clFileNameForCaching)
+			{
+
+				FILE* file = fopen(clFileNameForCaching, "rb");
+				//in many cases the relative path is a few levels up the directory hierarchy, so try it
+				if (!file)
+				{
+					const char* prefix[]={"../","../../","../../../","../../../../"};
+					for (int i=0;!file && i<3;i++)
+					{
+						char relativeFileName[1024];
+						sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching);
+						file = fopen(relativeFileName, "rb");
+					}
+				}
+
+				if (file)
+				{
+					char* kernelSrc=0;
+					fseek( file, 0L, SEEK_END );
+					int kernelSize = ftell( file );
+					rewind( file );
+					kernelSrc = (char*)malloc(kernelSize+1);
+					int readBytes = fread((void*)kernelSrc,1,kernelSize, file);
+					kernelSrc[kernelSize] = 0;
+					fclose(file);
+					kernelSource = kernelSrc;
+				}
+			}
+		}
+
+		size_t program_length = kernelSource ? strlen(kernelSource) : 0;
+#ifdef MAC //or __APPLE__?
+		char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
+#else
+		//const char* flags = "-DGUID_ARG= -fno-alias";
+		const char* flags = "-DGUID_ARG= ";
+#endif
+
+
+		m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum);
+		if (localErrNum!= CL_SUCCESS)
+		{
+			if (pErrNum)
+				*pErrNum = localErrNum;
+			return 0;
+		}
+
+		// Build the program with 'mad' Optimization option
+
+
+
+        		flagsize = sizeof(char)*(strlen(additionalMacros) + strlen(flags) + 5);
+		compileFlags = (char*) malloc(flagsize);
+#ifdef _WIN32
+		sprintf_s(compileFlags,flagsize, "%s %s", flags, additionalMacros);
+#else
+		sprintf(compileFlags, "%s %s", flags, additionalMacros);
+#endif
+		localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL);
+		if (localErrNum!= CL_SUCCESS)
+		{
+			char *build_log;
+			size_t ret_val_size;
+			clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+			build_log = (char*) malloc(sizeof(char)*(ret_val_size+1));
+			clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+
+			// to be carefully, terminate with \0
+			// there's no information in the reference whether the string is 0 terminated or not
+			build_log[ret_val_size] = '\0';
+
+
+			printf("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log);
+			free (build_log);
+			if (pErrNum)
+				*pErrNum = localErrNum;
+			return 0;
+		}
+
+#ifdef _WIN32
+
+		if( clFileNameForCaching )
+		{	//	write to binary
+
+			cl_uint numAssociatedDevices;
+			status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 );
+			btAssert( status == CL_SUCCESS );
+			if (numAssociatedDevices==1)
+			{
+
+				size_t binarySize;
+				char* binary ;
+
+				status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
+				btAssert( status == CL_SUCCESS );
+
+				binary = (char*)malloc(sizeof(char)*binarySize);
+
+				status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
+				btAssert( status == CL_SUCCESS );
+
+				{
+					FILE* file=0;
+#ifdef _WIN32
+					if (fopen_s(&file,binaryFileName, "wb")!=0)
+						file=0;
+#else
+					file = fopen(binaryFileName, "wb");
+#endif
+					if (file)
+					{
+						fwrite( binary, sizeof(char), binarySize, file );
+						fclose( file );
+					} else
+					{
+						printf("cannot write file %s\n", binaryFileName);
+					}
+				}
+
+				free (binary);
+			}
+		}
+#endif //_WIN32
+
+		free(compileFlags);
+
+	}
+	return m_cpProgram;
+}
+
+
+cl_kernel btOpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros )
+{
+
+	cl_kernel kernel;
+	cl_int localErrNum;
+
+	cl_program m_cpProgram = prog;
+
+	printf("compiling kernel %s ",kernelName);
+
+	if (!m_cpProgram)
+	{
+		m_cpProgram = btOpenCLUtils_compileCLProgramFromString(clContext,device,kernelSource,pErrNum, additionalMacros,0);
+	}
+
+
+	// Create the kernel
+	kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum);
+	if (localErrNum != CL_SUCCESS)
+	{
+		printf("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName);
+        assert(0);
+		if (pErrNum)
+			*pErrNum = localErrNum;
+		return 0;
+	}
+
+	if (!prog && m_cpProgram)
+	{
+		clReleaseProgram(m_cpProgram);
+	}
+	printf("ready. \n");
+
+
+	if (pErrNum)
+			*pErrNum = CL_SUCCESS;
+	return kernel;
+
+}
diff --git a/opencl/basic_initialize/btOpenCLUtils.h b/opencl/basic_initialize/btOpenCLUtils.h
new file mode 100644
index 000000000..a1c7fbd7c
--- /dev/null
+++ b/opencl/basic_initialize/btOpenCLUtils.h
@@ -0,0 +1,179 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//original author: Roman Ponomarev
+//cleanup by Erwin Coumans
+
+#ifndef BT_OPENCL_UTILS_H
+#define BT_OPENCL_UTILS_H
+
+#include "btOpenCLInclude.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+///C API for OpenCL utilities: convenience functions, see below for C++ API
+
+/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
+/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
+cl_context 	btOpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC , int preferredDeviceIndex , int preferredPlatformIndex, cl_platform_id* platformId);
+	
+int btOpenCLUtils_getNumDevices(cl_context cxMainContext);
+
+cl_device_id btOpenCLUtils_getDevice(cl_context cxMainContext, int nr);
+
+void btOpenCLUtils_printDeviceInfo(cl_device_id device);
+
+cl_kernel btOpenCLUtils_compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog,const char* additionalMacros);
+
+//optional
+cl_program btOpenCLUtils_compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum,const char* additionalMacros  , const char* srcFileNameForCaching);
+
+//the following optional APIs provide access using specific platform information
+int btOpenCLUtils_getNumPlatforms(cl_int* pErrNum);
+
+///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
+cl_platform_id btOpenCLUtils_getPlatform(int nr, cl_int* pErrNum);
+
+void btOpenCLUtils_printPlatformInfo(cl_platform_id platform);
+
+const char* btOpenCLUtils_getSdkVendorName();
+
+cl_context 	btOpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC ,int preferredDeviceIndex , int preferredPlatformIndex);
+
+#ifdef __cplusplus
+}
+
+#define BT_MAX_STRING_LENGTH 1024
+
+typedef struct
+{
+	char m_deviceName[BT_MAX_STRING_LENGTH];
+	char m_deviceVendor[BT_MAX_STRING_LENGTH];
+	char m_driverVersion[BT_MAX_STRING_LENGTH];
+	char m_deviceExtensions[BT_MAX_STRING_LENGTH];
+
+	cl_device_type		m_deviceType;
+	cl_uint 				m_computeUnits;
+	size_t 					m_workitemDims;
+	size_t 					m_workItemSize[3];
+	size_t 					m_image2dMaxWidth;
+	size_t 					m_image2dMaxHeight;
+	size_t 					m_image3dMaxWidth;
+	size_t 					m_image3dMaxHeight;
+	size_t 					m_image3dMaxDepth;
+	size_t 					m_workgroupSize;
+	cl_uint 				m_clockFrequency;
+	cl_ulong				m_constantBufferSize;
+	cl_ulong				m_localMemSize;
+	cl_ulong				m_globalMemSize;
+    cl_bool					m_errorCorrectionSupport;
+	cl_device_local_mem_type m_localMemType;
+	cl_uint					m_maxReadImageArgs;
+	cl_uint					m_maxWriteImageArgs;
+
+
+
+	cl_uint 				m_addressBits;
+	cl_ulong				m_maxMemAllocSize;
+	cl_command_queue_properties m_queueProperties;
+	cl_bool					m_imageSupport;
+	cl_uint					m_vecWidthChar;
+	cl_uint					m_vecWidthShort;
+	cl_uint					m_vecWidthInt;
+	cl_uint					m_vecWidthLong;
+	cl_uint					m_vecWidthFloat;
+	cl_uint					m_vecWidthDouble;
+
+} btOpenCLDeviceInfo;
+
+typedef struct
+{
+	char m_platformVendor[BT_MAX_STRING_LENGTH];
+	char m_platformName[BT_MAX_STRING_LENGTH];
+	char m_platformVersion[BT_MAX_STRING_LENGTH];
+} btOpenCLPlatformInfo;
+
+
+///C++ API for OpenCL utilities: convenience functions
+struct btOpenCLUtils
+{
+	/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
+	/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
+	static inline cl_context 	createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1, cl_platform_id* platformId=0)
+	{
+		return btOpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx , pGLDC , preferredDeviceIndex, preferredPlatformIndex, platformId);
+	}
+	
+	static inline int getNumDevices(cl_context cxMainContext)
+	{
+		return btOpenCLUtils_getNumDevices(cxMainContext);
+	}
+	static inline cl_device_id getDevice(cl_context cxMainContext, int nr)
+	{
+		return btOpenCLUtils_getDevice(cxMainContext,nr);
+	}
+
+	static void getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo* info);
+
+	static inline void printDeviceInfo(cl_device_id device)
+	{
+		btOpenCLUtils_printDeviceInfo(device);
+	}
+
+	static inline cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" )
+	{
+		return btOpenCLUtils_compileCLKernelFromString(clContext,device, kernelSource,  kernelName, pErrNum, prog,additionalMacros);
+	}
+
+	//optional
+	static inline cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0)
+	{
+		return btOpenCLUtils_compileCLProgramFromString(clContext,device, kernelSource, pErrNum,additionalMacros, srcFileNameForCaching);
+	}
+
+	//the following optional APIs provide access using specific platform information
+	static inline int getNumPlatforms(cl_int* pErrNum=0)
+	{
+		return btOpenCLUtils_getNumPlatforms(pErrNum);
+	}
+	///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
+	static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum=0)
+	{
+		return btOpenCLUtils_getPlatform(nr,pErrNum);
+	}
+	
+	static void getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo* platformInfo);
+
+	static inline void printPlatformInfo(cl_platform_id platform)
+	{
+		btOpenCLUtils_printPlatformInfo(platform);
+	}
+
+	static inline const char* getSdkVendorName()
+	{
+		return btOpenCLUtils_getSdkVendorName();
+	}
+	static inline cl_context 	createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1)
+	{
+		return btOpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx,pGLDC,preferredDeviceIndex, preferredPlatformIndex);
+	}
+};
+
+#endif //__cplusplus
+
+#endif // BT_OPENCL_UTILS_H
diff --git a/opencl/basic_initialize/main.cpp b/opencl/basic_initialize/main.cpp
new file mode 100644
index 000000000..263ba1b30
--- /dev/null
+++ b/opencl/basic_initialize/main.cpp
@@ -0,0 +1,98 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///original author: Erwin Coumans
+
+#include "btOpenCLUtils.h"
+#include <stdio.h>
+
+cl_context			g_cxMainContext;
+cl_command_queue	g_cqCommandQue;
+
+
+
+int main(int argc, char* argv[])
+{
+	int ciErrNum = 0;
+	
+	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+	const char* vendorSDK = btOpenCLUtils::getSdkVendorName();
+
+	printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
+	int numPlatforms = btOpenCLUtils::getNumPlatforms();
+	printf("Num Platforms = %d\n", numPlatforms);
+
+	for (int i=0;i<numPlatforms;i++)
+	{
+		cl_platform_id platform = btOpenCLUtils::getPlatform(i);
+		btOpenCLPlatformInfo platformInfo;
+		btOpenCLUtils::getPlatformInfo(platform,&platformInfo);
+		printf("--------------------------------\n");
+		printf("Platform info for platform nr %d:\n",i);
+		printf("  CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
+		printf("  CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
+		printf("  CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
+		
+		cl_context context = btOpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
+		
+		int numDevices = btOpenCLUtils::getNumDevices(context);
+		printf("Num Devices = %d\n", numDevices);
+		for (int j=0;j<numDevices;j++)
+		{
+			cl_device_id dev = btOpenCLUtils::getDevice(context,j);
+			btOpenCLDeviceInfo devInfo;
+			btOpenCLUtils::getDeviceInfo(dev,&devInfo);
+			btOpenCLUtils::printDeviceInfo(dev);
+		}
+
+		clReleaseContext(context);
+	}
+
+	///Easier method to initialize OpenCL using createContextFromType for a GPU
+	deviceType = CL_DEVICE_TYPE_GPU;
+	
+	void* glCtx=0;
+	void* glDC = 0;
+	printf("Initialize OpenCL using btOpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
+	g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	if (g_cxMainContext)
+	{
+		int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
+
+		for (int i=0;i<numDev;i++)
+		{
+			cl_device_id		device;
+			device = btOpenCLUtils::getDevice(g_cxMainContext,i);
+			btOpenCLDeviceInfo clInfo;
+			btOpenCLUtils::getDeviceInfo(device,&clInfo);
+			btOpenCLUtils::printDeviceInfo(device);
+			// create a command-queue
+			g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			//normally you would create and execute kernels using this command queue
+
+			clReleaseCommandQueue(g_cqCommandQue);
+		}
+
+		clReleaseContext(g_cxMainContext);
+
+	}
+	else {
+		printf("No OpenCL capable GPU found!");
+	}
+	return 0;
+}
\ No newline at end of file
diff --git a/opencl/basic_initialize/premake4.lua b/opencl/basic_initialize/premake4.lua
new file mode 100644
index 000000000..a9a07f1c8
--- /dev/null
+++ b/opencl/basic_initialize/premake4.lua
@@ -0,0 +1,28 @@
+function createProject(vendor)
+	
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("OpenCL_intialize_" .. vendor)
+
+		initOpenCL(vendor)
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../bin"
+
+		files {
+			"main.cpp",
+			"btOpenCLUtils.cpp",
+			"btOpenCLUtils.h"
+		}
+		
+	end
+end
+	
+createProject("Apple")
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
diff --git a/opencl/lds_bank_conflict/lds_kernels.cl b/opencl/lds_bank_conflict/lds_kernels.cl
new file mode 100644
index 000000000..6e3ad78f3
--- /dev/null
+++ b/opencl/lds_bank_conflict/lds_kernels.cl
@@ -0,0 +1,171 @@
+
+#define TILE_DIM  32
+#define BLOCK_ROWS  8
+
+
+/*// simple copy kernel (CUDA)
+// Used as reference case representing best effective bandwidth.
+__global__ void copy(float *odata, const float *idata)
+{
+  int x = blockIdx.x * TILE_DIM + threadIdx.x;
+  int y = blockIdx.y * TILE_DIM + threadIdx.y;
+  int width = gridDim.x * TILE_DIM;
+
+  for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
+	odata[(y+j)*width + x] = idata[(y+j)*width + x];
+}
+*/
+// simple copy kernel (OpenCL)
+__kernel void copyKernel(__global float* odata, __global const float* idata)
+{
+  int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
+  int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
+  int width = get_num_groups(0) * get_local_size(0);
+  for (int j = 0; j < get_num_groups(1); j+= get_local_size(1))
+  {
+	odata[(y+j)*width + x] = idata[(y+j)*width + x];
+  }
+}
+
+/*
+// copy kernel using shared memory (CUDA)
+// Also used as reference case, demonstrating effect of using shared memory.
+__global__ void copySharedMem(float *odata, const float *idata)
+{
+  __shared__ float tile[TILE_DIM * TILE_DIM];
+  
+  int x = blockIdx.x * TILE_DIM + threadIdx.x;
+  int y = blockIdx.y * TILE_DIM + threadIdx.y;
+  int width = gridDim.x * TILE_DIM;
+
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
+	 tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x];
+
+  __syncthreads();
+
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
+	 odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x];          
+}
+*/
+
+// copy kernel using shared memory (OpenCL)
+// Also used as reference case, demonstrating effect of using shared memory.
+__kernel void copySharedMemKernel(__global float *odata, __global const float *idata)
+{
+  __local float tile[TILE_DIM * TILE_DIM];
+  
+  int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
+  int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
+  int width = get_num_groups(0) * get_local_size(0);
+
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
+	 tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)] = idata[(y+j)*width + x];
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
+	 odata[(y+j)*width + x] = tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)];
+}
+
+/*
+// naive transpose (CUDA)
+// Simplest transpose; doesn't use shared memory.
+// Global memory reads are coalesced but writes are not.
+__global__ void transposeNaive(float *odata, const float *idata)
+{
+  int x = blockIdx.x * TILE_DIM + threadIdx.x;
+  int y = blockIdx.y * TILE_DIM + threadIdx.y;
+  int width = gridDim.x * TILE_DIM;
+
+  for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
+	odata[x*width + (y+j)] = idata[(y+j)*width + x];
+}
+*/
+
+// naive transpose (OpenCL)
+// Simplest transpose; doesn't use shared memory.
+// Global memory reads are coalesced but writes are not.
+__kernel void transposeNaiveKernel(__global float *odata, __global const float *idata)
+{
+  int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
+  int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
+  int width = get_num_groups(0) * get_local_size(0);
+
+  for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
+	odata[x*width + (y+j)] = idata[(y+j)*width + x];
+}
+
+/*
+// coalesced transpose (CUDA)
+// Uses shared memory to achieve coalesing in both reads and writes
+// Tile width == #banks causes shared memory bank conflicts.
+__global__ void transposeCoalesced(float *odata, const float *idata)
+{
+  __shared__ float tile[TILE_DIM][TILE_DIM];
+	
+  int x = blockIdx.x * TILE_DIM + threadIdx.x;
+  int y = blockIdx.y * TILE_DIM + threadIdx.y;
+  int width = gridDim.x * TILE_DIM;
+
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
+	 tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+
+  __syncthreads();
+
+  x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * TILE_DIM + threadIdx.y;
+
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
+	 odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+}
+*/
+
+// coalesced transpose (OpenCL)
+// Uses shared memory to achieve coalesing in both reads and writes
+// Tile width == #banks causes shared memory bank conflicts.
+__kernel void transposeCoalescedKernel(__global float *odata, __global const float *idata)
+{
+  __local float tile[TILE_DIM][TILE_DIM];
+	
+  int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
+  int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
+  int width = get_num_groups(0) * get_local_size(0);
+    
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
+	 tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  x = get_group_id(1) * TILE_DIM + get_local_id(0);
+  y = get_group_id(0) * TILE_DIM + get_local_id(1);
+  
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
+	 odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
+}
+
+
+// No bank-conflict transpose (OpenCL)
+// Same as transposeCoalesced except the first tile dimension is padded 
+// to avoid shared memory bank conflicts.
+__kernel void transposeNoBankConflictsKernel(__global float *odata, __global const float *idata)
+{
+  __local float tile[TILE_DIM][TILE_DIM+1];
+	
+  int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
+  int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
+  int width = get_num_groups(0) * get_local_size(0);
+    
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
+	 tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  x = get_group_id(1) * TILE_DIM + get_local_id(0);
+  y = get_group_id(0) * TILE_DIM + get_local_id(1);
+  
+  for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
+	 odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
+}
+
+
+
diff --git a/opencl/lds_bank_conflict/main.cpp b/opencl/lds_bank_conflict/main.cpp
new file mode 100644
index 000000000..07742d268
--- /dev/null
+++ b/opencl/lds_bank_conflict/main.cpp
@@ -0,0 +1,361 @@
+//Adapted from CUDA to OpenCL by Erwin Coumans
+//See http://bitbucket.org/erwincoumans/opencl_course
+
+// Copyright 2012 NVIDIA Corporation
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "btOpenCLUtils.h"
+#include "../parallel_primitives/host/btOpenCLArray.h"
+#include "../parallel_primitives/host/btLauncherCL.h"
+#include "../parallel_primitives/host/btQuickprof.h"
+#include "../parallel_primitives/host/btFillCL.h"
+#include "../parallel_primitives/host/CommandLineArgs.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+//make sure to update the same #define in the opencl/lds_bank_conflict/lds_kernels.cl
+const int TILE_DIM = 32;
+const int BLOCK_ROWS = 8;
+const int NUM_REPS = 100;
+
+// Check errors and print GB/s
+void postprocess(const float *ref, const float *res, int n, float ms)
+{
+  bool passed = true;
+  for (int i = 0; i < n; i++)
+	if (res[i] != ref[i]) {
+	  printf("\nError: at res[%d] got %f but expected %f\n", i, res[i], ref[i]);
+	  printf("%25s\n", "*** FAILED ***");
+	  passed = false;
+	  break;
+	}
+  if (passed)
+	printf("%20.2f\n", 2 * n * sizeof(float) * 1e-6 * NUM_REPS / ms );
+}
+
+char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
+{
+    // locals 
+    FILE* pFileStream = NULL;
+    size_t szSourceLength;
+	
+    // open the OpenCL source code file
+	pFileStream = fopen(cFilename, "rb");
+	if(pFileStream == 0) 
+	{       
+		return NULL;
+	}
+	
+    size_t szPreambleLength = strlen(cPreamble);
+	
+    // get the length of the source code
+    fseek(pFileStream, 0, SEEK_END); 
+    szSourceLength = ftell(pFileStream);
+    fseek(pFileStream, 0, SEEK_SET); 
+	
+    // allocate a buffer for the source code string and read it in
+    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
+    memcpy(cSourceString, cPreamble, szPreambleLength);
+    fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream); 
+	
+    // close the file and return the total length of the combined (preamble + source) string
+    fclose(pFileStream);
+    if(szFinalLength != 0)
+    {
+        *szFinalLength = szSourceLength + szPreambleLength;
+    }
+    cSourceString[szSourceLength + szPreambleLength] = '\0';
+	
+    return cSourceString;
+}
+
+int main(int argc, char **argv)
+{
+	printf("Use --deviceId=<id> or --platformId=<id> to override OpenCL device\n");
+	CommandLineArgs args(argc,argv);
+
+	const int nx = 1024;
+	const int ny = 1024;
+ 
+	const int mem_size = nx*ny*sizeof(float);
+	const int num_elements = nx*ny;
+	btClock clock;
+	double startEvent=0.f;
+	double stopEvent=0.f;
+
+	int localSizeX = TILE_DIM;
+	int localSizeY = BLOCK_ROWS;
+
+	int numThreadsX = (nx/TILE_DIM)*TILE_DIM;
+	int numThreadsY = (ny/TILE_DIM)*BLOCK_ROWS;
+
+	int gridX = numThreadsX / localSizeX;
+	int gridY = numThreadsY / localSizeY;
+
+	int ciErrNum = 0;
+	int preferred_device = -1;
+	int preferred_platform = -1;
+	args.GetCmdLineArgument("deviceId",preferred_device);
+	args.GetCmdLineArgument("platformId",preferred_platform);
+
+
+	cl_platform_id		platformId=0;
+	cl_context			ctx=0;
+	cl_command_queue	queue=0;
+	cl_device_id		device=0;
+	cl_kernel			copyKernel=0;
+	cl_kernel			copySharedMemKernel=0;
+	cl_kernel			transposeNaiveKernel = 0;
+	cl_kernel			transposeCoalescedKernel = 0;
+	cl_kernel			transposeNoBankConflictsKernel= 0;
+	
+
+	ctx = btOpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
+	btOpenCLUtils::printPlatformInfo(platformId);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	device = btOpenCLUtils::getDevice(ctx,0);
+	btOpenCLUtils::printDeviceInfo(device);
+	queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
+
+	const char* cSourceFile = "opencl/lds_bank_conflict/lds_kernels.cl";
+	
+	size_t szKernelLength;
+
+	const char* cSourceCL =0;
+	char relativeFileName[1024];
+
+	{
+		const char* prefix[]={"./","../","../../","../../../","../../../../"};
+		int numPrefixes = sizeof(prefix)/sizeof(char*);
+
+		for (int i=0;!cSourceCL && i<numPrefixes;i++)
+		{
+			
+			sprintf(relativeFileName,"%s%s",prefix[i],cSourceFile);
+			cSourceCL = loadProgSource(relativeFileName, "", &szKernelLength);
+			if (cSourceCL)
+			{
+				printf("Loaded program source: %s\n", relativeFileName); 
+			}
+		}
+	}
+	if (!cSourceCL)
+	{
+		printf("Couldn't find file %s, exiting\n",cSourceFile);
+		exit(0);
+	}
+
+char flags[1024]={0};
+#ifdef CL_PLATFORM_INTEL
+///use this flag to allow for OpenCL kernel debugging on CPU using the Intel OpenCL run-time
+	//sprintf(flags,"-g -s \"%s\"","C:/develop/opencl_course/opencl/lds_bank_conflict/lds_kernels.cl");
+#endif//CL_PLATFORM_INTEL
+
+	
+	copyKernel  = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"copyKernel",&ciErrNum,0,flags);
+	copySharedMemKernel  = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"copySharedMemKernel",&ciErrNum,0,flags);
+	transposeNaiveKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeNaiveKernel",&ciErrNum,0,flags);
+	transposeCoalescedKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeCoalescedKernel",&ciErrNum,0,flags);
+	transposeNoBankConflictsKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeNoBankConflictsKernel",&ciErrNum,0,flags);
+	
+	btFillCL clMemSet(ctx,device,queue);
+
+	printf("\n============================================\n");
+
+	printf("Matrix size: %d %d, Block size: %d %d, Tile size: %d %d\n", 
+		 nx, ny, TILE_DIM, BLOCK_ROWS, TILE_DIM, TILE_DIM);
+
+	float *h_idata = (float*)malloc(mem_size);
+	float *h_cdata = (float*)malloc(mem_size);
+	float *h_tdata = (float*)malloc(mem_size);
+	float *gold    = (float*)malloc(mem_size);
+  
+	btOpenCLArray<float> d_idataCL(ctx,queue);d_idataCL.resize(num_elements);
+	btOpenCLArray<float> d_cdataCL(ctx,queue);d_cdataCL.resize(num_elements);
+	btOpenCLArray<float> d_tdataCL(ctx,queue);d_tdataCL.resize(num_elements);
+  
+
+	// check parameters and calculate execution configuration
+	if (nx % TILE_DIM || ny % TILE_DIM) 
+	{
+		printf("nx and ny must be a multiple of TILE_DIM\n");
+		goto error_exit;
+	}
+
+	if (TILE_DIM % BLOCK_ROWS) 
+	{
+		printf("TILE_DIM must be a multiple of BLOCK_ROWS\n");
+		goto error_exit;
+	}
+	
+  // host
+  for (int j = 0; j < ny; j++)
+	for (int i = 0; i < nx; i++)
+	  h_idata[j*nx + i] = j*nx + i;
+
+  // correct result for error checking
+  for (int j = 0; j < ny; j++)
+	for (int i = 0; i < nx; i++)
+	{
+	  gold[j*nx + i] = h_idata[i*nx + j];
+	}
+  
+  d_idataCL.copyFromHostPointer(h_idata,num_elements);
+
+  // events for timing
+  clock.reset();
+
+  float ms;
+
+  // ------------
+  // time kernels
+  // ------------
+  printf("%25s%25s\n", "Routine", "Bandwidth (GB/s)");
+  
+  // ----
+  // copy 
+  // ----
+  printf("%25s", "copy");
+
+  clMemSet.execute(d_cdataCL,0.f,num_elements);
+  
+  {
+	    // warm up
+		btLauncherCL launcher( queue, copyKernel);
+		launcher.setBuffer( d_cdataCL.getBufferCL());
+		launcher.setBuffer( d_idataCL.getBufferCL());
+		launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
+
+		startEvent = clock.getTimeMicroseconds()/1e3;
+		for (int i = 0; i < NUM_REPS; i++)
+			launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		clFinish(queue);
+		stopEvent = clock.getTimeMicroseconds()/1e3;
+	}
+
+	ms = float(stopEvent-startEvent);
+
+	d_cdataCL.copyToHostPointer(h_cdata,num_elements,0);
+	postprocess(h_idata, h_cdata, nx*ny, ms);
+
+  // -------------
+  // copySharedMem 
+  // -------------
+	printf("%25s", "shared memory copy");
+	clMemSet.execute(d_cdataCL,0.f,num_elements);
+
+	{
+		btLauncherCL launcher( queue, copySharedMemKernel);
+		launcher.setBuffer( d_cdataCL.getBufferCL());
+		launcher.setBuffer( d_idataCL.getBufferCL());
+		launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
+
+		startEvent = clock.getTimeMicroseconds()/1e3;
+		for (int i = 0; i < NUM_REPS; i++)
+			launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		clFinish(queue);
+		stopEvent = clock.getTimeMicroseconds()/1e3;
+	}
+
+	ms = float(stopEvent-startEvent);
+	d_cdataCL.copyToHostPointer(h_cdata,num_elements,0);
+	postprocess(h_idata, h_cdata, nx * ny, ms);
+
+  // --------------
+  // transposeNaive 
+  // --------------
+	printf("%25s", "naive transpose");
+	clMemSet.execute(d_tdataCL,0.f,num_elements);
+	{
+		// warmup
+		btLauncherCL launcher( queue, transposeNaiveKernel);
+		launcher.setBuffer( d_tdataCL.getBufferCL());
+		launcher.setBuffer( d_idataCL.getBufferCL());
+		launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
+
+		startEvent = clock.getTimeMicroseconds()/1e3;
+		for (int i = 0; i < NUM_REPS; i++)
+			launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		clFinish(queue);
+		stopEvent = clock.getTimeMicroseconds()/1e3;
+	}
+	ms = float(stopEvent-startEvent);
+	d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
+	postprocess(gold, h_tdata, nx * ny, ms);
+
+  // ------------------
+  // transposeCoalesced 
+  // ------------------
+	printf("%25s", "coalesced transpose");
+    clMemSet.execute(d_tdataCL,0.f,num_elements);
+	{
+		btLauncherCL launcher( queue, transposeCoalescedKernel);
+		launcher.setBuffer( d_tdataCL.getBufferCL());
+		launcher.setBuffer( d_idataCL.getBufferCL());
+		launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
+
+		startEvent = clock.getTimeMicroseconds()/1e3;
+		for (int i = 0; i < NUM_REPS; i++)
+			launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		clFinish(queue);
+		stopEvent = clock.getTimeMicroseconds()/1e3;
+	}
+
+	ms = float(stopEvent-startEvent);
+	d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
+	postprocess(gold, h_tdata, nx * ny, ms);
+
+  // ------------------------
+  // transposeNoBankConflicts
+  // ------------------------
+	printf("%25s", "conflict-free transpose");
+	clMemSet.execute(d_tdataCL,0.f,num_elements);
+	{
+		btLauncherCL launcher( queue, transposeNoBankConflictsKernel);
+		launcher.setBuffer( d_tdataCL.getBufferCL());
+		launcher.setBuffer( d_idataCL.getBufferCL());
+		launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
+
+		startEvent = clock.getTimeMicroseconds()/1e3;
+		for (int i = 0; i < NUM_REPS; i++)
+			launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		clFinish(queue);
+		stopEvent = clock.getTimeMicroseconds()/1e3;
+	}
+
+	ms = float(stopEvent-startEvent);
+	d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
+	postprocess(gold, h_tdata, nx * ny, ms);
+
+error_exit:
+  // cleanup
+	clReleaseKernel(copyKernel);
+	clReleaseCommandQueue(queue);
+	clReleaseContext(ctx);
+
+	free(h_idata);
+	free(h_tdata);
+	free(h_cdata);
+	free(gold);
+	printf("Press <enter>\n");
+	getchar();
+}
diff --git a/opencl/lds_bank_conflict/premake4.lua b/opencl/lds_bank_conflict/premake4.lua
new file mode 100644
index 000000000..7a26da2cc
--- /dev/null
+++ b/opencl/lds_bank_conflict/premake4.lua
@@ -0,0 +1,37 @@
+
+function createProject (vendor)
+
+	local hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ( "OpenCL_lds_bank_conflict_" .. vendor)
+
+		initOpenCL(vendor)
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../bin"
+
+		links {
+			"OpenCL_lib_parallel_primitives_host_" .. vendor
+		}
+
+		includedirs {
+			"../basic_initialize"
+		}
+		
+		files {
+			"main.cpp",
+			"../basic_initialize/btOpenCLUtils.cpp",
+			"../basic_initialize/btOpenCLUtils.h"
+		}
+	end
+	
+end
+
+createProject("AMD")
+createProject("NVIDIA")
+createProject("Intel")
+createProject("Apple")
diff --git a/opencl/parallel_primitives/benchmark/premake4.lua b/opencl/parallel_primitives/benchmark/premake4.lua
new file mode 100644
index 000000000..515540c8b
--- /dev/null
+++ b/opencl/parallel_primitives/benchmark/premake4.lua
@@ -0,0 +1,35 @@
+function createProject(vendor)
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("OpenCL_radixsort_benchmark_" .. vendor)
+
+		initOpenCL(vendor)
+		
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+		includedirs {".."}
+		
+		links {
+			("OpenCL_lib_parallel_primitives_host_" .. vendor)
+		}
+		
+		files {
+			"test_large_problem_sorting.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../host/btFillCL.cpp",
+			"../host/btPrefixScanCL.cpp",
+			"../host/btRadixSort32CL.cpp",
+		}
+		
+	end
+end
+
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
+createProject("Apple")
\ No newline at end of file
diff --git a/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp b/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp
new file mode 100644
index 000000000..b3629c3f8
--- /dev/null
+++ b/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp
@@ -0,0 +1,709 @@
+/******************************************************************************
+ * Copyright 2010 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * 
+ * 
+ * 
+ * AUTHORS' REQUEST: 
+ * 
+ * 		If you use|reference|benchmark this code, please cite our Technical 
+ * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
+ * 
+ *		@TechReport{ Merrill:Sorting:2010,
+ *        	author = "Duane Merrill and Andrew Grimshaw",
+ *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
+ *        	year = "2010",
+ *        	institution = "University of Virginia, Department of Computer Science",
+ *        	address = "Charlottesville, VA, USA",
+ *        	number = "CS2010-03"
+ *		}
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ * Thanks!
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple test driver program for *large-problem* radix sorting.
+ *
+ * Useful for demonstrating how to integrate radix sorting into 
+ * your application 
+ ******************************************************************************/
+
+/******************************************************************************
+ * Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
+ ******************************************************************************/
+#ifdef _WIN32
+#pragma warning (disable:4996)
+#endif
+#include <stdlib.h> 
+#include <stdio.h> 
+#include <string.h> 
+#include <math.h> 
+#include <float.h>
+#include <algorithm>
+#include <string>
+
+
+//#include <iostream>
+#include <sstream>
+/**********************
+*
+*/
+
+#include "../host/btRadixSort32CL.h"
+#include "../../basic_initialize/btOpenCLUtils.h"
+#include "../host/btQuickprof.h"
+
+cl_context g_cxMainContext;
+cl_device_id g_device;
+cl_command_queue g_cqCommandQueue;
+
+/***********************
+*
+*/
+
+bool g_verbose;
+///Preferred OpenCL device/platform. When < 0 then no preference is used. 
+///Note that btOpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
+///Preferred device/platform take priority over this platform-vendor match
+int gPreferredDeviceId = -1;
+int gPreferredPlatformId = -1;
+
+
+
+/******************************************************************************
+ * Routines
+ ******************************************************************************/
+
+
+/**
+ * Keys-only sorting.  Uses the GPU to sort the specified vector of elements for the given 
+ * number of iterations, displaying runtime information.
+ *
+ * @param[in] 		num_elements 
+ * 		Size in elements of the vector to sort
+ * @param[in] 		h_keys 
+ * 		Vector of keys to sort 
+ * @param[in] 		iterations  
+ * 		Number of times to invoke the GPU sorting primitive
+  * @param[in] 		cfg 
+ * 		Config
+ */
+template <typename K>
+void TimedSort(
+	unsigned int num_elements, 
+	K *h_keys,
+	unsigned int iterations)
+{
+	printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
+
+	int max_elements = num_elements;
+	btAlignedObjectArray<unsigned int> hostData;
+	hostData.resize(num_elements);
+	for (int i=0;i<num_elements;i++)
+	{
+		hostData[i] = h_keys[i];
+	}
+
+	btRadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
+
+	btOpenCLArray<unsigned int> gpuData(g_cxMainContext,g_cqCommandQueue);
+	gpuData.copyFromHost(hostData);
+	//sorter.executeHost(gpuData);
+    sorter.execute(gpuData);
+    
+	btAlignedObjectArray<unsigned int> hostDataSorted;
+	gpuData.copyToHost(hostDataSorted);
+    
+	clFinish(g_cqCommandQueue);
+
+	{
+		//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
+
+		// Create sorting enactor
+
+		// Perform the timed number of sorting iterations
+		double elapsed = 0;
+		float duration = 0;
+		btClock watch;
+
+		//warm-start
+		gpuData.copyFromHost(hostData);
+		clFinish(g_cqCommandQueue);
+		sorter.execute(gpuData);
+
+		watch.reset();
+
+			
+		for (int i = 0; i < iterations; i++) 
+		{
+
+
+
+			// Move a fresh copy of the problem into device storage
+			gpuData.copyFromHost(hostData);
+			clFinish(g_cqCommandQueue);
+
+			// Start GPU timing record
+			double startMs = watch.getTimeMicroseconds()/1e3;
+			
+			// Call the sorting API routine
+			sorter.execute(gpuData);
+
+
+
+			clFinish(g_cqCommandQueue);
+	
+			double stopMs = watch.getTimeMicroseconds()/1e3;
+
+			duration = stopMs - startMs;
+			
+			// End GPU timing record
+			elapsed += (double) duration;
+			printf("duration = %f\n", duration);
+		}
+
+		// Display timing information
+		double avg_runtime = elapsed / iterations;
+	//	double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; 
+	//   printf(", %f GPU ms, %f x10^9 elts/sec\n", 	avg_runtime,	throughput);
+		double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; 
+		printf(", %f GPU ms, %f x10^6 elts/sec\n", 	avg_runtime,	throughput);
+
+		gpuData.copyToHost(hostData);
+		for (int i=0;i<num_elements;i++)
+		{
+			h_keys[i] = hostData[i];
+		}
+	}
+}
+
+/**
+ * Key-value sorting.  Uses the GPU to sort the specified vector of elements for the given 
+ * number of iterations, displaying runtime information.
+ *
+ * @param[in] 		num_elements 
+ * 		Size in elements of the vector to sort
+ * @param[in] 		h_keys 
+ * 		Vector of keys to sort 
+ * @param[in,out] 	h_values  
+ * 		Vector of values to sort 
+ * @param[in] 		iterations  
+ * 		Number of times to invoke the GPU sorting primitive
+  * @param[in] 		cfg 
+ * 		Config
+ */
+template <typename K, typename V>
+void TimedSort(
+	unsigned int num_elements, 
+	K *h_keys,
+	V *h_values, 
+	unsigned int iterations) 
+{
+	
+	printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
+
+	int max_elements = num_elements;
+	btAlignedObjectArray<btSortData> hostData;
+	hostData.resize(num_elements);
+	for (int i=0;i<num_elements;i++)
+	{
+		hostData[i].m_key = h_keys[i];
+		hostData[i].m_value = h_values[i];
+	}
+
+	btRadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
+
+	btOpenCLArray<btSortData> gpuData(g_cxMainContext,g_cqCommandQueue);
+	gpuData.copyFromHost(hostData);
+	//sorter.executeHost(gpuData);
+    sorter.execute(gpuData);
+    
+	btAlignedObjectArray<btSortData> hostDataSorted;
+	gpuData.copyToHost(hostDataSorted);
+#if 0
+    for (int i=0;i<num_elements;i++)
+	{
+		printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
+        printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
+	}
+#endif
+    
+clFinish(g_cqCommandQueue);
+
+	{
+		//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
+
+		// Create sorting enactor
+
+		// Perform the timed number of sorting iterations
+		double elapsed = 0;
+		float duration = 0;
+		btClock watch;
+		
+		//warm-start
+		gpuData.copyFromHost(hostData);
+		sorter.execute(gpuData);
+		clFinish(g_cqCommandQueue);
+
+		watch.reset();
+
+			
+		for (int i = 0; i < iterations; i++) 
+		{
+
+
+
+			// Move a fresh copy of the problem into device storage
+			gpuData.copyFromHost(hostData);
+			clFinish(g_cqCommandQueue);
+
+			// Start GPU timing record
+			double startMs = watch.getTimeMicroseconds()/1e3;
+			
+			// Call the sorting API routine
+			sorter.execute(gpuData);
+			clFinish(g_cqCommandQueue);
+	
+			double stopMs = watch.getTimeMicroseconds()/1e3;
+
+			duration = stopMs - startMs;
+			
+			// End GPU timing record
+			elapsed += (double) duration;
+			printf("duration = %f\n", duration);
+		}
+
+		// Display timing information
+		double avg_runtime = elapsed / iterations;
+	//	double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; 
+	//   printf(", %f GPU ms, %f x10^9 elts/sec\n", 	avg_runtime,	throughput);
+		double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; 
+		printf(", %f GPU ms, %f x10^6 elts/sec\n", 	avg_runtime,	throughput);
+
+		gpuData.copyToHost(hostData);
+		for (int i=0;i<num_elements;i++)
+		{
+			h_keys[i] = hostData[i].m_key;
+			h_values[i] = hostData[i].m_value;
+		}
+	}
+}
+
+
+
+/**
+ * Generates random 32-bit keys.
+ * 
+ * We always take the second-order byte from rand() because the higher-order 
+ * bits returned by rand() are commonly considered more uniformly distributed
+ * than the lower-order bits.
+ * 
+ * We can decrease the entropy level of keys by adopting the technique 
+ * of Thearling and Smith in which keys are computed from the bitwise AND of 
+ * multiple random samples: 
+ * 
+ * entropy_reduction	| Effectively-unique bits per key
+ * -----------------------------------------------------
+ * -1					| 0
+ * 0					| 32
+ * 1					| 25.95
+ * 2					| 17.41
+ * 3					| 10.78
+ * 4					| 6.42
+ * ...					| ...
+ * 
+ */
+template <typename K>
+void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
+{
+	const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
+	unsigned char key_bits[NUM_UCHARS];
+	
+	do {
+	
+		for (int j = 0; j < NUM_UCHARS; j++) {
+			unsigned char quarterword = 0xff;
+			for (int i = 0; i <= entropy_reduction; i++) {
+				quarterword &= (rand() >> 7);
+			}
+			key_bits[j] = quarterword;
+		}
+		
+		if (lower_key_bits < sizeof(K) * 8) {
+			unsigned long long base = 0;
+			memcpy(&base, key_bits, sizeof(K));
+			base &= (1 << lower_key_bits) - 1;
+			memcpy(key_bits, &base, sizeof(K));
+		}
+		
+		memcpy(&key, key_bits, sizeof(K));
+		
+	} while (key != key);		// avoids NaNs when generating random floating point numbers 
+}
+
+
+/******************************************************************************
+ * Templated routines for printing keys/values to the console 
+ ******************************************************************************/
+
+template<typename T> 
+void PrintValue(T val) {
+	printf("%d", val);
+}
+
+template<>
+void PrintValue<float>(float val) {
+	printf("%f", val);
+}
+
+template<>
+void PrintValue<double>(double val) {
+	printf("%f", val);
+}
+
+template<>
+void PrintValue<unsigned char>(unsigned char val) {
+	printf("%u", val);
+}
+
+template<>
+void PrintValue<unsigned short>(unsigned short val) {
+	printf("%u", val);
+}
+
+template<>
+void PrintValue<unsigned int>(unsigned int val) {
+	printf("%u", val);
+}
+
+template<>
+void PrintValue<long>(long val) {
+	printf("%ld", val);
+}
+
+template<>
+void PrintValue<unsigned long>(unsigned long val) {
+	printf("%lu", val);
+}
+
+template<>
+void PrintValue<long long>(long long val) {
+	printf("%lld", val);
+}
+
+template<>
+void PrintValue<unsigned long long>(unsigned long long val) {
+	printf("%llu", val);
+}
+
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename T, typename SizeT>
+int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
+{
+	printf("\n");
+	for (SizeT i = 0; i < len; i++) {
+
+		if (computed[i] != reference[i]) {
+			printf("INCORRECT: [%lu]: ", (unsigned long) i);
+			PrintValue<T>(computed[i]);
+			printf(" != ");
+			PrintValue<T>(reference[i]);
+
+			if (verbose) {
+				printf("\nresult[...");
+				for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
+					PrintValue<T>(computed[j]);
+					printf(", ");
+				}
+				printf("...]");
+				printf("\nreference[...");
+				for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
+					PrintValue<T>(reference[j]);
+					printf(", ");
+				}
+				printf("...]");
+			}
+
+			return 1;
+		}
+	}
+
+	printf("CORRECT\n");
+	return 0;
+}
+
+/**
+ * Creates an example sorting problem whose keys is a vector of the specified 
+ * number of K elements, values of V elements, and then dispatches the problem 
+ * to the GPU for the given number of iterations, displaying runtime information.
+ *
+ * @param[in] 		iterations  
+ * 		Number of times to invoke the GPU sorting primitive
+ * @param[in] 		num_elements 
+ * 		Size in elements of the vector to sort
+ * @param[in] 		cfg 
+ * 		Config
+ */
+template<typename K, typename V>
+void TestSort(
+	unsigned int iterations,
+	int num_elements,
+	bool keys_only)
+{
+    // Allocate the sorting problem on the host and fill the keys with random bytes
+
+	K *h_keys = NULL;
+	K *h_reference_keys = NULL;
+	V *h_values = NULL;
+	h_keys = (K*) malloc(num_elements * sizeof(K));
+	h_reference_keys = (K*) malloc(num_elements * sizeof(K));
+	if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
+	
+
+	// Use random bits
+	for (unsigned int i = 0; i < num_elements; ++i) {
+		RandomBits<K>(h_keys[i], 0);
+		//h_keys[i] = num_elements-i;
+        //h_keys[i] = 0xffffffffu-i;
+		if (!keys_only)
+			h_values[i] = h_keys[i];//0xffffffffu-i;
+
+		h_reference_keys[i] = h_keys[i];
+	}
+
+    // Run the timing test 
+	if (keys_only) {
+		TimedSort<K>(num_elements, h_keys, iterations);
+	} else {
+		TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
+	}
+
+//	cudaThreadSynchronize();
+    
+	// Display sorted key data
+	if (g_verbose) {
+		printf("\n\nKeys:\n");
+		for (int i = 0; i < num_elements; i++) {	
+			PrintValue<K>(h_keys[i]);
+			printf(", ");
+		}
+		printf("\n\n");
+	}	
+	
+    // Verify solution
+	std::sort(h_reference_keys, h_reference_keys + num_elements);	
+	CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
+	printf("\n");
+	fflush(stdout);
+
+	// Free our allocated host memory 
+	if (h_keys != NULL) free(h_keys);
+    if (h_values != NULL) free(h_values);
+}
+
+
+
+/**
+ * Displays the commandline usage for this tool
+ */
+void Usage() 
+{
+	printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n"); 
+	printf("\n");
+	printf("\t--v\tDisplays sorted results to the console.\n");
+	printf("\n");
+	printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
+	printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
+	printf("\n");
+	printf("\t--n\tThe number of elements to comprise the sample problem\n");
+	printf("\t\t\tDefault = 512\n");
+	printf("\n");
+	printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
+	printf("\n");
+}
+
+
+/******************************************************************************
+ * Command-line parsing
+ ******************************************************************************/
+#include <map>
+#include <algorithm>
+#include <string>
+
+class CommandLineArgs
+{
+protected:
+
+	std::map<std::string, std::string> pairs;
+
+public:
+
+	// Constructor
+	CommandLineArgs(int argc, char **argv)
+	{
+		using namespace std;
+
+	    for (int i = 1; i < argc; i++)
+	    {
+	        string arg = argv[i];
+
+	        if ((arg[0] != '-') || (arg[1] != '-')) {
+	        	continue;
+	        }
+
+        	string::size_type pos;
+		    string key, val;
+	        if ((pos = arg.find( '=')) == string::npos) {
+	        	key = string(arg, 2, arg.length() - 2);
+	        	val = "";
+	        } else {
+	        	key = string(arg, 2, pos - 2);
+	        	val = string(arg, pos + 1, arg.length() - 1);
+	        }
+        	pairs[key] = val;
+	    }
+	}
+
+	bool CheckCmdLineFlag(const char* arg_name)
+	{
+		using namespace std;
+		map<string, string>::iterator itr;
+		if ((itr = pairs.find(arg_name)) != pairs.end()) {
+			return true;
+	    }
+		return false;
+	}
+
+	template <typename T>
+	void GetCmdLineArgument(const char *arg_name, T &val);
+
+	int ParsedArgc()
+	{
+		return pairs.size();
+	}
+};
+
+template <typename T>
+void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
+{
+	using namespace std;
+	map<string, string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+		istringstream strstream(itr->second);
+		strstream >> val;
+    }
+}
+
+template <>
+void CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
+{
+	using namespace std;
+	map<string, string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+
+		string s = itr->second;
+		val = (char*) malloc(sizeof(char) * (s.length() + 1));
+		strcpy(val, s.c_str());
+
+	} else {
+    	val = NULL;
+	}
+}
+
+
+
+
+
+/******************************************************************************
+ * Main
+ ******************************************************************************/
+
+extern bool gDebugSkipLoadingBinary;
+
+int main( int argc, char** argv) 
+{
+	gDebugSkipLoadingBinary = true;
+
+	cl_int ciErrNum;
+	CommandLineArgs args(argc,argv);
+
+	args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
+	args.GetCmdLineArgument("platformId", gPreferredPlatformId);
+
+	printf("Initialize OpenCL using btOpenCLUtils_createContextFromType\n");
+	cl_platform_id platformId;
+	g_cxMainContext = btOpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	int numDev = btOpenCLUtils_getNumDevices(g_cxMainContext);
+
+	if (!numDev)
+	{
+		printf("error: no OpenCL devices\n");
+		exit(0);
+	}
+	int result;
+	int devId = 0;
+	g_device = btOpenCLUtils_getDevice(g_cxMainContext,devId);
+	btOpenCLUtils_printDeviceInfo(g_device);
+	// create a command-queue
+	g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+
+
+	//srand(time(NULL));	
+	srand(0);				// presently deterministic
+
+    unsigned int num_elements 					= 32*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
+    unsigned int iterations  					= 10;
+    bool keys_only = true;
+
+    //
+	// Check command line arguments
+    //
+
+	
+
+	if (args.CheckCmdLineFlag("help"))
+	{
+		Usage();
+		return 0;
+	}
+	
+	args.GetCmdLineArgument("i", iterations);
+	args.GetCmdLineArgument("n", num_elements);
+	
+
+
+	keys_only = !args.CheckCmdLineFlag("key-values");
+	g_verbose = args.CheckCmdLineFlag("v");
+
+
+
+	TestSort<unsigned int, unsigned int>(
+			iterations,
+			num_elements, 
+			keys_only);
+
+
+}
diff --git a/opencl/parallel_primitives/host/CommandLineArgs.h b/opencl/parallel_primitives/host/CommandLineArgs.h
new file mode 100644
index 000000000..b2a43016f
--- /dev/null
+++ b/opencl/parallel_primitives/host/CommandLineArgs.h
@@ -0,0 +1,92 @@
+#ifndef COMMAND_LINE_ARGS_H
+#define COMMAND_LINE_ARGS_H
+
+/******************************************************************************
+ * Command-line parsing
+ ******************************************************************************/
+#include <map>
+#include <algorithm>
+#include <string>
+#include <cstring>
+#include <sstream>
+class CommandLineArgs
+{
+protected:
+
+	std::map<std::string, std::string> pairs;
+
+public:
+
+	// Constructor
+	CommandLineArgs(int argc, char **argv)
+	{
+		using namespace std;
+
+	    for (int i = 1; i < argc; i++)
+	    {
+	        string arg = argv[i];
+
+	        if ((arg[0] != '-') || (arg[1] != '-')) {
+	        	continue;
+	        }
+
+        	string::size_type pos;
+		    string key, val;
+	        if ((pos = arg.find( '=')) == string::npos) {
+	        	key = string(arg, 2, arg.length() - 2);
+	        	val = "";
+	        } else {
+	        	key = string(arg, 2, pos - 2);
+	        	val = string(arg, pos + 1, arg.length() - 1);
+	        }
+        	pairs[key] = val;
+	    }
+	}
+
+	bool CheckCmdLineFlag(const char* arg_name)
+	{
+		using namespace std;
+		map<string, string>::iterator itr;
+		if ((itr = pairs.find(arg_name)) != pairs.end()) {
+			return true;
+	    }
+		return false;
+	}
+
+	template <typename T>
+	void GetCmdLineArgument(const char *arg_name, T &val);
+
+	int ParsedArgc()
+	{
+		return pairs.size();
+	}
+};
+
+template <typename T>
+void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
+{
+	using namespace std;
+	map<string, string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+		istringstream strstream(itr->second);
+		strstream >> val;
+    }
+}
+
+template <>
+void CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
+{
+	using namespace std;
+	map<string, string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+
+		string s = itr->second;
+		val = (char*) malloc(sizeof(char) * (s.length() + 1));
+		std::strcpy(val, s.c_str());
+
+	} else {
+    	val = NULL;
+	}
+}
+
+#endif //COMMAND_LINE_ARGS_H
diff --git a/opencl/parallel_primitives/host/btAlignedAllocator.cpp b/opencl/parallel_primitives/host/btAlignedAllocator.cpp
new file mode 100644
index 000000000..a65296c6a
--- /dev/null
+++ b/opencl/parallel_primitives/host/btAlignedAllocator.cpp
@@ -0,0 +1,181 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "btAlignedAllocator.h"
+
+int gNumAlignedAllocs = 0;
+int gNumAlignedFree = 0;
+int gTotalBytesAlignedAllocs = 0;//detect memory leaks
+
+static void *btAllocDefault(size_t size)
+{
+	return malloc(size);
+}
+
+static void btFreeDefault(void *ptr)
+{
+	free(ptr);
+}
+
+static btAllocFunc *sAllocFunc = btAllocDefault;
+static btFreeFunc *sFreeFunc = btFreeDefault;
+
+
+
+#if defined (BT_HAS_ALIGNED_ALLOCATOR)
+#include <malloc.h>
+static void *btAlignedAllocDefault(size_t size, int alignment)
+{
+	return _aligned_malloc(size, (size_t)alignment);
+}
+
+static void btAlignedFreeDefault(void *ptr)
+{
+	_aligned_free(ptr);
+}
+#elif defined(__CELLOS_LV2__)
+#include <stdlib.h>
+
+static inline void *btAlignedAllocDefault(size_t size, int alignment)
+{
+	return memalign(alignment, size);
+}
+
+static inline void btAlignedFreeDefault(void *ptr)
+{
+	free(ptr);
+}
+#else
+
+
+
+
+
+static inline void *btAlignedAllocDefault(size_t size, int alignment)
+{
+  void *ret;
+  char *real;
+  real = (char *)sAllocFunc(size + sizeof(void *) + (alignment-1));
+  if (real) {
+	ret = btAlignPointer(real + sizeof(void *),alignment);
+    *((void **)(ret)-1) = (void *)(real);
+  } else {
+    ret = (void *)(real);
+  }
+  return (ret);
+}
+
+static inline void btAlignedFreeDefault(void *ptr)
+{
+  void* real;
+
+  if (ptr) {
+    real = *((void **)(ptr)-1);
+    sFreeFunc(real);
+  }
+}
+#endif
+
+
+static btAlignedAllocFunc *sAlignedAllocFunc = btAlignedAllocDefault;
+static btAlignedFreeFunc *sAlignedFreeFunc = btAlignedFreeDefault;
+
+void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc)
+{
+  sAlignedAllocFunc = allocFunc ? allocFunc : btAlignedAllocDefault;
+  sAlignedFreeFunc = freeFunc ? freeFunc : btAlignedFreeDefault;
+}
+
+void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc)
+{
+  sAllocFunc = allocFunc ? allocFunc : btAllocDefault;
+  sFreeFunc = freeFunc ? freeFunc : btFreeDefault;
+}
+
+#ifdef BT_DEBUG_MEMORY_ALLOCATIONS
+//this generic allocator provides the total allocated number of bytes
+#include <stdio.h>
+
+void*   btAlignedAllocInternal  (size_t size, int alignment,int line,char* filename)
+{
+ void *ret;
+ char *real;
+
+ gTotalBytesAlignedAllocs += size;
+ gNumAlignedAllocs++;
+
+ 
+ real = (char *)sAllocFunc(size + 2*sizeof(void *) + (alignment-1));
+ if (real) {
+   ret = (void*) btAlignPointer(real + 2*sizeof(void *), alignment);
+   *((void **)(ret)-1) = (void *)(real);
+       *((int*)(ret)-2) = size;
+
+ } else {
+   ret = (void *)(real);//??
+ }
+
+ printf("allocation#%d at address %x, from %s,line %d, size %d\n",gNumAlignedAllocs,real, filename,line,size);
+
+ int* ptr = (int*)ret;
+ *ptr = 12;
+ return (ret);
+}
+
+void    btAlignedFreeInternal   (void* ptr,int line,char* filename)
+{
+
+ void* real;
+ gNumAlignedFree++;
+
+ if (ptr) {
+   real = *((void **)(ptr)-1);
+       int size = *((int*)(ptr)-2);
+       gTotalBytesAlignedAllocs -= size;
+
+	   printf("free #%d at address %x, from %s,line %d, size %d\n",gNumAlignedFree,real, filename,line,size);
+
+   sFreeFunc(real);
+ } else
+ {
+	 printf("NULL ptr\n");
+ }
+}
+
+#else //BT_DEBUG_MEMORY_ALLOCATIONS
+
+void*	btAlignedAllocInternal	(size_t size, int alignment)
+{
+	gNumAlignedAllocs++;
+	void* ptr;
+	ptr = sAlignedAllocFunc(size, alignment);
+//	printf("btAlignedAllocInternal %d, %x\n",size,ptr);
+	return ptr;
+}
+
+void	btAlignedFreeInternal	(void* ptr)
+{
+	if (!ptr)
+	{
+		return;
+	}
+
+	gNumAlignedFree++;
+//	printf("btAlignedFreeInternal %x\n",ptr);
+	sAlignedFreeFunc(ptr);
+}
+
+#endif //BT_DEBUG_MEMORY_ALLOCATIONS
+
diff --git a/opencl/parallel_primitives/host/btAlignedAllocator.h b/opencl/parallel_primitives/host/btAlignedAllocator.h
new file mode 100644
index 000000000..f168f3c66
--- /dev/null
+++ b/opencl/parallel_primitives/host/btAlignedAllocator.h
@@ -0,0 +1,107 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_ALIGNED_ALLOCATOR
+#define BT_ALIGNED_ALLOCATOR
+
+///we probably replace this with our own aligned memory allocator
+///so we replace _aligned_malloc and _aligned_free with our own
+///that is better portable and more predictable
+
+#include "btScalar.h"
+//#define BT_DEBUG_MEMORY_ALLOCATIONS 1
+#ifdef BT_DEBUG_MEMORY_ALLOCATIONS
+
+#define btAlignedAlloc(a,b) \
+		btAlignedAllocInternal(a,b,__LINE__,__FILE__)
+
+#define btAlignedFree(ptr) \
+		btAlignedFreeInternal(ptr,__LINE__,__FILE__)
+
+void*	btAlignedAllocInternal	(size_t size, int alignment,int line,char* filename);
+
+void	btAlignedFreeInternal	(void* ptr,int line,char* filename);
+
+#else
+	void*	btAlignedAllocInternal	(size_t size, int alignment);
+	void	btAlignedFreeInternal	(void* ptr);
+
+	#define btAlignedAlloc(size,alignment) btAlignedAllocInternal(size,alignment)
+	#define btAlignedFree(ptr) btAlignedFreeInternal(ptr)
+
+#endif
+typedef int	size_type;
+
+typedef void *(btAlignedAllocFunc)(size_t size, int alignment);
+typedef void (btAlignedFreeFunc)(void *memblock);
+typedef void *(btAllocFunc)(size_t size);
+typedef void (btFreeFunc)(void *memblock);
+
+///The developer can let all Bullet memory allocations go through a custom memory allocator, using btAlignedAllocSetCustom
+void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc);
+///If the developer has already an custom aligned allocator, then btAlignedAllocSetCustomAligned can be used. The default aligned allocator pre-allocates extra memory using the non-aligned allocator, and instruments it.
+void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc);
+
+
+///The btAlignedAllocator is a portable class for aligned memory allocations.
+///Default implementations for unaligned and aligned allocations can be overridden by a custom allocator using btAlignedAllocSetCustom and btAlignedAllocSetCustomAligned.
+template < typename T , unsigned Alignment >
+class btAlignedAllocator {
+	
+	typedef btAlignedAllocator< T , Alignment > self_type;
+	
+public:
+
+	//just going down a list:
+	btAlignedAllocator() {}
+	/*
+	btAlignedAllocator( const self_type & ) {}
+	*/
+
+	template < typename Other >
+	btAlignedAllocator( const btAlignedAllocator< Other , Alignment > & ) {}
+
+	typedef const T*         const_pointer;
+	typedef const T&         const_reference;
+	typedef T*               pointer;
+	typedef T&               reference;
+	typedef T                value_type;
+
+	pointer       address   ( reference        ref ) const                           { return &ref; }
+	const_pointer address   ( const_reference  ref ) const                           { return &ref; }
+	pointer       allocate  ( size_type        n   , const_pointer *      hint = 0 ) {
+		(void)hint;
+		return reinterpret_cast< pointer >(btAlignedAlloc( sizeof(value_type) * n , Alignment ));
+	}
+	void          construct ( pointer          ptr , const value_type &   value    ) { new (ptr) value_type( value ); }
+	void          deallocate( pointer          ptr ) {
+		btAlignedFree( reinterpret_cast< void * >( ptr ) );
+	}
+	void          destroy   ( pointer          ptr )                                 { ptr->~value_type(); }
+	
+
+	template < typename O > struct rebind {
+		typedef btAlignedAllocator< O , Alignment > other;
+	};
+	template < typename O >
+	self_type & operator=( const btAlignedAllocator< O , Alignment > & ) { return *this; }
+
+	friend bool operator==( const self_type & , const self_type & ) { return true; }
+};
+
+
+
+#endif //BT_ALIGNED_ALLOCATOR
+
diff --git a/opencl/parallel_primitives/host/btAlignedObjectArray.h b/opencl/parallel_primitives/host/btAlignedObjectArray.h
new file mode 100644
index 000000000..24e59ab65
--- /dev/null
+++ b/opencl/parallel_primitives/host/btAlignedObjectArray.h
@@ -0,0 +1,511 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef BT_OBJECT_ARRAY__
+#define BT_OBJECT_ARRAY__
+
+#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE
+#include "btAlignedAllocator.h"
+
+///If the platform doesn't support placement new, you can disable BT_USE_PLACEMENT_NEW
+///then the btAlignedObjectArray doesn't support objects with virtual methods, and non-trivial constructors/destructors
+///You can enable BT_USE_MEMCPY, then swapping elements in the array will use memcpy instead of operator=
+///see discussion here: http://continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1231 and
+///http://www.continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1240
+
+#define BT_USE_PLACEMENT_NEW 1
+//#define BT_USE_MEMCPY 1 //disable, because it is cumbersome to find out for each platform where memcpy is defined. It can be in <memory.h> or <string.h> or otherwise...
+#define BT_ALLOW_ARRAY_COPY_OPERATOR // enabling this can accidently perform deep copies of data if you are not careful
+
+#ifdef BT_USE_MEMCPY
+#include <memory.h>
+#include <string.h>
+#endif //BT_USE_MEMCPY
+
+#ifdef BT_USE_PLACEMENT_NEW
+#include <new> //for placement new
+#endif //BT_USE_PLACEMENT_NEW
+
+
+///The btAlignedObjectArray template class uses a subset of the stl::vector interface for its methods
+///It is developed to replace stl::vector to avoid portability issues, including STL alignment issues to add SIMD/SSE data
+template <typename T> 
+//template <class T> 
+class btAlignedObjectArray
+{
+	btAlignedAllocator<T , 16>	m_allocator;
+
+	int					m_size;
+	int					m_capacity;
+	T*					m_data;
+	//PCK: added this line
+	bool				m_ownsMemory;
+
+#ifdef BT_ALLOW_ARRAY_COPY_OPERATOR
+public:
+	SIMD_FORCE_INLINE btAlignedObjectArray<T>& operator=(const btAlignedObjectArray<T> &other)
+	{
+		copyFromArray(other);
+		return *this;
+	}
+#else//BT_ALLOW_ARRAY_COPY_OPERATOR
+private:
+		SIMD_FORCE_INLINE btAlignedObjectArray<T>& operator=(const btAlignedObjectArray<T> &other);
+#endif//BT_ALLOW_ARRAY_COPY_OPERATOR
+
+protected:
+		SIMD_FORCE_INLINE	int	allocSize(int size)
+		{
+			return (size ? size*2 : 1);
+		}
+		SIMD_FORCE_INLINE	void	copy(int start,int end, T* dest) const
+		{
+			int i;
+			for (i=start;i<end;++i)
+#ifdef BT_USE_PLACEMENT_NEW
+				new (&dest[i]) T(m_data[i]);
+#else
+				dest[i] = m_data[i];
+#endif //BT_USE_PLACEMENT_NEW
+		}
+
+		SIMD_FORCE_INLINE	void	init()
+		{
+			//PCK: added this line
+			m_ownsMemory = true;
+			m_data = 0;
+			m_size = 0;
+			m_capacity = 0;
+		}
+		SIMD_FORCE_INLINE	void	destroy(int first,int last)
+		{
+			int i;
+			for (i=first; i<last;i++)
+			{
+				m_data[i].~T();
+			}
+		}
+
+		SIMD_FORCE_INLINE	void* allocate(int size)
+		{
+			if (size)
+				return m_allocator.allocate(size);
+			return 0;
+		}
+
+		SIMD_FORCE_INLINE	void	deallocate()
+		{
+			if(m_data)	{
+				//PCK: enclosed the deallocation in this block
+				if (m_ownsMemory)
+				{
+					m_allocator.deallocate(m_data);
+				}
+				m_data = 0;
+			}
+		}
+
+	
+
+
+	public:
+		
+		btAlignedObjectArray()
+		{
+			init();
+		}
+
+		~btAlignedObjectArray()
+		{
+			clear();
+		}
+
+		///Generally it is best to avoid using the copy constructor of an btAlignedObjectArray, and use a (const) reference to the array instead.
+		btAlignedObjectArray(const btAlignedObjectArray& otherArray)
+		{
+			init();
+
+			int otherSize = otherArray.size();
+			resize (otherSize);
+			otherArray.copy(0, otherSize, m_data);
+		}
+
+		
+		
+		/// return the number of elements in the array
+		SIMD_FORCE_INLINE	int size() const
+		{	
+			return m_size;
+		}
+		
+		SIMD_FORCE_INLINE const T& at(int n) const
+		{
+			btAssert(n>=0);
+			btAssert(n<size());
+			return m_data[n];
+		}
+
+		SIMD_FORCE_INLINE T& at(int n)
+		{
+			btAssert(n>=0);
+			btAssert(n<size());
+			return m_data[n];
+		}
+
+		SIMD_FORCE_INLINE const T& operator[](int n) const
+		{
+			btAssert(n>=0);
+			btAssert(n<size());
+			return m_data[n];
+		}
+
+		SIMD_FORCE_INLINE T& operator[](int n)
+		{
+			btAssert(n>=0);
+			btAssert(n<size());
+			return m_data[n];
+		}
+		
+
+		///clear the array, deallocated memory. Generally it is better to use array.resize(0), to reduce performance overhead of run-time memory (de)allocations.
+		SIMD_FORCE_INLINE	void	clear()
+		{
+			destroy(0,size());
+			
+			deallocate();
+			
+			init();
+		}
+
+		SIMD_FORCE_INLINE	void	pop_back()
+		{
+			btAssert(m_size>0);
+			m_size--;
+			m_data[m_size].~T();
+		}
+
+
+		///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument.
+		///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations.
+		SIMD_FORCE_INLINE	void	resizeNoInitialize(int newsize)
+		{
+			int curSize = size();
+
+			if (newsize < curSize)
+			{
+			} else
+			{
+				if (newsize > size())
+				{
+					reserve(newsize);
+				}
+				//leave this uninitialized
+			}
+			m_size = newsize;
+		}
+	
+		SIMD_FORCE_INLINE	void	resize(int newsize, const T& fillData=T())
+		{
+			int curSize = size();
+
+			if (newsize < curSize)
+			{
+				for(int i = newsize; i < curSize; i++)
+				{
+					m_data[i].~T();
+				}
+			} else
+			{
+				if (newsize > size())
+				{
+					reserve(newsize);
+				}
+#ifdef BT_USE_PLACEMENT_NEW
+				for (int i=curSize;i<newsize;i++)
+				{
+					new ( &m_data[i]) T(fillData);
+				}
+#endif //BT_USE_PLACEMENT_NEW
+
+			}
+
+			m_size = newsize;
+		}
+		SIMD_FORCE_INLINE	T&  expandNonInitializing( )
+		{	
+			int sz = size();
+			if( sz == capacity() )
+			{
+				reserve( allocSize(size()) );
+			}
+			m_size++;
+
+			return m_data[sz];		
+		}
+
+
+		SIMD_FORCE_INLINE	T&  expand( const T& fillValue=T())
+		{	
+			int sz = size();
+			if( sz == capacity() )
+			{
+				reserve( allocSize(size()) );
+			}
+			m_size++;
+#ifdef BT_USE_PLACEMENT_NEW
+			new (&m_data[sz]) T(fillValue); //use the in-place new (not really allocating heap memory)
+#endif
+
+			return m_data[sz];		
+		}
+
+
+		SIMD_FORCE_INLINE	void push_back(const T& _Val)
+		{	
+			int sz = size();
+			if( sz == capacity() )
+			{
+				reserve( allocSize(size()) );
+			}
+			
+#ifdef BT_USE_PLACEMENT_NEW
+			new ( &m_data[m_size] ) T(_Val);
+#else
+			m_data[size()] = _Val;			
+#endif //BT_USE_PLACEMENT_NEW
+
+			m_size++;
+		}
+
+	
+		/// return the pre-allocated (reserved) elements, this is at least as large as the total number of elements,see size() and reserve()
+		SIMD_FORCE_INLINE	int capacity() const
+		{	
+			return m_capacity;
+		}
+		
+		SIMD_FORCE_INLINE	void reserve(int _Count)
+		{	// determine new minimum length of allocated storage
+			if (capacity() < _Count)
+			{	// not enough room, reallocate
+				T*	s = (T*)allocate(_Count);
+
+				copy(0, size(), s);
+
+				destroy(0,size());
+
+				deallocate();
+				
+				//PCK: added this line
+				m_ownsMemory = true;
+
+				m_data = s;
+				
+				m_capacity = _Count;
+
+			}
+		}
+
+
+		class less
+		{
+			public:
+
+				bool operator() ( const T& a, const T& b )
+				{
+					return ( a < b );
+				}
+		};
+	
+
+		template <typename L>
+		void quickSortInternal(const L& CompareFunc,int lo, int hi)
+		{
+		//  lo is the lower index, hi is the upper index
+		//  of the region of array a that is to be sorted
+			int i=lo, j=hi;
+			T x=m_data[(lo+hi)/2];
+
+			//  partition
+			do
+			{    
+				while (CompareFunc(m_data[i],x)) 
+					i++; 
+				while (CompareFunc(x,m_data[j])) 
+					j--;
+				if (i<=j)
+				{
+					swap(i,j);
+					i++; j--;
+				}
+			} while (i<=j);
+
+			//  recursion
+			if (lo<j) 
+				quickSortInternal( CompareFunc, lo, j);
+			if (i<hi) 
+				quickSortInternal( CompareFunc, i, hi);
+		}
+
+
+		template <typename L>
+		void quickSort(const L& CompareFunc)
+		{
+			//don't sort 0 or 1 elements
+			if (size()>1)
+			{
+				quickSortInternal(CompareFunc,0,size()-1);
+			}
+		}
+
+
+		///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/
+		template <typename L>
+		void downHeap(T *pArr, int k, int n, const L& CompareFunc)
+		{
+			/*  PRE: a[k+1..N] is a heap */
+			/* POST:  a[k..N]  is a heap */
+			
+			T temp = pArr[k - 1];
+			/* k has child(s) */
+			while (k <= n/2) 
+			{
+				int child = 2*k;
+				
+				if ((child < n) && CompareFunc(pArr[child - 1] , pArr[child]))
+				{
+					child++;
+				}
+				/* pick larger child */
+				if (CompareFunc(temp , pArr[child - 1]))
+				{
+					/* move child up */
+					pArr[k - 1] = pArr[child - 1];
+					k = child;
+				}
+				else
+				{
+					break;
+				}
+			}
+			pArr[k - 1] = temp;
+		} /*downHeap*/
+
+		void	swap(int index0,int index1)
+		{
+#ifdef BT_USE_MEMCPY
+			char	temp[sizeof(T)];
+			memcpy(temp,&m_data[index0],sizeof(T));
+			memcpy(&m_data[index0],&m_data[index1],sizeof(T));
+			memcpy(&m_data[index1],temp,sizeof(T));
+#else
+			T temp = m_data[index0];
+			m_data[index0] = m_data[index1];
+			m_data[index1] = temp;
+#endif //BT_USE_PLACEMENT_NEW
+
+		}
+
+	template <typename L>
+	void heapSort(const L& CompareFunc)
+	{
+		/* sort a[0..N-1],  N.B. 0 to N-1 */
+		int k;
+		int n = m_size;
+		for (k = n/2; k > 0; k--) 
+		{
+			downHeap(m_data, k, n, CompareFunc);
+		}
+
+		/* a[1..N] is now a heap */
+		while ( n>=1 ) 
+		{
+			swap(0,n-1); /* largest of a[0..n-1] */
+
+
+			n = n - 1;
+			/* restore a[1..i-1] heap */
+			downHeap(m_data, 1, n, CompareFunc);
+		} 
+	}
+
+	///non-recursive binary search, assumes sorted array
+	int	findBinarySearch(const T& key) const
+	{
+		int first = 0;
+		int last = size()-1;
+
+		//assume sorted array
+		while (first <= last) {
+			int mid = (first + last) / 2;  // compute mid point.
+			if (key > m_data[mid]) 
+				first = mid + 1;  // repeat search in top half.
+			else if (key < m_data[mid]) 
+				last = mid - 1; // repeat search in bottom half.
+			else
+				return mid;     // found it. return position /////
+		}
+		return size();    // failed to find key
+	}
+
+
+	int	findLinearSearch(const T& key) const
+	{
+		int index=size();
+		int i;
+
+		for (i=0;i<size();i++)
+		{
+			if (m_data[i] == key)
+			{
+				index = i;
+				break;
+			}
+		}
+		return index;
+	}
+
+	void	remove(const T& key)
+	{
+
+		int findIndex = findLinearSearch(key);
+		if (findIndex<size())
+		{
+			swap( findIndex,size()-1);
+			pop_back();
+		}
+	}
+
+	//PCK: whole function
+	void initializeFromBuffer(void *buffer, int size, int capacity)
+	{
+		clear();
+		m_ownsMemory = false;
+		m_data = (T*)buffer;
+		m_size = size;
+		m_capacity = capacity;
+	}
+
+	void copyFromArray(const btAlignedObjectArray& otherArray)
+	{
+		int otherSize = otherArray.size();
+		resize (otherSize);
+		otherArray.copy(0, otherSize, m_data);
+	}
+
+};
+
+#endif //BT_OBJECT_ARRAY__
diff --git a/opencl/parallel_primitives/host/btBoundSearchCL.cpp b/opencl/parallel_primitives/host/btBoundSearchCL.cpp
new file mode 100644
index 000000000..9395e9cc8
--- /dev/null
+++ b/opencl/parallel_primitives/host/btBoundSearchCL.cpp
@@ -0,0 +1,213 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+//Host-code rewritten by Erwin Coumans
+
+#define BOUNDSEARCH_PATH "opencl/parallel_primitives/kernels/BoundSearchKernels.cl"
+#define KERNEL0 "SearchSortDataLowerKernel"
+#define KERNEL1 "SearchSortDataUpperKernel"
+#define KERNEL2 "SubtractKernel"
+
+
+#include "btBoundSearchCL.h"
+#include "../../basic_initialize/btOpenCLUtils.h"
+#include "btLauncherCL.h"
+#include "../kernels/BoundSearchKernelsCL.h"
+
+btBoundSearchCL::btBoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
+	:m_context(ctx),
+	m_device(device),
+	m_queue(queue)
+{
+
+	const char* additionalMacros = "";
+	const char* srcFileNameForCaching="";
+
+	cl_int pErrNum;
+	const char* kernelSource = boundSearchKernelsCL;
+
+	cl_program boundSearchProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH);
+	btAssert(boundSearchProg);
+
+	m_lowerSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros );
+	btAssert(m_lowerSortDataKernel );
+
+	m_upperSortDataKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros );
+	btAssert(m_upperSortDataKernel);
+
+	m_subtractKernel = 0;
+
+	if( maxSize )
+	{
+		m_subtractKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros );
+		btAssert(m_subtractKernel);
+	}
+
+	//m_constBuffer = new btOpenCLArray<btInt4>( device, 1, BufferBase::BUFFER_CONST );
+	
+	m_lower = (maxSize == 0)? 0: new btOpenCLArray<unsigned int>(ctx,queue,maxSize );
+	m_upper = (maxSize == 0)? 0: new btOpenCLArray<unsigned int>(ctx,queue, maxSize );
+
+	m_filler = new btFillCL(ctx,device,queue);
+}
+
+btBoundSearchCL::~btBoundSearchCL()
+{
+	
+	delete m_lower;
+	delete m_upper;
+	delete m_filler;
+			
+	clReleaseKernel(m_lowerSortDataKernel);
+	clReleaseKernel(m_upperSortDataKernel);
+	clReleaseKernel(m_subtractKernel);
+	
+
+}
+
+
+void btBoundSearchCL::execute(btOpenCLArray<btSortData>& src, int nSrc, btOpenCLArray<unsigned int>& dst, int nDst, Option option )
+{
+	btInt4 constBuffer;
+	constBuffer.x = nSrc;
+	constBuffer.y = nDst;
+
+	if( option == BOUND_LOWER )
+	{
+		btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL()) };
+
+		btLauncherCL launcher( m_queue, m_lowerSortDataKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+		launcher.setConst( nSrc );
+        launcher.setConst( nDst );
+        
+		launcher.launch1D( nSrc, 64 );
+	}
+	else if( option == BOUND_UPPER )
+	{
+		btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };
+
+		btLauncherCL launcher(m_queue, m_upperSortDataKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+        launcher.setConst( nSrc );
+        launcher.setConst( nDst );
+
+		launcher.launch1D( nSrc, 64 );
+	}
+	else if( option == COUNT )
+	{
+		btAssert( m_lower );
+		btAssert( m_upper );
+		btAssert( m_lower->capacity() <= (int)nDst );
+		btAssert( m_upper->capacity() <= (int)nDst );
+
+		int zero = 0;
+		m_filler->execute( *m_lower, zero, nDst );
+		m_filler->execute( *m_upper, zero, nDst );
+
+		execute( src, nSrc, *m_lower, nDst, BOUND_LOWER );
+		execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );
+
+		{
+			btBufferInfoCL bInfo[] = { btBufferInfoCL( m_upper->getBufferCL(), true ), btBufferInfoCL( m_lower->getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };
+
+			btLauncherCL  launcher( m_queue, m_subtractKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+            launcher.setConst( nSrc );
+            launcher.setConst( nDst );
+
+			launcher.launch1D( nDst, 64 );
+		}
+	}
+	else
+	{
+		btAssert( 0 );
+	}
+
+}
+
+
+void btBoundSearchCL::executeHost( btAlignedObjectArray<btSortData>& src, int nSrc, 
+	btAlignedObjectArray<unsigned int>& dst,  int nDst, Option option )
+{
+
+
+	for(int i=0; i<nSrc-1; i++) 
+		btAssert( src[i].m_key <= src[i+1].m_key );
+
+	btSortData minData,zeroData,maxData;
+	minData.m_key = -1;
+	minData.m_value = -1;
+	zeroData.m_key=0;
+	zeroData.m_value=0;
+	maxData.m_key = nDst;
+	maxData.m_value = nDst;
+
+	if( option == BOUND_LOWER )
+	{
+		for(int i=0; i<nSrc; i++)
+		{
+			btSortData& iData = (i==0)? minData: src[i-1];
+			btSortData& jData = (i==nSrc)? maxData: src[i];
+
+			if( iData.m_key != jData.m_key )
+			{
+				int k = jData.m_key;
+				{
+					dst[k] = i;
+				}
+			}
+		}
+	}
+	else if( option == BOUND_UPPER )
+	{
+		for(int i=1; i<nSrc+1; i++)
+		{
+			btSortData& iData = src[i-1];
+			btSortData& jData = (i==nSrc)? maxData: src[i];
+
+			if( iData.m_key != jData.m_key )
+			{
+				int k = iData.m_key;
+				{
+					dst[k] = i;
+				}
+			}
+		}
+	}
+	else if( option == COUNT )
+	{
+		btAlignedObjectArray<unsigned int> lower;
+		lower.resize(nDst );
+		btAlignedObjectArray<unsigned int> upper;
+		upper.resize(nDst );
+
+		for(int i=0; i<nDst; i++) 
+		{ 
+			lower[i] = upper[i] = 0; 
+		}
+
+		executeHost( src, nSrc, lower, nDst, BOUND_LOWER );
+		executeHost( src, nSrc, upper, nDst, BOUND_UPPER );
+
+		for( int i=0; i<nDst; i++) 
+		{ 
+			dst[i] = upper[i] - lower[i]; 
+		}
+	}
+	else
+	{
+		btAssert( 0 );
+	}
+}
diff --git a/opencl/parallel_primitives/host/btBoundSearchCL.h b/opencl/parallel_primitives/host/btBoundSearchCL.h
new file mode 100644
index 000000000..161b4edf3
--- /dev/null
+++ b/opencl/parallel_primitives/host/btBoundSearchCL.h
@@ -0,0 +1,67 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#ifndef BT_BOUNDSEARCH_H
+#define BT_BOUNDSEARCH_H
+
+#pragma once
+
+/*#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+#include <AdlPrimitives/Sort/SortData.h>
+#include <AdlPrimitives/Fill/Fill.h>
+*/
+
+#include "btOpenCLArray.h"
+#include "btFillCL.h"
+#include "btRadixSort32CL.h" //for btSortData (perhaps move it?)
+class btBoundSearchCL
+{
+	public:
+
+		enum Option
+		{
+			BOUND_LOWER,
+			BOUND_UPPER,
+			COUNT,
+		};
+
+		cl_context m_context;
+		cl_device_id m_device;
+		cl_command_queue m_queue;
+
+		
+		cl_kernel m_lowerSortDataKernel;
+		cl_kernel m_upperSortDataKernel;
+		cl_kernel m_subtractKernel;
+		
+		btOpenCLArray<btInt4>* m_constbtOpenCLArray;
+		btOpenCLArray<unsigned int>* m_lower;
+		btOpenCLArray<unsigned int>* m_upper;
+		
+		btFillCL* m_filler;
+		
+		btBoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
+
+		virtual ~btBoundSearchCL();
+
+		//	src has to be src[i].m_key <= src[i+1].m_key
+		void execute( btOpenCLArray<btSortData>& src, int nSrc, btOpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
+
+		void executeHost( btAlignedObjectArray<btSortData>& src, int nSrc, btAlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
+};
+
+
+#endif //BT_BOUNDSEARCH_H
diff --git a/opencl/parallel_primitives/host/btBufferInfoCL.h b/opencl/parallel_primitives/host/btBufferInfoCL.h
new file mode 100644
index 000000000..48798e232
--- /dev/null
+++ b/opencl/parallel_primitives/host/btBufferInfoCL.h
@@ -0,0 +1,19 @@
+
+#ifndef BT_BUFFER_INFO_CL_H
+#define BT_BUFFER_INFO_CL_H
+
+#include "btOpenCLArray.h"
+
+
+struct btBufferInfoCL
+{
+	//btBufferInfoCL(){}
+
+//	template<typename T>
+	btBufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
+
+	cl_mem m_clBuffer;
+	bool m_isReadOnly;
+};
+
+#endif //BT_BUFFER_INFO_CL_H
diff --git a/opencl/parallel_primitives/host/btFillCL.cpp b/opencl/parallel_primitives/host/btFillCL.cpp
new file mode 100644
index 000000000..18a7e2093
--- /dev/null
+++ b/opencl/parallel_primitives/host/btFillCL.cpp
@@ -0,0 +1,126 @@
+#include "btFillCL.h"
+#include "../../basic_initialize/btOpenCLUtils.h"
+#include "btBufferInfoCL.h"
+#include "btLauncherCL.h"
+
+#define FILL_CL_PROGRAM_PATH "opencl/parallel_primitives/kernels/FillKernels.cl"
+
+#include "../kernels/FillKernelsCL.h"
+
+btFillCL::btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
+:m_commandQueue(queue)
+{
+	const char* kernelSource = fillKernelsCL;
+	cl_int pErrNum;
+	const char* additionalMacros = "";
+
+	cl_program fillProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH);
+	btAssert(fillProg);
+
+	m_fillIntKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros );
+	btAssert(m_fillIntKernel);
+
+	m_fillUnsignedIntKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros );
+	btAssert(m_fillIntKernel);
+
+	m_fillFloatKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros );
+	btAssert(m_fillFloatKernel);
+
+	
+
+	m_fillKernelInt2 = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros );
+	btAssert(m_fillKernelInt2);
+	
+}
+
+btFillCL::~btFillCL()
+{
+	clReleaseKernel(m_fillKernelInt2);
+	clReleaseKernel(m_fillIntKernel);
+	clReleaseKernel(m_fillUnsignedIntKernel);
+	clReleaseKernel(m_fillFloatKernel);
+
+}
+
+void btFillCL::execute(btOpenCLArray<float>& src, const float value, int n, int offset)
+{
+	btAssert( n>0 );
+
+	{
+		btLauncherCL launcher( m_commandQueue, m_fillFloatKernel );
+		launcher.setBuffer( src.getBufferCL());
+		launcher.setConst( n );
+		launcher.setConst( value );
+		launcher.setConst( offset);
+
+		launcher.launch1D( n );
+	}
+}
+
+void btFillCL::execute(btOpenCLArray<int>& src, const int value, int n, int offset)
+{
+	btAssert( n>0 );
+	
+
+	{
+		btLauncherCL launcher( m_commandQueue, m_fillIntKernel );
+		launcher.setBuffer(src.getBufferCL());
+		launcher.setConst( n);
+		launcher.setConst( value);
+		launcher.setConst( offset);
+		launcher.launch1D( n );
+	}
+}
+
+
+void btFillCL::execute(btOpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
+{
+	btAssert( n>0 );
+
+	{
+		btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };
+
+		btLauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+		launcher.setConst( n );
+        launcher.setConst(value);
+		launcher.setConst(offset);
+
+		launcher.launch1D( n );
+	}
+}
+
+void btFillCL::executeHost(btAlignedObjectArray<btInt2> &src, const btInt2 &value, int n, int offset)
+{
+	for (int i=0;i<n;i++)
+	{
+		src[i+offset]=value;
+	}
+}
+
+void btFillCL::executeHost(btAlignedObjectArray<int> &src, const int value, int n, int offset)
+{
+	for (int i=0;i<n;i++)
+	{
+		src[i+offset]=value;
+	}
+}
+
+void btFillCL::execute(btOpenCLArray<btInt2> &src, const btInt2 &value, int n, int offset)
+{
+	btAssert( n>0 );
+	
+
+	{
+		btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };
+
+		btLauncherCL launcher(m_commandQueue, m_fillKernelInt2);
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+		launcher.setConst(n);
+		launcher.setConst(value);
+		launcher.setConst(offset);
+
+		//( constBuffer );
+		launcher.launch1D( n );
+	}
+}
diff --git a/opencl/parallel_primitives/host/btFillCL.h b/opencl/parallel_primitives/host/btFillCL.h
new file mode 100644
index 000000000..a9303a73d
--- /dev/null
+++ b/opencl/parallel_primitives/host/btFillCL.h
@@ -0,0 +1,137 @@
+#ifndef BT_FILL_CL_H
+#define BT_FILL_CL_H
+
+#include "btOpenCLArray.h"
+#include "btScalar.h"
+
+ATTRIBUTE_ALIGNED16(struct) btUnsignedInt4
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	union
+	{
+		struct
+		{
+			unsigned int x,y,z,w;
+		};
+		struct
+		{
+			unsigned int s[4];
+		};
+	};
+};
+
+ATTRIBUTE_ALIGNED16(struct) btInt4
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	union
+	{
+		struct
+		{
+			int x,y,z,w;
+		};
+		struct
+		{
+			int s[4];
+		};
+	};
+};
+
+struct btUnsignedInt2
+{
+	union
+	{
+		struct
+		{
+			unsigned int x,y;
+		};
+		struct
+		{
+			unsigned int s[2];
+		};
+	};
+};
+
+struct btInt2
+{
+	union
+	{
+		struct
+		{
+			int x,y;
+		};
+		struct
+		{
+			int s[2];
+		};
+	};
+};
+
+SIMD_FORCE_INLINE btInt4 btMakeInt4(int x, int y, int z, int w = 0)
+{
+	btInt4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+}
+
+SIMD_FORCE_INLINE btUnsignedInt4 btMakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0)
+{
+	btUnsignedInt4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+}
+
+class btFillCL
+{
+	
+	cl_command_queue	m_commandQueue;
+	
+	cl_kernel			m_fillKernelInt2;
+	cl_kernel			m_fillIntKernel;
+	cl_kernel			m_fillUnsignedIntKernel;
+	cl_kernel			m_fillFloatKernel;
+
+	public:
+		
+		struct btConstData
+		{
+			union
+			{
+				btInt4 m_data;
+				btUnsignedInt4 m_UnsignedData;
+			};
+			int m_offset;
+			int m_n;
+			int m_padding[2];
+		};
+
+protected:
+
+public:
+
+		btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
+
+		virtual ~btFillCL();
+
+		void execute(btOpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
+	
+		void execute(btOpenCLArray<int>& src, const int value, int n, int offset = 0);
+
+		void execute(btOpenCLArray<float>& src, const float value, int n, int offset = 0);
+
+		void execute(btOpenCLArray<btInt2>& src, const btInt2& value, int n, int offset = 0);
+
+		void executeHost(btAlignedObjectArray<btInt2> &src, const btInt2 &value, int n, int offset);
+
+		void executeHost(btAlignedObjectArray<int> &src, const int value, int n, int offset);
+
+	//	void execute(btOpenCLArray<btInt4>& src, const btInt4& value, int n, int offset = 0);
+
+};
+		
+		
+		
+	
+
+#endif //BT_FILL_CL_H
diff --git a/opencl/parallel_primitives/host/btHashMap.h b/opencl/parallel_primitives/host/btHashMap.h
new file mode 100644
index 000000000..ce07db3ac
--- /dev/null
+++ b/opencl/parallel_primitives/host/btHashMap.h
@@ -0,0 +1,450 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef BT_HASH_MAP_H
+#define BT_HASH_MAP_H
+
+#include "btAlignedObjectArray.h"
+
+///very basic hashable string implementation, compatible with btHashMap
+struct btHashString
+{
+	const char* m_string;
+	unsigned int	m_hash;
+
+	SIMD_FORCE_INLINE	unsigned int getHash()const
+	{
+		return m_hash;
+	}
+
+	btHashString(const char* name)
+		:m_string(name)
+	{
+		/* magic numbers from http://www.isthe.com/chongo/tech/comp/fnv/ */
+		static const unsigned int  InitialFNV = 2166136261u;
+		static const unsigned int FNVMultiple = 16777619u;
+
+		/* Fowler / Noll / Vo (FNV) Hash */
+		unsigned int hash = InitialFNV;
+		
+		for(int i = 0; m_string[i]; i++)
+		{
+			hash = hash ^ (m_string[i]);       /* xor  the low 8 bits */
+			hash = hash * FNVMultiple;  /* multiply by the magic number */
+		}
+		m_hash = hash;
+	}
+
+	int portableStringCompare(const char* src,	const char* dst) const
+	{
+			int ret = 0 ;
+
+			while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst)
+					++src, ++dst;
+
+			if ( ret < 0 )
+					ret = -1 ;
+			else if ( ret > 0 )
+					ret = 1 ;
+
+			return( ret );
+	}
+
+	bool equals(const btHashString& other) const
+	{
+		return (m_string == other.m_string) ||
+			(0==portableStringCompare(m_string,other.m_string));
+
+	}
+
+};
+
+const int BT_HASH_NULL=0xffffffff;
+
+
+class btHashInt
+{
+	int	m_uid;
+public:
+	btHashInt(int uid)	:m_uid(uid)
+	{
+	}
+
+	int	getUid1() const
+	{
+		return m_uid;
+	}
+
+	void	setUid1(int uid)
+	{
+		m_uid = uid;
+	}
+
+	bool equals(const btHashInt& other) const
+	{
+		return getUid1() == other.getUid1();
+	}
+	//to our success
+	SIMD_FORCE_INLINE	unsigned int getHash()const
+	{
+		int key = m_uid;
+		// Thomas Wang's hash
+		key += ~(key << 15);	key ^=  (key >> 10);	key +=  (key << 3);	key ^=  (key >> 6);	key += ~(key << 11);	key ^=  (key >> 16);
+		return key;
+	}
+};
+
+
+
+class btHashPtr
+{
+
+	union
+	{
+		const void*	m_pointer;
+		int	m_hashValues[2];
+	};
+
+public:
+
+	btHashPtr(const void* ptr)
+		:m_pointer(ptr)
+	{
+	}
+
+	const void*	getPointer() const
+	{
+		return m_pointer;
+	}
+
+	bool equals(const btHashPtr& other) const
+	{
+		return getPointer() == other.getPointer();
+	}
+
+	//to our success
+	SIMD_FORCE_INLINE	unsigned int getHash()const
+	{
+		const bool VOID_IS_8 = ((sizeof(void*)==8));
+		
+		int key = VOID_IS_8? m_hashValues[0]+m_hashValues[1] : m_hashValues[0];
+	
+		// Thomas Wang's hash
+		key += ~(key << 15);	key ^=  (key >> 10);	key +=  (key << 3);	key ^=  (key >> 6);	key += ~(key << 11);	key ^=  (key >> 16);
+		return key;
+	}
+
+	
+};
+
+
+template <class Value>
+class btHashKeyPtr
+{
+        int     m_uid;
+public:
+
+        btHashKeyPtr(int uid)    :m_uid(uid)
+        {
+        }
+
+        int     getUid1() const
+        {
+                return m_uid;
+        }
+
+        bool equals(const btHashKeyPtr<Value>& other) const
+        {
+                return getUid1() == other.getUid1();
+        }
+
+        //to our success
+        SIMD_FORCE_INLINE       unsigned int getHash()const
+        {
+                int key = m_uid;
+                // Thomas Wang's hash
+                key += ~(key << 15);	key ^=  (key >> 10);	key +=  (key << 3);	key ^=  (key >> 6);	key += ~(key << 11);	key ^=  (key >> 16);
+                return key;
+        }
+
+        
+};
+
+
+template <class Value>
+class btHashKey
+{
+	int	m_uid;
+public:
+
+	btHashKey(int uid)	:m_uid(uid)
+	{
+	}
+
+	int	getUid1() const
+	{
+		return m_uid;
+	}
+
+	bool equals(const btHashKey<Value>& other) const
+	{
+		return getUid1() == other.getUid1();
+	}
+	//to our success
+	SIMD_FORCE_INLINE	unsigned int getHash()const
+	{
+		int key = m_uid;
+		// Thomas Wang's hash
+		key += ~(key << 15);	key ^=  (key >> 10);	key +=  (key << 3);	key ^=  (key >> 6);	key += ~(key << 11);	key ^=  (key >> 16);
+		return key;
+	}
+};
+
+
+///The btHashMap template class implements a generic and lightweight hashmap.
+///A basic sample of how to use btHashMap is located in Demos\BasicDemo\main.cpp
+template <class Key, class Value>
+class btHashMap
+{
+
+protected:
+	btAlignedObjectArray<int>		m_hashTable;
+	btAlignedObjectArray<int>		m_next;
+	
+	btAlignedObjectArray<Value>		m_valueArray;
+	btAlignedObjectArray<Key>		m_keyArray;
+
+	void	growTables(const Key& /*key*/)
+	{
+		int newCapacity = m_valueArray.capacity();
+
+		if (m_hashTable.size() < newCapacity)
+		{
+			//grow hashtable and next table
+			int curHashtableSize = m_hashTable.size();
+
+			m_hashTable.resize(newCapacity);
+			m_next.resize(newCapacity);
+
+			int i;
+
+			for (i= 0; i < newCapacity; ++i)
+			{
+				m_hashTable[i] = BT_HASH_NULL;
+			}
+			for (i = 0; i < newCapacity; ++i)
+			{
+				m_next[i] = BT_HASH_NULL;
+			}
+
+			for(i=0;i<curHashtableSize;i++)
+			{
+				//const Value& value = m_valueArray[i];
+				//const Key& key = m_keyArray[i];
+
+				int	hashValue = m_keyArray[i].getHash() & (m_valueArray.capacity()-1);	// New hash value with new mask
+				m_next[i] = m_hashTable[hashValue];
+				m_hashTable[hashValue] = i;
+			}
+
+
+		}
+	}
+
+	public:
+
+	void insert(const Key& key, const Value& value) {
+		int hash = key.getHash() & (m_valueArray.capacity()-1);
+
+		//replace value if the key is already there
+		int index = findIndex(key);
+		if (index != BT_HASH_NULL)
+		{
+			m_valueArray[index]=value;
+			return;
+		}
+
+		int count = m_valueArray.size();
+		int oldCapacity = m_valueArray.capacity();
+		m_valueArray.push_back(value);
+		m_keyArray.push_back(key);
+
+		int newCapacity = m_valueArray.capacity();
+		if (oldCapacity < newCapacity)
+		{
+			growTables(key);
+			//hash with new capacity
+			hash = key.getHash() & (m_valueArray.capacity()-1);
+		}
+		m_next[count] = m_hashTable[hash];
+		m_hashTable[hash] = count;
+	}
+
+	void remove(const Key& key) {
+
+		int hash = key.getHash() & (m_valueArray.capacity()-1);
+
+		int pairIndex = findIndex(key);
+		
+		if (pairIndex ==BT_HASH_NULL)
+		{
+			return;
+		}
+
+		// Remove the pair from the hash table.
+		int index = m_hashTable[hash];
+		btAssert(index != BT_HASH_NULL);
+
+		int previous = BT_HASH_NULL;
+		while (index != pairIndex)
+		{
+			previous = index;
+			index = m_next[index];
+		}
+
+		if (previous != BT_HASH_NULL)
+		{
+			btAssert(m_next[previous] == pairIndex);
+			m_next[previous] = m_next[pairIndex];
+		}
+		else
+		{
+			m_hashTable[hash] = m_next[pairIndex];
+		}
+
+		// We now move the last pair into spot of the
+		// pair being removed. We need to fix the hash
+		// table indices to support the move.
+
+		int lastPairIndex = m_valueArray.size() - 1;
+
+		// If the removed pair is the last pair, we are done.
+		if (lastPairIndex == pairIndex)
+		{
+			m_valueArray.pop_back();
+			m_keyArray.pop_back();
+			return;
+		}
+
+		// Remove the last pair from the hash table.
+		int lastHash = m_keyArray[lastPairIndex].getHash() & (m_valueArray.capacity()-1);
+
+		index = m_hashTable[lastHash];
+		btAssert(index != BT_HASH_NULL);
+
+		previous = BT_HASH_NULL;
+		while (index != lastPairIndex)
+		{
+			previous = index;
+			index = m_next[index];
+		}
+
+		if (previous != BT_HASH_NULL)
+		{
+			btAssert(m_next[previous] == lastPairIndex);
+			m_next[previous] = m_next[lastPairIndex];
+		}
+		else
+		{
+			m_hashTable[lastHash] = m_next[lastPairIndex];
+		}
+
+		// Copy the last pair into the remove pair's spot.
+		m_valueArray[pairIndex] = m_valueArray[lastPairIndex];
+		m_keyArray[pairIndex] = m_keyArray[lastPairIndex];
+
+		// Insert the last pair into the hash table
+		m_next[pairIndex] = m_hashTable[lastHash];
+		m_hashTable[lastHash] = pairIndex;
+
+		m_valueArray.pop_back();
+		m_keyArray.pop_back();
+
+	}
+
+
+	int size() const
+	{
+		return m_valueArray.size();
+	}
+
+	const Value* getAtIndex(int index) const
+	{
+		btAssert(index < m_valueArray.size());
+
+		return &m_valueArray[index];
+	}
+
+	Value* getAtIndex(int index)
+	{
+		btAssert(index < m_valueArray.size());
+
+		return &m_valueArray[index];
+	}
+
+	Value* operator[](const Key& key) {
+		return find(key);
+	}
+
+	const Value*	find(const Key& key) const
+	{
+		int index = findIndex(key);
+		if (index == BT_HASH_NULL)
+		{
+			return NULL;
+		}
+		return &m_valueArray[index];
+	}
+
+	Value*	find(const Key& key)
+	{
+		int index = findIndex(key);
+		if (index == BT_HASH_NULL)
+		{
+			return NULL;
+		}
+		return &m_valueArray[index];
+	}
+
+
+	int	findIndex(const Key& key) const
+	{
+		unsigned int hash = key.getHash() & (m_valueArray.capacity()-1);
+
+		if (hash >= (unsigned int)m_hashTable.size())
+		{
+			return BT_HASH_NULL;
+		}
+
+		int index = m_hashTable[hash];
+		while ((index != BT_HASH_NULL) && key.equals(m_keyArray[index]) == false)
+		{
+			index = m_next[index];
+		}
+		return index;
+	}
+
+	void	clear()
+	{
+		m_hashTable.clear();
+		m_next.clear();
+		m_valueArray.clear();
+		m_keyArray.clear();
+	}
+
+};
+
+#endif //BT_HASH_MAP_H
diff --git a/opencl/parallel_primitives/host/btLauncherCL.h b/opencl/parallel_primitives/host/btLauncherCL.h
new file mode 100644
index 000000000..6b5657426
--- /dev/null
+++ b/opencl/parallel_primitives/host/btLauncherCL.h
@@ -0,0 +1,363 @@
+
+#ifndef BT_LAUNCHER_CL_H
+#define BT_LAUNCHER_CL_H
+
+#include "btBufferInfoCL.h"
+#include "btMinMax.h"
+#include "btOpenCLArray.h"
+#include <stdio.h>
+
+#ifdef _WIN32
+#pragma warning(disable :4996)
+#endif
+#define BT_CL_MAX_ARG_SIZE 16
+struct btKernelArgData
+{
+    int m_isBuffer;
+    int m_argIndex;
+    int m_argSizeInBytes;
+    union
+    {
+        cl_mem m_clBuffer;
+        unsigned char m_argData[BT_CL_MAX_ARG_SIZE];
+    };
+    
+};
+
+class btLauncherCL
+{
+
+	cl_command_queue m_commandQueue;
+	cl_kernel m_kernel;
+	int m_idx;
+
+    btAlignedObjectArray<btKernelArgData> m_kernelArguments;
+   
+    
+    int m_serializationSizeInBytes;
+
+	public:
+
+     btAlignedObjectArray<btOpenCLArray<unsigned char>* > m_arrays;
+    
+		btLauncherCL(cl_command_queue queue, cl_kernel kernel)
+			:m_commandQueue(queue),
+			m_kernel(kernel),
+			m_idx(0)
+		{
+            m_serializationSizeInBytes = sizeof(int);
+		}
+    
+        virtual ~btLauncherCL()
+        {
+            for (int i=0;i<m_arrays.size();i++)
+            {
+                clReleaseMemObject(m_arrays[i]->getBufferCL());
+            }
+        }
+
+		inline void setBuffer( cl_mem clBuffer)
+		{
+			
+                btKernelArgData kernelArg;
+                kernelArg.m_argIndex = m_idx;
+                kernelArg.m_isBuffer = 1;
+                kernelArg.m_clBuffer = clBuffer;
+            
+                cl_mem_info param_name = CL_MEM_SIZE;
+                size_t param_value;
+                size_t sizeInBytes = sizeof(size_t);
+                size_t actualSizeInBytes;
+                cl_int err;
+                err = clGetMemObjectInfo (	kernelArg.m_clBuffer,
+                                          param_name,
+                                          sizeInBytes,
+                                          &param_value,
+                                          &actualSizeInBytes);
+                
+                btAssert( err == CL_SUCCESS );
+                kernelArg.m_argSizeInBytes = param_value;
+                
+                m_kernelArguments.push_back(kernelArg);
+                m_serializationSizeInBytes+= sizeof(btKernelArgData);
+                m_serializationSizeInBytes+=param_value;
+                
+                cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
+				btAssert( status == CL_SUCCESS );
+            }
+
+
+		inline void setBuffers( btBufferInfoCL* buffInfo, int n )
+		{
+			for(int i=0; i<n; i++)
+			{
+                btKernelArgData kernelArg;
+                kernelArg.m_argIndex = m_idx;
+                kernelArg.m_isBuffer = 1;
+                kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
+            
+                cl_mem_info param_name = CL_MEM_SIZE;
+                size_t param_value;
+                size_t sizeInBytes = sizeof(size_t);
+                size_t actualSizeInBytes;
+                cl_int err;
+                err = clGetMemObjectInfo (	kernelArg.m_clBuffer,
+                                          param_name,
+                                          sizeInBytes,
+                                          &param_value,
+                                          &actualSizeInBytes);
+                
+                btAssert( err == CL_SUCCESS );
+                kernelArg.m_argSizeInBytes = param_value;
+                
+                m_kernelArguments.push_back(kernelArg);
+                m_serializationSizeInBytes+= sizeof(btKernelArgData);
+                m_serializationSizeInBytes+=param_value;
+                
+                cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
+				btAssert( status == CL_SUCCESS );
+            }
+		}
+    
+    int getSerializationBufferSize() const 
+    {
+        return m_serializationSizeInBytes;
+    }
+    
+    inline int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
+    {
+        int index=0;
+        
+        int numArguments = *(int*) &buf[index];
+        index+=sizeof(int);
+        
+        for (int i=0;i<numArguments;i++)
+        {
+            btKernelArgData* arg = (btKernelArgData*)&buf[index];
+
+            index+=sizeof(btKernelArgData);
+            if (arg->m_isBuffer)
+            {
+                btOpenCLArray<unsigned char>* clData = new btOpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
+                clData->resize(arg->m_argSizeInBytes);
+                
+                clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
+                
+                arg->m_clBuffer = clData->getBufferCL();
+                
+                m_arrays.push_back(clData);
+                
+                cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
+				btAssert( status == CL_SUCCESS );
+                index+=arg->m_argSizeInBytes;
+            } else 
+            {
+                cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
+				btAssert( status == CL_SUCCESS );
+            }
+			m_kernelArguments.push_back(*arg);
+        }
+		m_serializationSizeInBytes = index;
+        return index;
+    }
+
+	inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
+    {
+		 int index=0;
+        
+        int numArguments = *(int*) &goldBuffer[index];
+        index+=sizeof(int);
+
+		if (numArguments != m_kernelArguments.size())
+		{
+			printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size());
+			return -1;
+		}
+        
+        for (int ii=0;ii<numArguments;ii++)
+        {
+            btKernelArgData* argGold = (btKernelArgData*)&goldBuffer[index];
+
+			if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
+			{
+				printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
+				return -2;
+			}
+
+			{
+				int expected = argGold->m_isBuffer;
+				int found = m_kernelArguments[ii].m_isBuffer;
+
+				if (expected != found)
+				{
+					printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found);
+					return -3;
+				}
+			}
+			index+=sizeof(btKernelArgData);
+
+			if (argGold->m_isBuffer)
+            {
+
+				unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes);
+				unsigned char* goldBuf = &goldBuffer[index];
+				for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++)
+				{
+					memBuf[j] = 0xaa;
+				}
+
+				cl_int status = 0;
+				status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
+                                             memBuf, 0,0,0 );
+                btAssert( status==CL_SUCCESS );
+                clFinish(m_commandQueue);
+
+				for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
+				{
+					int expected = goldBuf[b];
+					int found = memBuf[b];
+					if (expected != found)
+					{
+						printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
+							ii, b, expected, found);
+						return -4;
+					}
+				}
+
+                
+                index+=argGold->m_argSizeInBytes;
+            } else 
+            {
+				
+				//compare content
+				for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
+				{
+					int expected = argGold->m_argData[b];
+					int found =m_kernelArguments[ii].m_argData[b];
+					if (expected != found)
+					{
+						printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
+							ii, b, expected, found);
+						return -5;
+					}
+				}
+
+            }
+        }
+        return index;
+  
+	}
+
+    inline int serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
+    {
+		//initialize to known values
+		for (int i=0;i<destBufferCapacity;i++)
+			destBuffer[i] = 0xec;
+
+        assert(destBufferCapacity>=m_serializationSizeInBytes);
+        
+        //todo: use the btSerializer for this to allow for 32/64bit, endianness etc        
+        int numArguments = m_kernelArguments.size();
+        int curBufferSize = 0;
+        int* dest = (int*)&destBuffer[curBufferSize];
+        *dest = numArguments;
+        curBufferSize += sizeof(int);
+        
+        
+        
+        for (int i=0;i<this->m_kernelArguments.size();i++)
+        {
+            btKernelArgData* arg = (btKernelArgData*) &destBuffer[curBufferSize];
+            *arg = m_kernelArguments[i];
+            curBufferSize+=sizeof(btKernelArgData);
+            if (arg->m_isBuffer==1)
+            {
+                //copy the OpenCL buffer content
+                cl_int status = 0;
+                status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
+                                             &destBuffer[curBufferSize], 0,0,0 );
+                btAssert( status==CL_SUCCESS );
+                clFinish(m_commandQueue);
+                curBufferSize+=arg->m_argSizeInBytes;
+            }
+            
+        }
+        return curBufferSize;
+    }
+	
+	void serializeToFile(const char* fileName, int numWorkItems)
+	{
+		int num = numWorkItems;
+		int buffSize = getSerializationBufferSize();
+		unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
+		for (int i=0;i<buffSize+1;i++)
+		{
+			unsigned char* ptr = (unsigned char*)&buf[i];
+			*ptr = 0xff;
+		}
+		int actualWrite = serializeArguments(buf,buffSize);
+                
+		unsigned char* cptr = (unsigned char*)&buf[buffSize];
+	//            printf("buf[buffSize] = %d\n",*cptr);
+                
+		assert(buf[buffSize]==0xff);//check for buffer overrun
+		int* ptr = (int*)&buf[buffSize];
+                
+		*ptr = num;
+                
+		FILE* f = fopen(fileName,"wb");
+		fwrite(buf,buffSize+sizeof(int),1,f);
+		fclose(f);
+
+		delete[] buf;
+	}		
+
+
+	template<typename T>
+		inline void setConst( const T& consts )
+		{
+			int sz=sizeof(T);
+			btAssert(sz<=BT_CL_MAX_ARG_SIZE);
+            btKernelArgData kernelArg;
+            kernelArg.m_argIndex = m_idx;
+            kernelArg.m_isBuffer = 0;
+            T* destArg = (T*)kernelArg.m_argData;
+            *destArg = consts;
+            kernelArg.m_argSizeInBytes = sizeof(T);
+            m_kernelArguments.push_back(kernelArg);
+            m_serializationSizeInBytes+=sizeof(btKernelArgData);
+            
+			cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts );
+			btAssert( status == CL_SUCCESS );
+		}
+
+		inline void launch1D( int numThreads, int localSize = 64)
+		{
+			launch2D( numThreads, 1, localSize, 1 );
+		}
+
+		inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
+		{
+			size_t gRange[3] = {1,1,1};
+			size_t lRange[3] = {1,1,1};
+			lRange[0] = localSizeX;
+			lRange[1] = localSizeY;
+			gRange[0] = btMax((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
+			gRange[0] *= lRange[0];
+			gRange[1] = btMax((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
+			gRange[1] *= lRange[1];
+
+			cl_int status = clEnqueueNDRangeKernel( m_commandQueue, 
+				m_kernel, 2, NULL, gRange, lRange, 0,0,0 );
+            if (status != CL_SUCCESS)
+            {
+                printf("Error: OpenCL status = %d\n",status);
+            }
+			btAssert( status == CL_SUCCESS );
+
+		}
+};
+
+
+
+#endif //BT_LAUNCHER_CL_H
diff --git a/opencl/parallel_primitives/host/btMinMax.h b/opencl/parallel_primitives/host/btMinMax.h
new file mode 100644
index 000000000..5b436e9ba
--- /dev/null
+++ b/opencl/parallel_primitives/host/btMinMax.h
@@ -0,0 +1,71 @@
+/*
+Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#ifndef BT_GEN_MINMAX_H
+#define BT_GEN_MINMAX_H
+
+#include "btScalar.h"
+
+template <class T>
+SIMD_FORCE_INLINE const T& btMin(const T& a, const T& b) 
+{
+  return a < b ? a : b ;
+}
+
+template <class T>
+SIMD_FORCE_INLINE const T& btMax(const T& a, const T& b) 
+{
+  return  a > b ? a : b;
+}
+
+template <class T>
+SIMD_FORCE_INLINE const T& btClamped(const T& a, const T& lb, const T& ub) 
+{
+	return a < lb ? lb : (ub < a ? ub : a); 
+}
+
+template <class T>
+SIMD_FORCE_INLINE void btSetMin(T& a, const T& b) 
+{
+    if (b < a) 
+	{
+		a = b;
+	}
+}
+
+template <class T>
+SIMD_FORCE_INLINE void btSetMax(T& a, const T& b) 
+{
+    if (a < b) 
+	{
+		a = b;
+	}
+}
+
+template <class T>
+SIMD_FORCE_INLINE void btClamp(T& a, const T& lb, const T& ub) 
+{
+	if (a < lb) 
+	{
+		a = lb; 
+	}
+	else if (ub < a) 
+	{
+		a = ub;
+	}
+}
+
+#endif //BT_GEN_MINMAX_H
diff --git a/opencl/parallel_primitives/host/btOpenCLArray.h b/opencl/parallel_primitives/host/btOpenCLArray.h
new file mode 100644
index 000000000..91e88e9ed
--- /dev/null
+++ b/opencl/parallel_primitives/host/btOpenCLArray.h
@@ -0,0 +1,274 @@
+#ifndef BT_OPENCL_ARRAY_H
+#define BT_OPENCL_ARRAY_H
+
+#include "btAlignedObjectArray.h"
+#include "../../basic_initialize/btOpenCLInclude.h"
+
+template <typename T> 
+class btOpenCLArray
+{
+	int	m_size;
+	int	m_capacity;
+	cl_mem	m_clBuffer;
+
+	cl_context		 m_clContext;
+	cl_command_queue m_commandQueue;
+
+	bool	m_ownsMemory;
+
+	bool	m_allowGrowingCapacity;
+
+	void deallocate()
+	{
+		if (m_clBuffer && m_ownsMemory)
+		{
+			clReleaseMemObject(m_clBuffer);
+		}
+		m_clBuffer = 0;
+		m_capacity=0;
+	}
+
+	btOpenCLArray<T>& operator=(const btOpenCLArray<T>& src);
+
+	SIMD_FORCE_INLINE	int	allocSize(int size)
+		{
+			return (size ? size*2 : 1);
+		}
+
+public:
+
+	btOpenCLArray(cl_context ctx, cl_command_queue queue, int initialCapacity=0, bool allowGrowingCapacity=true)
+	:m_size(0),  m_capacity(0),m_clBuffer(0),
+	m_clContext(ctx),m_commandQueue(queue),
+	m_ownsMemory(true),m_allowGrowingCapacity(true)
+	{
+		if (initialCapacity)
+		{
+			reserve(initialCapacity);
+		}
+		m_allowGrowingCapacity = allowGrowingCapacity;
+	}
+
+	///this is an error-prone method with no error checking, be careful!
+	void setFromOpenCLBuffer(cl_mem buffer, int sizeInElements)
+	{
+		deallocate();
+		m_ownsMemory = false;
+		m_allowGrowingCapacity = false;
+		m_clBuffer = buffer;
+		m_size = sizeInElements;
+		m_capacity = sizeInElements;
+	}
+	
+// we could enable this assignment, but need to make sure to avoid accidental deep copies
+//	btOpenCLArray<T>& operator=(const btAlignedObjectArray<T>& src) 
+//	{
+//		copyFromArray(src);
+//		return *this;
+//	}
+
+
+	cl_mem	getBufferCL() const
+	{
+		return m_clBuffer;
+	}
+
+	
+	virtual ~btOpenCLArray()
+	{
+		deallocate();
+		m_size=0;
+		m_capacity=0;
+	}
+	
+	SIMD_FORCE_INLINE	void push_back(const T& _Val,bool waitForCompletion=true)
+	{	
+		int sz = size();
+		if( sz == capacity() )
+		{
+			reserve( allocSize(size()) );
+		}
+		copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
+		m_size++;
+	}
+
+	SIMD_FORCE_INLINE T forcedAt(int n) const
+	{
+		btAssert(n>=0);
+		btAssert(n<capacity());
+		T elem;
+		copyToHostPointer(&elem,1,n,true);
+		return elem;
+	}
+
+	SIMD_FORCE_INLINE T at(int n) const
+	{
+		btAssert(n>=0);
+		btAssert(n<size());
+		T elem;
+		copyToHostPointer(&elem,1,n,true);
+		return elem;
+	}
+
+	SIMD_FORCE_INLINE	void	resize(int newsize, bool copyOldContents=true)
+	{
+		int curSize = size();
+
+		if (newsize < curSize)
+		{
+			//leave the OpenCL memory for now
+		} else
+		{
+			if (newsize > size())
+			{
+				reserve(newsize,copyOldContents);
+			}
+
+			//leave new data uninitialized (init in debug mode?)
+			//for (int i=curSize;i<newsize;i++) ...
+		}
+
+		m_size = newsize;
+	}
+
+	SIMD_FORCE_INLINE int size() const
+	{
+		return m_size;
+	}
+
+	SIMD_FORCE_INLINE	int capacity() const
+	{	
+		return m_capacity;
+	}
+
+	SIMD_FORCE_INLINE	void reserve(int _Count, bool copyOldContents=true)
+	{	// determine new minimum length of allocated storage
+		if (capacity() < _Count)
+		{	// not enough room, reallocate
+
+			if (m_allowGrowingCapacity)
+			{
+				cl_int ciErrNum;
+				//create a new OpenCL buffer
+				int memSizeInBytes = sizeof(T)*_Count;
+				cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
+				btAssert(ciErrNum==CL_SUCCESS);
+
+//#define BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
+#ifdef BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
+				unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
+				for (int i=0;i<memSizeInBytes;i++)
+					src[i] = 0xbb;
+				ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 );
+				btAssert(ciErrNum==CL_SUCCESS);
+				clFinish(m_commandQueue);
+				free(src);
+#endif //BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
+
+				if (copyOldContents)
+					copyToCL(buf, size());
+
+				//deallocate the old buffer
+				deallocate();
+
+				m_clBuffer = buf;
+			
+				m_capacity = _Count;
+			} else
+			{
+				//fail: assert and
+				btAssert(0);
+				deallocate();
+			}
+		}
+	}
+
+
+	void copyToCL(cl_mem destination, int numElements, int firstElem=0, int dstOffsetInElems=0) const
+	{
+		if (numElements<=0)
+			return;
+
+		btAssert(m_clBuffer);
+		btAssert(destination);
+		
+		//likely some error, destination is same as source
+		btAssert(m_clBuffer != destination);
+
+		btAssert((firstElem+numElements)<=m_size);
+		
+		cl_int status = 0;
+		
+
+		btAssert(numElements>0);
+		btAssert(numElements<=m_size);
+
+		int srcOffsetBytes = sizeof(T)*firstElem;
+		int dstOffsetInBytes = sizeof(T)*dstOffsetInElems;
+
+		status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination, 
+			srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 );
+
+		btAssert( status == CL_SUCCESS );
+	}
+
+	void copyFromHost(const btAlignedObjectArray<T>& srcArray, bool waitForCompletion=true)
+	{
+		int newSize = srcArray.size();
+		
+		bool copyOldContents = false;
+		resize (newSize,copyOldContents);
+		if (newSize)
+			copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion);
+
+	}
+
+	void copyFromHostPointer(const T* src, int numElems, int destFirstElem= 0, bool waitForCompletion=true)
+	{
+		btAssert(numElems+destFirstElem <= capacity());
+
+		cl_int status = 0;
+		int sizeInBytes=sizeof(T)*numElems;
+		status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes,
+		src, 0,0,0 );
+		btAssert(status == CL_SUCCESS );
+		if (waitForCompletion)
+			clFinish(m_commandQueue);
+
+	}
+	
+
+	void copyToHost(btAlignedObjectArray<T>& destArray, bool waitForCompletion=true) const
+	{
+		destArray.resize(this->size());
+		if (size())
+			copyToHostPointer(&destArray[0], size(),0,waitForCompletion);
+	}
+
+	void copyToHostPointer(T* destPtr, int numElem, int srcFirstElem=0, bool waitForCompletion=true) const
+	{
+		btAssert(numElem+srcFirstElem <= capacity());
+
+		cl_int status = 0;
+		status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem,
+		destPtr, 0,0,0 );
+		btAssert( status==CL_SUCCESS );
+
+		if (waitForCompletion)
+			clFinish(m_commandQueue);
+	}
+	
+	void copyFromOpenCLArray(const btOpenCLArray& src)
+	{
+		int newSize = src.size();
+		resize(newSize);
+		if (size())
+		{
+			src.copyToCL(m_clBuffer,size());
+		}
+	}
+
+};
+
+
+#endif //BT_OPENCL_ARRAY_H
diff --git a/opencl/parallel_primitives/host/btPrefixScanCL.cpp b/opencl/parallel_primitives/host/btPrefixScanCL.cpp
new file mode 100644
index 000000000..c584097c5
--- /dev/null
+++ b/opencl/parallel_primitives/host/btPrefixScanCL.cpp
@@ -0,0 +1,126 @@
+#include "btPrefixScanCL.h"
+#include "btFillCL.h"
+#define BT_PREFIXSCAN_PROG_PATH "opencl/parallel_primitives/kernels/PrefixScanKernels.cl"
+
+#include "btLauncherCL.h"
+#include "../../basic_initialize/btOpenCLUtils.h"
+#include "../kernels/PrefixScanKernelsCL.h"
+
+btPrefixScanCL::btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
+:m_commandQueue(queue)
+{
+	const char* scanKernelSource = prefixScanKernelsCL;
+	cl_int pErrNum;
+	char* additionalMacros=0;
+
+	m_workBuffer = new btOpenCLArray<unsigned int>(ctx,queue,size);
+	cl_program scanProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, BT_PREFIXSCAN_PROG_PATH);
+	btAssert(scanProg);
+
+	m_localScanKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
+	btAssert(m_localScanKernel );
+	m_blockSumKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
+	btAssert(m_blockSumKernel );
+	m_propagationKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
+	btAssert(m_propagationKernel );
+}
+
+
+btPrefixScanCL::~btPrefixScanCL()
+{
+	delete m_workBuffer;
+	clReleaseKernel(m_localScanKernel);
+	clReleaseKernel(m_blockSumKernel);
+	clReleaseKernel(m_propagationKernel);
+}
+
+template<class T>
+T btNextPowerOf2(T n)
+{
+	n -= 1;
+	for(int i=0; i<sizeof(T)*8; i++)
+		n = n | (n>>i);
+	return n+1;
+}
+
+void btPrefixScanCL::execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
+{
+	
+//	btAssert( data->m_option == EXCLUSIVE );
+	const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
+
+	dst.resize(src.size());
+	m_workBuffer->resize(src.size());
+
+	btInt4 constBuffer;
+	constBuffer.x = n;
+	constBuffer.y = numBlocks;
+	constBuffer.z = (int)btNextPowerOf2( numBlocks );
+
+	btOpenCLArray<unsigned int>* srcNative = &src;
+	btOpenCLArray<unsigned int>* dstNative = &dst;
+	
+	{
+		btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( srcNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) };
+
+		btLauncherCL launcher( m_commandQueue, m_localScanKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+		launcher.setConst(  constBuffer );
+		launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
+	}
+
+	{
+		btBufferInfoCL bInfo[] = { btBufferInfoCL( m_workBuffer->getBufferCL() ) };
+
+		btLauncherCL launcher( m_commandQueue, m_blockSumKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+		launcher.setConst( constBuffer );
+		launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
+	}
+	
+
+	if( numBlocks > 1 )
+	{
+		btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) };
+		btLauncherCL launcher( m_commandQueue, m_propagationKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+		launcher.setConst( constBuffer );
+		launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
+	}
+
+
+	if( sum )
+	{
+		clFinish(m_commandQueue);
+		dstNative->copyToHostPointer(sum,1,n-1,true);
+	}
+
+}
+
+
+void btPrefixScanCL::executeHost(btAlignedObjectArray<unsigned int>& src, btAlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
+{
+	unsigned int s = 0;
+	//if( data->m_option == EXCLUSIVE )
+	{
+		for(int i=0; i<n; i++)
+		{
+			dst[i] = s;
+			s += src[i];
+		}
+	}
+	/*else
+	{
+		for(int i=0; i<n; i++)
+		{
+			s += hSrc[i];
+			hDst[i] = s;
+		}
+	}
+	*/
+
+	if( sum )
+	{
+		*sum = dst[n-1];
+	}
+}
\ No newline at end of file
diff --git a/opencl/parallel_primitives/host/btPrefixScanCL.h b/opencl/parallel_primitives/host/btPrefixScanCL.h
new file mode 100644
index 000000000..a7dbf4f5e
--- /dev/null
+++ b/opencl/parallel_primitives/host/btPrefixScanCL.h
@@ -0,0 +1,37 @@
+
+#ifndef BT_PREFIX_SCAN_CL_H
+#define BT_PREFIX_SCAN_CL_H
+
+#include "btOpenCLArray.h"
+#include "btBufferInfoCL.h"
+#include "btAlignedObjectArray.h"
+
+class btPrefixScanCL
+{
+	enum
+	{
+		BLOCK_SIZE = 128
+	};
+
+//	Option m_option;
+
+	cl_command_queue	m_commandQueue;
+
+	cl_kernel m_localScanKernel;
+	cl_kernel m_blockSumKernel;
+	cl_kernel m_propagationKernel;
+
+	btOpenCLArray<unsigned int>* m_workBuffer;
+
+
+	public:
+		
+	btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
+
+	virtual ~btPrefixScanCL();
+
+	void execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
+	void executeHost(btAlignedObjectArray<unsigned int>& src, btAlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum);
+};
+
+#endif //BT_PREFIX_SCAN_CL_H
diff --git a/opencl/parallel_primitives/host/btQuickprof.cpp b/opencl/parallel_primitives/host/btQuickprof.cpp
new file mode 100644
index 000000000..544aee89d
--- /dev/null
+++ b/opencl/parallel_primitives/host/btQuickprof.cpp
@@ -0,0 +1,566 @@
+/*
+
+***************************************************************************************************
+**
+** profile.cpp
+**
+** Real-Time Hierarchical Profiling for Game Programming Gems 3
+**
+** by Greg Hjelstrom & Byon Garrabrant
+**
+***************************************************************************************************/
+
+// Credits: The Clock class was inspired by the Timer classes in 
+// Ogre (www.ogre3d.org).
+
+#include "btQuickprof.h"
+
+#ifndef BT_NO_PROFILE
+
+
+static btClock gProfileClock;
+
+
+#ifdef __CELLOS_LV2__
+#include <sys/sys_time.h>
+#include <sys/time_util.h>
+#include <stdio.h>
+#endif
+
+#if defined (SUNOS) || defined (__SUNOS__) 
+#include <stdio.h> 
+#endif
+
+#if defined(WIN32) || defined(_WIN32)
+
+#define BT_USE_WINDOWS_TIMERS
+#define WIN32_LEAN_AND_MEAN
+#define NOWINRES
+#define NOMCX
+#define NOIME 
+
+#ifdef _XBOX
+	#include <Xtl.h>
+#else //_XBOX
+	#include <windows.h>
+#endif //_XBOX
+
+#include <time.h>
+
+
+#else //_WIN32
+#include <sys/time.h>
+#endif //_WIN32
+
+#define mymin(a,b) (a > b ? a : b)
+
+struct btClockData
+{
+
+#ifdef BT_USE_WINDOWS_TIMERS
+	LARGE_INTEGER mClockFrequency;
+	DWORD mStartTick;
+	LONGLONG mPrevElapsedTime;
+	LARGE_INTEGER mStartTime;
+#else
+#ifdef __CELLOS_LV2__
+	uint64_t	mStartTime;
+#else
+	struct timeval mStartTime;
+#endif
+#endif //__CELLOS_LV2__
+
+};
+
+///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling.
+btClock::btClock()
+{
+	m_data = new btClockData;
+#ifdef BT_USE_WINDOWS_TIMERS
+	QueryPerformanceFrequency(&m_data->mClockFrequency);
+#endif
+	reset();
+}
+
+btClock::~btClock()
+{
+	delete m_data;
+}
+
+btClock::btClock(const btClock& other)
+{
+	m_data = new btClockData;
+	*m_data = *other.m_data;
+}
+
+btClock& btClock::operator=(const btClock& other)
+{
+	*m_data = *other.m_data;
+	return *this;
+}
+
+
+	/// Resets the initial reference time.
+void btClock::reset()
+{
+#ifdef BT_USE_WINDOWS_TIMERS
+	QueryPerformanceCounter(&m_data->mStartTime);
+	m_data->mStartTick = GetTickCount();
+	m_data->mPrevElapsedTime = 0;
+#else
+#ifdef __CELLOS_LV2__
+
+	typedef uint64_t  ClockSize;
+	ClockSize newTime;
+	//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
+	SYS_TIMEBASE_GET( newTime );
+	m_data->mStartTime = newTime;
+#else
+	gettimeofday(&m_data->mStartTime, 0);
+#endif
+#endif
+}
+
+/// Returns the time in ms since the last call to reset or since 
+/// the btClock was created.
+unsigned long int btClock::getTimeMilliseconds()
+{
+#ifdef BT_USE_WINDOWS_TIMERS
+	LARGE_INTEGER currentTime;
+	QueryPerformanceCounter(&currentTime);
+	LONGLONG elapsedTime = currentTime.QuadPart - 
+		m_data->mStartTime.QuadPart;
+		// Compute the number of millisecond ticks elapsed.
+	unsigned long msecTicks = (unsigned long)(1000 * elapsedTime / 
+		m_data->mClockFrequency.QuadPart);
+		// Check for unexpected leaps in the Win32 performance counter.  
+	// (This is caused by unexpected data across the PCI to ISA 
+		// bridge, aka south bridge.  See Microsoft KB274323.)
+		unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
+		signed long msecOff = (signed long)(msecTicks - elapsedTicks);
+		if (msecOff < -100 || msecOff > 100)
+		{
+			// Adjust the starting time forwards.
+			LONGLONG msecAdjustment = mymin(msecOff * 
+				m_data->mClockFrequency.QuadPart / 1000, elapsedTime - 
+				m_data->mPrevElapsedTime);
+			m_data->mStartTime.QuadPart += msecAdjustment;
+			elapsedTime -= msecAdjustment;
+
+			// Recompute the number of millisecond ticks elapsed.
+			msecTicks = (unsigned long)(1000 * elapsedTime / 
+				m_data->mClockFrequency.QuadPart);
+		}
+
+		// Store the current elapsed time for adjustments next time.
+		m_data->mPrevElapsedTime = elapsedTime;
+
+		return msecTicks;
+#else
+
+#ifdef __CELLOS_LV2__
+		uint64_t freq=sys_time_get_timebase_frequency();
+		double dFreq=((double) freq) / 1000.0;
+		typedef uint64_t  ClockSize;
+		ClockSize newTime;
+		SYS_TIMEBASE_GET( newTime );
+		//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
+
+		return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq);
+#else
+
+		struct timeval currentTime;
+		gettimeofday(&currentTime, 0);
+		return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000 + 
+			(currentTime.tv_usec - m_data->mStartTime.tv_usec) / 1000;
+#endif //__CELLOS_LV2__
+#endif
+}
+
+	/// Returns the time in us since the last call to reset or since 
+	/// the Clock was created.
+unsigned long int btClock::getTimeMicroseconds()
+{
+#ifdef BT_USE_WINDOWS_TIMERS
+		LARGE_INTEGER currentTime;
+		QueryPerformanceCounter(&currentTime);
+		LONGLONG elapsedTime = currentTime.QuadPart - 
+			m_data->mStartTime.QuadPart;
+
+		// Compute the number of millisecond ticks elapsed.
+		unsigned long msecTicks = (unsigned long)(1000 * elapsedTime / 
+			m_data->mClockFrequency.QuadPart);
+
+		// Check for unexpected leaps in the Win32 performance counter.  
+		// (This is caused by unexpected data across the PCI to ISA 
+		// bridge, aka south bridge.  See Microsoft KB274323.)
+		unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
+		signed long msecOff = (signed long)(msecTicks - elapsedTicks);
+		if (msecOff < -100 || msecOff > 100)
+		{
+			// Adjust the starting time forwards.
+			LONGLONG msecAdjustment = mymin(msecOff * 
+				m_data->mClockFrequency.QuadPart / 1000, elapsedTime - 
+				m_data->mPrevElapsedTime);
+			m_data->mStartTime.QuadPart += msecAdjustment;
+			elapsedTime -= msecAdjustment;
+		}
+
+		// Store the current elapsed time for adjustments next time.
+		m_data->mPrevElapsedTime = elapsedTime;
+
+		// Convert to microseconds.
+		unsigned long usecTicks = (unsigned long)(1000000 * elapsedTime / 
+			m_data->mClockFrequency.QuadPart);
+
+		return usecTicks;
+#else
+
+#ifdef __CELLOS_LV2__
+		uint64_t freq=sys_time_get_timebase_frequency();
+		double dFreq=((double) freq)/ 1000000.0;
+		typedef uint64_t  ClockSize;
+		ClockSize newTime;
+		//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
+		SYS_TIMEBASE_GET( newTime );
+
+		return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq);
+#else
+
+		struct timeval currentTime;
+		gettimeofday(&currentTime, 0);
+		return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 + 
+			(currentTime.tv_usec - m_data->mStartTime.tv_usec);
+#endif//__CELLOS_LV2__
+#endif 
+}
+
+
+
+
+
+inline void Profile_Get_Ticks(unsigned long int * ticks)
+{
+	*ticks = gProfileClock.getTimeMicroseconds();
+}
+
+inline float Profile_Get_Tick_Rate(void)
+{
+//	return 1000000.f;
+	return 1000.f;
+
+}
+
+
+
+/***************************************************************************************************
+**
+** CProfileNode
+**
+***************************************************************************************************/
+
+/***********************************************************************************************
+ * INPUT:                                                                                      *
+ * name - pointer to a static string which is the name of this profile node                    *
+ * parent - parent pointer                                                                     *
+ *                                                                                             *
+ * WARNINGS:                                                                                   *
+ * The name is assumed to be a static pointer, only the pointer is stored and compared for     *
+ * efficiency reasons.                                                                         *
+ *=============================================================================================*/
+CProfileNode::CProfileNode( const char * name, CProfileNode * parent ) :
+	Name( name ),
+	TotalCalls( 0 ),
+	TotalTime( 0 ),
+	StartTime( 0 ),
+	RecursionCounter( 0 ),
+	Parent( parent ),
+	Child( NULL ),
+	Sibling( NULL ),
+	m_userPtr(0)
+{
+	Reset();
+}
+
+
+void	CProfileNode::CleanupMemory()
+{
+	delete ( Child);
+	Child = NULL;
+	delete ( Sibling);
+	Sibling = NULL;
+}
+
+CProfileNode::~CProfileNode( void )
+{
+	delete ( Child);
+	delete ( Sibling);
+}
+
+
+/***********************************************************************************************
+ * INPUT:                                                                                      *
+ * name - static string pointer to the name of the node we are searching for                   *
+ *                                                                                             *
+ * WARNINGS:                                                                                   *
+ * All profile names are assumed to be static strings so this function uses pointer compares   *
+ * to find the named node.                                                                     *
+ *=============================================================================================*/
+CProfileNode * CProfileNode::Get_Sub_Node( const char * name )
+{
+	// Try to find this sub node
+	CProfileNode * child = Child;
+	while ( child ) {
+		if ( child->Name == name ) {
+			return child;
+		}
+		child = child->Sibling;
+	}
+
+	// We didn't find it, so add it
+	
+	CProfileNode * node = new CProfileNode( name, this );
+	node->Sibling = Child;
+	Child = node;
+	return node;
+}
+
+
+void	CProfileNode::Reset( void )
+{
+	TotalCalls = 0;
+	TotalTime = 0.0f;
+	
+
+	if ( Child ) {
+		Child->Reset();
+	}
+	if ( Sibling ) {
+		Sibling->Reset();
+	}
+}
+
+
+void	CProfileNode::Call( void )
+{
+	TotalCalls++;
+	if (RecursionCounter++ == 0) {
+		Profile_Get_Ticks(&StartTime);
+	}
+}
+
+
+bool	CProfileNode::Return( void )
+{
+	if ( --RecursionCounter == 0 && TotalCalls != 0 ) { 
+		unsigned long int time;
+		Profile_Get_Ticks(&time);
+		time-=StartTime;
+		TotalTime += (float)time / Profile_Get_Tick_Rate();
+	}
+	return ( RecursionCounter == 0 );
+}
+
+
+/***************************************************************************************************
+**
+** CProfileIterator
+**
+***************************************************************************************************/
+CProfileIterator::CProfileIterator( CProfileNode * start )
+{
+	CurrentParent = start;
+	CurrentChild = CurrentParent->Get_Child();
+}
+
+
+void	CProfileIterator::First(void)
+{
+	CurrentChild = CurrentParent->Get_Child();
+}
+
+
+void	CProfileIterator::Next(void)
+{
+	CurrentChild = CurrentChild->Get_Sibling();
+}
+
+
+bool	CProfileIterator::Is_Done(void)
+{
+	return CurrentChild == NULL;
+}
+
+
+void	CProfileIterator::Enter_Child( int index )
+{
+	CurrentChild = CurrentParent->Get_Child();
+	while ( (CurrentChild != NULL) && (index != 0) ) {
+		index--;
+		CurrentChild = CurrentChild->Get_Sibling();
+	}
+
+	if ( CurrentChild != NULL ) {
+		CurrentParent = CurrentChild;
+		CurrentChild = CurrentParent->Get_Child();
+	}
+}
+
+
+void	CProfileIterator::Enter_Parent( void )
+{
+	if ( CurrentParent->Get_Parent() != NULL ) {
+		CurrentParent = CurrentParent->Get_Parent();
+	}
+	CurrentChild = CurrentParent->Get_Child();
+}
+
+
+/***************************************************************************************************
+**
+** CProfileManager
+**
+***************************************************************************************************/
+
+CProfileNode	CProfileManager::Root( "Root", NULL );
+CProfileNode *	CProfileManager::CurrentNode = &CProfileManager::Root;
+int				CProfileManager::FrameCounter = 0;
+unsigned long int			CProfileManager::ResetTime = 0;
+
+
+/***********************************************************************************************
+ * CProfileManager::Start_Profile -- Begin a named profile                                    *
+ *                                                                                             *
+ * Steps one level deeper into the tree, if a child already exists with the specified name     *
+ * then it accumulates the profiling; otherwise a new child node is added to the profile tree. *
+ *                                                                                             *
+ * INPUT:                                                                                      *
+ * name - name of this profiling record                                                        *
+ *                                                                                             *
+ * WARNINGS:                                                                                   *
+ * The string used is assumed to be a static string; pointer compares are used throughout      *
+ * the profiling code for efficiency.                                                          *
+ *=============================================================================================*/
+void	CProfileManager::Start_Profile( const char * name )
+{
+	if (name != CurrentNode->Get_Name()) {
+		CurrentNode = CurrentNode->Get_Sub_Node( name );
+	} 
+	
+	CurrentNode->Call();
+}
+
+
+/***********************************************************************************************
+ * CProfileManager::Stop_Profile -- Stop timing and record the results.                       *
+ *=============================================================================================*/
+void	CProfileManager::Stop_Profile( void )
+{
+	// Return will indicate whether we should back up to our parent (we may
+	// be profiling a recursive function)
+	if (CurrentNode->Return()) {
+		CurrentNode = CurrentNode->Get_Parent();
+	}
+}
+
+
+/***********************************************************************************************
+ * CProfileManager::Reset -- Reset the contents of the profiling system                       *
+ *                                                                                             *
+ *    This resets everything except for the tree structure.  All of the timing data is reset.  *
+ *=============================================================================================*/
+void	CProfileManager::Reset( void )
+{ 
+	gProfileClock.reset();
+	Root.Reset();
+    Root.Call();
+	FrameCounter = 0;
+	Profile_Get_Ticks(&ResetTime);
+}
+
+
+/***********************************************************************************************
+ * CProfileManager::Increment_Frame_Counter -- Increment the frame counter                    *
+ *=============================================================================================*/
+void CProfileManager::Increment_Frame_Counter( void )
+{
+	FrameCounter++;
+}
+
+
+/***********************************************************************************************
+ * CProfileManager::Get_Time_Since_Reset -- returns the elapsed time since last reset         *
+ *=============================================================================================*/
+float CProfileManager::Get_Time_Since_Reset( void )
+{
+	unsigned long int time;
+	Profile_Get_Ticks(&time);
+	time -= ResetTime;
+	return (float)time / Profile_Get_Tick_Rate();
+}
+
+#include <stdio.h>
+
+void	CProfileManager::dumpRecursive(CProfileIterator* profileIterator, int spacing)
+{
+	profileIterator->First();
+	if (profileIterator->Is_Done())
+		return;
+
+	float accumulated_time=0,parent_time = profileIterator->Is_Root() ? CProfileManager::Get_Time_Since_Reset() : profileIterator->Get_Current_Parent_Total_Time();
+	int i;
+	int frames_since_reset = CProfileManager::Get_Frame_Count_Since_Reset();
+	for (i=0;i<spacing;i++)	printf(".");
+	printf("----------------------------------\n");
+	for (i=0;i<spacing;i++)	printf(".");
+	printf("Profiling: %s (total running time: %.3f ms) ---\n",	profileIterator->Get_Current_Parent_Name(), parent_time );
+	float totalTime = 0.f;
+
+	
+	int numChildren = 0;
+	
+	for (i = 0; !profileIterator->Is_Done(); i++,profileIterator->Next())
+	{
+		numChildren++;
+		float current_total_time = profileIterator->Get_Current_Total_Time();
+		accumulated_time += current_total_time;
+		float fraction = parent_time > SIMD_EPSILON ? (current_total_time / parent_time) * 100 : 0.f;
+		{
+			int i;	for (i=0;i<spacing;i++)	printf(".");
+		}
+		printf("%d -- %s (%.2f %%) :: %.3f ms / frame (%d calls)\n",i, profileIterator->Get_Current_Name(), fraction,(current_total_time / (double)frames_since_reset),profileIterator->Get_Current_Total_Calls());
+		totalTime += current_total_time;
+		//recurse into children
+	}
+
+	if (parent_time < accumulated_time)
+	{
+		printf("what's wrong\n");
+	}
+	for (i=0;i<spacing;i++)	printf(".");
+	printf("%s (%.3f %%) :: %.3f ms\n", "Unaccounted:",parent_time > SIMD_EPSILON ? ((parent_time - accumulated_time) / parent_time) * 100 : 0.f, parent_time - accumulated_time);
+	
+	for (i=0;i<numChildren;i++)
+	{
+		profileIterator->Enter_Child(i);
+		dumpRecursive(profileIterator,spacing+3);
+		profileIterator->Enter_Parent();
+	}
+}
+
+
+
+void	CProfileManager::dumpAll()
+{
+	CProfileIterator* profileIterator = 0;
+	profileIterator = CProfileManager::Get_Iterator();
+
+	dumpRecursive(profileIterator,0);
+
+	CProfileManager::Release_Iterator(profileIterator);
+}
+
+
+
+
+#endif //BT_NO_PROFILE
diff --git a/opencl/parallel_primitives/host/btQuickprof.h b/opencl/parallel_primitives/host/btQuickprof.h
new file mode 100644
index 000000000..93f3f4a60
--- /dev/null
+++ b/opencl/parallel_primitives/host/btQuickprof.h
@@ -0,0 +1,203 @@
+
+/***************************************************************************************************
+**
+** Real-Time Hierarchical Profiling for Game Programming Gems 3
+**
+** by Greg Hjelstrom & Byon Garrabrant
+**
+***************************************************************************************************/
+
+// Credits: The Clock class was inspired by the Timer classes in 
+// Ogre (www.ogre3d.org).
+
+
+
+#ifndef BT_QUICK_PROF_H
+#define BT_QUICK_PROF_H
+
+//To disable built-in profiling, please comment out next line
+//#define BT_NO_PROFILE 1
+#ifndef BT_NO_PROFILE
+#include <stdio.h>//@todo remove this, backwards compatibility
+#include "btScalar.h"
+#include "btAlignedAllocator.h"
+#include <new>
+
+
+
+
+
+#define USE_BT_CLOCK 1
+
+#ifdef USE_BT_CLOCK
+
+///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling.
+class btClock
+{
+public:
+	btClock();
+
+	btClock(const btClock& other);
+	btClock& operator=(const btClock& other);
+
+	~btClock();
+
+	/// Resets the initial reference time.
+	void reset();
+
+	/// Returns the time in ms since the last call to reset or since 
+	/// the btClock was created.
+	unsigned long int getTimeMilliseconds();
+
+	/// Returns the time in us since the last call to reset or since 
+	/// the Clock was created.
+	unsigned long int getTimeMicroseconds();
+private:
+	struct btClockData* m_data;
+};
+
+#endif //USE_BT_CLOCK
+
+
+
+
+///A node in the Profile Hierarchy Tree
+class	CProfileNode {
+
+public:
+	CProfileNode( const char * name, CProfileNode * parent );
+	~CProfileNode( void );
+
+	CProfileNode * Get_Sub_Node( const char * name );
+
+	CProfileNode * Get_Parent( void )		{ return Parent; }
+	CProfileNode * Get_Sibling( void )		{ return Sibling; }
+	CProfileNode * Get_Child( void )			{ return Child; }
+
+	void				CleanupMemory();
+	void				Reset( void );
+	void				Call( void );
+	bool				Return( void );
+
+	const char *	Get_Name( void )				{ return Name; }
+	int				Get_Total_Calls( void )		{ return TotalCalls; }
+	float				Get_Total_Time( void )		{ return TotalTime; }
+	void*			GetUserPointer() const {return m_userPtr;}
+	void			SetUserPointer(void* ptr) { m_userPtr = ptr;}
+protected:
+
+	const char *	Name;
+	int				TotalCalls;
+	float				TotalTime;
+	unsigned long int			StartTime;
+	int				RecursionCounter;
+
+	CProfileNode *	Parent;
+	CProfileNode *	Child;
+	CProfileNode *	Sibling;
+	void*	m_userPtr;
+};
+
+///An iterator to navigate through the tree
+class CProfileIterator
+{
+public:
+	// Access all the children of the current parent
+	void				First(void);
+	void				Next(void);
+	bool				Is_Done(void);
+	bool                Is_Root(void) { return (CurrentParent->Get_Parent() == 0); }
+
+	void				Enter_Child( int index );		// Make the given child the new parent
+	void				Enter_Largest_Child( void );	// Make the largest child the new parent
+	void				Enter_Parent( void );			// Make the current parent's parent the new parent
+
+	// Access the current child
+	const char *	Get_Current_Name( void )			{ return CurrentChild->Get_Name(); }
+	int				Get_Current_Total_Calls( void )	{ return CurrentChild->Get_Total_Calls(); }
+	float				Get_Current_Total_Time( void )	{ return CurrentChild->Get_Total_Time(); }
+
+	void*	Get_Current_UserPointer( void )			{ return CurrentChild->GetUserPointer(); }
+	void	Set_Current_UserPointer(void* ptr) {CurrentChild->SetUserPointer(ptr);}
+	// Access the current parent
+	const char *	Get_Current_Parent_Name( void )			{ return CurrentParent->Get_Name(); }
+	int				Get_Current_Parent_Total_Calls( void )	{ return CurrentParent->Get_Total_Calls(); }
+	float				Get_Current_Parent_Total_Time( void )	{ return CurrentParent->Get_Total_Time(); }
+
+	
+
+protected:
+
+	CProfileNode *	CurrentParent;
+	CProfileNode *	CurrentChild;
+	
+
+	CProfileIterator( CProfileNode * start );
+	friend	class		CProfileManager;
+};
+
+
+///The Manager for the Profile system
+class	CProfileManager {
+public:
+	static	void						Start_Profile( const char * name );
+	static	void						Stop_Profile( void );
+
+	static	void						CleanupMemory(void)
+	{
+		Root.CleanupMemory();
+	}
+
+	static	void						Reset( void );
+	static	void						Increment_Frame_Counter( void );
+	static	int						Get_Frame_Count_Since_Reset( void )		{ return FrameCounter; }
+	static	float						Get_Time_Since_Reset( void );
+
+	static	CProfileIterator *	Get_Iterator( void )	
+	{ 
+		
+		return new CProfileIterator( &Root ); 
+	}
+	static	void						Release_Iterator( CProfileIterator * iterator ) { delete ( iterator); }
+
+	static void	dumpRecursive(CProfileIterator* profileIterator, int spacing);
+
+	static void	dumpAll();
+
+private:
+	static	CProfileNode			Root;
+	static	CProfileNode *			CurrentNode;
+	static	int						FrameCounter;
+	static	unsigned long int					ResetTime;
+};
+
+
+///ProfileSampleClass is a simple way to profile a function's scope
+///Use the BT_PROFILE macro at the start of scope to time
+class	CProfileSample {
+public:
+	CProfileSample( const char * name )
+	{ 
+		CProfileManager::Start_Profile( name ); 
+	}
+
+	~CProfileSample( void )					
+	{ 
+		CProfileManager::Stop_Profile(); 
+	}
+};
+
+
+#define	BT_PROFILE( name )			CProfileSample __profile( name )
+
+#else
+
+#define	BT_PROFILE( name )
+
+#endif //#ifndef BT_NO_PROFILE
+
+
+
+#endif //BT_QUICK_PROF_H
+
+
diff --git a/opencl/parallel_primitives/host/btRadixSort32CL.cpp b/opencl/parallel_primitives/host/btRadixSort32CL.cpp
new file mode 100644
index 000000000..6d007fef2
--- /dev/null
+++ b/opencl/parallel_primitives/host/btRadixSort32CL.cpp
@@ -0,0 +1,712 @@
+
+#include "btRadixSort32CL.h"
+#include "btLauncherCL.h"
+#include "../../basic_initialize/btOpenCLUtils.h"
+#include "btPrefixScanCL.h"
+#include "btFillCL.h"
+
+#define RADIXSORT32_PATH "opencl/parallel_primitives/kernels/RadixSort32Kernels.cl"
+
+#include "../kernels/RadixSort32KernelsCL.h"
+
+btRadixSort32CL::btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
+:m_commandQueue(queue)
+{
+	btOpenCLDeviceInfo info;
+	btOpenCLUtils::getDeviceInfo(device,&info);
+	m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
+
+	m_workBuffer1 = new btOpenCLArray<unsigned int>(ctx,queue);
+	m_workBuffer2 = new btOpenCLArray<unsigned int>(ctx,queue);
+	m_workBuffer3 = new btOpenCLArray<btSortData>(ctx,queue);
+	m_workBuffer3a = new btOpenCLArray<unsigned int>(ctx,queue);
+	m_workBuffer4 = new btOpenCLArray<btSortData>(ctx,queue);
+	m_workBuffer4a = new btOpenCLArray<unsigned int>(ctx,queue);
+
+
+	if (initialCapacity>0)
+	{
+		m_workBuffer1->resize(initialCapacity);
+		m_workBuffer3->resize(initialCapacity);
+		m_workBuffer3a->resize(initialCapacity);
+		m_workBuffer4->resize(initialCapacity);
+		m_workBuffer4a->resize(initialCapacity);
+	}
+
+	m_scan = new btPrefixScanCL(ctx,device,queue);
+	m_fill = new btFillCL(ctx,device,queue);
+	
+	const char* additionalMacros = "";
+	const char* srcFileNameForCaching="";
+
+	cl_int pErrNum;
+	const char* kernelSource = radixSort32KernelsCL;
+	
+	cl_program sortProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
+	btAssert(sortProg);
+
+	m_streamCountSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
+	btAssert(m_streamCountSortDataKernel );
+
+
+	
+	m_streamCountKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
+	btAssert(m_streamCountKernel);
+
+
+	
+	if (m_deviceCPU)
+	{
+		
+		m_sortAndScatterSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
+		btAssert(m_sortAndScatterSortDataKernel);
+		m_sortAndScatterKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
+		btAssert(m_sortAndScatterKernel);
+	} else
+	{
+		m_sortAndScatterSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
+		btAssert(m_sortAndScatterSortDataKernel);
+		m_sortAndScatterKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
+		btAssert(m_sortAndScatterKernel);
+	}
+		
+	m_prefixScanKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
+	btAssert(m_prefixScanKernel);
+		
+}
+
+btRadixSort32CL::~btRadixSort32CL()
+{
+	delete m_scan;
+	delete m_fill;
+	delete m_workBuffer1;
+	delete m_workBuffer2;
+	delete m_workBuffer3;
+	delete m_workBuffer3a;
+	delete m_workBuffer4;
+	delete m_workBuffer4a;
+
+	clReleaseKernel(m_streamCountSortDataKernel);
+	clReleaseKernel(m_streamCountKernel);
+	clReleaseKernel(m_sortAndScatterSortDataKernel);
+	clReleaseKernel(m_sortAndScatterKernel);
+	clReleaseKernel(m_prefixScanKernel);
+}
+
+void btRadixSort32CL::executeHost(btAlignedObjectArray<btSortData>& inout, int sortBits /* = 32 */)
+{
+	int n = inout.size();
+	const int BITS_PER_PASS = 8;
+	const int NUM_TABLES = (1<<BITS_PER_PASS);
+
+
+	int tables[NUM_TABLES];
+	int counter[NUM_TABLES];
+
+	btSortData* src = &inout[0];
+	btAlignedObjectArray<btSortData> workbuffer;
+	workbuffer.resize(inout.size());
+	btSortData* dst = &workbuffer[0];
+
+	int count=0;
+	for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
+	{
+		for(int i=0; i<NUM_TABLES; i++)
+		{
+			tables[i] = 0;
+		}
+
+		for(int i=0; i<n; i++)
+		{
+			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
+			tables[tableIdx]++;
+		}
+//#define TEST
+#ifdef TEST
+		printf("histogram size=%d\n",NUM_TABLES);
+		for (int i=0;i<NUM_TABLES;i++)
+		{
+			if (tables[i]!=0)
+			{
+				printf("tables[%d]=%d]\n",i,tables[i]);
+			}
+
+		}
+#endif //TEST
+		//	prefix scan
+		int sum = 0;
+		for(int i=0; i<NUM_TABLES; i++)
+		{
+			int iData = tables[i];
+			tables[i] = sum;
+			sum += iData;
+			counter[i] = 0;
+		}
+
+		//	distribute
+		for(int i=0; i<n; i++)
+		{
+			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
+			
+			dst[tables[tableIdx] + counter[tableIdx]] = src[i];
+			counter[tableIdx] ++;
+		}
+
+		btSwap( src, dst );
+		count++;
+	}
+
+	if (count&1)
+	{
+		btAssert(0);//need to copy 
+
+	}
+}
+
+void btRadixSort32CL::executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
+{
+
+	btAlignedObjectArray<btSortData> inout;
+	keyValuesInOut.copyToHost(inout);
+
+	executeHost(inout,sortBits);
+
+	keyValuesInOut.copyFromHost(inout);
+}
+
+void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray<unsigned int>& keysOut, btOpenCLArray<unsigned int>& valuesIn, 
+								btOpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
+{
+
+}
+
+//#define DEBUG_RADIXSORT
+//#define DEBUG_RADIXSORT2
+
+
+void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
+{
+	
+	int originalSize = keyValuesInOut.size();
+	int workingSize = originalSize;
+	
+			
+	int dataAlignment = DATA_ALIGNMENT;
+
+#ifdef DEBUG_RADIXSORT2
+    btAlignedObjectArray<btSortData>   test2;
+    keyValuesInOut.copyToHost(test2);
+    printf("numElem = %d\n",test2.size());
+    for (int i=0;i<test2.size();i++)
+    {
+        printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
+        printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
+    }
+#endif //DEBUG_RADIXSORT2
+    
+	btOpenCLArray<btSortData>* src = 0;
+
+	if (workingSize%dataAlignment)
+	{
+		workingSize += dataAlignment-(workingSize%dataAlignment);
+		m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
+		m_workBuffer4->resize(workingSize);
+		btSortData fillValue;
+		fillValue.m_key = 0xffffffff;
+		fillValue.m_value = 0xffffffff;
+
+#define USE_BTFILL
+#ifdef USE_BTFILL
+		m_fill->execute((btOpenCLArray<btInt2>&)*m_workBuffer4,(btInt2&)fillValue,workingSize-originalSize,originalSize);
+#else
+		//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
+		
+		for (int i=originalSize; i<workingSize;i++)
+		{
+			m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
+		}
+#endif//USE_BTFILL
+
+		src = m_workBuffer4;
+	} else
+	{
+		src = &keyValuesInOut;
+		m_workBuffer4->resize(0);
+	}
+		
+	btAssert( workingSize%DATA_ALIGNMENT == 0 );
+	int minCap = NUM_BUCKET*NUM_WGS;
+
+
+	int n = workingSize;
+
+	m_workBuffer1->resize(minCap);
+	m_workBuffer3->resize(workingSize);
+	
+
+//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	btAssert( BITS_PER_PASS == 4 );
+	btAssert( WG_SIZE == 64 );
+	btAssert( (sortBits&0x3) == 0 );
+
+	
+	
+	btOpenCLArray<btSortData>* dst = m_workBuffer3;
+
+	btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
+	btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;
+
+
+	int nWGs = NUM_WGS;
+	btConstData cdata;
+
+	{
+        int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
+     	int nBlocks = (n+blockSize-1)/(blockSize);
+		cdata.m_n = n;
+		cdata.m_nWGs = NUM_WGS;
+		cdata.m_startBit = 0;
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
+		if( nBlocks < NUM_WGS )
+		{
+			cdata.m_nBlocksPerWG = 1;
+			nWGs = nBlocks;
+		}
+	}
+
+	int count=0;
+	for(int ib=0; ib<sortBits; ib+=4)
+	{
+#ifdef DEBUG_RADIXSORT2
+        keyValuesInOut.copyToHost(test2);
+        printf("numElem = %d\n",test2.size());
+        for (int i=0;i<test2.size();i++)
+        {
+            if (test2[i].m_key != test2[i].m_value)
+            {
+                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
+                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
+            }
+        }
+#endif //DEBUG_RADIXSORT2
+        
+		cdata.m_startBit = ib;
+		
+		if (src->size())
+		{
+			btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
+			btLauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel);
+
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst(  cdata );
+			
+			int num = NUM_WGS*WG_SIZE;
+			launcher.launch1D( num, WG_SIZE );
+		}
+
+        
+        
+#ifdef DEBUG_RADIXSORT
+		btAlignedObjectArray<unsigned int> testHist;
+		srcHisto->copyToHost(testHist);
+		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
+		for (int i=0;i<testHist.size();i++)
+		{
+			if (testHist[i]!=0)
+				printf("testHist[%d]=%d\n",i,testHist[i]);
+		}
+#endif //DEBUG_RADIXSORT
+	
+	
+
+//fast prefix scan is not working properly on Mac OSX yet
+#ifdef _WIN32
+	bool fastScan=!m_deviceCPU;//only use fast scan on GPU
+#else
+	bool fastScan=false;
+#endif
+
+		if (fastScan)
+		{//	prefix scan group histogram
+			btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
+			btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst(  cdata );
+			launcher.launch1D( 128, 128 );
+			destHisto = srcHisto;
+		}else
+		{
+			//unsigned int sum; //for debugging
+            m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
+		}
+
+
+#ifdef DEBUG_RADIXSORT
+		destHisto->copyToHost(testHist);
+		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
+		for (int i=0;i<testHist.size();i++)
+		{
+			if (testHist[i]!=0)
+				printf("testHist[%d]=%d\n",i,testHist[i]);
+		}
+        
+        for (int i=0;i<testHist.size();i+=NUM_WGS)
+		{
+				printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
+		}
+
+#endif //DEBUG_RADIXSORT
+
+#define USE_GPU
+#ifdef USE_GPU
+        
+		if (src->size())
+		{//	local sort and distribute
+			btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
+			btLauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst(  cdata );
+			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
+            
+		}
+#else
+        {
+#define NUM_TABLES 16
+//#define SEQUENTIAL
+#ifdef SEQUENTIAL
+            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+            int tables[NUM_TABLES];
+            int startBit = ib;
+            
+            destHisto->copyToHost(testHist);
+            btAlignedObjectArray<btSortData> srcHost;
+            btAlignedObjectArray<btSortData> dstHost;
+            dstHost.resize(src->size());
+            
+            src->copyToHost(srcHost);
+            
+            for (int i=0;i<NUM_TABLES;i++)
+            {
+                tables[i] = testHist[i*NUM_WGS];
+            }
+            
+            //	distribute
+            for(int i=0; i<n; i++)
+            {
+                int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
+                
+                dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
+                counter2[tableIdx] ++;
+            }
+            
+            
+#else
+          
+            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+            
+            int tables[NUM_TABLES];
+             btAlignedObjectArray<btSortData> dstHostOK;
+            dstHostOK.resize(src->size());
+
+            destHisto->copyToHost(testHist);
+            btAlignedObjectArray<btSortData> srcHost;
+            src->copyToHost(srcHost);
+        
+            int blockSize = 256;
+            int nBlocksPerWG = cdata.m_nBlocksPerWG;
+            int startBit = ib;
+
+            {
+                for (int i=0;i<NUM_TABLES;i++)
+                {
+                    tables[i] = testHist[i*NUM_WGS];
+                }
+                
+                //	distribute
+                for(int i=0; i<n; i++)
+                {
+                    int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
+                    
+                    dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
+                    counter2[tableIdx] ++;
+                }
+
+            
+            }
+            
+            
+            btAlignedObjectArray<btSortData> dstHost;
+            dstHost.resize(src->size());
+            
+            
+            int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+            
+            
+            
+            for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
+            {
+              int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+              int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
+                
+              for(int iblock=0; iblock<btMin(cdata.m_nBlocksPerWG, nBlocks); iblock++)
+              {
+                for (int lIdx = 0;lIdx < 64;lIdx++)
+                {
+                    int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+                    
+                    //	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
+                    //	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
+                    //	AMD: AtomInc performs better while NV prefers ++
+                    for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
+                    {
+                        if( addr+j < n )
+                        {
+                          //  printf ("addr+j=%d\n", addr+j);
+                            
+                            int i = addr+j;
+                            
+                            int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
+                            
+                            int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
+                            
+                            btSortData ok = dstHostOK[destIndex];
+                                                    
+                            if (ok.m_key != srcHost[i].m_key)
+                            {
+                                printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
+                                printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
+                            }
+                            if (ok.m_value != srcHost[i].m_value)
+                            {
+                                
+                               printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
+                                printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );
+
+                            }
+                   
+                            dstHost[destIndex] = srcHost[i];
+                            counter[tableIdx] ++;
+                            
+                        }
+                    }
+                }
+              }
+            }
+            
+         
+#endif //SEQUENTIAL
+            
+            dst->copyFromHost(dstHost);
+        }
+#endif//USE_GPU
+        
+        
+        
+#ifdef DEBUG_RADIXSORT
+		destHisto->copyToHost(testHist);
+		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
+		for (int i=0;i<testHist.size();i++)
+		{
+			if (testHist[i]!=0)
+				printf("testHist[%d]=%d\n",i,testHist[i]);
+		}
+#endif //DEBUG_RADIXSORT
+		btSwap(src, dst );
+		btSwap(srcHisto,destHisto);
+
+#ifdef DEBUG_RADIXSORT2
+        keyValuesInOut.copyToHost(test2);
+        printf("numElem = %d\n",test2.size());
+        for (int i=0;i<test2.size();i++)
+        {
+            if (test2[i].m_key != test2[i].m_value)
+            {
+                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
+                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
+            }
+        }
+#endif //DEBUG_RADIXSORT2
+        
+        count++;
+                
+        
+	}
+	
+   
+    
+	if (count&1)
+	{
+		btAssert(0);//need to copy from workbuffer to keyValuesInOut
+	}
+
+	if (m_workBuffer4->size())
+	{
+		m_workBuffer4->resize(originalSize);
+		keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
+	}
+
+
+#ifdef DEBUG_RADIXSORT
+    keyValuesInOut.copyToHost(test2);
+   
+    printf("numElem = %d\n",test2.size());
+    for (int i=0;i<test2.size();i++)
+    {
+        printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
+        printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
+    }
+#endif    
+	
+}
+
+
+
+
+
+
+void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
+{
+	int originalSize = keysInOut.size();
+	int workingSize = originalSize;
+	
+			
+	int dataAlignment = DATA_ALIGNMENT;
+
+	btOpenCLArray<unsigned int>* src = 0;
+
+	if (workingSize%dataAlignment)
+	{
+		workingSize += dataAlignment-(workingSize%dataAlignment);
+		m_workBuffer4a->copyFromOpenCLArray(keysInOut);
+		m_workBuffer4a->resize(workingSize);
+		unsigned int fillValue = 0xffffffff;
+		
+		m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize);
+
+		src = m_workBuffer4a;
+	} else
+	{
+		src = &keysInOut;
+		m_workBuffer4a->resize(0);
+	}
+	
+	
+
+	btAssert( workingSize%DATA_ALIGNMENT == 0 );
+	int minCap = NUM_BUCKET*NUM_WGS;
+
+
+	int n = workingSize;
+
+	
+	m_workBuffer1->resize(minCap);
+	m_workBuffer3->resize(workingSize);
+	m_workBuffer3a->resize(workingSize);
+
+//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	btAssert( BITS_PER_PASS == 4 );
+	btAssert( WG_SIZE == 64 );
+	btAssert( (sortBits&0x3) == 0 );
+
+	
+	
+	btOpenCLArray<unsigned int>* dst = m_workBuffer3a;
+
+	btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
+	btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;
+
+
+	int nWGs = NUM_WGS;
+	btConstData cdata;
+
+	{
+        int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
+     	int nBlocks = (n+blockSize-1)/(blockSize);
+		cdata.m_n = n;
+		cdata.m_nWGs = NUM_WGS;
+		cdata.m_startBit = 0;
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
+		if( nBlocks < NUM_WGS )
+		{
+			cdata.m_nBlocksPerWG = 1;
+			nWGs = nBlocks;
+		}
+	}
+
+	int count=0;
+	for(int ib=0; ib<sortBits; ib+=4)
+	{
+		cdata.m_startBit = ib;
+		
+		if (src->size())
+		{
+			btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
+			btLauncherCL launcher(m_commandQueue, m_streamCountKernel);
+
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst(  cdata );
+			
+			int num = NUM_WGS*WG_SIZE;
+			launcher.launch1D( num, WG_SIZE );
+		}
+
+        
+
+//fast prefix scan is not working properly on Mac OSX yet
+#ifdef _WIN32
+	bool fastScan=!m_deviceCPU;
+	
+#else
+	bool fastScan=false;
+#endif
+
+		if (fastScan)
+		{//	prefix scan group histogram
+			btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
+			btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst(  cdata );
+			launcher.launch1D( 128, 128 );
+			destHisto = srcHisto;
+		}else
+		{
+			//unsigned int sum; //for debugging
+            m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
+		}
+
+		if (src->size())
+		{//	local sort and distribute
+			btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
+			btLauncherCL launcher( m_commandQueue, m_sortAndScatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst(  cdata );
+			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
+            
+		}
+        
+		btSwap(src, dst );
+		btSwap(srcHisto,destHisto);
+
+        count++;
+	}
+    
+	if (count&1)
+	{
+		btAssert(0);//need to copy from workbuffer to keyValuesInOut
+	}
+
+	if (m_workBuffer4a->size())
+	{
+		m_workBuffer4a->resize(originalSize);
+		keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
+	}
+	
+}
+
+
+
+
+
+
+
diff --git a/opencl/parallel_primitives/host/btRadixSort32CL.h b/opencl/parallel_primitives/host/btRadixSort32CL.h
new file mode 100644
index 000000000..4570303c6
--- /dev/null
+++ b/opencl/parallel_primitives/host/btRadixSort32CL.h
@@ -0,0 +1,85 @@
+
+#ifndef BT_RADIXSORT32_H
+#define BT_RADIXSORT32_H
+
+#include "btOpenCLArray.h"
+
+struct btSortData
+{
+	int m_key;
+	int m_value;
+};
+#include "btBufferInfoCL.h"
+
+class  btRadixSort32CL
+{
+
+		btOpenCLArray<unsigned int>* m_workBuffer1;
+		btOpenCLArray<unsigned int>* m_workBuffer2;
+		
+		btOpenCLArray<btSortData>*	m_workBuffer3;
+		btOpenCLArray<btSortData>*	m_workBuffer4;
+
+		btOpenCLArray<unsigned int>* m_workBuffer3a;
+		btOpenCLArray<unsigned int>* m_workBuffer4a;
+
+		cl_command_queue	m_commandQueue;
+
+		cl_kernel m_streamCountSortDataKernel;
+		cl_kernel m_streamCountKernel;
+
+		cl_kernel m_prefixScanKernel;
+		cl_kernel m_sortAndScatterSortDataKernel;
+		cl_kernel m_sortAndScatterKernel;
+
+
+		bool	m_deviceCPU;
+
+		class btPrefixScanCL* m_scan;
+		class btFillCL*	m_fill;
+
+public:
+	struct btConstData
+		{
+			int m_n;
+			int m_nWGs;
+			int m_startBit;
+			int m_nBlocksPerWG;
+		};
+	enum
+		{
+			DATA_ALIGNMENT = 256,
+			WG_SIZE = 64,
+            BLOCK_SIZE = 256,
+			ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
+			BITS_PER_PASS = 4,
+			NUM_BUCKET=(1<<BITS_PER_PASS),
+			//	if you change this, change nPerWI in kernel as well
+			NUM_WGS = 20*6,	//	cypress
+//			NUM_WGS = 24*6,	//	cayman
+//			NUM_WGS = 32*4,	//	nv
+		};
+
+
+private:
+		
+
+public:
+
+		btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
+
+		virtual ~btRadixSort32CL();
+
+		void execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray<unsigned int>& keysOut, btOpenCLArray<unsigned int>& valuesIn, 
+								btOpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
+
+		///keys only
+		void execute(btOpenCLArray<unsigned int>& keysInOut, int sortBits  = 32 );
+
+		void execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits  = 32 );
+		void executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits = 32);
+		void executeHost(btAlignedObjectArray<btSortData>& keyValuesInOut, int sortBits = 32);
+
+};
+#endif //BT_RADIXSORT32_H
+
diff --git a/opencl/parallel_primitives/host/btScalar.h b/opencl/parallel_primitives/host/btScalar.h
new file mode 100644
index 000000000..3a94054e9
--- /dev/null
+++ b/opencl/parallel_primitives/host/btScalar.h
@@ -0,0 +1,660 @@
+/*
+Copyright (c) 2003-2009 Erwin Coumans  http://bullet.googlecode.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#ifndef BT_SCALAR_H
+#define BT_SCALAR_H
+
+#ifdef BT_MANAGED_CODE
+//Aligned data types not supported in managed code
+#pragma unmanaged
+#endif
+
+
+#include <math.h>
+#include <stdlib.h>//size_t for MSVC 6.0
+#include <float.h>
+
+/* SVN $Revision$ on $Date$ from http://bullet.googlecode.com*/
+#define BT_BULLET_VERSION 281
+
+inline int	btGetVersion()
+{
+	return BT_BULLET_VERSION;
+}
+
+#if defined(DEBUG) || defined (_DEBUG)
+#define BT_DEBUG
+#endif
+
+
+#ifdef _WIN32
+
+		#if defined(__MINGW32__) || defined(__CYGWIN__) || (defined (_MSC_VER) && _MSC_VER < 1300)
+
+			#define SIMD_FORCE_INLINE inline
+			#define ATTRIBUTE_ALIGNED16(a) a
+			#define ATTRIBUTE_ALIGNED64(a) a
+			#define ATTRIBUTE_ALIGNED128(a) a
+		#else
+			//#define BT_HAS_ALIGNED_ALLOCATOR
+			#pragma warning(disable : 4324) // disable padding warning
+//			#pragma warning(disable:4530) // Disable the exception disable but used in MSCV Stl warning.
+//			#pragma warning(disable:4996) //Turn off warnings about deprecated C routines
+//			#pragma warning(disable:4786) // Disable the "debug name too long" warning
+
+			#define SIMD_FORCE_INLINE __forceinline
+			#define ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a
+			#define ATTRIBUTE_ALIGNED64(a) __declspec(align(64)) a
+			#define ATTRIBUTE_ALIGNED128(a) __declspec (align(128)) a
+		#ifdef _XBOX
+			#define BT_USE_VMX128
+
+			#include <ppcintrinsics.h>
+ 			#define BT_HAVE_NATIVE_FSEL
+ 			#define btFsel(a,b,c) __fsel((a),(b),(c))
+		#else
+
+#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (BT_USE_DOUBLE_PRECISION))
+			#define BT_USE_SSE
+			#ifdef BT_USE_SSE
+			//BT_USE_SSE_IN_API is disabled under Windows by default, because 
+			//it makes it harder to integrate Bullet into your application under Windows 
+			//(structured embedding Bullet structs/classes need to be 16-byte aligned)
+			//with relatively little performance gain
+			//If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries
+			//you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage)
+			//#define BT_USE_SSE_IN_API
+			#endif //BT_USE_SSE
+			#include <emmintrin.h>
+#endif
+
+		#endif//_XBOX
+
+		#endif //__MINGW32__
+
+#ifdef BT_DEBUG
+	#ifdef _MSC_VER
+		#include <stdio.h>
+		#define btAssert(x) { if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);__debugbreak();	}}
+	#else//_MSC_VER
+		#include <assert.h>
+		#define btAssert assert
+	#endif//_MSC_VER
+#else
+		#define btAssert(x)
+#endif
+		//btFullAssert is optional, slows down a lot
+		#define btFullAssert(x)
+
+		#define btLikely(_c)  _c
+		#define btUnlikely(_c) _c
+
+#else
+	
+#if defined	(__CELLOS_LV2__)
+		#define SIMD_FORCE_INLINE inline __attribute__((always_inline))
+		#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
+		#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
+		#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
+		#ifndef assert
+		#include <assert.h>
+		#endif
+#ifdef BT_DEBUG
+#ifdef __SPU__
+#include <spu_printf.h>
+#define printf spu_printf
+	#define btAssert(x) {if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}}
+#else
+	#define btAssert assert
+#endif
+	
+#else
+		#define btAssert(x)
+#endif
+		//btFullAssert is optional, slows down a lot
+		#define btFullAssert(x)
+
+		#define btLikely(_c)  _c
+		#define btUnlikely(_c) _c
+
+#else
+
+#ifdef USE_LIBSPE2
+
+		#define SIMD_FORCE_INLINE __inline
+		#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
+		#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
+		#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
+		#ifndef assert
+		#include <assert.h>
+		#endif
+#ifdef BT_DEBUG
+		#define btAssert assert
+#else
+		#define btAssert(x)
+#endif
+		//btFullAssert is optional, slows down a lot
+		#define btFullAssert(x)
+
+
+		#define btLikely(_c)   __builtin_expect((_c), 1)
+		#define btUnlikely(_c) __builtin_expect((_c), 0)
+		
+
+#else
+	//non-windows systems
+
+#if (defined (__APPLE__) && (!defined (BT_USE_DOUBLE_PRECISION)))
+    #if defined (__i386__) || defined (__x86_64__)
+        #define BT_USE_SSE
+		//BT_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries
+		//if apps run into issues, we will disable the next line
+		#define BT_USE_SSE_IN_API
+        #ifdef BT_USE_SSE
+            // include appropriate SSE level
+            #if defined (__SSE4_1__)
+                #include <smmintrin.h>
+            #elif defined (__SSSE3__)
+                #include <tmmintrin.h>
+            #elif defined (__SSE3__)
+                #include <pmmintrin.h>
+            #else
+                #include <emmintrin.h>
+            #endif
+        #endif //BT_USE_SSE
+    #elif defined( __armv7__ )
+        #ifdef __clang__
+            #define BT_USE_NEON 1
+
+            #if defined BT_USE_NEON && defined (__clang__)
+                #include <arm_neon.h>
+            #endif//BT_USE_NEON
+       #endif //__clang__
+    #endif//__arm__
+
+	#define SIMD_FORCE_INLINE inline __attribute__ ((always_inline))
+///@todo: check out alignment methods for other platforms/compilers
+	#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
+	#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
+	#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
+	#ifndef assert
+	#include <assert.h>
+	#endif
+
+	#if defined(DEBUG) || defined (_DEBUG)
+	 #if defined (__i386__) || defined (__x86_64__)
+	#include <stdio.h>
+	 #define btAssert(x)\
+	{\
+	if(!(x))\
+	{\
+		printf("Assert %s in line %d, file %s\n",#x, __LINE__, __FILE__);\
+		asm volatile ("int3");\
+	}\
+	}
+	#else//defined (__i386__) || defined (__x86_64__)
+		#define btAssert assert
+	#endif//defined (__i386__) || defined (__x86_64__)
+	#else//defined(DEBUG) || defined (_DEBUG)
+		#define btAssert(x)
+	#endif//defined(DEBUG) || defined (_DEBUG)
+
+	//btFullAssert is optional, slows down a lot
+	#define btFullAssert(x)
+	#define btLikely(_c)  _c
+	#define btUnlikely(_c) _c
+
+#else
+
+		#define SIMD_FORCE_INLINE inline
+		///@todo: check out alignment methods for other platforms/compilers
+		///#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
+		///#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
+		///#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
+		#define ATTRIBUTE_ALIGNED16(a) a
+		#define ATTRIBUTE_ALIGNED64(a) a
+		#define ATTRIBUTE_ALIGNED128(a) a
+		#ifndef assert
+		#include <assert.h>
+		#endif
+
+#if defined(DEBUG) || defined (_DEBUG)
+		#define btAssert assert
+#else
+		#define btAssert(x)
+#endif
+
+		//btFullAssert is optional, slows down a lot
+		#define btFullAssert(x)
+		#define btLikely(_c)  _c
+		#define btUnlikely(_c) _c
+#endif //__APPLE__ 
+
+#endif // LIBSPE2
+
+#endif	//__CELLOS_LV2__
+#endif
+
+
+///The btScalar type abstracts floating point numbers, to easily switch between double and single floating point precision.
+#if defined(BT_USE_DOUBLE_PRECISION)
+typedef double btScalar;
+//this number could be bigger in double precision
+#define BT_LARGE_FLOAT 1e30
+#else
+typedef float btScalar;
+//keep BT_LARGE_FLOAT*BT_LARGE_FLOAT < FLT_MAX
+#define BT_LARGE_FLOAT 1e18f
+#endif
+
+#ifdef BT_USE_SSE
+typedef __m128 btSimdFloat4;
+#endif//BT_USE_SSE
+
+#if defined BT_USE_SSE_IN_API && defined (BT_USE_SSE)
+#ifdef _WIN32
+
+#ifndef BT_NAN
+static int btNanMask = 0x7F800001;
+#define BT_NAN (*(float*)&btNanMask)
+#endif
+
+#ifndef BT_INFINITY
+static  int btInfinityMask = 0x7F800000;
+#define BT_INFINITY (*(float*)&btInfinityMask)
+#endif
+
+inline __m128 operator + (const __m128 A, const __m128 B)
+{
+    return _mm_add_ps(A, B);
+}
+
+inline __m128 operator - (const __m128 A, const __m128 B)
+{
+    return _mm_sub_ps(A, B);
+}
+
+inline __m128 operator * (const __m128 A, const __m128 B)
+{
+    return _mm_mul_ps(A, B);
+}
+
+#define btCastfTo128i(a) (_mm_castps_si128(a))
+#define btCastfTo128d(a) (_mm_castps_pd(a))
+#define btCastiTo128f(a) (_mm_castsi128_ps(a))
+#define btCastdTo128f(a) (_mm_castpd_ps(a))
+#define btCastdTo128i(a) (_mm_castpd_si128(a))
+#define btAssign128(r0,r1,r2,r3) _mm_setr_ps(r0,r1,r2,r3)
+
+#else//_WIN32
+
+#define btCastfTo128i(a) ((__m128i)(a))
+#define btCastfTo128d(a) ((__m128d)(a))
+#define btCastiTo128f(a)  ((__m128) (a))
+#define btCastdTo128f(a) ((__m128) (a))
+#define btCastdTo128i(a) ((__m128i)(a))
+#define btAssign128(r0,r1,r2,r3) (__m128){r0,r1,r2,r3}
+#define BT_INFINITY INFINITY
+#define BT_NAN NAN
+#endif//_WIN32
+#endif //BT_USE_SSE_IN_API
+
+#ifdef BT_USE_NEON
+#include <arm_neon.h>
+
+typedef float32x4_t btSimdFloat4;
+#define BT_INFINITY INFINITY
+#define BT_NAN NAN
+#define btAssign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3}
+#endif
+
+
+
+
+
+#define BT_DECLARE_ALIGNED_ALLOCATOR() \
+   SIMD_FORCE_INLINE void* operator new(size_t sizeInBytes)   { return btAlignedAlloc(sizeInBytes,16); }   \
+   SIMD_FORCE_INLINE void  operator delete(void* ptr)         { btAlignedFree(ptr); }   \
+   SIMD_FORCE_INLINE void* operator new(size_t, void* ptr)   { return ptr; }   \
+   SIMD_FORCE_INLINE void  operator delete(void*, void*)      { }   \
+   SIMD_FORCE_INLINE void* operator new[](size_t sizeInBytes)   { return btAlignedAlloc(sizeInBytes,16); }   \
+   SIMD_FORCE_INLINE void  operator delete[](void* ptr)         { btAlignedFree(ptr); }   \
+   SIMD_FORCE_INLINE void* operator new[](size_t, void* ptr)   { return ptr; }   \
+   SIMD_FORCE_INLINE void  operator delete[](void*, void*)      { }   \
+
+
+
+#if defined(BT_USE_DOUBLE_PRECISION) || defined(BT_FORCE_DOUBLE_FUNCTIONS)
+		
+SIMD_FORCE_INLINE btScalar btSqrt(btScalar x) { return sqrt(x); }
+SIMD_FORCE_INLINE btScalar btFabs(btScalar x) { return fabs(x); }
+SIMD_FORCE_INLINE btScalar btCos(btScalar x) { return cos(x); }
+SIMD_FORCE_INLINE btScalar btSin(btScalar x) { return sin(x); }
+SIMD_FORCE_INLINE btScalar btTan(btScalar x) { return tan(x); }
+SIMD_FORCE_INLINE btScalar btAcos(btScalar x) { if (x<btScalar(-1))	x=btScalar(-1); if (x>btScalar(1))	x=btScalar(1); return acos(x); }
+SIMD_FORCE_INLINE btScalar btAsin(btScalar x) { if (x<btScalar(-1))	x=btScalar(-1); if (x>btScalar(1))	x=btScalar(1); return asin(x); }
+SIMD_FORCE_INLINE btScalar btAtan(btScalar x) { return atan(x); }
+SIMD_FORCE_INLINE btScalar btAtan2(btScalar x, btScalar y) { return atan2(x, y); }
+SIMD_FORCE_INLINE btScalar btExp(btScalar x) { return exp(x); }
+SIMD_FORCE_INLINE btScalar btLog(btScalar x) { return log(x); }
+SIMD_FORCE_INLINE btScalar btPow(btScalar x,btScalar y) { return pow(x,y); }
+SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmod(x,y); }
+
+#else
+		
+SIMD_FORCE_INLINE btScalar btSqrt(btScalar y) 
+{ 
+#ifdef USE_APPROXIMATION
+    double x, z, tempf;
+    unsigned long *tfptr = ((unsigned long *)&tempf) + 1;
+
+	tempf = y;
+	*tfptr = (0xbfcdd90a - *tfptr)>>1; /* estimate of 1/sqrt(y) */
+	x =  tempf;
+	z =  y*btScalar(0.5);
+	x = (btScalar(1.5)*x)-(x*x)*(x*z);         /* iteration formula     */
+	x = (btScalar(1.5)*x)-(x*x)*(x*z);
+	x = (btScalar(1.5)*x)-(x*x)*(x*z);
+	x = (btScalar(1.5)*x)-(x*x)*(x*z);
+	x = (btScalar(1.5)*x)-(x*x)*(x*z);
+	return x*y;
+#else
+	return sqrtf(y); 
+#endif
+}
+SIMD_FORCE_INLINE btScalar btFabs(btScalar x) { return fabsf(x); }
+SIMD_FORCE_INLINE btScalar btCos(btScalar x) { return cosf(x); }
+SIMD_FORCE_INLINE btScalar btSin(btScalar x) { return sinf(x); }
+SIMD_FORCE_INLINE btScalar btTan(btScalar x) { return tanf(x); }
+SIMD_FORCE_INLINE btScalar btAcos(btScalar x) { 
+	if (x<btScalar(-1))	
+		x=btScalar(-1); 
+	if (x>btScalar(1))	
+		x=btScalar(1);
+	return acosf(x); 
+}
+SIMD_FORCE_INLINE btScalar btAsin(btScalar x) { 
+	if (x<btScalar(-1))	
+		x=btScalar(-1); 
+	if (x>btScalar(1))	
+		x=btScalar(1);
+	return asinf(x); 
+}
+SIMD_FORCE_INLINE btScalar btAtan(btScalar x) { return atanf(x); }
+SIMD_FORCE_INLINE btScalar btAtan2(btScalar x, btScalar y) { return atan2f(x, y); }
+SIMD_FORCE_INLINE btScalar btExp(btScalar x) { return expf(x); }
+SIMD_FORCE_INLINE btScalar btLog(btScalar x) { return logf(x); }
+SIMD_FORCE_INLINE btScalar btPow(btScalar x,btScalar y) { return powf(x,y); }
+SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmodf(x,y); }
+	
+#endif
+
+#define SIMD_2_PI         btScalar(6.283185307179586232)
+#define SIMD_PI           (SIMD_2_PI * btScalar(0.5))
+#define SIMD_HALF_PI      (SIMD_2_PI * btScalar(0.25))
+#define SIMD_RADS_PER_DEG (SIMD_2_PI / btScalar(360.0))
+#define SIMD_DEGS_PER_RAD  (btScalar(360.0) / SIMD_2_PI)
+#define SIMDSQRT12 btScalar(0.7071067811865475244008443621048490)
+
+#define btRecipSqrt(x) ((btScalar)(btScalar(1.0)/btSqrt(btScalar(x))))		/* reciprocal square root */
+
+
+#ifdef BT_USE_DOUBLE_PRECISION
+#define SIMD_EPSILON      DBL_EPSILON
+#define SIMD_INFINITY     DBL_MAX
+#else
+#define SIMD_EPSILON      FLT_EPSILON
+#define SIMD_INFINITY     FLT_MAX
+#endif
+
+SIMD_FORCE_INLINE btScalar btAtan2Fast(btScalar y, btScalar x) 
+{
+	btScalar coeff_1 = SIMD_PI / 4.0f;
+	btScalar coeff_2 = 3.0f * coeff_1;
+	btScalar abs_y = btFabs(y);
+	btScalar angle;
+	if (x >= 0.0f) {
+		btScalar r = (x - abs_y) / (x + abs_y);
+		angle = coeff_1 - coeff_1 * r;
+	} else {
+		btScalar r = (x + abs_y) / (abs_y - x);
+		angle = coeff_2 - coeff_1 * r;
+	}
+	return (y < 0.0f) ? -angle : angle;
+}
+
+SIMD_FORCE_INLINE bool      btFuzzyZero(btScalar x) { return btFabs(x) < SIMD_EPSILON; }
+
+SIMD_FORCE_INLINE bool	btEqual(btScalar a, btScalar eps) {
+	return (((a) <= eps) && !((a) < -eps));
+}
+SIMD_FORCE_INLINE bool	btGreaterEqual (btScalar a, btScalar eps) {
+	return (!((a) <= eps));
+}
+
+
+SIMD_FORCE_INLINE int       btIsNegative(btScalar x) {
+    return x < btScalar(0.0) ? 1 : 0;
+}
+
+SIMD_FORCE_INLINE btScalar btRadians(btScalar x) { return x * SIMD_RADS_PER_DEG; }
+SIMD_FORCE_INLINE btScalar btDegrees(btScalar x) { return x * SIMD_DEGS_PER_RAD; }
+
+#define BT_DECLARE_HANDLE(name) typedef struct name##__ { int unused; } *name
+
+#ifndef btFsel
+SIMD_FORCE_INLINE btScalar btFsel(btScalar a, btScalar b, btScalar c)
+{
+	return a >= 0 ? b : c;
+}
+#endif
+#define btFsels(a,b,c) (btScalar)btFsel(a,b,c)
+
+
+SIMD_FORCE_INLINE bool btMachineIsLittleEndian()
+{
+   long int i = 1;
+   const char *p = (const char *) &i;
+   if (p[0] == 1)  // Lowest address contains the least significant byte
+	   return true;
+   else
+	   return false;
+}
+
+
+
+///btSelect avoids branches, which makes performance much better for consoles like Playstation 3 and XBox 360
+///Thanks Phil Knight. See also http://www.cellperformance.com/articles/2006/04/more_techniques_for_eliminatin_1.html
+SIMD_FORCE_INLINE unsigned btSelect(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero) 
+{
+    // Set testNz to 0xFFFFFFFF if condition is nonzero, 0x00000000 if condition is zero
+    // Rely on positive value or'ed with its negative having sign bit on
+    // and zero value or'ed with its negative (which is still zero) having sign bit off 
+    // Use arithmetic shift right, shifting the sign bit through all 32 bits
+    unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
+    unsigned testEqz = ~testNz;
+    return ((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz)); 
+}
+SIMD_FORCE_INLINE int btSelect(unsigned condition, int valueIfConditionNonZero, int valueIfConditionZero)
+{
+    unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
+    unsigned testEqz = ~testNz; 
+    return static_cast<int>((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
+}
+SIMD_FORCE_INLINE float btSelect(unsigned condition, float valueIfConditionNonZero, float valueIfConditionZero)
+{
+#ifdef BT_HAVE_NATIVE_FSEL
+    return (float)btFsel((btScalar)condition - btScalar(1.0f), valueIfConditionNonZero, valueIfConditionZero);
+#else
+    return (condition != 0) ? valueIfConditionNonZero : valueIfConditionZero; 
+#endif
+}
+
+template<typename T> SIMD_FORCE_INLINE void btSwap(T& a, T& b)
+{
+	T tmp = a;
+	a = b;
+	b = tmp;
+}
+
+
+//PCK: endian swapping functions
+SIMD_FORCE_INLINE unsigned btSwapEndian(unsigned val)
+{
+	return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8)  | ((val & 0x000000ff) << 24));
+}
+
+SIMD_FORCE_INLINE unsigned short btSwapEndian(unsigned short val)
+{
+	return static_cast<unsigned short>(((val & 0xff00) >> 8) | ((val & 0x00ff) << 8));
+}
+
+SIMD_FORCE_INLINE unsigned btSwapEndian(int val)
+{
+	return btSwapEndian((unsigned)val);
+}
+
+SIMD_FORCE_INLINE unsigned short btSwapEndian(short val)
+{
+	return btSwapEndian((unsigned short) val);
+}
+
+///btSwapFloat uses using char pointers to swap the endianness
+////btSwapFloat/btSwapDouble will NOT return a float, because the machine might 'correct' invalid floating point values
+///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754. 
+///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception. 
+///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you. 
+///so instead of returning a float/double, we return integer/long long integer
+SIMD_FORCE_INLINE unsigned int  btSwapEndianFloat(float d)
+{
+    unsigned int a = 0;
+    unsigned char *dst = (unsigned char *)&a;
+    unsigned char *src = (unsigned char *)&d;
+
+    dst[0] = src[3];
+    dst[1] = src[2];
+    dst[2] = src[1];
+    dst[3] = src[0];
+    return a;
+}
+
+// unswap using char pointers
+SIMD_FORCE_INLINE float btUnswapEndianFloat(unsigned int a) 
+{
+    float d = 0.0f;
+    unsigned char *src = (unsigned char *)&a;
+    unsigned char *dst = (unsigned char *)&d;
+
+    dst[0] = src[3];
+    dst[1] = src[2];
+    dst[2] = src[1];
+    dst[3] = src[0];
+
+    return d;
+}
+
+
+// swap using char pointers
+SIMD_FORCE_INLINE void  btSwapEndianDouble(double d, unsigned char* dst)
+{
+    unsigned char *src = (unsigned char *)&d;
+
+    dst[0] = src[7];
+    dst[1] = src[6];
+    dst[2] = src[5];
+    dst[3] = src[4];
+    dst[4] = src[3];
+    dst[5] = src[2];
+    dst[6] = src[1];
+    dst[7] = src[0];
+
+}
+
+// unswap using char pointers
+SIMD_FORCE_INLINE double btUnswapEndianDouble(const unsigned char *src) 
+{
+    double d = 0.0;
+    unsigned char *dst = (unsigned char *)&d;
+
+    dst[0] = src[7];
+    dst[1] = src[6];
+    dst[2] = src[5];
+    dst[3] = src[4];
+    dst[4] = src[3];
+    dst[5] = src[2];
+    dst[6] = src[1];
+    dst[7] = src[0];
+
+	return d;
+}
+
+// returns normalized value in range [-SIMD_PI, SIMD_PI]
+SIMD_FORCE_INLINE btScalar btNormalizeAngle(btScalar angleInRadians) 
+{
+	angleInRadians = btFmod(angleInRadians, SIMD_2_PI);
+	if(angleInRadians < -SIMD_PI)
+	{
+		return angleInRadians + SIMD_2_PI;
+	}
+	else if(angleInRadians > SIMD_PI)
+	{
+		return angleInRadians - SIMD_2_PI;
+	}
+	else
+	{
+		return angleInRadians;
+	}
+}
+
+///rudimentary class to provide type info
+struct btTypedObject
+{
+	btTypedObject(int objectType)
+		:m_objectType(objectType)
+	{
+	}
+	int	m_objectType;
+	inline int getObjectType() const
+	{
+		return m_objectType;
+	}
+};
+
+
+  
+///align a pointer to the provided alignment, upwards
+template <typename T>T* btAlignPointer(T* unalignedPtr, size_t alignment)
+{
+		
+	struct btConvertPointerSizeT
+	{
+		union 
+		{
+				T* ptr;
+				size_t integer;
+		};
+	};
+    btConvertPointerSizeT converter;
+    
+    
+	const size_t bit_mask = ~(alignment - 1);
+    converter.ptr = unalignedPtr;
+	converter.integer += alignment-1;
+	converter.integer &= bit_mask;
+	return converter.ptr;
+}
+
+#endif //BT_SCALAR_H
diff --git a/opencl/parallel_primitives/host/premake4.lua b/opencl/parallel_primitives/host/premake4.lua
new file mode 100644
index 000000000..9aaa4692e
--- /dev/null
+++ b/opencl/parallel_primitives/host/premake4.lua
@@ -0,0 +1,26 @@
+function createProject(vendor)
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+		
+		project ("OpenCL_lib_parallel_primitives_host_" .. vendor)
+	
+		initOpenCL(vendor)
+			
+		kind "StaticLib"
+		targetdir "../../../lib"
+		includedirs {
+			".",
+		}
+		files {
+			"**.cpp",
+			"**.h"
+		}
+		
+	end
+end
+
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
+createProject("Apple")
\ No newline at end of file
diff --git a/opencl/parallel_primitives/kernels/BoundSearchKernels.cl b/opencl/parallel_primitives/kernels/BoundSearchKernels.cl
new file mode 100644
index 000000000..f3b4a1e8a
--- /dev/null
+++ b/opencl/parallel_primitives/kernels/BoundSearchKernels.cl
@@ -0,0 +1,106 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+
+
+typedef struct
+{
+	u32 m_nSrc;
+	u32 m_nDst;
+	u32 m_padding[2];
+} ConstBuffer;
+
+
+
+__attribute__((reqd_work_group_size(64,1,1)))
+__kernel
+void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, 
+					unsigned int nSrc, unsigned int nDst)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < nSrc )
+	{
+		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);
+		SortData end; end.m_key = nDst; end.m_value = nDst;
+
+		SortData iData = (gIdx==0)? first: src[gIdx-1];
+		SortData jData = (gIdx==nSrc)? end: src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
+			u32 k = jData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
+
+__attribute__((reqd_work_group_size(64,1,1)))
+__kernel
+void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, 
+					unsigned int nSrc, unsigned int nDst)
+{
+	int gIdx = GET_GLOBAL_IDX+1;
+
+	if( gIdx < nSrc+1 )
+	{
+		SortData first; first.m_key = 0; first.m_value = 0;
+		SortData end; end.m_key = nDst; end.m_value = nDst;
+
+		SortData iData = src[gIdx-1];
+		SortData jData = (gIdx==nSrc)? end: src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+			u32 k = iData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
+__attribute__((reqd_work_group_size(64,1,1)))
+__kernel
+void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, 
+					unsigned int nSrc, unsigned int nDst)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	
+
+	if( gIdx < nDst )
+	{
+		C[gIdx] = A[gIdx] - B[gIdx];
+	}
+}
+
diff --git a/opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h b/opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h
new file mode 100644
index 000000000..bf802e9fe
--- /dev/null
+++ b/opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h
@@ -0,0 +1,110 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* boundSearchKernelsCL= \
+"/*\n"
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Originally written by Takahiro Harada\n"
+"\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_nSrc;\n"
+"	u32 m_nDst;\n"
+"	u32 m_padding[2];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"__kernel\n"
+"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
+"					unsigned int nSrc, unsigned int nDst)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < nSrc )\n"
+"	{\n"
+"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
+"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+"\n"
+"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
+"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
+"			u32 k = jData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"__kernel\n"
+"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
+"					unsigned int nSrc, unsigned int nDst)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX+1;\n"
+"\n"
+"	if( gIdx < nSrc+1 )\n"
+"	{\n"
+"		SortData first; first.m_key = 0; first.m_value = 0;\n"
+"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+"\n"
+"		SortData iData = src[gIdx-1];\n"
+"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"			u32 k = iData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"__kernel\n"
+"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
+"					unsigned int nSrc, unsigned int nDst)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"\n"
+"	if( gIdx < nDst )\n"
+"	{\n"
+"		C[gIdx] = A[gIdx] - B[gIdx];\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+;
diff --git a/opencl/parallel_primitives/kernels/CopyKernels.cl b/opencl/parallel_primitives/kernels/CopyKernels.cl
new file mode 100644
index 000000000..2eee5752e
--- /dev/null
+++ b/opencl/parallel_primitives/kernels/CopyKernels.cl
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+typedef struct
+{
+	int m_n;
+	int m_padding[3];
+} ConstBuffer;
+
+
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void Copy1F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		float4 a0 = src[gIdx];
+
+		dst[ gIdx ] = a0;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void Copy2F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 2*gIdx <= cb.m_n )
+	{
+		float4 a0 = src[gIdx*2+0];
+		float4 a1 = src[gIdx*2+1];
+
+		dst[ gIdx*2+0 ] = a0;
+		dst[ gIdx*2+1 ] = a1;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void Copy4F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 4*gIdx <= cb.m_n )
+	{
+		int idx0 = gIdx*4+0;
+		int idx1 = gIdx*4+1;
+		int idx2 = gIdx*4+2;
+		int idx3 = gIdx*4+3;
+
+		float4 a0 = src[idx0];
+		float4 a1 = src[idx1];
+		float4 a2 = src[idx2];
+		float4 a3 = src[idx3];
+
+		dst[ idx0 ] = a0;
+		dst[ idx1 ] = a1;
+		dst[ idx2 ] = a2;
+		dst[ idx3 ] = a3;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void CopyF1Kernel(__global float* dstF1, __global float* srcF1, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		float a0 = srcF1[gIdx];
+
+		dstF1[ gIdx ] = a0;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		float2 a0 = srcF2[gIdx];
+
+		dstF2[ gIdx ] = a0;
+	}
+}
+
diff --git a/opencl/parallel_primitives/kernels/CopyKernelsCL.h b/opencl/parallel_primitives/kernels/CopyKernelsCL.h
new file mode 100644
index 000000000..e5670e3cd
--- /dev/null
+++ b/opencl/parallel_primitives/kernels/CopyKernelsCL.h
@@ -0,0 +1,132 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* copyKernelsCL= \
+"/*\n"
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Originally written by Takahiro Harada\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	int m_n;\n"
+"	int m_padding[3];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx];\n"
+"\n"
+"		dst[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 2*gIdx <= cb.m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx*2+0];\n"
+"		float4 a1 = src[gIdx*2+1];\n"
+"\n"
+"		dst[ gIdx*2+0 ] = a0;\n"
+"		dst[ gIdx*2+1 ] = a1;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 4*gIdx <= cb.m_n )\n"
+"	{\n"
+"		int idx0 = gIdx*4+0;\n"
+"		int idx1 = gIdx*4+1;\n"
+"		int idx2 = gIdx*4+2;\n"
+"		int idx3 = gIdx*4+3;\n"
+"\n"
+"		float4 a0 = src[idx0];\n"
+"		float4 a1 = src[idx1];\n"
+"		float4 a2 = src[idx2];\n"
+"		float4 a3 = src[idx3];\n"
+"\n"
+"		dst[ idx0 ] = a0;\n"
+"		dst[ idx1 ] = a1;\n"
+"		dst[ idx2 ] = a2;\n"
+"		dst[ idx3 ] = a3;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float a0 = srcF1[gIdx];\n"
+"\n"
+"		dstF1[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float2 a0 = srcF2[gIdx];\n"
+"\n"
+"		dstF2[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+;
diff --git a/opencl/parallel_primitives/kernels/FillKernels.cl b/opencl/parallel_primitives/kernels/FillKernels.cl
new file mode 100644
index 000000000..71c31075d
--- /dev/null
+++ b/opencl/parallel_primitives/kernels/FillKernels.cl
@@ -0,0 +1,107 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+typedef struct
+{
+	union
+	{
+		int4 m_data;
+		uint4 m_unsignedData;
+		float	m_floatData;
+	};
+	int m_offset;
+	int m_n;
+	int m_padding[2];
+} ConstBuffer;
+
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < num_elements )
+	{
+		dstInt[ offset+gIdx ] = value;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < num_elements )
+	{
+		dstFloat[ offset+gIdx ] = value;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < num )
+	{
+		dstInt[ offset+gIdx ] = value;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < num )
+	{
+		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < num )
+	{
+		dstInt4[ offset+gIdx ] = value;
+	}
+}
+
diff --git a/opencl/parallel_primitives/kernels/FillKernelsCL.h b/opencl/parallel_primitives/kernels/FillKernelsCL.h
new file mode 100644
index 000000000..24eac7b11
--- /dev/null
+++ b/opencl/parallel_primitives/kernels/FillKernelsCL.h
@@ -0,0 +1,111 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* fillKernelsCL= \
+"/*\n"
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Originally written by Takahiro Harada\n"
+"\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	union\n"
+"	{\n"
+"		int4 m_data;\n"
+"		uint4 m_unsignedData;\n"
+"		float	m_floatData;\n"
+"	};\n"
+"	int m_offset;\n"
+"	int m_n;\n"
+"	int m_padding[2];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < num_elements )\n"
+"	{\n"
+"		dstInt[ offset+gIdx ] = value;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < num_elements )\n"
+"	{\n"
+"		dstFloat[ offset+gIdx ] = value;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < num )\n"
+"	{\n"
+"		dstInt[ offset+gIdx ] = value;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < num )\n"
+"	{\n"
+"		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < num )\n"
+"	{\n"
+"		dstInt4[ offset+gIdx ] = value;\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+;
diff --git a/opencl/parallel_primitives/kernels/PrefixScanKernels.cl b/opencl/parallel_primitives/kernels/PrefixScanKernels.cl
new file mode 100644
index 000000000..963cc1e48
--- /dev/null
+++ b/opencl/parallel_primitives/kernels/PrefixScanKernels.cl
@@ -0,0 +1,154 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+
+// takahiro end
+#define WG_SIZE 128 
+#define m_numElems x
+#define m_numBlocks y
+#define m_numScanBlocks z
+
+/*typedef struct
+{
+	uint m_numElems;
+	uint m_numBlocks;
+	uint m_numScanBlocks;
+	uint m_padding[1];
+} ConstBuffer;
+*/
+
+u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)
+{
+	u32 blocksum;
+    int offset = 1;
+    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
+    {
+        GROUP_LDS_BARRIER;
+        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            data[bi] += data[ai];
+        }
+	}
+
+    GROUP_LDS_BARRIER;
+
+    if( lIdx == 0 )
+	{
+		blocksum = data[ n-1 ];
+        data[ n-1 ] = 0;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	offset >>= 1;
+    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
+    {
+        GROUP_LDS_BARRIER;
+        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            u32 temp = data[ai];
+            data[ai] = data[bi];
+            data[bi] += temp;
+        }
+	}
+	GROUP_LDS_BARRIER;
+
+	return blocksum;
+}
+
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+__kernel
+void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,
+		uint4 cb)
+{
+	__local u32 ldsData[WG_SIZE*2];
+
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+
+	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
+	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
+
+	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
+
+	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
+
+	if( (2*gIdx) < cb.m_numElems )
+    {
+        dst[2*gIdx]     = ldsData[2*lIdx];
+	}
+	if( (2*gIdx + 1) < cb.m_numElems )
+	{
+        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
+    }
+}
+
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+__kernel
+void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)
+{
+	const u32 blockSize = WG_SIZE*2;
+
+	int myIdx = GET_GROUP_IDX+1;
+	int lIdx = GET_LOCAL_IDX;
+
+	u32 iBlockSum = blockSum[myIdx];
+
+	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
+	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
+	{
+		dst[i] += iBlockSum;
+	}
+}
+
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+__kernel
+void TopLevelScanKernel(__global u32* dst, uint4 cb)
+{
+	__local u32 ldsData[2048];
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	int lSize = GET_GROUP_SIZE;
+
+	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
+	{
+		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
+
+	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
+	{
+		dst[i] = ldsData[i];
+	}
+
+	if( gIdx == 0 )
+	{
+		dst[cb.m_numBlocks] = sum;
+	}
+}
diff --git a/opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h b/opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h
new file mode 100644
index 000000000..762ee2738
--- /dev/null
+++ b/opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h
@@ -0,0 +1,158 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* prefixScanKernelsCL= \
+"/*\n"
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Originally written by Takahiro Harada\n"
+"\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"\n"
+"// takahiro end\n"
+"#define WG_SIZE 128 \n"
+"#define m_numElems x\n"
+"#define m_numBlocks y\n"
+"#define m_numScanBlocks z\n"
+"\n"
+"/*typedef struct\n"
+"{\n"
+"	uint m_numElems;\n"
+"	uint m_numBlocks;\n"
+"	uint m_numScanBlocks;\n"
+"	uint m_padding[1];\n"
+"} ConstBuffer;\n"
+"*/\n"
+"\n"
+"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
+"{\n"
+"	u32 blocksum;\n"
+"    int offset = 1;\n"
+"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            data[bi] += data[ai];\n"
+"        }\n"
+"	}\n"
+"\n"
+"    GROUP_LDS_BARRIER;\n"
+"\n"
+"    if( lIdx == 0 )\n"
+"	{\n"
+"		blocksum = data[ n-1 ];\n"
+"        data[ n-1 ] = 0;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	offset >>= 1;\n"
+"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            u32 temp = data[ai];\n"
+"            data[ai] = data[bi];\n"
+"            data[bi] += temp;\n"
+"        }\n"
+"	}\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	return blocksum;\n"
+"}\n"
+"\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"__kernel\n"
+"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
+"		uint4 cb)\n"
+"{\n"
+"	__local u32 ldsData[WG_SIZE*2];\n"
+"\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"\n"
+"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
+"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
+"\n"
+"	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"\n"
+"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
+"\n"
+"	if( (2*gIdx) < cb.m_numElems )\n"
+"    {\n"
+"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+"	}\n"
+"	if( (2*gIdx + 1) < cb.m_numElems )\n"
+"	{\n"
+"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+"    }\n"
+"}\n"
+"\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"__kernel\n"
+"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
+"{\n"
+"	const u32 blockSize = WG_SIZE*2;\n"
+"\n"
+"	int myIdx = GET_GROUP_IDX+1;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"\n"
+"	u32 iBlockSum = blockSum[myIdx];\n"
+"\n"
+"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
+"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+"	{\n"
+"		dst[i] += iBlockSum;\n"
+"	}\n"
+"}\n"
+"\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"__kernel\n"
+"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
+"{\n"
+"	__local u32 ldsData[2048];\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int lSize = GET_GROUP_SIZE;\n"
+"\n"
+"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
+"	{\n"
+"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"\n"
+"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
+"	{\n"
+"		dst[i] = ldsData[i];\n"
+"	}\n"
+"\n"
+"	if( gIdx == 0 )\n"
+"	{\n"
+"		dst[cb.m_numBlocks] = sum;\n"
+"	}\n"
+"}\n"
+"\n"
+;
diff --git a/opencl/parallel_primitives/kernels/RadixSort32Kernels.cl b/opencl/parallel_primitives/kernels/RadixSort32Kernels.cl
new file mode 100644
index 000000000..ceb1d95e5
--- /dev/null
+++ b/opencl/parallel_primitives/kernels/RadixSort32Kernels.cl
@@ -0,0 +1,1071 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Author Takahiro Harada
+
+
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AtomAdd(x, value) atom_add(&(x), value)
+
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+
+
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+#define WG_SIZE 64
+#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)
+#define BITS_PER_PASS 4
+#define NUM_BUCKET (1<<BITS_PER_PASS)
+typedef uchar u8;
+
+//	this isn't optimization for VLIW. But just reducing writes. 
+#define USE_2LEVEL_REDUCE 1
+
+//#define CHECK_BOUNDARY 1
+
+//#define NV_GPU 1
+
+
+//	Cypress
+#define nPerWI 16
+//	Cayman
+//#define nPerWI 20
+
+#define m_n x
+#define m_nWGs y
+#define m_startBit z
+#define m_nBlocksPerWG w
+
+/*
+typedef struct
+{
+	int m_n;
+	int m_nWGs;
+	int m_startBit;
+	int m_nBlocksPerWG;
+} ConstBuffer;
+*/
+
+typedef struct
+{
+	unsigned int m_key;
+	unsigned int m_value;
+} SortDataCL;
+
+
+uint prefixScanVectorEx( uint4* data )
+{
+	u32 sum = 0;
+	u32 tmp = data[0].x;
+	data[0].x = sum;
+	sum += tmp;
+	tmp = data[0].y;
+	data[0].y = sum;
+	sum += tmp;
+	tmp = data[0].z;
+	data[0].z = sum;
+	sum += tmp;
+	tmp = data[0].w;
+	data[0].w = sum;
+	sum += tmp;
+	return sum;
+}
+
+u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )
+{
+	{	//	Set data
+		sorterSharedMemory[lIdx] = 0;
+		sorterSharedMemory[lIdx+wgSize] = pData;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	Prefix sum
+		int idx = 2*lIdx + (wgSize+1);
+#if defined(USE_2LEVEL_REDUCE)
+		if( lIdx < 64 )
+		{
+			u32 u0, u1, u2;
+			u0 = sorterSharedMemory[idx-3];
+			u1 = sorterSharedMemory[idx-2];
+			u2 = sorterSharedMemory[idx-1];
+			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			
+			GROUP_MEM_FENCE;
+
+			u0 = sorterSharedMemory[idx-12];
+			u1 = sorterSharedMemory[idx-8];
+			u2 = sorterSharedMemory[idx-4];
+			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			
+			GROUP_MEM_FENCE;
+
+			u0 = sorterSharedMemory[idx-48];
+			u1 = sorterSharedMemory[idx-32];
+			u2 = sorterSharedMemory[idx-16];
+			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			
+			GROUP_MEM_FENCE;
+			if( wgSize > 64 )
+			{
+				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+				GROUP_MEM_FENCE;
+			}
+
+			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+			GROUP_MEM_FENCE;
+		}
+#else
+		if( lIdx < 64 )
+		{
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];			
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];
+			GROUP_MEM_FENCE;
+			if( wgSize > 64 )
+			{
+				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+				GROUP_MEM_FENCE;
+			}
+
+			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+			GROUP_MEM_FENCE;
+		}
+#endif
+	}
+
+	GROUP_LDS_BARRIER;
+
+	*totalSum = sorterSharedMemory[wgSize*2-1];
+	u32 addValue = sorterSharedMemory[lIdx+wgSize-1];
+	return addValue;
+}
+
+//__attribute__((reqd_work_group_size(128,1,1)))
+uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )
+{
+	u32 s4 = prefixScanVectorEx( &pData );
+	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );
+	return pData + make_uint4( rank, rank, rank, rank );
+}
+
+
+//__attribute__((reqd_work_group_size(64,1,1)))
+uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )
+{
+	u32 s4 = prefixScanVectorEx( &pData );
+	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );
+	return pData + make_uint4( rank, rank, rank, rank );
+}
+
+u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}
+
+u32 bit8Scan(u32 v)
+{
+	return (v<<8) + (v<<16) + (v<<24);
+}
+
+//===
+
+
+
+
+#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]
+
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )
+{
+	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
+
+	u32 gIdx = GET_GLOBAL_IDX;
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+	const int startBit = cb.m_startBit;
+	const int n = cb.m_n;
+	const int nWGs = cb.m_nWGs;
+	const int nBlocksPerWG = cb.m_nBlocksPerWG;
+
+	for(int i=0; i<NUM_BUCKET; i++)
+	{
+		MY_HISTOGRAM(i) = 0;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
+	u32 localKey;
+
+	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
+
+	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+
+	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
+	{
+		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
+		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
+		//	AMD: AtomInc performs better while NV prefers ++
+		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+		{
+#if defined(CHECK_BOUNDARY)
+			if( addr+i < n )
+#endif
+			{
+				localKey = (gSrc[addr+i]>>startBit) & 0xf;
+#if defined(NV_GPU)
+				MY_HISTOGRAM( localKey )++;
+#else
+				AtomInc( MY_HISTOGRAM( localKey ) );
+#endif
+			}
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+	
+	if( lIdx < NUM_BUCKET )
+	{
+		u32 sum = 0;
+		for(int i=0; i<GET_GROUP_SIZE; i++)
+		{
+			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];
+		}
+		histogramOut[lIdx*nWGs+wgIdx] = sum;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4  cb )
+{
+	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
+
+	u32 gIdx = GET_GLOBAL_IDX;
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+	const int startBit = cb.m_startBit;
+	const int n = cb.m_n;
+	const int nWGs = cb.m_nWGs;
+	const int nBlocksPerWG = cb.m_nBlocksPerWG;
+
+	for(int i=0; i<NUM_BUCKET; i++)
+	{
+		MY_HISTOGRAM(i) = 0;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
+	u32 localKey;
+
+	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
+
+	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+
+	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
+	{
+		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
+		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
+		//	AMD: AtomInc performs better while NV prefers ++
+		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+		{
+#if defined(CHECK_BOUNDARY)
+			if( addr+i < n )
+#endif
+			{
+				localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;
+#if defined(NV_GPU)
+				MY_HISTOGRAM( localKey )++;
+#else
+				AtomInc( MY_HISTOGRAM( localKey ) );
+#endif
+			}
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+	
+	if( lIdx < NUM_BUCKET )
+	{
+		u32 sum = 0;
+		for(int i=0; i<GET_GROUP_SIZE; i++)
+		{
+			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];
+		}
+		histogramOut[lIdx*nWGs+wgIdx] = sum;
+	}
+}
+
+#define nPerLane (nPerWI/4)
+
+//	NUM_BUCKET*nWGs < 128*nPerWI
+__kernel
+__attribute__((reqd_work_group_size(128,1,1)))
+void PrefixScanKernel( __global u32* wHistogram1, int4  cb )
+{
+	__local u32 ldsTopScanData[128*2];
+
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	const int nWGs = cb.m_nWGs;
+
+	u32 data[nPerWI];
+	for(int i=0; i<nPerWI; i++)
+	{
+		data[i] = 0;
+		if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )
+			data[i] = wHistogram1[nPerWI*lIdx+i];
+	}
+
+	uint4 myData = make_uint4(0,0,0,0);
+
+	for(int i=0; i<nPerLane; i++)
+	{
+		myData.x += data[nPerLane*0+i];
+		myData.y += data[nPerLane*1+i];
+		myData.z += data[nPerLane*2+i];
+		myData.w += data[nPerLane*3+i];
+	}
+
+	uint totalSum;
+	uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );
+
+//	for(int j=0; j<4; j++) //	somehow it introduces a lot of branches
+	{	int j = 0;
+		u32 sum = 0;
+		for(int i=0; i<nPerLane; i++)
+		{
+			u32 tmp = data[nPerLane*j+i];
+			data[nPerLane*j+i] = sum;
+			sum += tmp;
+		}
+	}
+	{	int j = 1;
+		u32 sum = 0;
+		for(int i=0; i<nPerLane; i++)
+		{
+			u32 tmp = data[nPerLane*j+i];
+			data[nPerLane*j+i] = sum;
+			sum += tmp;
+		}
+	}
+	{	int j = 2;
+		u32 sum = 0;
+		for(int i=0; i<nPerLane; i++)
+		{
+			u32 tmp = data[nPerLane*j+i];
+			data[nPerLane*j+i] = sum;
+			sum += tmp;
+		}
+	}
+	{	int j = 3;
+		u32 sum = 0;
+		for(int i=0; i<nPerLane; i++)
+		{
+			u32 tmp = data[nPerLane*j+i];
+			data[nPerLane*j+i] = sum;
+			sum += tmp;
+		}
+	}
+
+	for(int i=0; i<nPerLane; i++)
+	{
+		data[nPerLane*0+i] += scanned.x;
+		data[nPerLane*1+i] += scanned.y;
+		data[nPerLane*2+i] += scanned.z;
+		data[nPerLane*3+i] += scanned.w;
+	}
+
+	for(int i=0; i<nPerWI; i++)
+	{
+		int index = nPerWI*lIdx+i;
+		if (index < NUM_BUCKET*nWGs)
+			wHistogram1[nPerWI*lIdx+i] = data[i];
+	}
+}
+
+//	4 scan, 4 exchange
+void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)
+{
+	for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)
+	{
+		u32 mask = (1<<bitIdx);
+		uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );
+		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );
+		u32 total;
+		prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );
+		{
+			uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
+			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );
+			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );
+
+			GROUP_LDS_BARRIER;
+
+			ldsSortData[dstAddr.x] = sortData[0];
+			ldsSortData[dstAddr.y] = sortData[1];
+			ldsSortData[dstAddr.z] = sortData[2];
+			ldsSortData[dstAddr.w] = sortData[3];
+
+			GROUP_LDS_BARRIER;
+
+			sortData[0] = ldsSortData[localAddr.x];
+			sortData[1] = ldsSortData[localAddr.y];
+			sortData[2] = ldsSortData[localAddr.z];
+			sortData[3] = ldsSortData[localAddr.w];
+
+			GROUP_LDS_BARRIER;
+		}
+	}
+}
+
+//	2 scan, 2 exchange
+void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)
+{
+	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)
+	{
+		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, 
+			(sortData[1]>>(startBit+ibit)) & 0x3, 
+			(sortData[2]>>(startBit+ibit)) & 0x3, 
+			(sortData[3]>>(startBit+ibit)) & 0x3);
+
+		u32 key4;
+		u32 sKeyPacked[4] = { 0, 0, 0, 0 };
+		{
+			sKeyPacked[0] |= 1<<(8*b.x);
+			sKeyPacked[1] |= 1<<(8*b.y);
+			sKeyPacked[2] |= 1<<(8*b.z);
+			sKeyPacked[3] |= 1<<(8*b.w);
+
+			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];
+		}
+
+		u32 rankPacked;
+		u32 sumPacked;
+		{
+			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );
+		}
+
+		GROUP_LDS_BARRIER;
+
+		u32 newOffset[4] = { 0,0,0,0 };
+		{
+			u32 sumScanned = bit8Scan( sumPacked );
+
+			u32 scannedKeys[4];
+			scannedKeys[0] = 1<<(8*b.x);
+			scannedKeys[1] = 1<<(8*b.y);
+			scannedKeys[2] = 1<<(8*b.z);
+			scannedKeys[3] = 1<<(8*b.w);
+			{	//	4 scans at once
+				u32 sum4 = 0;
+				for(int ie=0; ie<4; ie++)
+				{
+					u32 tmp = scannedKeys[ie];
+					scannedKeys[ie] = sum4;
+					sum4 += tmp;
+				}
+			}
+
+			{
+				u32 sumPlusRank = sumScanned + rankPacked;
+				{	u32 ie = b.x;
+					scannedKeys[0] += sumPlusRank;
+					newOffset[0] = unpack4Key( scannedKeys[0], ie );
+				}
+				{	u32 ie = b.y;
+					scannedKeys[1] += sumPlusRank;
+					newOffset[1] = unpack4Key( scannedKeys[1], ie );
+				}
+				{	u32 ie = b.z;
+					scannedKeys[2] += sumPlusRank;
+					newOffset[2] = unpack4Key( scannedKeys[2], ie );
+				}
+				{	u32 ie = b.w;
+					scannedKeys[3] += sumPlusRank;
+					newOffset[3] = unpack4Key( scannedKeys[3], ie );
+				}
+			}
+		}
+
+
+		GROUP_LDS_BARRIER;
+
+		{
+			ldsSortData[newOffset[0]] = sortData[0];
+			ldsSortData[newOffset[1]] = sortData[1];
+			ldsSortData[newOffset[2]] = sortData[2];
+			ldsSortData[newOffset[3]] = sortData[3];
+
+			GROUP_LDS_BARRIER;
+
+			u32 dstAddr = 4*lIdx;
+			sortData[0] = ldsSortData[dstAddr+0];
+			sortData[1] = ldsSortData[dstAddr+1];
+			sortData[2] = ldsSortData[dstAddr+2];
+			sortData[3] = ldsSortData[dstAddr+3];
+
+			GROUP_LDS_BARRIER;
+		}
+	}
+}
+
+#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )
+{
+	__local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
+	__local u32 localHistogramToCarry[NUM_BUCKET];
+	__local u32 localHistogram[NUM_BUCKET*2];
+
+	u32 gIdx = GET_GLOBAL_IDX;
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+
+	const int n = cb.m_n;
+	const int nWGs = cb.m_nWGs;
+	const int startBit = cb.m_startBit;
+	const int nBlocksPerWG = cb.m_nBlocksPerWG;
+
+	if( lIdx < (NUM_BUCKET) )
+	{
+		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
+
+	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;
+
+	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+
+	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
+	{
+		u32 myHistogram = 0;
+
+		u32 sortData[ELEMENTS_PER_WORK_ITEM];
+		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+#if defined(CHECK_BOUNDARY)
+			sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;
+#else
+			sortData[i] = gSrc[ addr+i ];
+#endif
+
+		sort4Bits1(sortData, startBit, lIdx, ldsSortData);
+
+		u32 keys[ELEMENTS_PER_WORK_ITEM];
+		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+			keys[i] = (sortData[i]>>startBit) & 0xf;
+
+		{	//	create histogram
+			u32 setIdx = lIdx/16;
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[lIdx] = 0;
+			}
+			ldsSortData[lIdx] = 0;
+			GROUP_LDS_BARRIER;
+
+			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+#if defined(CHECK_BOUNDARY)
+				if( addr+i < n )
+#endif
+
+#if defined(NV_GPU)
+				SET_HISTOGRAM( setIdx, keys[i] )++;
+#else
+				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );
+#endif
+			
+			GROUP_LDS_BARRIER;
+			
+			uint hIdx = NUM_BUCKET+lIdx;
+			if( lIdx < NUM_BUCKET )
+			{
+				u32 sum = 0;
+				for(int i=0; i<WG_SIZE/16; i++)
+				{
+					sum += SET_HISTOGRAM( i, lIdx );
+				}
+				myHistogram = sum;
+				localHistogram[hIdx] = sum;
+			}
+			GROUP_LDS_BARRIER;
+
+#if defined(USE_2LEVEL_REDUCE)
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[hIdx] = localHistogram[hIdx-1];
+				GROUP_MEM_FENCE;
+
+				u32 u0, u1, u2;
+				u0 = localHistogram[hIdx-3];
+				u1 = localHistogram[hIdx-2];
+				u2 = localHistogram[hIdx-1];
+				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
+				GROUP_MEM_FENCE;
+				u0 = localHistogram[hIdx-12];
+				u1 = localHistogram[hIdx-8];
+				u2 = localHistogram[hIdx-4];
+				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
+				GROUP_MEM_FENCE;
+			}
+#else
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[hIdx] = localHistogram[hIdx-1];
+				GROUP_MEM_FENCE;
+				localHistogram[hIdx] += localHistogram[hIdx-1];
+				GROUP_MEM_FENCE;
+				localHistogram[hIdx] += localHistogram[hIdx-2];
+				GROUP_MEM_FENCE;
+				localHistogram[hIdx] += localHistogram[hIdx-4];
+				GROUP_MEM_FENCE;
+				localHistogram[hIdx] += localHistogram[hIdx-8];
+				GROUP_MEM_FENCE;
+			}
+#endif
+			GROUP_LDS_BARRIER;
+		}
+
+		{
+			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
+			{
+				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;
+				int binIdx = keys[ie];
+				int groupOffset = localHistogramToCarry[binIdx];
+				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
+#if defined(CHECK_BOUNDARY)
+				if( addr+ie < n )
+#endif
+				gDst[ groupOffset + myIdx ] = sortData[ie];
+			}
+		}
+
+		GROUP_LDS_BARRIER;
+
+		if( lIdx < NUM_BUCKET )
+		{
+			localHistogramToCarry[lIdx] += myHistogram;
+		}
+		GROUP_LDS_BARRIER;
+	}
+}
+
+//	2 scan, 2 exchange
+void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)
+{
+	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)
+	{
+		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, 
+			(sortData[1]>>(startBit+ibit)) & 0x3, 
+			(sortData[2]>>(startBit+ibit)) & 0x3, 
+			(sortData[3]>>(startBit+ibit)) & 0x3);
+
+		u32 key4;
+		u32 sKeyPacked[4] = { 0, 0, 0, 0 };
+		{
+			sKeyPacked[0] |= 1<<(8*b.x);
+			sKeyPacked[1] |= 1<<(8*b.y);
+			sKeyPacked[2] |= 1<<(8*b.z);
+			sKeyPacked[3] |= 1<<(8*b.w);
+
+			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];
+		}
+
+		u32 rankPacked;
+		u32 sumPacked;
+		{
+			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );
+		}
+
+		GROUP_LDS_BARRIER;
+
+		u32 newOffset[4] = { 0,0,0,0 };
+		{
+			u32 sumScanned = bit8Scan( sumPacked );
+
+			u32 scannedKeys[4];
+			scannedKeys[0] = 1<<(8*b.x);
+			scannedKeys[1] = 1<<(8*b.y);
+			scannedKeys[2] = 1<<(8*b.z);
+			scannedKeys[3] = 1<<(8*b.w);
+			{	//	4 scans at once
+				u32 sum4 = 0;
+				for(int ie=0; ie<4; ie++)
+				{
+					u32 tmp = scannedKeys[ie];
+					scannedKeys[ie] = sum4;
+					sum4 += tmp;
+				}
+			}
+
+			{
+				u32 sumPlusRank = sumScanned + rankPacked;
+				{	u32 ie = b.x;
+					scannedKeys[0] += sumPlusRank;
+					newOffset[0] = unpack4Key( scannedKeys[0], ie );
+				}
+				{	u32 ie = b.y;
+					scannedKeys[1] += sumPlusRank;
+					newOffset[1] = unpack4Key( scannedKeys[1], ie );
+				}
+				{	u32 ie = b.z;
+					scannedKeys[2] += sumPlusRank;
+					newOffset[2] = unpack4Key( scannedKeys[2], ie );
+				}
+				{	u32 ie = b.w;
+					scannedKeys[3] += sumPlusRank;
+					newOffset[3] = unpack4Key( scannedKeys[3], ie );
+				}
+			}
+		}
+
+
+		GROUP_LDS_BARRIER;
+
+		{
+			ldsSortData[newOffset[0]] = sortData[0];
+			ldsSortData[newOffset[1]] = sortData[1];
+			ldsSortData[newOffset[2]] = sortData[2];
+			ldsSortData[newOffset[3]] = sortData[3];
+
+			ldsSortVal[newOffset[0]] = sortVal[0];
+			ldsSortVal[newOffset[1]] = sortVal[1];
+			ldsSortVal[newOffset[2]] = sortVal[2];
+			ldsSortVal[newOffset[3]] = sortVal[3];
+
+			GROUP_LDS_BARRIER;
+
+			u32 dstAddr = 4*lIdx;
+			sortData[0] = ldsSortData[dstAddr+0];
+			sortData[1] = ldsSortData[dstAddr+1];
+			sortData[2] = ldsSortData[dstAddr+2];
+			sortData[3] = ldsSortData[dstAddr+3];
+
+			sortVal[0] = ldsSortVal[dstAddr+0];
+			sortVal[1] = ldsSortVal[dstAddr+1];
+			sortVal[2] = ldsSortVal[dstAddr+2];
+			sortVal[3] = ldsSortVal[dstAddr+3];
+
+			GROUP_LDS_BARRIER;
+		}
+	}
+}
+
+
+
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)
+{
+	__local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
+	__local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
+	__local u32 localHistogramToCarry[NUM_BUCKET];
+	__local u32 localHistogram[NUM_BUCKET*2];
+
+	u32 gIdx = GET_GLOBAL_IDX;
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+
+	const int n = cb.m_n;
+	const int nWGs = cb.m_nWGs;
+	const int startBit = cb.m_startBit;
+	const int nBlocksPerWG = cb.m_nBlocksPerWG;
+
+	if( lIdx < (NUM_BUCKET) )
+	{
+		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];
+	}
+
+	GROUP_LDS_BARRIER;
+    
+
+	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
+
+	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;
+
+	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+
+	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
+	{
+
+		u32 myHistogram = 0;
+
+		int sortData[ELEMENTS_PER_WORK_ITEM];
+		int sortVal[ELEMENTS_PER_WORK_ITEM];
+
+		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+#if defined(CHECK_BOUNDARY)
+		{
+			sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;
+			sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;
+		}
+#else
+		{
+			sortData[i] = gSrc[ addr+i ].m_key;
+			sortVal[i] = gSrc[ addr+i ].m_value;
+		}
+#endif
+
+		sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);
+
+		u32 keys[ELEMENTS_PER_WORK_ITEM];
+		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+			keys[i] = (sortData[i]>>startBit) & 0xf;
+
+		{	//	create histogram
+			u32 setIdx = lIdx/16;
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[lIdx] = 0;
+			}
+			ldsSortData[lIdx] = 0;
+			GROUP_LDS_BARRIER;
+
+			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+#if defined(CHECK_BOUNDARY)
+				if( addr+i < n )
+#endif
+
+#if defined(NV_GPU)
+				SET_HISTOGRAM( setIdx, keys[i] )++;
+#else
+				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );
+#endif
+			
+			GROUP_LDS_BARRIER;
+			
+			uint hIdx = NUM_BUCKET+lIdx;
+			if( lIdx < NUM_BUCKET )
+			{
+				u32 sum = 0;
+				for(int i=0; i<WG_SIZE/16; i++)
+				{
+					sum += SET_HISTOGRAM( i, lIdx );
+				}
+				myHistogram = sum;
+				localHistogram[hIdx] = sum;
+			}
+			GROUP_LDS_BARRIER;
+
+#if defined(USE_2LEVEL_REDUCE)
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[hIdx] = localHistogram[hIdx-1];
+				GROUP_MEM_FENCE;
+
+				u32 u0, u1, u2;
+				u0 = localHistogram[hIdx-3];
+				u1 = localHistogram[hIdx-2];
+				u2 = localHistogram[hIdx-1];
+				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
+				GROUP_MEM_FENCE;
+				u0 = localHistogram[hIdx-12];
+				u1 = localHistogram[hIdx-8];
+				u2 = localHistogram[hIdx-4];
+				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
+				GROUP_MEM_FENCE;
+			}
+#else
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[hIdx] = localHistogram[hIdx-1];
+				GROUP_MEM_FENCE;
+				localHistogram[hIdx] += localHistogram[hIdx-1];
+				GROUP_MEM_FENCE;
+				localHistogram[hIdx] += localHistogram[hIdx-2];
+				GROUP_MEM_FENCE;
+				localHistogram[hIdx] += localHistogram[hIdx-4];
+				GROUP_MEM_FENCE;
+				localHistogram[hIdx] += localHistogram[hIdx-8];
+				GROUP_MEM_FENCE;
+			}
+#endif
+			GROUP_LDS_BARRIER;
+		}
+
+    	{
+			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
+			{
+				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;
+				int binIdx = keys[ie];
+				int groupOffset = localHistogramToCarry[binIdx];
+				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
+#if defined(CHECK_BOUNDARY)
+				if( addr+ie < n )
+				{
+                    if ((groupOffset + myIdx)<n)
+                    {
+                        if (sortData[ie]==sortVal[ie])
+                        {
+                            
+                            SortDataCL tmp;
+                            tmp.m_key = sortData[ie];
+                            tmp.m_value = sortVal[ie];
+                            if (tmp.m_key == tmp.m_value)
+                                gDst[groupOffset + myIdx ] = tmp;
+                        }
+                        
+                    }
+				}
+#else
+                if ((groupOffset + myIdx)<n)
+                {
+                    gDst[ groupOffset + myIdx ].m_key = sortData[ie];
+                    gDst[ groupOffset + myIdx ].m_value = sortVal[ie];
+                }
+#endif
+			}
+		}
+
+		GROUP_LDS_BARRIER;
+
+		if( lIdx < NUM_BUCKET )
+		{
+			localHistogramToCarry[lIdx] += myHistogram;
+		}
+		GROUP_LDS_BARRIER;
+	}
+}
+
+
+
+
+
+
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)
+{
+    
+	u32 gIdx = GET_GLOBAL_IDX;
+	u32 realLocalIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+	const int startBit = cb.m_startBit;
+	const int n = cb.m_n;
+	const int nWGs = cb.m_nWGs;
+	const int nBlocksPerWG = cb.m_nBlocksPerWG;
+
+    int counter[NUM_BUCKET];
+    
+    if (realLocalIdx>0)
+        return;
+    
+    for (int c=0;c<NUM_BUCKET;c++)
+        counter[c]=0;
+
+    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
+	
+	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
+
+   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)
+  {
+     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)
+ 	{
+        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+        
+		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
+		{
+            int i = addr2+j;
+			if( i < n )
+			{
+                int tableIdx;
+				tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1
+                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];
+                counter[tableIdx] ++;
+			}
+		}
+	}
+  }
+    
+}
+
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )
+{
+    
+	u32 gIdx = GET_GLOBAL_IDX;
+	u32 realLocalIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+	const int startBit = cb.m_startBit;
+	const int n = cb.m_n;
+	const int nWGs = cb.m_nWGs;
+	const int nBlocksPerWG = cb.m_nBlocksPerWG;
+
+    int counter[NUM_BUCKET];
+    
+    if (realLocalIdx>0)
+        return;
+    
+    for (int c=0;c<NUM_BUCKET;c++)
+        counter[c]=0;
+
+    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
+	
+	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
+
+   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)
+  {
+     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)
+ 	{
+        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
+        
+		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
+		{
+            int i = addr2+j;
+			if( i < n )
+			{
+                int tableIdx;
+				tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1
+                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];
+                counter[tableIdx] ++;
+			}
+		}
+	}
+  }
+    
+}
\ No newline at end of file
diff --git a/opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h b/opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h
new file mode 100644
index 000000000..464829c3b
--- /dev/null
+++ b/opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h
@@ -0,0 +1,1074 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* radixSort32KernelsCL= \
+"/*\n"
+"Bullet Continuous Collision Detection and Physics Library\n"
+"Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org\n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Author Takahiro Harada\n"
+"\n"
+"\n"
+"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"\n"
+"\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"#define WG_SIZE 64\n"
+"#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n"
+"#define BITS_PER_PASS 4\n"
+"#define NUM_BUCKET (1<<BITS_PER_PASS)\n"
+"typedef uchar u8;\n"
+"\n"
+"//	this isn't optimization for VLIW. But just reducing writes. \n"
+"#define USE_2LEVEL_REDUCE 1\n"
+"\n"
+"//#define CHECK_BOUNDARY 1\n"
+"\n"
+"//#define NV_GPU 1\n"
+"\n"
+"\n"
+"//	Cypress\n"
+"#define nPerWI 16\n"
+"//	Cayman\n"
+"//#define nPerWI 20\n"
+"\n"
+"#define m_n x\n"
+"#define m_nWGs y\n"
+"#define m_startBit z\n"
+"#define m_nBlocksPerWG w\n"
+"\n"
+"/*\n"
+"typedef struct\n"
+"{\n"
+"	int m_n;\n"
+"	int m_nWGs;\n"
+"	int m_startBit;\n"
+"	int m_nBlocksPerWG;\n"
+"} ConstBuffer;\n"
+"*/\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	unsigned int m_key;\n"
+"	unsigned int m_value;\n"
+"} SortDataCL;\n"
+"\n"
+"\n"
+"uint prefixScanVectorEx( uint4* data )\n"
+"{\n"
+"	u32 sum = 0;\n"
+"	u32 tmp = data[0].x;\n"
+"	data[0].x = sum;\n"
+"	sum += tmp;\n"
+"	tmp = data[0].y;\n"
+"	data[0].y = sum;\n"
+"	sum += tmp;\n"
+"	tmp = data[0].z;\n"
+"	data[0].z = sum;\n"
+"	sum += tmp;\n"
+"	tmp = data[0].w;\n"
+"	data[0].w = sum;\n"
+"	sum += tmp;\n"
+"	return sum;\n"
+"}\n"
+"\n"
+"u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )\n"
+"{\n"
+"	{	//	Set data\n"
+"		sorterSharedMemory[lIdx] = 0;\n"
+"		sorterSharedMemory[lIdx+wgSize] = pData;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	Prefix sum\n"
+"		int idx = 2*lIdx + (wgSize+1);\n"
+"#if defined(USE_2LEVEL_REDUCE)\n"
+"		if( lIdx < 64 )\n"
+"		{\n"
+"			u32 u0, u1, u2;\n"
+"			u0 = sorterSharedMemory[idx-3];\n"
+"			u1 = sorterSharedMemory[idx-2];\n"
+"			u2 = sorterSharedMemory[idx-1];\n"
+"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
+"			GROUP_MEM_FENCE;\n"
+"\n"
+"			u0 = sorterSharedMemory[idx-12];\n"
+"			u1 = sorterSharedMemory[idx-8];\n"
+"			u2 = sorterSharedMemory[idx-4];\n"
+"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
+"			GROUP_MEM_FENCE;\n"
+"\n"
+"			u0 = sorterSharedMemory[idx-48];\n"
+"			u1 = sorterSharedMemory[idx-32];\n"
+"			u2 = sorterSharedMemory[idx-16];\n"
+"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
+"			GROUP_MEM_FENCE;\n"
+"			if( wgSize > 64 )\n"
+"			{\n"
+"				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"				GROUP_MEM_FENCE;\n"
+"			}\n"
+"\n"
+"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"			GROUP_MEM_FENCE;\n"
+"		}\n"
+"#else\n"
+"		if( lIdx < 64 )\n"
+"		{\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];			\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n"
+"			GROUP_MEM_FENCE;\n"
+"			if( wgSize > 64 )\n"
+"			{\n"
+"				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"				GROUP_MEM_FENCE;\n"
+"			}\n"
+"\n"
+"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"			GROUP_MEM_FENCE;\n"
+"		}\n"
+"#endif\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	*totalSum = sorterSharedMemory[wgSize*2-1];\n"
+"	u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n"
+"	return addValue;\n"
+"}\n"
+"\n"
+"//__attribute__((reqd_work_group_size(128,1,1)))\n"
+"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
+"{\n"
+"	u32 s4 = prefixScanVectorEx( &pData );\n"
+"	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n"
+"	return pData + make_uint4( rank, rank, rank, rank );\n"
+"}\n"
+"\n"
+"\n"
+"//__attribute__((reqd_work_group_size(64,1,1)))\n"
+"uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
+"{\n"
+"	u32 s4 = prefixScanVectorEx( &pData );\n"
+"	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n"
+"	return pData + make_uint4( rank, rank, rank, rank );\n"
+"}\n"
+"\n"
+"u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n"
+"\n"
+"u32 bit8Scan(u32 v)\n"
+"{\n"
+"	return (v<<8) + (v<<16) + (v<<24);\n"
+"}\n"
+"\n"
+"//===\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n"
+"{\n"
+"	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
+"\n"
+"	u32 gIdx = GET_GLOBAL_IDX;\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"	const int startBit = cb.m_startBit;\n"
+"	const int n = cb.m_n;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+"\n"
+"	for(int i=0; i<NUM_BUCKET; i++)\n"
+"	{\n"
+"		MY_HISTOGRAM(i) = 0;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+"	u32 localKey;\n"
+"\n"
+"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
+"\n"
+"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+"\n"
+"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
+"	{\n"
+"		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
+"		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n"
+"		//	AMD: AtomInc performs better while NV prefers ++\n"
+"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"		{\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"			if( addr+i < n )\n"
+"#endif\n"
+"			{\n"
+"				localKey = (gSrc[addr+i]>>startBit) & 0xf;\n"
+"#if defined(NV_GPU)\n"
+"				MY_HISTOGRAM( localKey )++;\n"
+"#else\n"
+"				AtomInc( MY_HISTOGRAM( localKey ) );\n"
+"#endif\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	if( lIdx < NUM_BUCKET )\n"
+"	{\n"
+"		u32 sum = 0;\n"
+"		for(int i=0; i<GET_GROUP_SIZE; i++)\n"
+"		{\n"
+"			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n"
+"		}\n"
+"		histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4  cb )\n"
+"{\n"
+"	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
+"\n"
+"	u32 gIdx = GET_GLOBAL_IDX;\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"	const int startBit = cb.m_startBit;\n"
+"	const int n = cb.m_n;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+"\n"
+"	for(int i=0; i<NUM_BUCKET; i++)\n"
+"	{\n"
+"		MY_HISTOGRAM(i) = 0;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+"	u32 localKey;\n"
+"\n"
+"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
+"\n"
+"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+"\n"
+"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
+"	{\n"
+"		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
+"		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n"
+"		//	AMD: AtomInc performs better while NV prefers ++\n"
+"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"		{\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"			if( addr+i < n )\n"
+"#endif\n"
+"			{\n"
+"				localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;\n"
+"#if defined(NV_GPU)\n"
+"				MY_HISTOGRAM( localKey )++;\n"
+"#else\n"
+"				AtomInc( MY_HISTOGRAM( localKey ) );\n"
+"#endif\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	if( lIdx < NUM_BUCKET )\n"
+"	{\n"
+"		u32 sum = 0;\n"
+"		for(int i=0; i<GET_GROUP_SIZE; i++)\n"
+"		{\n"
+"			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n"
+"		}\n"
+"		histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
+"	}\n"
+"}\n"
+"\n"
+"#define nPerLane (nPerWI/4)\n"
+"\n"
+"//	NUM_BUCKET*nWGs < 128*nPerWI\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(128,1,1)))\n"
+"void PrefixScanKernel( __global u32* wHistogram1, int4  cb )\n"
+"{\n"
+"	__local u32 ldsTopScanData[128*2];\n"
+"\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"\n"
+"	u32 data[nPerWI];\n"
+"	for(int i=0; i<nPerWI; i++)\n"
+"	{\n"
+"		data[i] = 0;\n"
+"		if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )\n"
+"			data[i] = wHistogram1[nPerWI*lIdx+i];\n"
+"	}\n"
+"\n"
+"	uint4 myData = make_uint4(0,0,0,0);\n"
+"\n"
+"	for(int i=0; i<nPerLane; i++)\n"
+"	{\n"
+"		myData.x += data[nPerLane*0+i];\n"
+"		myData.y += data[nPerLane*1+i];\n"
+"		myData.z += data[nPerLane*2+i];\n"
+"		myData.w += data[nPerLane*3+i];\n"
+"	}\n"
+"\n"
+"	uint totalSum;\n"
+"	uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );\n"
+"\n"
+"//	for(int j=0; j<4; j++) //	somehow it introduces a lot of branches\n"
+"	{	int j = 0;\n"
+"		u32 sum = 0;\n"
+"		for(int i=0; i<nPerLane; i++)\n"
+"		{\n"
+"			u32 tmp = data[nPerLane*j+i];\n"
+"			data[nPerLane*j+i] = sum;\n"
+"			sum += tmp;\n"
+"		}\n"
+"	}\n"
+"	{	int j = 1;\n"
+"		u32 sum = 0;\n"
+"		for(int i=0; i<nPerLane; i++)\n"
+"		{\n"
+"			u32 tmp = data[nPerLane*j+i];\n"
+"			data[nPerLane*j+i] = sum;\n"
+"			sum += tmp;\n"
+"		}\n"
+"	}\n"
+"	{	int j = 2;\n"
+"		u32 sum = 0;\n"
+"		for(int i=0; i<nPerLane; i++)\n"
+"		{\n"
+"			u32 tmp = data[nPerLane*j+i];\n"
+"			data[nPerLane*j+i] = sum;\n"
+"			sum += tmp;\n"
+"		}\n"
+"	}\n"
+"	{	int j = 3;\n"
+"		u32 sum = 0;\n"
+"		for(int i=0; i<nPerLane; i++)\n"
+"		{\n"
+"			u32 tmp = data[nPerLane*j+i];\n"
+"			data[nPerLane*j+i] = sum;\n"
+"			sum += tmp;\n"
+"		}\n"
+"	}\n"
+"\n"
+"	for(int i=0; i<nPerLane; i++)\n"
+"	{\n"
+"		data[nPerLane*0+i] += scanned.x;\n"
+"		data[nPerLane*1+i] += scanned.y;\n"
+"		data[nPerLane*2+i] += scanned.z;\n"
+"		data[nPerLane*3+i] += scanned.w;\n"
+"	}\n"
+"\n"
+"	for(int i=0; i<nPerWI; i++)\n"
+"	{\n"
+"		int index = nPerWI*lIdx+i;\n"
+"		if (index < NUM_BUCKET*nWGs)\n"
+"			wHistogram1[nPerWI*lIdx+i] = data[i];\n"
+"	}\n"
+"}\n"
+"\n"
+"//	4 scan, 4 exchange\n"
+"void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
+"{\n"
+"	for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)\n"
+"	{\n"
+"		u32 mask = (1<<bitIdx);\n"
+"		uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n"
+"		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n"
+"		u32 total;\n"
+"		prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n"
+"		{\n"
+"			uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
+"			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
+"			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			ldsSortData[dstAddr.x] = sortData[0];\n"
+"			ldsSortData[dstAddr.y] = sortData[1];\n"
+"			ldsSortData[dstAddr.z] = sortData[2];\n"
+"			ldsSortData[dstAddr.w] = sortData[3];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			sortData[0] = ldsSortData[localAddr.x];\n"
+"			sortData[1] = ldsSortData[localAddr.y];\n"
+"			sortData[2] = ldsSortData[localAddr.z];\n"
+"			sortData[3] = ldsSortData[localAddr.w];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"//	2 scan, 2 exchange\n"
+"void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
+"{\n"
+"	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n"
+"	{\n"
+"		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n"
+"			(sortData[1]>>(startBit+ibit)) & 0x3, \n"
+"			(sortData[2]>>(startBit+ibit)) & 0x3, \n"
+"			(sortData[3]>>(startBit+ibit)) & 0x3);\n"
+"\n"
+"		u32 key4;\n"
+"		u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
+"		{\n"
+"			sKeyPacked[0] |= 1<<(8*b.x);\n"
+"			sKeyPacked[1] |= 1<<(8*b.y);\n"
+"			sKeyPacked[2] |= 1<<(8*b.z);\n"
+"			sKeyPacked[3] |= 1<<(8*b.w);\n"
+"\n"
+"			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
+"		}\n"
+"\n"
+"		u32 rankPacked;\n"
+"		u32 sumPacked;\n"
+"		{\n"
+"			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
+"		}\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		u32 newOffset[4] = { 0,0,0,0 };\n"
+"		{\n"
+"			u32 sumScanned = bit8Scan( sumPacked );\n"
+"\n"
+"			u32 scannedKeys[4];\n"
+"			scannedKeys[0] = 1<<(8*b.x);\n"
+"			scannedKeys[1] = 1<<(8*b.y);\n"
+"			scannedKeys[2] = 1<<(8*b.z);\n"
+"			scannedKeys[3] = 1<<(8*b.w);\n"
+"			{	//	4 scans at once\n"
+"				u32 sum4 = 0;\n"
+"				for(int ie=0; ie<4; ie++)\n"
+"				{\n"
+"					u32 tmp = scannedKeys[ie];\n"
+"					scannedKeys[ie] = sum4;\n"
+"					sum4 += tmp;\n"
+"				}\n"
+"			}\n"
+"\n"
+"			{\n"
+"				u32 sumPlusRank = sumScanned + rankPacked;\n"
+"				{	u32 ie = b.x;\n"
+"					scannedKeys[0] += sumPlusRank;\n"
+"					newOffset[0] = unpack4Key( scannedKeys[0], ie );\n"
+"				}\n"
+"				{	u32 ie = b.y;\n"
+"					scannedKeys[1] += sumPlusRank;\n"
+"					newOffset[1] = unpack4Key( scannedKeys[1], ie );\n"
+"				}\n"
+"				{	u32 ie = b.z;\n"
+"					scannedKeys[2] += sumPlusRank;\n"
+"					newOffset[2] = unpack4Key( scannedKeys[2], ie );\n"
+"				}\n"
+"				{	u32 ie = b.w;\n"
+"					scannedKeys[3] += sumPlusRank;\n"
+"					newOffset[3] = unpack4Key( scannedKeys[3], ie );\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		{\n"
+"			ldsSortData[newOffset[0]] = sortData[0];\n"
+"			ldsSortData[newOffset[1]] = sortData[1];\n"
+"			ldsSortData[newOffset[2]] = sortData[2];\n"
+"			ldsSortData[newOffset[3]] = sortData[3];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			u32 dstAddr = 4*lIdx;\n"
+"			sortData[0] = ldsSortData[dstAddr+0];\n"
+"			sortData[1] = ldsSortData[dstAddr+1];\n"
+"			sortData[2] = ldsSortData[dstAddr+2];\n"
+"			sortData[3] = ldsSortData[dstAddr+3];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )\n"
+"{\n"
+"	__local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
+"	__local u32 localHistogramToCarry[NUM_BUCKET];\n"
+"	__local u32 localHistogram[NUM_BUCKET*2];\n"
+"\n"
+"	u32 gIdx = GET_GLOBAL_IDX;\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"\n"
+"	const int n = cb.m_n;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	const int startBit = cb.m_startBit;\n"
+"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+"\n"
+"	if( lIdx < (NUM_BUCKET) )\n"
+"	{\n"
+"		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+"\n"
+"	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
+"\n"
+"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+"\n"
+"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
+"	{\n"
+"		u32 myHistogram = 0;\n"
+"\n"
+"		u32 sortData[ELEMENTS_PER_WORK_ITEM];\n"
+"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"			sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;\n"
+"#else\n"
+"			sortData[i] = gSrc[ addr+i ];\n"
+"#endif\n"
+"\n"
+"		sort4Bits1(sortData, startBit, lIdx, ldsSortData);\n"
+"\n"
+"		u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
+"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"			keys[i] = (sortData[i]>>startBit) & 0xf;\n"
+"\n"
+"		{	//	create histogram\n"
+"			u32 setIdx = lIdx/16;\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[lIdx] = 0;\n"
+"			}\n"
+"			ldsSortData[lIdx] = 0;\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"				if( addr+i < n )\n"
+"#endif\n"
+"\n"
+"#if defined(NV_GPU)\n"
+"				SET_HISTOGRAM( setIdx, keys[i] )++;\n"
+"#else\n"
+"				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n"
+"#endif\n"
+"			\n"
+"			GROUP_LDS_BARRIER;\n"
+"			\n"
+"			uint hIdx = NUM_BUCKET+lIdx;\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				u32 sum = 0;\n"
+"				for(int i=0; i<WG_SIZE/16; i++)\n"
+"				{\n"
+"					sum += SET_HISTOGRAM( i, lIdx );\n"
+"				}\n"
+"				myHistogram = sum;\n"
+"				localHistogram[hIdx] = sum;\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"#if defined(USE_2LEVEL_REDUCE)\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+"				GROUP_MEM_FENCE;\n"
+"\n"
+"				u32 u0, u1, u2;\n"
+"				u0 = localHistogram[hIdx-3];\n"
+"				u1 = localHistogram[hIdx-2];\n"
+"				u2 = localHistogram[hIdx-1];\n"
+"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
+"				GROUP_MEM_FENCE;\n"
+"				u0 = localHistogram[hIdx-12];\n"
+"				u1 = localHistogram[hIdx-8];\n"
+"				u2 = localHistogram[hIdx-4];\n"
+"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
+"				GROUP_MEM_FENCE;\n"
+"			}\n"
+"#else\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+"				GROUP_MEM_FENCE;\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
+"				GROUP_MEM_FENCE;\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
+"				GROUP_MEM_FENCE;\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
+"				GROUP_MEM_FENCE;\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
+"				GROUP_MEM_FENCE;\n"
+"			}\n"
+"#endif\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"\n"
+"		{\n"
+"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
+"			{\n"
+"				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n"
+"				int binIdx = keys[ie];\n"
+"				int groupOffset = localHistogramToCarry[binIdx];\n"
+"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"				if( addr+ie < n )\n"
+"#endif\n"
+"				gDst[ groupOffset + myIdx ] = sortData[ie];\n"
+"			}\n"
+"		}\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		if( lIdx < NUM_BUCKET )\n"
+"		{\n"
+"			localHistogramToCarry[lIdx] += myHistogram;\n"
+"		}\n"
+"		GROUP_LDS_BARRIER;\n"
+"	}\n"
+"}\n"
+"\n"
+"//	2 scan, 2 exchange\n"
+"void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)\n"
+"{\n"
+"	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n"
+"	{\n"
+"		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n"
+"			(sortData[1]>>(startBit+ibit)) & 0x3, \n"
+"			(sortData[2]>>(startBit+ibit)) & 0x3, \n"
+"			(sortData[3]>>(startBit+ibit)) & 0x3);\n"
+"\n"
+"		u32 key4;\n"
+"		u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
+"		{\n"
+"			sKeyPacked[0] |= 1<<(8*b.x);\n"
+"			sKeyPacked[1] |= 1<<(8*b.y);\n"
+"			sKeyPacked[2] |= 1<<(8*b.z);\n"
+"			sKeyPacked[3] |= 1<<(8*b.w);\n"
+"\n"
+"			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
+"		}\n"
+"\n"
+"		u32 rankPacked;\n"
+"		u32 sumPacked;\n"
+"		{\n"
+"			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
+"		}\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		u32 newOffset[4] = { 0,0,0,0 };\n"
+"		{\n"
+"			u32 sumScanned = bit8Scan( sumPacked );\n"
+"\n"
+"			u32 scannedKeys[4];\n"
+"			scannedKeys[0] = 1<<(8*b.x);\n"
+"			scannedKeys[1] = 1<<(8*b.y);\n"
+"			scannedKeys[2] = 1<<(8*b.z);\n"
+"			scannedKeys[3] = 1<<(8*b.w);\n"
+"			{	//	4 scans at once\n"
+"				u32 sum4 = 0;\n"
+"				for(int ie=0; ie<4; ie++)\n"
+"				{\n"
+"					u32 tmp = scannedKeys[ie];\n"
+"					scannedKeys[ie] = sum4;\n"
+"					sum4 += tmp;\n"
+"				}\n"
+"			}\n"
+"\n"
+"			{\n"
+"				u32 sumPlusRank = sumScanned + rankPacked;\n"
+"				{	u32 ie = b.x;\n"
+"					scannedKeys[0] += sumPlusRank;\n"
+"					newOffset[0] = unpack4Key( scannedKeys[0], ie );\n"
+"				}\n"
+"				{	u32 ie = b.y;\n"
+"					scannedKeys[1] += sumPlusRank;\n"
+"					newOffset[1] = unpack4Key( scannedKeys[1], ie );\n"
+"				}\n"
+"				{	u32 ie = b.z;\n"
+"					scannedKeys[2] += sumPlusRank;\n"
+"					newOffset[2] = unpack4Key( scannedKeys[2], ie );\n"
+"				}\n"
+"				{	u32 ie = b.w;\n"
+"					scannedKeys[3] += sumPlusRank;\n"
+"					newOffset[3] = unpack4Key( scannedKeys[3], ie );\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		{\n"
+"			ldsSortData[newOffset[0]] = sortData[0];\n"
+"			ldsSortData[newOffset[1]] = sortData[1];\n"
+"			ldsSortData[newOffset[2]] = sortData[2];\n"
+"			ldsSortData[newOffset[3]] = sortData[3];\n"
+"\n"
+"			ldsSortVal[newOffset[0]] = sortVal[0];\n"
+"			ldsSortVal[newOffset[1]] = sortVal[1];\n"
+"			ldsSortVal[newOffset[2]] = sortVal[2];\n"
+"			ldsSortVal[newOffset[3]] = sortVal[3];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			u32 dstAddr = 4*lIdx;\n"
+"			sortData[0] = ldsSortData[dstAddr+0];\n"
+"			sortData[1] = ldsSortData[dstAddr+1];\n"
+"			sortData[2] = ldsSortData[dstAddr+2];\n"
+"			sortData[3] = ldsSortData[dstAddr+3];\n"
+"\n"
+"			sortVal[0] = ldsSortVal[dstAddr+0];\n"
+"			sortVal[1] = ldsSortVal[dstAddr+1];\n"
+"			sortVal[2] = ldsSortVal[dstAddr+2];\n"
+"			sortVal[3] = ldsSortVal[dstAddr+3];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
+"{\n"
+"	__local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
+"	__local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
+"	__local u32 localHistogramToCarry[NUM_BUCKET];\n"
+"	__local u32 localHistogram[NUM_BUCKET*2];\n"
+"\n"
+"	u32 gIdx = GET_GLOBAL_IDX;\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"\n"
+"	const int n = cb.m_n;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	const int startBit = cb.m_startBit;\n"
+"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+"\n"
+"	if( lIdx < (NUM_BUCKET) )\n"
+"	{\n"
+"		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"    \n"
+"\n"
+"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+"\n"
+"	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
+"\n"
+"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+"\n"
+"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
+"	{\n"
+"\n"
+"		u32 myHistogram = 0;\n"
+"\n"
+"		int sortData[ELEMENTS_PER_WORK_ITEM];\n"
+"		int sortVal[ELEMENTS_PER_WORK_ITEM];\n"
+"\n"
+"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"		{\n"
+"			sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;\n"
+"			sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;\n"
+"		}\n"
+"#else\n"
+"		{\n"
+"			sortData[i] = gSrc[ addr+i ].m_key;\n"
+"			sortVal[i] = gSrc[ addr+i ].m_value;\n"
+"		}\n"
+"#endif\n"
+"\n"
+"		sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);\n"
+"\n"
+"		u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
+"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"			keys[i] = (sortData[i]>>startBit) & 0xf;\n"
+"\n"
+"		{	//	create histogram\n"
+"			u32 setIdx = lIdx/16;\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[lIdx] = 0;\n"
+"			}\n"
+"			ldsSortData[lIdx] = 0;\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"				if( addr+i < n )\n"
+"#endif\n"
+"\n"
+"#if defined(NV_GPU)\n"
+"				SET_HISTOGRAM( setIdx, keys[i] )++;\n"
+"#else\n"
+"				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n"
+"#endif\n"
+"			\n"
+"			GROUP_LDS_BARRIER;\n"
+"			\n"
+"			uint hIdx = NUM_BUCKET+lIdx;\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				u32 sum = 0;\n"
+"				for(int i=0; i<WG_SIZE/16; i++)\n"
+"				{\n"
+"					sum += SET_HISTOGRAM( i, lIdx );\n"
+"				}\n"
+"				myHistogram = sum;\n"
+"				localHistogram[hIdx] = sum;\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"#if defined(USE_2LEVEL_REDUCE)\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+"				GROUP_MEM_FENCE;\n"
+"\n"
+"				u32 u0, u1, u2;\n"
+"				u0 = localHistogram[hIdx-3];\n"
+"				u1 = localHistogram[hIdx-2];\n"
+"				u2 = localHistogram[hIdx-1];\n"
+"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
+"				GROUP_MEM_FENCE;\n"
+"				u0 = localHistogram[hIdx-12];\n"
+"				u1 = localHistogram[hIdx-8];\n"
+"				u2 = localHistogram[hIdx-4];\n"
+"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
+"				GROUP_MEM_FENCE;\n"
+"			}\n"
+"#else\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+"				GROUP_MEM_FENCE;\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
+"				GROUP_MEM_FENCE;\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
+"				GROUP_MEM_FENCE;\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
+"				GROUP_MEM_FENCE;\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
+"				GROUP_MEM_FENCE;\n"
+"			}\n"
+"#endif\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"\n"
+"    	{\n"
+"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
+"			{\n"
+"				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n"
+"				int binIdx = keys[ie];\n"
+"				int groupOffset = localHistogramToCarry[binIdx];\n"
+"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
+"#if defined(CHECK_BOUNDARY)\n"
+"				if( addr+ie < n )\n"
+"				{\n"
+"                    if ((groupOffset + myIdx)<n)\n"
+"                    {\n"
+"                        if (sortData[ie]==sortVal[ie])\n"
+"                        {\n"
+"                            \n"
+"                            SortDataCL tmp;\n"
+"                            tmp.m_key = sortData[ie];\n"
+"                            tmp.m_value = sortVal[ie];\n"
+"                            if (tmp.m_key == tmp.m_value)\n"
+"                                gDst[groupOffset + myIdx ] = tmp;\n"
+"                        }\n"
+"                        \n"
+"                    }\n"
+"				}\n"
+"#else\n"
+"                if ((groupOffset + myIdx)<n)\n"
+"                {\n"
+"                    gDst[ groupOffset + myIdx ].m_key = sortData[ie];\n"
+"                    gDst[ groupOffset + myIdx ].m_value = sortVal[ie];\n"
+"                }\n"
+"#endif\n"
+"			}\n"
+"		}\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		if( lIdx < NUM_BUCKET )\n"
+"		{\n"
+"			localHistogramToCarry[lIdx] += myHistogram;\n"
+"		}\n"
+"		GROUP_LDS_BARRIER;\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
+"{\n"
+"    \n"
+"	u32 gIdx = GET_GLOBAL_IDX;\n"
+"	u32 realLocalIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"	const int startBit = cb.m_startBit;\n"
+"	const int n = cb.m_n;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+"\n"
+"    int counter[NUM_BUCKET];\n"
+"    \n"
+"    if (realLocalIdx>0)\n"
+"        return;\n"
+"    \n"
+"    for (int c=0;c<NUM_BUCKET;c++)\n"
+"        counter[c]=0;\n"
+"\n"
+"    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+"	\n"
+"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
+"\n"
+"   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
+"  {\n"
+"     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
+" 	{\n"
+"        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+"        \n"
+"		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n"
+"		{\n"
+"            int i = addr2+j;\n"
+"			if( i < n )\n"
+"			{\n"
+"                int tableIdx;\n"
+"				tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1\n"
+"                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n"
+"                counter[tableIdx] ++;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"  }\n"
+"    \n"
+"}\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )\n"
+"{\n"
+"    \n"
+"	u32 gIdx = GET_GLOBAL_IDX;\n"
+"	u32 realLocalIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"	const int startBit = cb.m_startBit;\n"
+"	const int n = cb.m_n;\n"
+"	const int nWGs = cb.m_nWGs;\n"
+"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+"\n"
+"    int counter[NUM_BUCKET];\n"
+"    \n"
+"    if (realLocalIdx>0)\n"
+"        return;\n"
+"    \n"
+"    for (int c=0;c<NUM_BUCKET;c++)\n"
+"        counter[c]=0;\n"
+"\n"
+"    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+"	\n"
+"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
+"\n"
+"   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
+"  {\n"
+"     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
+" 	{\n"
+"        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+"        \n"
+"		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n"
+"		{\n"
+"            int i = addr2+j;\n"
+"			if( i < n )\n"
+"			{\n"
+"                int tableIdx;\n"
+"				tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1\n"
+"                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n"
+"                counter[tableIdx] ++;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"  }\n"
+"    \n"
+"}\n"
+;
diff --git a/opencl/parallel_primitives/test/main.cpp b/opencl/parallel_primitives/test/main.cpp
new file mode 100644
index 000000000..d659410f1
--- /dev/null
+++ b/opencl/parallel_primitives/test/main.cpp
@@ -0,0 +1,379 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include <stdio.h>
+#include "../basic_initialize/btOpenCLUtils.h"
+#include "../host/btFillCL.h"
+#include "../host/btBoundSearchCL.h"
+#include "../host/btRadixSort32CL.h"
+#include "../host/btPrefixScanCL.h"
+#include "../host/CommandLineArgs.h"
+
+#include "../host/btMinMax.h"
+int g_nPassed = 0;
+int g_nFailed = 0;
+bool g_testFailed = 0;
+
+#define TEST_INIT g_testFailed = 0;
+#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
+#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
+#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
+
+cl_context g_context=0;
+cl_device_id g_device=0;
+cl_command_queue g_queue =0;
+const char* g_deviceName = 0;
+
+void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
+{
+	void* glCtx=0;
+	void* glDC = 0;
+	int ciErrNum = 0;
+	//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
+
+	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+
+	g_context = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	int numDev = btOpenCLUtils::getNumDevices(g_context);
+	if (numDev>0)
+	{
+		btOpenCLDeviceInfo info;
+		g_device= btOpenCLUtils::getDevice(g_context,0);
+		g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+        btOpenCLUtils::printDeviceInfo(g_device);
+		btOpenCLUtils::getDeviceInfo(g_device,&info);
+		g_deviceName = info.m_deviceName;
+	}
+}
+
+void exitCL()
+{
+	clReleaseCommandQueue(g_queue);
+	clReleaseContext(g_context);
+}
+
+
+inline void fillIntTest()
+{
+	TEST_INIT;
+
+	btFillCL* fillCL = new btFillCL(g_context,g_device,g_queue);
+	int maxSize=1024*256;
+	btOpenCLArray<int> intBuffer(g_context,g_queue,maxSize);
+	intBuffer.resize(maxSize);
+	
+#define NUM_TESTS 7
+
+	int dx = maxSize/NUM_TESTS;
+	for (int iter=0;iter<NUM_TESTS;iter++)
+	{
+		int size = btMin( 11+dx*iter, maxSize );
+
+		int value = 2;
+		
+
+		int offset=0;
+		fillCL->execute(intBuffer,value,size,offset);
+
+		btAlignedObjectArray<int> hostBuf2;
+		hostBuf2.resize(size);
+		fillCL->executeHost(hostBuf2,value,size,offset);
+
+		btAlignedObjectArray<int> hostBuf;
+		intBuffer.copyToHost(hostBuf);
+
+		for(int i=0; i<size; i++)
+		{
+				TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
+				TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
+		}
+	}
+
+	
+
+	delete fillCL;
+
+	TEST_REPORT( "fillIntTest" );
+}
+
+
+__inline
+void seedRandom(int seed)
+{
+	srand( seed );
+}
+
+template<typename T>
+__inline
+T getRandom(const T& minV, const T& maxV)
+{
+	float r = (rand()%10000)/10000.f;
+	T range = maxV - minV;
+	return (T)(minV + r*range);
+}
+
+struct btSortDataCompare
+{
+	inline bool operator()(const btSortData& first, const btSortData& second) const
+	{
+		return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value);
+	}
+};
+
+
+void boundSearchTest( )
+{
+	TEST_INIT;
+
+	int maxSize = 1024*256;
+	int bucketSize = 256;
+
+	btOpenCLArray<btSortData> srcCL(g_context,g_queue,maxSize);
+	btOpenCLArray<unsigned int> upperCL(g_context,g_queue,maxSize);
+	btOpenCLArray<unsigned int> lowerCL(g_context,g_queue,maxSize);
+	
+	btAlignedObjectArray<btSortData> srcHost;
+	btAlignedObjectArray<unsigned int> upperHost;
+	btAlignedObjectArray<unsigned int> lowerHost;
+	btAlignedObjectArray<unsigned int> upperHostCompare;
+	btAlignedObjectArray<unsigned int> lowerHostCompare;
+	
+	btBoundSearchCL* search = new btBoundSearchCL(g_context,g_device,g_queue, maxSize);
+
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		
+		int size = btMin( 128+dx*iter, maxSize );
+
+		upperHost.resize(bucketSize);
+		lowerHost.resize(bucketSize);
+		upperHostCompare.resize(bucketSize);
+		lowerHostCompare.resize(bucketSize);
+
+		srcHost.resize(size);
+
+		for(int i=0; i<size; i++) 
+		{
+			btSortData v;
+//			v.m_key = i<2? 0 : 5;
+			v.m_key = getRandom(0,bucketSize);
+
+			v.m_value = i;
+			srcHost.at(i) = v;
+		}
+
+		srcHost.quickSort(btSortDataCompare());
+		srcCL.copyFromHost(srcHost);
+
+		{
+			
+			for(int i=0; i<bucketSize; i++) 
+			{
+				lowerHost[i] = -1;
+				lowerHostCompare[i] = -1;
+				upperHost[i] = -1;
+				upperHostCompare[i] = -1;
+			}
+			upperCL.copyFromHost(upperHost);
+			lowerCL.copyFromHost(lowerHost);
+		}
+
+		search->execute(srcCL,size,upperCL,bucketSize,btBoundSearchCL::BOUND_UPPER);
+		search->execute(srcCL,size,lowerCL,bucketSize,btBoundSearchCL::BOUND_LOWER);
+
+		search->executeHost(srcHost,size,upperHostCompare,bucketSize,btBoundSearchCL::BOUND_UPPER);
+		search->executeHost(srcHost,size,lowerHostCompare,bucketSize,btBoundSearchCL::BOUND_LOWER);
+
+		lowerCL.copyToHost(lowerHost);
+		upperCL.copyToHost(upperHost);
+		for(int i=0; i<bucketSize; i++)
+		{
+			TEST_ASSERT(upperHostCompare[i] == upperHost[i]);
+			TEST_ASSERT(lowerHostCompare[i] == lowerHost[i]);
+		}
+		/*
+		for(int i=1; i<bucketSize; i++)
+		{
+			int lhi_1 = lowerHost[i-1];
+			int lhi = lowerHost[i];
+
+			for(int j=lhi_1; j<lhi; j++)
+			//for(int j=lowerHost[i-1]; j<lowerHost[i]; j++)
+			{
+				TEST_ASSERT( srcHost[j].m_key < i );
+			}
+		}
+
+		for(int i=0; i<bucketSize; i++)
+		{
+			int jMin = (i==0)?0:upperHost[i-1];
+			for(int j=jMin; j<upperHost[i]; j++)
+			{
+				TEST_ASSERT( srcHost[j].m_key <= i );
+			}
+		}
+		*/
+
+
+		for(int i=0; i<bucketSize; i++)
+		{
+			int lhi = lowerHost[i];
+			int uhi = upperHost[i];
+
+			for(int j=lhi; j<uhi; j++)
+			{
+				if ( srcHost[j].m_key != i )
+				{
+					printf("error %d != %d\n",srcHost[j].m_key,i);
+				}
+				TEST_ASSERT( srcHost[j].m_key == i );
+			}
+		}
+
+	}
+
+	delete search;
+
+	TEST_REPORT( "boundSearchTest" );
+}
+
+
+void prefixScanTest()
+{
+	TEST_INIT;
+
+	int maxSize = 1024*256;
+
+	btAlignedObjectArray<unsigned int> buf0Host;
+	btAlignedObjectArray<unsigned int> buf1Host;
+
+	btOpenCLArray<unsigned int> buf2CL(g_context,g_queue,maxSize);
+	btOpenCLArray<unsigned int> buf3CL(g_context,g_queue,maxSize);
+	
+	
+	btPrefixScanCL* scan = new btPrefixScanCL(g_context,g_device,g_queue,maxSize);
+		
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = btMin( 128+dx*iter, maxSize );
+		buf0Host.resize(size);
+		buf1Host.resize(size);
+
+		for(int i=0; i<size; i++) 
+			buf0Host[i] = 1;
+		
+		buf2CL.copyFromHost( buf0Host);
+	
+		unsigned int sumHost, sumGPU;
+
+		scan->executeHost(buf0Host, buf1Host, size, &sumHost );
+		scan->execute( buf2CL, buf3CL, size, &sumGPU );
+
+		buf3CL.copyToHost(buf0Host);
+		
+		TEST_ASSERT( sumHost == sumGPU );
+		for(int i=0; i<size; i++) 
+			TEST_ASSERT( buf1Host[i] == buf0Host[i] );
+	}
+
+	delete scan;
+
+	TEST_REPORT( "scanTest" );
+}
+
+
+bool radixSortTest()
+{
+	TEST_INIT;
+	
+	int maxSize = 1024*256;
+
+	btAlignedObjectArray<btSortData> buf0Host;
+	buf0Host.resize(maxSize);
+	btAlignedObjectArray<btSortData> buf1Host;
+	buf1Host.resize(maxSize );
+	btOpenCLArray<btSortData> buf2CL(g_context,g_queue,maxSize);
+
+	btRadixSort32CL* sort = new btRadixSort32CL(g_context,g_device,g_queue,maxSize);
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = btMin( 128+dx*iter, maxSize-512 );
+		size = NEXTMULTIPLEOF( size, 512 );//not necessary
+		
+		buf0Host.resize(size);
+
+		for(int i=0; i<size; i++)
+		{
+			btSortData v;
+			v.m_key = getRandom(0,0xff);
+			v.m_value = i;
+			buf0Host[i] = v;
+		}
+
+		buf2CL.copyFromHost( buf0Host);
+		
+
+		sort->executeHost( buf0Host);
+		sort->execute(buf2CL);
+
+		buf2CL.copyToHost(buf1Host);
+				
+		for(int i=0; i<size; i++) 
+		{
+			TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key );
+		}
+	}
+
+	delete sort;
+
+	TEST_REPORT( "radixSort" );
+
+	return g_testFailed;
+}
+
+
+int main(int argc, char** argv)
+{
+	int preferredDeviceIndex = -1;
+	int preferredPlatformIndex = -1;
+
+	CommandLineArgs args(argc, argv);
+	args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
+	args.GetCmdLineArgument("platformId", preferredPlatformIndex);
+
+	initCL(preferredDeviceIndex,preferredPlatformIndex);
+
+	fillIntTest();
+
+	boundSearchTest();
+
+	prefixScanTest();
+
+	radixSortTest();
+
+	exitCL();
+
+	printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
+	printf("End, press <enter>\n");
+	getchar();
+}
+
diff --git a/opencl/parallel_primitives/test/premake4.lua b/opencl/parallel_primitives/test/premake4.lua
new file mode 100644
index 000000000..119087926
--- /dev/null
+++ b/opencl/parallel_primitives/test/premake4.lua
@@ -0,0 +1,41 @@
+function createProject(vendor)	
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("OpenCL_primitives_test_" .. vendor)
+
+		initOpenCL(vendor)
+
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+		includedirs {".",".."}
+		
+		
+		files {
+			"main.cpp",
+			"../../basic_initialize/btOpenCLInclude.h",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../host/btFillCL.cpp",
+			"../host/btFillCL.h",
+			"../host/btBoundSearchCL.cpp",
+			"../host/btBoundSearchCL.h",
+			"../host/btPrefixScanCL.cpp",
+			"../host/btPrefixScanCL.h",
+			"../host/btRadixSort32CL.cpp",
+			"../host/btRadixSort32CL.h",
+			"../host/btAlignedAllocator.cpp",
+			"../host/btAlignedAllocator.h",
+			"../host/btAlignedObjectArray.h",
+		}
+		
+	end
+end
+
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
+createProject("Apple")
\ No newline at end of file
diff --git a/opencl/reduce/main.cpp b/opencl/reduce/main.cpp
new file mode 100644
index 000000000..f925f6855
--- /dev/null
+++ b/opencl/reduce/main.cpp
@@ -0,0 +1,116 @@
+///original author: Erwin Coumans
+#include "btOpenCLUtils.h"
+#include "../parallel_primitives/host/btOpenCLArray.h"
+#include "../parallel_primitives/host/btLauncherCL.h"
+#include <stdio.h>
+
+
+#define MSTRINGIFY(A) #A
+const char* kernelString= MSTRINGIFY(
+__kernel void ReduceGlobal(__global int* d_in, __global int* d_out, int numElements)
+{
+	int myId = get_global_id(0);
+	int tid = get_local_id(0);
+
+
+	int ls = get_local_size(0);
+	for (unsigned int s=ls/2;s>0;s>>=1)
+	{
+		if (myId<numElements)
+		{
+			if (tid<s)
+			{
+				d_in[myId] += d_in[myId+s];
+			}
+		}
+		barrier(CLK_GLOBAL_MEM_FENCE);
+	}
+	if (tid==0)
+	{
+		if (myId<numElements)
+		{
+			d_out[get_group_id(0)]=d_in[myId];
+		}
+	}
+}
+);
+
+int main(int argc, char* argv[])
+{
+	int ciErrNum = 0;
+	int preferred_device = -1;
+	int preferred_platform = -1;
+	cl_platform_id		platformId;
+	cl_context			ctx;
+	cl_command_queue	queue;
+	cl_device_id		device;
+	cl_kernel			addKernel;
+	ctx = btOpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
+	btOpenCLUtils::printPlatformInfo(platformId);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	if (!ctx) {
+		printf("No OpenCL capable GPU found!");
+		return 0;
+	}
+
+	device = btOpenCLUtils::getDevice(ctx,0);
+	queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
+	addKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"ReduceGlobal",&ciErrNum);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	int numElements = 1024*1024;
+	btOpenCLArray<int> a(ctx,queue);
+	btOpenCLArray<int> b(ctx,queue);
+	btAlignedObjectArray<int> hostA;
+	btAlignedObjectArray<int> hostB;
+
+	for (int i=0;i<numElements;i++)
+	{
+		hostA.push_back(1);
+		hostB.push_back(0.f);
+	}
+	a.copyFromHost(hostA);
+	b.copyFromHost(hostB);
+	
+	int hostSum= 0;
+	for (int i=0;i<numElements;i++)
+	{
+		hostSum += hostA.at(i);
+	}
+	b.resize(numElements);
+
+	{
+		btLauncherCL launcher( queue, addKernel);
+		launcher.setBuffer( a.getBufferCL());
+		launcher.setBuffer( b.getBufferCL());
+		launcher.setConst(  numElements );
+		launcher.launch1D( numElements,1024);
+	}
+	clFinish(queue);
+	{
+		btLauncherCL launcher( queue, addKernel);
+		launcher.setBuffer( b.getBufferCL());
+		launcher.setBuffer( a.getBufferCL());
+		launcher.setConst(  1024 );
+		launcher.launch1D( 1024,1024);
+	}
+	clFinish(queue);
+
+	printf("hostSum = %d\n", hostSum);
+
+	int clSum = a.at(0);
+	printf("clSum = %d\n", clSum );
+	if (hostSum != clSum)
+	{
+		printf("Incorrect result\n");
+	} else
+	{
+		printf("Correct result\n");
+	}
+
+	
+	clReleaseCommandQueue(queue);
+	clReleaseContext(ctx);
+	printf("press key\n");
+	getchar();
+	return 0;
+}
\ No newline at end of file
diff --git a/opencl/reduce/premake4.lua b/opencl/reduce/premake4.lua
new file mode 100644
index 000000000..6212b51f2
--- /dev/null
+++ b/opencl/reduce/premake4.lua
@@ -0,0 +1,37 @@
+
+function createProject (vendor)
+
+	local hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ( "OpenCL_reduce_" .. vendor)
+
+		initOpenCL(vendor)
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../bin"
+
+		links {
+			"OpenCL_lib_parallel_primitives_host_" .. vendor
+		}
+
+		includedirs {
+			"../basic_initialize"
+		}
+		
+		files {
+			"main.cpp",
+			"../basic_initialize/btOpenCLUtils.cpp",
+			"../basic_initialize/btOpenCLUtils.h"
+		}
+	end
+	
+end
+
+createProject("AMD")
+createProject("NVIDIA")
+createProject("Intel")
+createProject("Apple")
diff --git a/opencl/vector_add/VectorAddKernels.cl b/opencl/vector_add/VectorAddKernels.cl
new file mode 100644
index 000000000..2ff17826a
--- /dev/null
+++ b/opencl/vector_add/VectorAddKernels.cl
@@ -0,0 +1,16 @@
+
+
+__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)
+{
+    // get oct-float index into global data array
+    int iGID = get_global_id(0);
+	if (iGID>=numElements)
+		return;
+
+	float8 aGID = a[iGID];
+	float8 bGID = b[iGID];
+
+	float8 result = aGID + bGID;
+    // write back out to GMEM
+    c[iGID] = result;
+}
diff --git a/opencl/vector_add/VectorAddKernels.h b/opencl/vector_add/VectorAddKernels.h
new file mode 100644
index 000000000..55c238aae
--- /dev/null
+++ b/opencl/vector_add/VectorAddKernels.h
@@ -0,0 +1,20 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* vectorAddCL= \
+"\n"
+"\n"
+"__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)\n"
+"{\n"
+"    // get oct-float index into global data array\n"
+"    int iGID = get_global_id(0);\n"
+"	if (iGID>=numElements)\n"
+"		return;\n"
+"\n"
+"	float8 aGID = a[iGID];\n"
+"	float8 bGID = b[iGID];\n"
+"\n"
+"	float8 result = aGID + bGID;\n"
+"    // write back out to GMEM\n"
+"    c[iGID] = result;\n"
+"}\n"
+"\n"
+;
diff --git a/opencl/vector_add/main.cpp b/opencl/vector_add/main.cpp
new file mode 100644
index 000000000..aa4132d98
--- /dev/null
+++ b/opencl/vector_add/main.cpp
@@ -0,0 +1,408 @@
+
+///VectorAdd sample, from the NVidia JumpStart Guide
+///http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf
+
+///Instead of #include <CL/cl.h> we include <MiniCL/cl.h>
+///Apart from this include file, all other code should compile and work on OpenCL compliant implementation
+
+
+#define LOAD_FROM_FILE
+
+#ifdef __APPLE__
+	#include <OpenCL/OpenCL.h>
+#else
+	#include <CL/cl.h>
+#endif //__APPLE__
+#ifdef _WIN32
+#pragma warning (disable:4996)
+#endif
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); }
+size_t wgSize;
+
+#include "VectorAddKernels.h"
+
+#ifdef CL_PLATFORM_INTEL
+	const char* preferredPlatform = "Intel(R) Corporation";
+#elif defined CL_PLATFORM_AMD
+	const char* preferredPlatform = "Advanced Micro Devices, Inc.";
+#elif defined CL_PLATFORM_NVIDIA
+	const char* preferredPlatform = "NVIDIA Corporation";
+#else
+	const char* preferredPlatform = "Unknown";
+#endif
+
+
+
+char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
+{
+    // locals 
+    FILE* pFileStream = NULL;
+    size_t szSourceLength;
+	
+    // open the OpenCL source code file
+	pFileStream = fopen(cFilename, "rb");
+	if(pFileStream == 0) 
+	{       
+		return NULL;
+	}
+	
+    size_t szPreambleLength = strlen(cPreamble);
+	
+    // get the length of the source code
+    fseek(pFileStream, 0, SEEK_END); 
+    szSourceLength = ftell(pFileStream);
+    fseek(pFileStream, 0, SEEK_SET); 
+	
+    // allocate a buffer for the source code string and read it in
+    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
+    memcpy(cSourceString, cPreamble, szPreambleLength);
+    fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream); 
+	
+    // close the file and return the total length of the combined (preamble + source) string
+    fclose(pFileStream);
+    if(szFinalLength != 0)
+    {
+        *szFinalLength = szSourceLength + szPreambleLength;
+    }
+    cSourceString[szSourceLength + szPreambleLength] = '\0';
+	
+    return cSourceString;
+}
+
+size_t workitem_size[3];
+
+void printDevInfo(cl_device_id device)
+{
+    char device_string[1024];
+	
+    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
+    printf(  " Device %s:\n", device_string);
+
+    // CL_DEVICE_INFO
+    cl_device_type type;
+    clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
+    if( type & CL_DEVICE_TYPE_CPU )
+        printf(" CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_CPU");
+    if( type & CL_DEVICE_TYPE_GPU )
+        printf(  " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_GPU");
+    if( type & CL_DEVICE_TYPE_ACCELERATOR )
+        printf(  " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
+    if( type & CL_DEVICE_TYPE_DEFAULT )
+        printf(  " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
+    
+    // CL_DEVICE_MAX_COMPUTE_UNITS
+    cl_uint compute_units;
+    clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
+    printf(  " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units);
+
+    // CL_DEVICE_MAX_WORK_GROUP_SIZE
+    
+    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
+    printf(  " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
+    
+}
+
+
+
+
+// Main function 
+// *********************************************************************
+int main(int argc, char **argv)
+{
+	void *srcA, *srcB, *dst;        // Host buffers for OpenCL test
+    cl_context cxGPUContext;       // OpenCL context
+    cl_command_queue cqCommandQue;  // OpenCL command que
+    cl_device_id* cdDevices;        // OpenCL device list    
+    cl_program cpProgram;           // OpenCL program
+    cl_kernel ckKernel;             // OpenCL kernel
+    cl_mem cmMemObjs[3];            // OpenCL memory buffer objects:  3 for device
+    size_t szGlobalWorkSize[1];     // 1D var for Total # of work items
+    size_t szLocalWorkSize[1];		// 1D var for # of work items in the work group	
+    size_t szParmDataBytes;			// Byte size of context information
+    cl_int ciErr1, ciErr2;			// Error code var
+    
+
+	int iTestN = 100000 * 8;		// Size of Vectors to process
+
+	int actualGlobalSize = iTestN / 8;
+	
+	
+    // set Global and Local work size dimensions
+    szGlobalWorkSize[0] = iTestN >> 3;  // do 8 computations per work item
+    szLocalWorkSize[0]= iTestN>>3;
+	
+	
+    // Allocate and initialize host arrays
+    srcA = (void *)malloc (sizeof(cl_float) * iTestN);
+    srcB = (void *)malloc (sizeof(cl_float) * iTestN);
+    dst = (void *)malloc (sizeof(cl_float) * iTestN);
+
+	int i;
+
+	// Initialize arrays with some values
+	for (i=0;i<iTestN;i++)
+	{
+		((cl_float*)srcA)[i] = cl_float(i);
+		((cl_float*)srcB)[i] = 2;
+		((cl_float*)dst)[i]=-1;
+	}
+
+
+	 cl_uint numPlatforms;
+    cl_platform_id platform = NULL;
+    cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
+
+    if (0 < numPlatforms) 
+    {
+        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+        
+        for (unsigned i = 0; i < numPlatforms; ++i) 
+        {
+            char pbuf[100];
+            status = clGetPlatformInfo(platforms[i],
+                                       CL_PLATFORM_VENDOR,
+                                       sizeof(pbuf),
+                                       pbuf,
+                                       NULL);
+
+            platform = platforms[i];
+			if (!strcmp(pbuf, preferredPlatform))
+            {
+				printf("Found platform %s\n", preferredPlatform);
+                break;
+            }
+        }
+        delete[] platforms;
+    }
+
+	cl_context_properties cps[3] = 
+    {
+        CL_CONTEXT_PLATFORM, 
+        (cl_context_properties)platform, 
+        0
+    };
+
+    // Create OpenCL context & context
+    cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
+	
+    // Query all devices available to the context
+    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
+    cdDevices = (cl_device_id*)malloc(szParmDataBytes);
+    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
+	if (cdDevices)
+	{
+		printDevInfo(cdDevices[0]);
+	}
+
+    // Create a command queue for first device the context reported
+    cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
+    ciErr1 |= ciErr2; 
+
+    // Allocate the OpenCL source and result buffer memory objects on the device GMEM
+    cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
+    ciErr1 |= ciErr2;
+    cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
+    ciErr1 |= ciErr2;
+    cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
+    ciErr1 |= ciErr2;
+
+///create kernels from binary
+	int numDevices = 1;
+	::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
+	const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));
+
+	for (i = 0; i < numDevices; ++i) {
+		images[i] = 0;
+		lengths[i] = 0;
+	}
+
+	
+	// Read the OpenCL kernel in from source file
+	const char* cSourceFile = "opencl/vector_add/VectorAddKernels.cl";
+	
+    
+    const char* cPathAndName = cSourceFile;
+#ifdef LOAD_FROM_FILE
+	size_t szKernelLength;
+
+	const char* cSourceCL =0;
+	char relativeFileName[1024];
+
+	{
+		const char* prefix[]={"../","../../","../../../","../../../../"};
+		int numPrefixes = sizeof(prefix)/sizeof(char*);
+
+		for (int i=0;!cSourceCL && i<numPrefixes;i++)
+		{
+			
+			sprintf(relativeFileName,"%s%s",prefix[i],cSourceFile);
+			cSourceCL = loadProgSource(relativeFileName, "", &szKernelLength);
+			if (cSourceCL)
+			{
+				printf("Loaded program source: %s\n", relativeFileName); 
+			}
+		}
+	}
+
+	if (!cSourceCL)
+	{
+		printf("Couldn't find file %s, exiting\n",cSourceFile);
+		exit(0);
+	}
+#else
+	const char* cSourceCL = vectorAddCL;
+	size_t szKernelLength = strlen(cSourceCL);
+#endif //LOAD_FROM_FILE
+
+
+	
+    // Create the program
+    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
+    printf("clCreateProgramWithSource...\n"); 
+    if (ciErr1 != CL_SUCCESS)
+    {
+        printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
+        exit(0);
+    }
+	
+    // Build the program with 'mad' Optimization option
+#ifdef MAC
+	char* flags = "-cl-mad-enable -DMAC ";
+#else
+	char flags[1024]={0};
+#ifdef CL_PLATFORM_INTEL
+	sprintf(flags,"-g -s \"%s\"","C:/develop/experiments/opencl/vector_add/VectorAddKernels.cl");
+#endif//CL_PLATFORM_INTEL
+
+#endif//MAC
+    ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
+    printf("clBuildProgram...\n"); 
+    if (ciErr1 != CL_SUCCESS)
+    {
+        printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
+        exit(0);
+    }
+	
+    // Create the kernel
+    ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
+    printf("clCreateKernel (VectorAdd)...\n"); 
+    if (ciErr1 != CL_SUCCESS)
+    {
+        printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
+		exit(0);
+    }
+	
+	
+	cl_int ciErrNum;
+	
+	ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
+	if (ciErrNum != CL_SUCCESS)
+	{
+		printf("cannot get workgroup size\n");
+		exit(0);
+	}
+
+	
+
+   
+    // Set the Argument values
+    ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
+    ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
+    ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);
+	ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(int), (void*)&actualGlobalSize);
+
+		printf("Press ENTER to quit\n");
+	getchar();
+	
+	int workgroupSize = wgSize;
+	if(workgroupSize <= 0)
+	{ // let OpenCL library calculate workgroup size
+		size_t globalWorkSize[2];
+		globalWorkSize[0] = actualGlobalSize;
+		globalWorkSize[1] = 1;
+	
+		// Copy input data from host to GPU and launch kernel 
+		ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 );
+
+	}
+	else
+	{
+		size_t localWorkSize[2], globalWorkSize[2];
+		//workgroupSize = btMin(workgroupSize, actualGlobalSize);
+		int num_t = actualGlobalSize / workgroupSize;
+		int num_g = num_t * workgroupSize;
+		if(num_g < actualGlobalSize)
+		{
+			num_t++;
+			//this can cause problems -> processing outside of the buffer
+			//make sure to check kernel
+		}
+
+		size_t globalThreads[] = {num_t * workgroupSize};
+		size_t localThreads[] = {workgroupSize};
+
+
+		localWorkSize[0]  = workgroupSize;
+		globalWorkSize[0] = num_t * workgroupSize;
+		localWorkSize[1] = 1;
+		globalWorkSize[1] = 1;
+
+		// Copy input data from host to GPU and launch kernel 
+		ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);
+
+	}
+	
+	if (ciErrNum != CL_SUCCESS)
+	{
+		printf("cannot clEnqueueNDRangeKernel\n");
+		exit(0);
+	}
+	
+	clFinish(cqCommandQue);
+    // Read back results and check accumulated errors
+    ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);
+
+    // Release kernel, program, and memory objects
+	// NOTE:  Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
+    free(cdDevices);
+	clReleaseKernel(ckKernel);  
+    clReleaseProgram(cpProgram);
+    clReleaseCommandQueue(cqCommandQue);
+    clReleaseContext(cxGPUContext);
+
+
+    // print the results
+    int iErrorCount = 0;
+    for (i = 0; i < iTestN; i++) 
+    {
+		if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
+			iErrorCount++;
+    }
+	
+	if (iErrorCount)
+	{
+		printf("Validation FAILED\n");
+	} else
+	{
+		printf("Validation SUCCESSFULL\n");
+	}
+    // Free host memory, close log and return success
+	for (i = 0; i < 3; i++)
+    {
+        clReleaseMemObject(cmMemObjs[i]);
+    }
+
+    free(srcA); 
+    free(srcB);
+    free (dst);
+	printf("Press ENTER to quit\n");
+	getchar();
+}
+
+
diff --git a/opencl/vector_add/premake4.lua b/opencl/vector_add/premake4.lua
new file mode 100644
index 000000000..ccaabd73b
--- /dev/null
+++ b/opencl/vector_add/premake4.lua
@@ -0,0 +1,28 @@
+function createProject(vendor)
+	
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("OpenCL_VectorAdd_" .. vendor)
+
+		initOpenCL(vendor)
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../bin"
+
+		files {
+			"main.cpp",
+			"../basic_initialize/btOpenCLUtils.cpp",
+			"../basic_initialize/btOpenCLUtils.h"
+		}
+		
+	end
+end
+	
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
+createProject("Apple")
diff --git a/opencl/vector_add_simplified/main.cpp b/opencl/vector_add_simplified/main.cpp
new file mode 100644
index 000000000..d911ec2c2
--- /dev/null
+++ b/opencl/vector_add_simplified/main.cpp
@@ -0,0 +1,69 @@
+///original author: Erwin Coumans
+#include "btOpenCLUtils.h"
+#include "../parallel_primitives/host/btOpenCLArray.h"
+#include "../parallel_primitives/host/btLauncherCL.h"
+#include <stdio.h>
+
+
+#define MSTRINGIFY(A) #A
+const char* kernelString= MSTRINGIFY(
+__kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int numElements)
+{
+  int iGID = get_global_id(0);
+	if (iGID>=numElements)
+		return;
+	float aGID = a[iGID];
+	float bGID = b[iGID];
+	float result = aGID + bGID;
+    c[iGID] = result;
+}
+);
+
+int main(int argc, char* argv[])
+{
+	int ciErrNum = 0;
+	int preferred_device = -1;
+	int preferred_platform = -1;
+	cl_platform_id		platformId;
+	cl_context			ctx;
+	cl_command_queue	queue;
+	cl_device_id		device;
+	cl_kernel			addKernel;
+	ctx = btOpenCLUtils::createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
+	btOpenCLUtils::printPlatformInfo(platformId);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	if (!ctx) {
+		printf("No OpenCL capable GPU found!");
+		return 0;
+	}
+
+	device = btOpenCLUtils::getDevice(ctx,0);
+	queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
+	addKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"VectorAdd",&ciErrNum);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	int numElements = 32;
+	btOpenCLArray<float> a(ctx,queue);
+	btOpenCLArray<float> b(ctx,queue);
+	btOpenCLArray<float> c(ctx,queue);
+	for (int i=0;i<numElements;i++)
+	{
+		a.push_back(float(i));
+		b.push_back(float(i));
+	}
+	
+	c.resize(numElements);
+	btLauncherCL launcher( queue, addKernel);
+	launcher.setBuffer( a.getBufferCL());
+	launcher.setBuffer( b.getBufferCL());
+	launcher.setBuffer( c.getBufferCL());
+	launcher.setConst(  numElements );
+	launcher.launch1D( numElements);
+	for (int i=0;i<numElements;i++)
+	{
+		float v = c.at(i);
+		printf("c[%d]=%f\n",i,v);
+	}
+	clReleaseCommandQueue(queue);
+	clReleaseContext(ctx);
+	return 0;
+}
\ No newline at end of file
diff --git a/opencl/vector_add_simplified/premake4.lua b/opencl/vector_add_simplified/premake4.lua
new file mode 100644
index 000000000..c459c16dc
--- /dev/null
+++ b/opencl/vector_add_simplified/premake4.lua
@@ -0,0 +1,37 @@
+
+function createProject (vendor)
+
+	local hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ( "OpenCL_vector_add_simplified_" .. vendor)
+
+		initOpenCL(vendor)
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../bin"
+
+		links {
+			"OpenCL_lib_parallel_primitives_host_" .. vendor
+		}
+
+		includedirs {
+			"../basic_initialize"
+		}
+		
+		files {
+			"main.cpp",
+			"../basic_initialize/btOpenCLUtils.cpp",
+			"../basic_initialize/btOpenCLUtils.h"
+		}
+	end
+	
+end
+
+createProject("AMD")
+createProject("NVIDIA")
+createProject("Intel")
+createProject("Apple")