commit 08272c7de5240f204532d282a92cfb76926d68ad Author: erwin coumans Date: Mon Mar 11 22:03:27 2013 +0100 import opencl_course source for a start diff --git a/build/findDirectX11.lua b/build/findDirectX11.lua new file mode 100644 index 000000000..68771c4a0 --- /dev/null +++ b/build/findDirectX11.lua @@ -0,0 +1,36 @@ +function findDirectX11() + local dx11path = os.getenv("DXSDK_DIR") + if (dx11path) then + local filepath = string.format("%s%s",dx11path,"Include/D3D11.h") + headerdx11 = io.open(filepath, "r") + if (headerdx11) then + printf("Found DX11: '%s'", filepath) + return true + end + end + return false + end + +function initDirectX11() + configuration {} + + local dx11path = os.getenv("DXSDK_DIR") + defines { "ADL_ENABLE_DX11"} + includedirs {"$(DXSDK_DIR)/include"} + + configuration "x32" + libdirs {"$(DXSDK_DIR)/Lib/x86"} + configuration "x64" + libdirs {"$(DXSDK_DIR)/Lib/x64"} + configuration {} + links {"d3dcompiler", + "dxerr", + "dxguid", + "d3dx9", + "d3d9", + "winmm", + "comctl32", + "d3dx11" + } + return true +end \ No newline at end of file diff --git a/build/findOpenCL.lua b/build/findOpenCL.lua new file mode 100644 index 000000000..e2eda667a --- /dev/null +++ b/build/findOpenCL.lua @@ -0,0 +1,151 @@ + + + function findOpenCL_Apple() + if os.is("macosx") then + return true + else + return false + end + end + + + function findOpenCL_AMD() + local amdopenclpath = os.getenv("AMDAPPSDKROOT") + if (amdopenclpath) then + return true + end + return false + end + + function findOpenCL_NVIDIA() + local nvidiaopenclpath = os.getenv("CUDA_PATH") + if (nvidiaopenclpath) then + return true + end + return false + end + + function findOpenCL_Intel() + if os.is("Windows") then + local intelopenclpath = os.getenv("INTELOCLSDKROOT") + if (intelopenclpath) then + return true + end + end + if os.is("Linux") then + local intelsdk = io.open("/usr/include/CL/opencl.h","r") + if (intelsdk) then + return true; + end + end + return false + end + + function initOpenCL_Apple() + configuration{} + includedirs { + "/System/Library/Frameworks/OpenCL.framework" + } + libdirs "/System/Library/Frameworks/OpenCL.framework" + links + { + "OpenCL.framework" + } + end + + function initOpenCL_AMD() + configuration {} + local amdopenclpath = os.getenv("AMDAPPSDKROOT") + if (amdopenclpath) then + defines { "ADL_ENABLE_CL" , "CL_PLATFORM_AMD"} + includedirs { + "$(AMDAPPSDKROOT)/include" + } + configuration "x32" + libdirs {"$(AMDAPPSDKROOT)/lib/x86"} + configuration "x64" + libdirs {"$(AMDAPPSDKROOT)/lib/x86_64"} + configuration {} + links {"OpenCL"} + return true + end + return false + end + + + function initOpenCL_NVIDIA() + configuration {} + local nvidiaopenclpath = os.getenv("CUDA_PATH") + if (nvidiaopenclpath) then + defines { "ADL_ENABLE_CL" , "CL_PLATFORM_NVIDIA"} + includedirs { + "$(CUDA_PATH)/include" + } + configuration "x32" + libdirs {"$(CUDA_PATH)/lib/Win32"} + configuration "x64" + libdirs {"$(CUDA_PATH)/lib/x64"} + configuration {} + links {"OpenCL"} + return true + end + return false + end + + function initOpenCL_Intel() + configuration {} + if os.is("Windows") then + local intelopenclpath = os.getenv("INTELOCLSDKROOT") + if (intelopenclpath) then + defines { "ADL_ENABLE_CL" , "CL_PLATFORM_INTEL"} + includedirs { + "$(INTELOCLSDKROOT)/include" + } + configuration "x32" + libdirs {"$(INTELOCLSDKROOT)/lib/x86"} + configuration "x64" + libdirs {"$(INTELOCLSDKROOT)/lib/x64"} + configuration {} + links {"OpenCL"} + return true + end + end + if os.is("Linux") then + defines { "ADL_ENABLE_CL" , "CL_PLATFORM_INTEL"} + configuration {} + links {"OpenCL"} + end + return false + end + + function findOpenCL (vendor ) + if vendor=="AMD" then + return findOpenCL_AMD() + end + if vendor=="NVIDIA" then + return findOpenCL_NVIDIA() + end + if vendor=="Intel" then + return findOpenCL_Intel() + end + if vendor=="Apple" then + return findOpenCL_Apple() + end + return false + end + + function initOpenCL ( vendor ) + if vendor=="AMD" then + initOpenCL_AMD() + end + if vendor=="NVIDIA" then + return initOpenCL_NVIDIA() + end + if vendor=="Intel" then + initOpenCL_Intel() + end + if vendor=="Apple" then + return initOpenCL_Apple() + end + end + diff --git a/build/findOpenGLGlewGlut.lua b/build/findOpenGLGlewGlut.lua new file mode 100644 index 000000000..f808972da --- /dev/null +++ b/build/findOpenGLGlewGlut.lua @@ -0,0 +1,51 @@ + + function initOpenGL() + configuration {} + configuration {"Windows"} + links {"opengl32","glu32"} + configuration {"MacOSX"} + links { "OpenGL.framework"} + configuration {"not Windows", "not MacOSX"} + links {"GL"} + configuration{} + end + + function initGlut() + configuration {} + configuration {"Windows"} + + includedirs { + projectRootDir .. "rendering/GlutGlewWindows" + } + libdirs { projectRootDir .. "rendering/GlutGlewWindows"} + configuration {"Windows", "x32"} + links {"glut32"} + configuration {"Windows", "x64"} + links {"glut64"} + + configuration {"MacOSX"} + links { "Glut.framework" } + configuration {"Linux"} + links {"glut","GLU"} + configuration{} + end + + function initGlew() + configuration {} + if os.is("Windows") then + configuration {"Windows"} + defines { "GLEW_STATIC"} + includedirs { + projectRootDir .. "rendering/GlutGlewWindows" + } + libdirs { projectRootDir .. "rendering/GlutGlewWindows"} + files { projectRootDir .. "rendering/GlutGlewWindows/glew.c"} + end + if os.is("Linux") then + links{"GLEW"} + end + configuration{} + end + + + diff --git a/build/premake4.exe b/build/premake4.exe new file mode 100644 index 000000000..072560edb Binary files /dev/null and b/build/premake4.exe differ diff --git a/build/premake4.lua b/build/premake4.lua new file mode 100644 index 000000000..16a50c9c0 --- /dev/null +++ b/build/premake4.lua @@ -0,0 +1,96 @@ + + solution "0MySolution" + + -- Multithreaded compiling + if _ACTION == "vs2010" or _ACTION=="vs2008" then + buildoptions { "/MP" } + end + + act = "" + + if _ACTION then + act = _ACTION + end + + + newoption + { + trigger = "ios", + description = "Enable iOS target (requires xcode4)" + } + + + configurations {"Release", "Debug"} + configuration "Release" + flags { "Optimize", "EnableSSE","StaticRuntime", "NoMinimalRebuild", "FloatFast"} + configuration "Debug" + defines {"_DEBUG=1"} + flags { "Symbols", "StaticRuntime" , "NoMinimalRebuild", "NoEditAndContinue" ,"FloatFast"} + + platforms {"x32", "x64"} + + configuration {"x32"} + targetsuffix ("_" .. act) + configuration "x64" + targetsuffix ("_" .. act .. "_64" ) + configuration {"x64", "debug"} + targetsuffix ("_" .. act .. "_x64_debug") + configuration {"x64", "release"} + targetsuffix ("_" .. act .. "_x64_release" ) + configuration {"x32", "debug"} + targetsuffix ("_" .. act .. "_debug" ) + + configuration{} + + postfix="" + + if _ACTION == "xcode4" then + if _OPTIONS["ios"] then + postfix = "ios"; + xcodebuildsettings + { + 'CODE_SIGN_IDENTITY = "iPhone Developer"', + "SDKROOT = iphoneos", + 'ARCHS = "armv7"', + 'TARGETED_DEVICE_FAMILY = "1,2"', + 'VALID_ARCHS = "armv7"', + } + else + xcodebuildsettings + { + 'ARCHS = "$(ARCHS_STANDARD_32_BIT) $(ARCHS_STANDARD_64_BIT)"', + 'VALID_ARCHS = "x86_64 i386"', + } + end + end + + + flags { "NoRTTI", "NoExceptions"} + defines { "_HAS_EXCEPTIONS=0" } + targetdir "../bin" + location("./" .. act .. postfix) + + + projectRootDir = os.getcwd() .. "/../" + print("Project root directroy: " .. projectRootDir); + + dofile ("findOpenCL.lua") + dofile ("findDirectX11.lua") + dofile ("findOpenGLGlewGlut.lua") + + language "C++" + + + + if not _OPTIONS["ios"] then + include "../opencl/vector_add_simplified" + include "../opencl/vector_add" + include "../opencl/basic_initialize" + include "../opencl/parallel_primitives/host" + include "../opencl/parallel_primitives/test" + include "../opencl/parallel_primitives/benchmark" + include "../opencl/lds_bank_conflict" + include "../opencl/reduce" + + + end \ No newline at end of file diff --git a/build/premake4_linux b/build/premake4_linux new file mode 100644 index 000000000..53442a801 Binary files /dev/null and b/build/premake4_linux differ diff --git a/build/premake4_linux64 b/build/premake4_linux64 new file mode 100644 index 000000000..4724db588 Binary files /dev/null and b/build/premake4_linux64 differ diff --git a/build/premake4_osx b/build/premake4_osx new file mode 100644 index 000000000..67e25d5b9 Binary files /dev/null and b/build/premake4_osx differ diff --git a/build/stringify.bat b/build/stringify.bat new file mode 100644 index 000000000..890f27d9d --- /dev/null +++ b/build/stringify.bat @@ -0,0 +1,13 @@ + +@echo off + + +premake4 --file=stringifyKernel.lua --kernelfile="../opencl/vector_add/VectorAddKernels.cl" --headerfile="../opencl/vector_add/VectorAddKernels.h" --stringname="vectorAddCL" stringify + +premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" --headerfile="../opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/BoundSearchKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/PrefixScanKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/FillKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify + + +pause \ No newline at end of file diff --git a/build/stringify.sh b/build/stringify.sh new file mode 100644 index 000000000..bb93fa405 --- /dev/null +++ b/build/stringify.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/vector_add/VectorAddKernels.cl" --headerfile="../opencl/vector_add/VectorAddKernels.h" --stringname="vectorAddCL" stringify +./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" --headerfile="../opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify +./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/BoundSearchKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify +./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/PrefixScanKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify +./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/FillKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify + diff --git a/build/stringifyKernel.lua b/build/stringifyKernel.lua new file mode 100644 index 000000000..dea0a73fe --- /dev/null +++ b/build/stringifyKernel.lua @@ -0,0 +1,78 @@ + + +function stringifyKernel(filenameIn, filenameOut, kernelMethod) + local BUFSIZE = 1024*1024 -- 1MB + local f = io.open(filenameIn,"r"); + local fw = io.open(filenameOut,"w"); + fw:write("//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project\n") + fw:write("static const char* " .. kernelMethod .. "= \\\n") + local cc, lc, wc = 0, 0, 0 -- char, line, and word counts + while true do + local lines, rest = f:read(BUFSIZE, "*line") + if not lines then break end + + local i = 0 + local startpos = 0 + local slen = string.len(lines) + local endpos = 0 + while true do + i = string.find(lines, "\n", i+1) -- find 'next' newline + if i == nil then + endpos = slen + else + endpos = i + end + oneline = string.sub(lines,startpos,endpos) + oneline = string.gsub(oneline,"\n","") + oneline = '\"' .. oneline .. '\\n\"' + oneline = string.gsub(oneline,"\\\\n","") + oneline = oneline .. "\n" + --print(oneline) + fw:write(oneline) + if i == nil then break end + startpos = i+1 + end + + if rest then lines = lines .. rest .. '\n' end + cc = cc + string.len(lines) + -- count words in the chunk + local _,t = string.gsub(lines, "%S+", "") + wc = wc + t + -- count newlines in the chunk + _,t = string.gsub(lines, "\n", "\n") + lc = lc + t + end + --print("stringified " .. filenameIn .. " into " .. filenameOut .. " processed " .. lc .. " lines") + print(filenameIn .. " (" .. lc .. " lines)") + + f:close() + fw:write(";\n") + fw:close() + end + + newoption { + trigger = "kernelfile", + value = "kernelpath", + description = "full path to the kernel source input file" + } + + newoption { + trigger = "headerfile", + value = "path", + description = "full path to the header output file" + } + + newoption { + trigger = "stringname", + value = "var", + description = "name of the kernel string variable" + } + + newaction { + trigger = "stringify", + description = "stringify kernels source code into strings", + execute = function () + stringifyKernel( _OPTIONS["kernelfile"] , _OPTIONS["headerfile"], _OPTIONS["stringname"]) + + end +} \ No newline at end of file diff --git a/build/vs2010.bat b/build/vs2010.bat new file mode 100644 index 000000000..584d0baa8 --- /dev/null +++ b/build/vs2010.bat @@ -0,0 +1,6 @@ + +rem premake4 --with-pe vs2010 +premake4 vs2010 + +mkdir vs2010\cache +pause \ No newline at end of file diff --git a/build/xcode.command b/build/xcode.command new file mode 100644 index 000000000..52a4a4a7c --- /dev/null +++ b/build/xcode.command @@ -0,0 +1,4 @@ + +cd `dirname $0` +./premake4_osx xcode4 + diff --git a/opencl/basic_initialize/btOpenCLInclude.h b/opencl/basic_initialize/btOpenCLInclude.h new file mode 100644 index 000000000..5f0e78da6 --- /dev/null +++ b/opencl/basic_initialize/btOpenCLInclude.h @@ -0,0 +1,44 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef BT_OPENCL_INCLUDE_H +#define BT_OPENCL_INCLUDE_H + + +#ifdef __APPLE__ +#ifdef USE_MINICL +#include +#else +#include +#include //clLogMessagesToStderrAPPLE +#endif +#else +#ifdef USE_MINICL +#include +#else +#include +#ifdef _WIN32 +#include "CL/cl_gl.h" +#endif //_WIN32 +#endif +#endif //__APPLE__ + +#include +#include +#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); } + + +#endif //BT_OPENCL_INCLUDE_H + diff --git a/opencl/basic_initialize/btOpenCLUtils.cpp b/opencl/basic_initialize/btOpenCLUtils.cpp new file mode 100644 index 000000000..af29461e7 --- /dev/null +++ b/opencl/basic_initialize/btOpenCLUtils.cpp @@ -0,0 +1,903 @@ +/* +Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org +Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +//original author: Roman Ponomarev +//cleanup by Erwin Coumans + +#include + +#ifdef _WIN32 +#pragma warning (disable:4996) +#endif +#include "btOpenCLUtils.h" +//#include "btOpenCLInclude.h" + +#include +#include + +#define BT_MAX_CL_DEVICES 16 //who needs 16 devices? + +#ifdef _WIN32 +#include +#endif + +#include +#define btAssert assert + + +//Set the preferred platform vendor using the OpenCL SDK +static const char* spPlatformVendor = +#if defined(CL_PLATFORM_MINI_CL) +"MiniCL, SCEA"; +#elif defined(CL_PLATFORM_AMD) +"Advanced Micro Devices, Inc."; +#elif defined(CL_PLATFORM_NVIDIA) +"NVIDIA Corporation"; +#elif defined(CL_PLATFORM_INTEL) +"Intel(R) Corporation"; +#else +"Unknown Vendor"; +#endif + +#ifndef CL_PLATFORM_MINI_CL +#ifdef _WIN32 +#include "CL/cl_gl.h" +#endif //_WIN32 +#endif + +bool gDebugForceLoadingFromSource = false; +bool gDebugSkipLoadingBinary = false; + +void MyFatalBreakAPPLE( const char * errstr , + const void * private_info , + size_t cb , + void * user_data ) +{ + printf("Error: %s\n", errstr); + + const char* patloc = strstr(errstr, "Warning"); + //find out if it is a warning or error, exit if error + + if (patloc) + { + printf("warning\n"); + } else + { + printf("error\n"); + btAssert(0); + } + + +} + + +int btOpenCLUtils_getNumPlatforms(cl_int* pErrNum) +{ + + cl_platform_id pPlatforms[10] = { 0 }; + + cl_uint numPlatforms = 0; + cl_int ciErrNum = clGetPlatformIDs(10, pPlatforms, &numPlatforms); + //cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms); + + if(ciErrNum != CL_SUCCESS) + { + if(pErrNum != NULL) + *pErrNum = ciErrNum; + } + return numPlatforms; + +} + +const char* btOpenCLUtils_getSdkVendorName() +{ + return spPlatformVendor; +} + +cl_platform_id btOpenCLUtils_getPlatform(int platformIndex0, cl_int* pErrNum) +{ + cl_platform_id platform = 0; + unsigned int platformIndex = (unsigned int )platformIndex0; + cl_uint numPlatforms; + cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms); + + if (platformIndex>=0 && platformIndexm_platformVendor,NULL); + oclCHECKERROR(ciErrNum,CL_SUCCESS); + ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_NAME,BT_MAX_STRING_LENGTH,platformInfo->m_platformName,NULL); + oclCHECKERROR(ciErrNum,CL_SUCCESS); + ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VERSION,BT_MAX_STRING_LENGTH,platformInfo->m_platformVersion,NULL); + oclCHECKERROR(ciErrNum,CL_SUCCESS); +} + +void btOpenCLUtils_printPlatformInfo(cl_platform_id platform) +{ + btOpenCLPlatformInfo platformInfo; + btOpenCLUtils::getPlatformInfo (platform, &platformInfo); + printf("Platform info:\n"); + printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor); + printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName); + printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion); +} + + + +cl_context btOpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex) +{ + cl_context retContext = 0; + cl_int ciErrNum=0; + cl_uint num_entries; + cl_device_id devices[BT_MAX_CL_DEVICES]; + cl_uint num_devices; + cl_context_properties* cprops; + + /* + * If we could find our platform, use it. Otherwise pass a NULL and get whatever the + * implementation thinks we should be using. + */ + cl_context_properties cps[7] = {0,0,0,0,0,0,0}; + cps[0] = CL_CONTEXT_PLATFORM; + cps[1] = (cl_context_properties)platform; +#ifdef _WIN32 + if (pGLContext && pGLDC) + { + cps[2] = CL_GL_CONTEXT_KHR; + cps[3] = (cl_context_properties)pGLContext; + cps[4] = CL_WGL_HDC_KHR; + cps[5] = (cl_context_properties)pGLDC; + } +#endif //_WIN32 + num_entries = BT_MAX_CL_DEVICES; + + + num_devices=-1; + + ciErrNum = clGetDeviceIDs( + platform, + deviceType, + num_entries, + devices, + &num_devices); + + if (ciErrNum<0) + { + printf("clGetDeviceIDs returned %d\n",ciErrNum); + return 0; + } + cprops = (NULL == platform) ? NULL : cps; + + if (!num_devices) + return 0; + + if (pGLContext) + { + //search for the GPU that relates to the OpenCL context + unsigned int i; + for (i=0;i=0 && (unsigned int)preferredDeviceIndex 0) + { + cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms); + ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL); + if(ciErrNum != CL_SUCCESS) + { + if(pErrNum != NULL) + *pErrNum = ciErrNum; + free(platforms); + return NULL; + } + + + + for ( i = 0; i < numPlatforms; ++i) + { + char pbuf[128]; + ciErrNum = clGetPlatformInfo( platforms[i], + CL_PLATFORM_VENDOR, + sizeof(pbuf), + pbuf, + NULL); + if(ciErrNum != CL_SUCCESS) + { + if(pErrNum != NULL) *pErrNum = ciErrNum; + return NULL; + } + + if (preferredPlatformIndex>=0 && i==preferredPlatformIndex) + { + cl_platform_id tmpPlatform = platforms[0]; + platforms[0] = platforms[i]; + platforms[i] = tmpPlatform; + break; + } else + { + if(!strcmp(pbuf, spPlatformVendor)) + { + cl_platform_id tmpPlatform = platforms[0]; + platforms[0] = platforms[i]; + platforms[i] = tmpPlatform; + } + } + } + + for (i = 0; i < numPlatforms; ++i) + { + cl_platform_id platform = platforms[i]; + assert(platform); + + retContext = btOpenCLUtils_createContextFromPlatform(platform,deviceType,pErrNum,pGLContext,pGLDC,preferredDeviceIndex,preferredPlatformIndex); + + if (retContext) + { +// printf("OpenCL platform details:\n"); + btOpenCLPlatformInfo platformInfo; + + btOpenCLUtils::getPlatformInfo(platform, &platformInfo); + + if (retPlatformId) + *retPlatformId = platform; + + break; + } + } + + free (platforms); + } + return retContext; +} + + +////////////////////////////////////////////////////////////////////////////// +//! Gets the id of the nth device from the context +//! +//! @return the id or -1 when out of range +//! @param cxMainContext OpenCL context +//! @param device_idx index of the device of interest +////////////////////////////////////////////////////////////////////////////// +cl_device_id btOpenCLUtils_getDevice(cl_context cxMainContext, int deviceIndex) +{ + assert(cxMainContext); + + size_t szParmDataBytes; + cl_device_id* cdDevices; + cl_device_id device ; + + // get the list of devices associated with context + clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes); + + if( szParmDataBytes / sizeof(cl_device_id) < (unsigned int)deviceIndex ) { + return (cl_device_id)-1; + } + + cdDevices = (cl_device_id*) malloc(szParmDataBytes); + + clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL); + + device = cdDevices[deviceIndex]; + free(cdDevices); + + return device; +} + +int btOpenCLUtils_getNumDevices(cl_context cxMainContext) +{ + size_t szParamDataBytes; + int device_count; + clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes); + device_count = (int) szParamDataBytes/ sizeof(cl_device_id); + return device_count; +} + + + +void btOpenCLUtils::getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo* info) +{ + // CL_DEVICE_NAME + clGetDeviceInfo(device, CL_DEVICE_NAME, BT_MAX_STRING_LENGTH, &info->m_deviceName, NULL); + + // CL_DEVICE_VENDOR + clGetDeviceInfo(device, CL_DEVICE_VENDOR, BT_MAX_STRING_LENGTH, &info->m_deviceVendor, NULL); + + // CL_DRIVER_VERSION + clGetDeviceInfo(device, CL_DRIVER_VERSION, BT_MAX_STRING_LENGTH, &info->m_driverVersion, NULL); + + // CL_DEVICE_INFO + clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &info->m_deviceType, NULL); + + // CL_DEVICE_MAX_COMPUTE_UNITS + clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(info->m_computeUnits), &info->m_computeUnits, NULL); + + // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(info->m_workitemDims), &info->m_workitemDims, NULL); + + // CL_DEVICE_MAX_WORK_ITEM_SIZES + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(info->m_workItemSize), &info->m_workItemSize, NULL); + + // CL_DEVICE_MAX_WORK_GROUP_SIZE + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(info->m_workgroupSize), &info->m_workgroupSize, NULL); + + // CL_DEVICE_MAX_CLOCK_FREQUENCY + clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(info->m_clockFrequency), &info->m_clockFrequency, NULL); + + // CL_DEVICE_ADDRESS_BITS + clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(info->m_addressBits), &info->m_addressBits, NULL); + + // CL_DEVICE_MAX_MEM_ALLOC_SIZE + clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(info->m_maxMemAllocSize), &info->m_maxMemAllocSize, NULL); + + // CL_DEVICE_GLOBAL_MEM_SIZE + clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(info->m_globalMemSize), &info->m_globalMemSize, NULL); + + // CL_DEVICE_ERROR_CORRECTION_SUPPORT + clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(info->m_errorCorrectionSupport), &info->m_errorCorrectionSupport, NULL); + + // CL_DEVICE_LOCAL_MEM_TYPE + clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(info->m_localMemType), &info->m_localMemType, NULL); + + // CL_DEVICE_LOCAL_MEM_SIZE + clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(info->m_localMemSize), &info->m_localMemSize, NULL); + + // CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE + clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(info->m_constantBufferSize), &info->m_constantBufferSize, NULL); + + // CL_DEVICE_QUEUE_PROPERTIES + clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(info->m_queueProperties), &info->m_queueProperties, NULL); + + // CL_DEVICE_IMAGE_SUPPORT + clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(info->m_imageSupport), &info->m_imageSupport, NULL); + + // CL_DEVICE_MAX_READ_IMAGE_ARGS + clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(info->m_maxReadImageArgs), &info->m_maxReadImageArgs, NULL); + + // CL_DEVICE_MAX_WRITE_IMAGE_ARGS + clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(info->m_maxWriteImageArgs), &info->m_maxWriteImageArgs, NULL); + + // CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH + clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &info->m_image2dMaxWidth, NULL); + clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &info->m_image2dMaxHeight, NULL); + clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &info->m_image3dMaxWidth, NULL); + clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &info->m_image3dMaxHeight, NULL); + clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &info->m_image3dMaxDepth, NULL); + + // CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines + clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, BT_MAX_STRING_LENGTH, &info->m_deviceExtensions, NULL); + + // CL_DEVICE_PREFERRED_VECTOR_WIDTH_ + clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &info->m_vecWidthChar, NULL); + clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &info->m_vecWidthShort, NULL); + clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &info->m_vecWidthInt, NULL); + clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &info->m_vecWidthLong, NULL); + clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &info->m_vecWidthFloat, NULL); + clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info->m_vecWidthDouble, NULL); +} + + +void btOpenCLUtils_printDeviceInfo(cl_device_id device) +{ + btOpenCLDeviceInfo info; + btOpenCLUtils::getDeviceInfo(device,&info); + printf("Device Info:\n"); + printf(" CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName); + printf(" CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor); + printf(" CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion); + + if( info.m_deviceType & CL_DEVICE_TYPE_CPU ) + printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU"); + if( info.m_deviceType & CL_DEVICE_TYPE_GPU ) + printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU"); + if( info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR ) + printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR"); + if( info.m_deviceType & CL_DEVICE_TYPE_DEFAULT ) + printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT"); + + printf(" CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits); + printf(" CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", info.m_workitemDims); + printf(" CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", info.m_workItemSize[0], info.m_workItemSize[1], info.m_workItemSize[2]); + printf(" CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize); + printf(" CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency); + printf(" CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits); + printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize/ (1024 * 1024))); + printf(" CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize/ (1024 * 1024))); + printf(" CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport== CL_TRUE ? "yes" : "no"); + printf(" CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global"); + printf(" CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024)); + printf(" CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024)); + if( info.m_queueProperties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE ) + printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE"); + if( info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE ) + printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE"); + + printf(" CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport); + + printf(" CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", info.m_maxReadImageArgs); + printf(" CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", info.m_maxWriteImageArgs); + printf("\n CL_DEVICE_IMAGE "); + printf("\t\t\t2D_MAX_WIDTH\t %u\n", info.m_image2dMaxWidth); + printf("\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", info.m_image2dMaxHeight); + printf("\t\t\t\t\t3D_MAX_WIDTH\t %u\n", info.m_image3dMaxWidth); + printf("\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", info.m_image3dMaxHeight); + printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth); + if (info.m_deviceExtensions != 0) + printf("\n CL_DEVICE_EXTENSIONS:%s\n",info.m_deviceExtensions); + else + printf(" CL_DEVICE_EXTENSIONS: None\n"); + printf(" CL_DEVICE_PREFERRED_VECTOR_WIDTH_\t"); + printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n", + info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong,info.m_vecWidthFloat, info.m_vecWidthDouble); + + +} + + +static const char* strip2(const char* name, const char* pattern) +{ + size_t const patlen = strlen(pattern); + size_t patcnt = 0; + const char * oriptr; + const char * patloc; + // find how many times the pattern occurs in the original string + for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen) + { + patcnt++; + } + return oriptr; +} + +cl_program btOpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSourceOrg, cl_int* pErrNum, const char* additionalMacrosArg , const char* clFileNameForCaching) +{ + const char* additionalMacros = additionalMacrosArg?additionalMacrosArg:""; + + cl_program m_cpProgram=0; + cl_int status; + +#ifdef _WIN32 + char binaryFileName[BT_MAX_STRING_LENGTH]; + char* bla=0; + + if (clFileNameForCaching && !(gDebugSkipLoadingBinary||gDebugForceLoadingFromSource) ) + { + + char deviceName[256]; + char driverVersion[256]; + const char* strippedName; + int fileUpToDate = 0; + int binaryFileValid=0; + FILETIME modtimeBinary; + + clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL); + clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL); + + + strippedName = strip2(clFileNameForCaching,"\\"); + strippedName = strip2(strippedName,"/"); + +#ifdef _WIN32 + sprintf_s(binaryFileName,BT_MAX_STRING_LENGTH,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion ); +#else + sprintf(binaryFileName,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion ); +#endif + + + //printf("searching for %s\n", binaryFileName); + + + + + CreateDirectory("cache",0); + { + + HANDLE binaryFileHandle = CreateFile(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0); + if (binaryFileHandle ==INVALID_HANDLE_VALUE) + { + DWORD errorCode; + errorCode = GetLastError(); + switch (errorCode) + { + case ERROR_FILE_NOT_FOUND: + { + printf("\nCached file not found %s\n", binaryFileName); + break; + } + case ERROR_PATH_NOT_FOUND: + { + printf("\nCached file path not found %s\n", binaryFileName); + break; + } + default: + { + printf("\nFailed reading cached file with errorCode = %d\n", errorCode); + } + } + } else + { + if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0) + { + DWORD errorCode; + errorCode = GetLastError(); + printf("\nGetFileTime errorCode = %d\n", errorCode); + } else + { + binaryFileValid = 1; + } + CloseHandle(binaryFileHandle); + } + + if (binaryFileValid) + { + HANDLE srcFileHandle = CreateFile(clFileNameForCaching,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0); + + if (srcFileHandle==INVALID_HANDLE_VALUE) + { + const char* prefix[]={"../","../../","../../../","../../../../"}; + for (int i=0;(srcFileHandle==INVALID_HANDLE_VALUE) && i<3;i++) + { + char relativeFileName[1024]; + sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching); + srcFileHandle = CreateFile(relativeFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0); + } + + } + + + if (srcFileHandle!=INVALID_HANDLE_VALUE) + { + FILETIME modtimeSrc; + if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0) + { + DWORD errorCode; + errorCode = GetLastError(); + printf("\nGetFileTime errorCode = %d\n", errorCode); + } + if ( ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime) + ||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime))) + { + fileUpToDate=1; + } else + { + printf("\nCached binary file out-of-date (%s)\n",binaryFileName); + } + CloseHandle(srcFileHandle); + } + else + { +#ifdef _DEBUG + DWORD errorCode; + errorCode = GetLastError(); + switch (errorCode) + { + case ERROR_FILE_NOT_FOUND: + { + printf("\nSrc file not found %s\n", clFileNameForCaching); + break; + } + case ERROR_PATH_NOT_FOUND: + { + printf("\nSrc path not found %s\n", clFileNameForCaching); + break; + } + default: + { + printf("\nnSrc file reading errorCode = %d\n", errorCode); + } + } + + //we should make sure the src file exists so we can verify the timestamp with binary + assert(0); + fileUpToDate = false; +#else + //if we cannot find the source, assume it is OK in release builds + fileUpToDate = true; +#endif + } + } + + + } + + if( fileUpToDate) + { +#ifdef _WIN32 + FILE* file; + if (fopen_s(&file,binaryFileName, "rb")!=0) + file=0; +#else + FILE* file = fopen(binaryFileName, "rb"); +#endif + + if (file) + { + size_t binarySize=0; + char* binary =0; + + fseek( file, 0L, SEEK_END ); + binarySize = ftell( file ); + rewind( file ); + binary = (char*)malloc(sizeof(char)*binarySize); + fread( binary, sizeof(char), binarySize, file ); + fclose( file ); + + m_cpProgram = clCreateProgramWithBinary( clContext, 1,&device, &binarySize, (const unsigned char**)&binary, 0, &status ); + btAssert( status == CL_SUCCESS ); + status = clBuildProgram( m_cpProgram, 1, &device, additionalMacros, 0, 0 ); + btAssert( status == CL_SUCCESS ); + + if( status != CL_SUCCESS ) + { + char *build_log; + size_t ret_val_size; + clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); + build_log = (char*)malloc(sizeof(char)*(ret_val_size+1)); + clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); + build_log[ret_val_size] = '\0'; + printf("%s\n", build_log); + free (build_log); + btAssert(0); + m_cpProgram = 0; + } + free (binary); + } + } + + } +#endif //_WIN32 + + if (!m_cpProgram) + { + + cl_int localErrNum; + char* compileFlags; + int flagsize; + + + + const char* kernelSource = kernelSourceOrg; + + if (!kernelSourceOrg || gDebugForceLoadingFromSource) + { + if (clFileNameForCaching) + { + + FILE* file = fopen(clFileNameForCaching, "rb"); + //in many cases the relative path is a few levels up the directory hierarchy, so try it + if (!file) + { + const char* prefix[]={"../","../../","../../../","../../../../"}; + for (int i=0;!file && i<3;i++) + { + char relativeFileName[1024]; + sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching); + file = fopen(relativeFileName, "rb"); + } + } + + if (file) + { + char* kernelSrc=0; + fseek( file, 0L, SEEK_END ); + int kernelSize = ftell( file ); + rewind( file ); + kernelSrc = (char*)malloc(kernelSize+1); + int readBytes = fread((void*)kernelSrc,1,kernelSize, file); + kernelSrc[kernelSize] = 0; + fclose(file); + kernelSource = kernelSrc; + } + } + } + + size_t program_length = kernelSource ? strlen(kernelSource) : 0; +#ifdef MAC //or __APPLE__? + char* flags = "-cl-mad-enable -DMAC -DGUID_ARG"; +#else + //const char* flags = "-DGUID_ARG= -fno-alias"; + const char* flags = "-DGUID_ARG= "; +#endif + + + m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum); + if (localErrNum!= CL_SUCCESS) + { + if (pErrNum) + *pErrNum = localErrNum; + return 0; + } + + // Build the program with 'mad' Optimization option + + + + flagsize = sizeof(char)*(strlen(additionalMacros) + strlen(flags) + 5); + compileFlags = (char*) malloc(flagsize); +#ifdef _WIN32 + sprintf_s(compileFlags,flagsize, "%s %s", flags, additionalMacros); +#else + sprintf(compileFlags, "%s %s", flags, additionalMacros); +#endif + localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL); + if (localErrNum!= CL_SUCCESS) + { + char *build_log; + size_t ret_val_size; + clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); + build_log = (char*) malloc(sizeof(char)*(ret_val_size+1)); + clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); + + // to be carefully, terminate with \0 + // there's no information in the reference whether the string is 0 terminated or not + build_log[ret_val_size] = '\0'; + + + printf("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log); + free (build_log); + if (pErrNum) + *pErrNum = localErrNum; + return 0; + } + +#ifdef _WIN32 + + if( clFileNameForCaching ) + { // write to binary + + cl_uint numAssociatedDevices; + status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 ); + btAssert( status == CL_SUCCESS ); + if (numAssociatedDevices==1) + { + + size_t binarySize; + char* binary ; + + status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 ); + btAssert( status == CL_SUCCESS ); + + binary = (char*)malloc(sizeof(char)*binarySize); + + status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 ); + btAssert( status == CL_SUCCESS ); + + { + FILE* file=0; +#ifdef _WIN32 + if (fopen_s(&file,binaryFileName, "wb")!=0) + file=0; +#else + file = fopen(binaryFileName, "wb"); +#endif + if (file) + { + fwrite( binary, sizeof(char), binarySize, file ); + fclose( file ); + } else + { + printf("cannot write file %s\n", binaryFileName); + } + } + + free (binary); + } + } +#endif //_WIN32 + + free(compileFlags); + + } + return m_cpProgram; +} + + +cl_kernel btOpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros ) +{ + + cl_kernel kernel; + cl_int localErrNum; + + cl_program m_cpProgram = prog; + + printf("compiling kernel %s ",kernelName); + + if (!m_cpProgram) + { + m_cpProgram = btOpenCLUtils_compileCLProgramFromString(clContext,device,kernelSource,pErrNum, additionalMacros,0); + } + + + // Create the kernel + kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum); + if (localErrNum != CL_SUCCESS) + { + printf("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName); + assert(0); + if (pErrNum) + *pErrNum = localErrNum; + return 0; + } + + if (!prog && m_cpProgram) + { + clReleaseProgram(m_cpProgram); + } + printf("ready. \n"); + + + if (pErrNum) + *pErrNum = CL_SUCCESS; + return kernel; + +} diff --git a/opencl/basic_initialize/btOpenCLUtils.h b/opencl/basic_initialize/btOpenCLUtils.h new file mode 100644 index 000000000..a1c7fbd7c --- /dev/null +++ b/opencl/basic_initialize/btOpenCLUtils.h @@ -0,0 +1,179 @@ +/* +Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org +Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +//original author: Roman Ponomarev +//cleanup by Erwin Coumans + +#ifndef BT_OPENCL_UTILS_H +#define BT_OPENCL_UTILS_H + +#include "btOpenCLInclude.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +///C API for OpenCL utilities: convenience functions, see below for C++ API + +/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code +/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise. +cl_context btOpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC , int preferredDeviceIndex , int preferredPlatformIndex, cl_platform_id* platformId); + +int btOpenCLUtils_getNumDevices(cl_context cxMainContext); + +cl_device_id btOpenCLUtils_getDevice(cl_context cxMainContext, int nr); + +void btOpenCLUtils_printDeviceInfo(cl_device_id device); + +cl_kernel btOpenCLUtils_compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog,const char* additionalMacros); + +//optional +cl_program btOpenCLUtils_compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum,const char* additionalMacros , const char* srcFileNameForCaching); + +//the following optional APIs provide access using specific platform information +int btOpenCLUtils_getNumPlatforms(cl_int* pErrNum); + +///get the nr'th platform, where nr is in the range [0..getNumPlatforms) +cl_platform_id btOpenCLUtils_getPlatform(int nr, cl_int* pErrNum); + +void btOpenCLUtils_printPlatformInfo(cl_platform_id platform); + +const char* btOpenCLUtils_getSdkVendorName(); + +cl_context btOpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC ,int preferredDeviceIndex , int preferredPlatformIndex); + +#ifdef __cplusplus +} + +#define BT_MAX_STRING_LENGTH 1024 + +typedef struct +{ + char m_deviceName[BT_MAX_STRING_LENGTH]; + char m_deviceVendor[BT_MAX_STRING_LENGTH]; + char m_driverVersion[BT_MAX_STRING_LENGTH]; + char m_deviceExtensions[BT_MAX_STRING_LENGTH]; + + cl_device_type m_deviceType; + cl_uint m_computeUnits; + size_t m_workitemDims; + size_t m_workItemSize[3]; + size_t m_image2dMaxWidth; + size_t m_image2dMaxHeight; + size_t m_image3dMaxWidth; + size_t m_image3dMaxHeight; + size_t m_image3dMaxDepth; + size_t m_workgroupSize; + cl_uint m_clockFrequency; + cl_ulong m_constantBufferSize; + cl_ulong m_localMemSize; + cl_ulong m_globalMemSize; + cl_bool m_errorCorrectionSupport; + cl_device_local_mem_type m_localMemType; + cl_uint m_maxReadImageArgs; + cl_uint m_maxWriteImageArgs; + + + + cl_uint m_addressBits; + cl_ulong m_maxMemAllocSize; + cl_command_queue_properties m_queueProperties; + cl_bool m_imageSupport; + cl_uint m_vecWidthChar; + cl_uint m_vecWidthShort; + cl_uint m_vecWidthInt; + cl_uint m_vecWidthLong; + cl_uint m_vecWidthFloat; + cl_uint m_vecWidthDouble; + +} btOpenCLDeviceInfo; + +typedef struct +{ + char m_platformVendor[BT_MAX_STRING_LENGTH]; + char m_platformName[BT_MAX_STRING_LENGTH]; + char m_platformVersion[BT_MAX_STRING_LENGTH]; +} btOpenCLPlatformInfo; + + +///C++ API for OpenCL utilities: convenience functions +struct btOpenCLUtils +{ + /// CL Context optionally takes a GL context. This is a generic type because we don't really want this code + /// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise. + static inline cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1, cl_platform_id* platformId=0) + { + return btOpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx , pGLDC , preferredDeviceIndex, preferredPlatformIndex, platformId); + } + + static inline int getNumDevices(cl_context cxMainContext) + { + return btOpenCLUtils_getNumDevices(cxMainContext); + } + static inline cl_device_id getDevice(cl_context cxMainContext, int nr) + { + return btOpenCLUtils_getDevice(cxMainContext,nr); + } + + static void getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo* info); + + static inline void printDeviceInfo(cl_device_id device) + { + btOpenCLUtils_printDeviceInfo(device); + } + + static inline cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" ) + { + return btOpenCLUtils_compileCLKernelFromString(clContext,device, kernelSource, kernelName, pErrNum, prog,additionalMacros); + } + + //optional + static inline cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0) + { + return btOpenCLUtils_compileCLProgramFromString(clContext,device, kernelSource, pErrNum,additionalMacros, srcFileNameForCaching); + } + + //the following optional APIs provide access using specific platform information + static inline int getNumPlatforms(cl_int* pErrNum=0) + { + return btOpenCLUtils_getNumPlatforms(pErrNum); + } + ///get the nr'th platform, where nr is in the range [0..getNumPlatforms) + static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum=0) + { + return btOpenCLUtils_getPlatform(nr,pErrNum); + } + + static void getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo* platformInfo); + + static inline void printPlatformInfo(cl_platform_id platform) + { + btOpenCLUtils_printPlatformInfo(platform); + } + + static inline const char* getSdkVendorName() + { + return btOpenCLUtils_getSdkVendorName(); + } + static inline cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1) + { + return btOpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx,pGLDC,preferredDeviceIndex, preferredPlatformIndex); + } +}; + +#endif //__cplusplus + +#endif // BT_OPENCL_UTILS_H diff --git a/opencl/basic_initialize/main.cpp b/opencl/basic_initialize/main.cpp new file mode 100644 index 000000000..263ba1b30 --- /dev/null +++ b/opencl/basic_initialize/main.cpp @@ -0,0 +1,98 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +///original author: Erwin Coumans + +#include "btOpenCLUtils.h" +#include + +cl_context g_cxMainContext; +cl_command_queue g_cqCommandQue; + + + +int main(int argc, char* argv[]) +{ + int ciErrNum = 0; + + cl_device_type deviceType = CL_DEVICE_TYPE_ALL; + const char* vendorSDK = btOpenCLUtils::getSdkVendorName(); + + printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK); + int numPlatforms = btOpenCLUtils::getNumPlatforms(); + printf("Num Platforms = %d\n", numPlatforms); + + for (int i=0;i +#include +#include + +//make sure to update the same #define in the opencl/lds_bank_conflict/lds_kernels.cl +const int TILE_DIM = 32; +const int BLOCK_ROWS = 8; +const int NUM_REPS = 100; + +// Check errors and print GB/s +void postprocess(const float *ref, const float *res, int n, float ms) +{ + bool passed = true; + for (int i = 0; i < n; i++) + if (res[i] != ref[i]) { + printf("\nError: at res[%d] got %f but expected %f\n", i, res[i], ref[i]); + printf("%25s\n", "*** FAILED ***"); + passed = false; + break; + } + if (passed) + printf("%20.2f\n", 2 * n * sizeof(float) * 1e-6 * NUM_REPS / ms ); +} + +char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength) +{ + // locals + FILE* pFileStream = NULL; + size_t szSourceLength; + + // open the OpenCL source code file + pFileStream = fopen(cFilename, "rb"); + if(pFileStream == 0) + { + return NULL; + } + + size_t szPreambleLength = strlen(cPreamble); + + // get the length of the source code + fseek(pFileStream, 0, SEEK_END); + szSourceLength = ftell(pFileStream); + fseek(pFileStream, 0, SEEK_SET); + + // allocate a buffer for the source code string and read it in + char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); + memcpy(cSourceString, cPreamble, szPreambleLength); + fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream); + + // close the file and return the total length of the combined (preamble + source) string + fclose(pFileStream); + if(szFinalLength != 0) + { + *szFinalLength = szSourceLength + szPreambleLength; + } + cSourceString[szSourceLength + szPreambleLength] = '\0'; + + return cSourceString; +} + +int main(int argc, char **argv) +{ + printf("Use --deviceId= or --platformId= to override OpenCL device\n"); + CommandLineArgs args(argc,argv); + + const int nx = 1024; + const int ny = 1024; + + const int mem_size = nx*ny*sizeof(float); + const int num_elements = nx*ny; + btClock clock; + double startEvent=0.f; + double stopEvent=0.f; + + int localSizeX = TILE_DIM; + int localSizeY = BLOCK_ROWS; + + int numThreadsX = (nx/TILE_DIM)*TILE_DIM; + int numThreadsY = (ny/TILE_DIM)*BLOCK_ROWS; + + int gridX = numThreadsX / localSizeX; + int gridY = numThreadsY / localSizeY; + + int ciErrNum = 0; + int preferred_device = -1; + int preferred_platform = -1; + args.GetCmdLineArgument("deviceId",preferred_device); + args.GetCmdLineArgument("platformId",preferred_platform); + + + cl_platform_id platformId=0; + cl_context ctx=0; + cl_command_queue queue=0; + cl_device_id device=0; + cl_kernel copyKernel=0; + cl_kernel copySharedMemKernel=0; + cl_kernel transposeNaiveKernel = 0; + cl_kernel transposeCoalescedKernel = 0; + cl_kernel transposeNoBankConflictsKernel= 0; + + + ctx = btOpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId); + btOpenCLUtils::printPlatformInfo(platformId); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + device = btOpenCLUtils::getDevice(ctx,0); + btOpenCLUtils::printDeviceInfo(device); + queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum); + + const char* cSourceFile = "opencl/lds_bank_conflict/lds_kernels.cl"; + + size_t szKernelLength; + + const char* cSourceCL =0; + char relativeFileName[1024]; + + { + const char* prefix[]={"./","../","../../","../../../","../../../../"}; + int numPrefixes = sizeof(prefix)/sizeof(char*); + + for (int i=0;!cSourceCL && i d_idataCL(ctx,queue);d_idataCL.resize(num_elements); + btOpenCLArray d_cdataCL(ctx,queue);d_cdataCL.resize(num_elements); + btOpenCLArray d_tdataCL(ctx,queue);d_tdataCL.resize(num_elements); + + + // check parameters and calculate execution configuration + if (nx % TILE_DIM || ny % TILE_DIM) + { + printf("nx and ny must be a multiple of TILE_DIM\n"); + goto error_exit; + } + + if (TILE_DIM % BLOCK_ROWS) + { + printf("TILE_DIM must be a multiple of BLOCK_ROWS\n"); + goto error_exit; + } + + // host + for (int j = 0; j < ny; j++) + for (int i = 0; i < nx; i++) + h_idata[j*nx + i] = j*nx + i; + + // correct result for error checking + for (int j = 0; j < ny; j++) + for (int i = 0; i < nx; i++) + { + gold[j*nx + i] = h_idata[i*nx + j]; + } + + d_idataCL.copyFromHostPointer(h_idata,num_elements); + + // events for timing + clock.reset(); + + float ms; + + // ------------ + // time kernels + // ------------ + printf("%25s%25s\n", "Routine", "Bandwidth (GB/s)"); + + // ---- + // copy + // ---- + printf("%25s", "copy"); + + clMemSet.execute(d_cdataCL,0.f,num_elements); + + { + // warm up + btLauncherCL launcher( queue, copyKernel); + launcher.setBuffer( d_cdataCL.getBufferCL()); + launcher.setBuffer( d_idataCL.getBufferCL()); + launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); + + startEvent = clock.getTimeMicroseconds()/1e3; + for (int i = 0; i < NUM_REPS; i++) + launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + clFinish(queue); + stopEvent = clock.getTimeMicroseconds()/1e3; + } + + ms = float(stopEvent-startEvent); + + d_cdataCL.copyToHostPointer(h_cdata,num_elements,0); + postprocess(h_idata, h_cdata, nx*ny, ms); + + // ------------- + // copySharedMem + // ------------- + printf("%25s", "shared memory copy"); + clMemSet.execute(d_cdataCL,0.f,num_elements); + + { + btLauncherCL launcher( queue, copySharedMemKernel); + launcher.setBuffer( d_cdataCL.getBufferCL()); + launcher.setBuffer( d_idataCL.getBufferCL()); + launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); + + startEvent = clock.getTimeMicroseconds()/1e3; + for (int i = 0; i < NUM_REPS; i++) + launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + clFinish(queue); + stopEvent = clock.getTimeMicroseconds()/1e3; + } + + ms = float(stopEvent-startEvent); + d_cdataCL.copyToHostPointer(h_cdata,num_elements,0); + postprocess(h_idata, h_cdata, nx * ny, ms); + + // -------------- + // transposeNaive + // -------------- + printf("%25s", "naive transpose"); + clMemSet.execute(d_tdataCL,0.f,num_elements); + { + // warmup + btLauncherCL launcher( queue, transposeNaiveKernel); + launcher.setBuffer( d_tdataCL.getBufferCL()); + launcher.setBuffer( d_idataCL.getBufferCL()); + launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); + + startEvent = clock.getTimeMicroseconds()/1e3; + for (int i = 0; i < NUM_REPS; i++) + launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + clFinish(queue); + stopEvent = clock.getTimeMicroseconds()/1e3; + } + ms = float(stopEvent-startEvent); + d_tdataCL.copyToHostPointer(h_tdata,num_elements,0); + postprocess(gold, h_tdata, nx * ny, ms); + + // ------------------ + // transposeCoalesced + // ------------------ + printf("%25s", "coalesced transpose"); + clMemSet.execute(d_tdataCL,0.f,num_elements); + { + btLauncherCL launcher( queue, transposeCoalescedKernel); + launcher.setBuffer( d_tdataCL.getBufferCL()); + launcher.setBuffer( d_idataCL.getBufferCL()); + launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); + + startEvent = clock.getTimeMicroseconds()/1e3; + for (int i = 0; i < NUM_REPS; i++) + launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + clFinish(queue); + stopEvent = clock.getTimeMicroseconds()/1e3; + } + + ms = float(stopEvent-startEvent); + d_tdataCL.copyToHostPointer(h_tdata,num_elements,0); + postprocess(gold, h_tdata, nx * ny, ms); + + // ------------------------ + // transposeNoBankConflicts + // ------------------------ + printf("%25s", "conflict-free transpose"); + clMemSet.execute(d_tdataCL,0.f,num_elements); + { + btLauncherCL launcher( queue, transposeNoBankConflictsKernel); + launcher.setBuffer( d_tdataCL.getBufferCL()); + launcher.setBuffer( d_idataCL.getBufferCL()); + launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); + + startEvent = clock.getTimeMicroseconds()/1e3; + for (int i = 0; i < NUM_REPS; i++) + launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + clFinish(queue); + stopEvent = clock.getTimeMicroseconds()/1e3; + } + + ms = float(stopEvent-startEvent); + d_tdataCL.copyToHostPointer(h_tdata,num_elements,0); + postprocess(gold, h_tdata, nx * ny, ms); + +error_exit: + // cleanup + clReleaseKernel(copyKernel); + clReleaseCommandQueue(queue); + clReleaseContext(ctx); + + free(h_idata); + free(h_tdata); + free(h_cdata); + free(gold); + printf("Press \n"); + getchar(); +} diff --git a/opencl/lds_bank_conflict/premake4.lua b/opencl/lds_bank_conflict/premake4.lua new file mode 100644 index 000000000..7a26da2cc --- /dev/null +++ b/opencl/lds_bank_conflict/premake4.lua @@ -0,0 +1,37 @@ + +function createProject (vendor) + + local hasCL = findOpenCL(vendor) + + if (hasCL) then + + project ( "OpenCL_lds_bank_conflict_" .. vendor) + + initOpenCL(vendor) + + language "C++" + + kind "ConsoleApp" + targetdir "../../bin" + + links { + "OpenCL_lib_parallel_primitives_host_" .. vendor + } + + includedirs { + "../basic_initialize" + } + + files { + "main.cpp", + "../basic_initialize/btOpenCLUtils.cpp", + "../basic_initialize/btOpenCLUtils.h" + } + end + +end + +createProject("AMD") +createProject("NVIDIA") +createProject("Intel") +createProject("Apple") diff --git a/opencl/parallel_primitives/benchmark/premake4.lua b/opencl/parallel_primitives/benchmark/premake4.lua new file mode 100644 index 000000000..515540c8b --- /dev/null +++ b/opencl/parallel_primitives/benchmark/premake4.lua @@ -0,0 +1,35 @@ +function createProject(vendor) + hasCL = findOpenCL(vendor) + + if (hasCL) then + + project ("OpenCL_radixsort_benchmark_" .. vendor) + + initOpenCL(vendor) + + language "C++" + + kind "ConsoleApp" + targetdir "../../../bin" + includedirs {".."} + + links { + ("OpenCL_lib_parallel_primitives_host_" .. vendor) + } + + files { + "test_large_problem_sorting.cpp", + "../../basic_initialize/btOpenCLUtils.cpp", + "../../basic_initialize/btOpenCLUtils.h", + "../host/btFillCL.cpp", + "../host/btPrefixScanCL.cpp", + "../host/btRadixSort32CL.cpp", + } + + end +end + +createProject("AMD") +createProject("Intel") +createProject("NVIDIA") +createProject("Apple") \ No newline at end of file diff --git a/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp b/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp new file mode 100644 index 000000000..b3629c3f8 --- /dev/null +++ b/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp @@ -0,0 +1,709 @@ +/****************************************************************************** + * Copyright 2010 Duane Merrill + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + * + * + * AUTHORS' REQUEST: + * + * If you use|reference|benchmark this code, please cite our Technical + * Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf): + * + * @TechReport{ Merrill:Sorting:2010, + * author = "Duane Merrill and Andrew Grimshaw", + * title = "Revisiting Sorting for GPGPU Stream Architectures", + * year = "2010", + * institution = "University of Virginia, Department of Computer Science", + * address = "Charlottesville, VA, USA", + * number = "CS2010-03" + * } + * + * For more information, see our Google Code project site: + * http://code.google.com/p/back40computing/ + * + * Thanks! + ******************************************************************************/ + +/****************************************************************************** + * Simple test driver program for *large-problem* radix sorting. + * + * Useful for demonstrating how to integrate radix sorting into + * your application + ******************************************************************************/ + +/****************************************************************************** + * Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans + ******************************************************************************/ +#ifdef _WIN32 +#pragma warning (disable:4996) +#endif +#include +#include +#include +#include +#include +#include +#include + + +//#include +#include +/********************** +* +*/ + +#include "../host/btRadixSort32CL.h" +#include "../../basic_initialize/btOpenCLUtils.h" +#include "../host/btQuickprof.h" + +cl_context g_cxMainContext; +cl_device_id g_device; +cl_command_queue g_cqCommandQueue; + +/*********************** +* +*/ + +bool g_verbose; +///Preferred OpenCL device/platform. When < 0 then no preference is used. +///Note that btOpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application. +///Preferred device/platform take priority over this platform-vendor match +int gPreferredDeviceId = -1; +int gPreferredPlatformId = -1; + + + +/****************************************************************************** + * Routines + ******************************************************************************/ + + +/** + * Keys-only sorting. Uses the GPU to sort the specified vector of elements for the given + * number of iterations, displaying runtime information. + * + * @param[in] num_elements + * Size in elements of the vector to sort + * @param[in] h_keys + * Vector of keys to sort + * @param[in] iterations + * Number of times to invoke the GPU sorting primitive + * @param[in] cfg + * Config + */ +template +void TimedSort( + unsigned int num_elements, + K *h_keys, + unsigned int iterations) +{ + printf("Keys only, %d iterations, %d elements\n", iterations, num_elements); + + int max_elements = num_elements; + btAlignedObjectArray hostData; + hostData.resize(num_elements); + for (int i=0;i gpuData(g_cxMainContext,g_cqCommandQueue); + gpuData.copyFromHost(hostData); + //sorter.executeHost(gpuData); + sorter.execute(gpuData); + + btAlignedObjectArray hostDataSorted; + gpuData.copyToHost(hostDataSorted); + + clFinish(g_cqCommandQueue); + + { + //printf("Key-values, %d iterations, %d elements", iterations, num_elements); + + // Create sorting enactor + + // Perform the timed number of sorting iterations + double elapsed = 0; + float duration = 0; + btClock watch; + + //warm-start + gpuData.copyFromHost(hostData); + clFinish(g_cqCommandQueue); + sorter.execute(gpuData); + + watch.reset(); + + + for (int i = 0; i < iterations; i++) + { + + + + // Move a fresh copy of the problem into device storage + gpuData.copyFromHost(hostData); + clFinish(g_cqCommandQueue); + + // Start GPU timing record + double startMs = watch.getTimeMicroseconds()/1e3; + + // Call the sorting API routine + sorter.execute(gpuData); + + + + clFinish(g_cqCommandQueue); + + double stopMs = watch.getTimeMicroseconds()/1e3; + + duration = stopMs - startMs; + + // End GPU timing record + elapsed += (double) duration; + printf("duration = %f\n", duration); + } + + // Display timing information + double avg_runtime = elapsed / iterations; + // double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; + // printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput); + double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; + printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput); + + gpuData.copyToHost(hostData); + for (int i=0;i +void TimedSort( + unsigned int num_elements, + K *h_keys, + V *h_values, + unsigned int iterations) +{ + + printf("Key-values, %d iterations, %d elements\n", iterations, num_elements); + + int max_elements = num_elements; + btAlignedObjectArray hostData; + hostData.resize(num_elements); + for (int i=0;i gpuData(g_cxMainContext,g_cqCommandQueue); + gpuData.copyFromHost(hostData); + //sorter.executeHost(gpuData); + sorter.execute(gpuData); + + btAlignedObjectArray hostDataSorted; + gpuData.copyToHost(hostDataSorted); +#if 0 + for (int i=0;i +void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8) +{ + const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char); + unsigned char key_bits[NUM_UCHARS]; + + do { + + for (int j = 0; j < NUM_UCHARS; j++) { + unsigned char quarterword = 0xff; + for (int i = 0; i <= entropy_reduction; i++) { + quarterword &= (rand() >> 7); + } + key_bits[j] = quarterword; + } + + if (lower_key_bits < sizeof(K) * 8) { + unsigned long long base = 0; + memcpy(&base, key_bits, sizeof(K)); + base &= (1 << lower_key_bits) - 1; + memcpy(key_bits, &base, sizeof(K)); + } + + memcpy(&key, key_bits, sizeof(K)); + + } while (key != key); // avoids NaNs when generating random floating point numbers +} + + +/****************************************************************************** + * Templated routines for printing keys/values to the console + ******************************************************************************/ + +template +void PrintValue(T val) { + printf("%d", val); +} + +template<> +void PrintValue(float val) { + printf("%f", val); +} + +template<> +void PrintValue(double val) { + printf("%f", val); +} + +template<> +void PrintValue(unsigned char val) { + printf("%u", val); +} + +template<> +void PrintValue(unsigned short val) { + printf("%u", val); +} + +template<> +void PrintValue(unsigned int val) { + printf("%u", val); +} + +template<> +void PrintValue(long val) { + printf("%ld", val); +} + +template<> +void PrintValue(unsigned long val) { + printf("%lu", val); +} + +template<> +void PrintValue(long long val) { + printf("%lld", val); +} + +template<> +void PrintValue(unsigned long long val) { + printf("%llu", val); +} + + + +/** + * Compares the equivalence of two arrays + */ +template +int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true) +{ + printf("\n"); + for (SizeT i = 0; i < len; i++) { + + if (computed[i] != reference[i]) { + printf("INCORRECT: [%lu]: ", (unsigned long) i); + PrintValue(computed[i]); + printf(" != "); + PrintValue(reference[i]); + + if (verbose) { + printf("\nresult[..."); + for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) { + PrintValue(computed[j]); + printf(", "); + } + printf("...]"); + printf("\nreference[..."); + for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) { + PrintValue(reference[j]); + printf(", "); + } + printf("...]"); + } + + return 1; + } + } + + printf("CORRECT\n"); + return 0; +} + +/** + * Creates an example sorting problem whose keys is a vector of the specified + * number of K elements, values of V elements, and then dispatches the problem + * to the GPU for the given number of iterations, displaying runtime information. + * + * @param[in] iterations + * Number of times to invoke the GPU sorting primitive + * @param[in] num_elements + * Size in elements of the vector to sort + * @param[in] cfg + * Config + */ +template +void TestSort( + unsigned int iterations, + int num_elements, + bool keys_only) +{ + // Allocate the sorting problem on the host and fill the keys with random bytes + + K *h_keys = NULL; + K *h_reference_keys = NULL; + V *h_values = NULL; + h_keys = (K*) malloc(num_elements * sizeof(K)); + h_reference_keys = (K*) malloc(num_elements * sizeof(K)); + if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V)); + + + // Use random bits + for (unsigned int i = 0; i < num_elements; ++i) { + RandomBits(h_keys[i], 0); + //h_keys[i] = num_elements-i; + //h_keys[i] = 0xffffffffu-i; + if (!keys_only) + h_values[i] = h_keys[i];//0xffffffffu-i; + + h_reference_keys[i] = h_keys[i]; + } + + // Run the timing test + if (keys_only) { + TimedSort(num_elements, h_keys, iterations); + } else { + TimedSort(num_elements, h_keys, h_values, iterations); + } + +// cudaThreadSynchronize(); + + // Display sorted key data + if (g_verbose) { + printf("\n\nKeys:\n"); + for (int i = 0; i < num_elements; i++) { + PrintValue(h_keys[i]); + printf(", "); + } + printf("\n\n"); + } + + // Verify solution + std::sort(h_reference_keys, h_reference_keys + num_elements); + CompareResults(h_keys, h_reference_keys, num_elements, true); + printf("\n"); + fflush(stdout); + + // Free our allocated host memory + if (h_keys != NULL) free(h_keys); + if (h_values != NULL) free(h_values); +} + + + +/** + * Displays the commandline usage for this tool + */ +void Usage() +{ + printf("\ntest_large_problem_sorting [--device=] [--v] [--i=] [--n=] [--key-values] [--deviceId=] [--platformId=]\n"); + printf("\n"); + printf("\t--v\tDisplays sorted results to the console.\n"); + printf("\n"); + printf("\t--i\tPerforms the sorting operation times\n"); + printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n"); + printf("\n"); + printf("\t--n\tThe number of elements to comprise the sample problem\n"); + printf("\t\t\tDefault = 512\n"); + printf("\n"); + printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n"); + printf("\n"); +} + + +/****************************************************************************** + * Command-line parsing + ******************************************************************************/ +#include +#include +#include + +class CommandLineArgs +{ +protected: + + std::map pairs; + +public: + + // Constructor + CommandLineArgs(int argc, char **argv) + { + using namespace std; + + for (int i = 1; i < argc; i++) + { + string arg = argv[i]; + + if ((arg[0] != '-') || (arg[1] != '-')) { + continue; + } + + string::size_type pos; + string key, val; + if ((pos = arg.find( '=')) == string::npos) { + key = string(arg, 2, arg.length() - 2); + val = ""; + } else { + key = string(arg, 2, pos - 2); + val = string(arg, pos + 1, arg.length() - 1); + } + pairs[key] = val; + } + } + + bool CheckCmdLineFlag(const char* arg_name) + { + using namespace std; + map::iterator itr; + if ((itr = pairs.find(arg_name)) != pairs.end()) { + return true; + } + return false; + } + + template + void GetCmdLineArgument(const char *arg_name, T &val); + + int ParsedArgc() + { + return pairs.size(); + } +}; + +template +void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val) +{ + using namespace std; + map::iterator itr; + if ((itr = pairs.find(arg_name)) != pairs.end()) { + istringstream strstream(itr->second); + strstream >> val; + } +} + +template <> +void CommandLineArgs::GetCmdLineArgument(const char* arg_name, char* &val) +{ + using namespace std; + map::iterator itr; + if ((itr = pairs.find(arg_name)) != pairs.end()) { + + string s = itr->second; + val = (char*) malloc(sizeof(char) * (s.length() + 1)); + strcpy(val, s.c_str()); + + } else { + val = NULL; + } +} + + + + + +/****************************************************************************** + * Main + ******************************************************************************/ + +extern bool gDebugSkipLoadingBinary; + +int main( int argc, char** argv) +{ + gDebugSkipLoadingBinary = true; + + cl_int ciErrNum; + CommandLineArgs args(argc,argv); + + args.GetCmdLineArgument("deviceId", gPreferredDeviceId); + args.GetCmdLineArgument("platformId", gPreferredPlatformId); + + printf("Initialize OpenCL using btOpenCLUtils_createContextFromType\n"); + cl_platform_id platformId; + g_cxMainContext = btOpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + + int numDev = btOpenCLUtils_getNumDevices(g_cxMainContext); + + if (!numDev) + { + printf("error: no OpenCL devices\n"); + exit(0); + } + int result; + int devId = 0; + g_device = btOpenCLUtils_getDevice(g_cxMainContext,devId); + btOpenCLUtils_printDeviceInfo(g_device); + // create a command-queue + g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + + + + //srand(time(NULL)); + srand(0); // presently deterministic + + unsigned int num_elements = 32*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288; + unsigned int iterations = 10; + bool keys_only = true; + + // + // Check command line arguments + // + + + + if (args.CheckCmdLineFlag("help")) + { + Usage(); + return 0; + } + + args.GetCmdLineArgument("i", iterations); + args.GetCmdLineArgument("n", num_elements); + + + + keys_only = !args.CheckCmdLineFlag("key-values"); + g_verbose = args.CheckCmdLineFlag("v"); + + + + TestSort( + iterations, + num_elements, + keys_only); + + +} diff --git a/opencl/parallel_primitives/host/CommandLineArgs.h b/opencl/parallel_primitives/host/CommandLineArgs.h new file mode 100644 index 000000000..b2a43016f --- /dev/null +++ b/opencl/parallel_primitives/host/CommandLineArgs.h @@ -0,0 +1,92 @@ +#ifndef COMMAND_LINE_ARGS_H +#define COMMAND_LINE_ARGS_H + +/****************************************************************************** + * Command-line parsing + ******************************************************************************/ +#include +#include +#include +#include +#include +class CommandLineArgs +{ +protected: + + std::map pairs; + +public: + + // Constructor + CommandLineArgs(int argc, char **argv) + { + using namespace std; + + for (int i = 1; i < argc; i++) + { + string arg = argv[i]; + + if ((arg[0] != '-') || (arg[1] != '-')) { + continue; + } + + string::size_type pos; + string key, val; + if ((pos = arg.find( '=')) == string::npos) { + key = string(arg, 2, arg.length() - 2); + val = ""; + } else { + key = string(arg, 2, pos - 2); + val = string(arg, pos + 1, arg.length() - 1); + } + pairs[key] = val; + } + } + + bool CheckCmdLineFlag(const char* arg_name) + { + using namespace std; + map::iterator itr; + if ((itr = pairs.find(arg_name)) != pairs.end()) { + return true; + } + return false; + } + + template + void GetCmdLineArgument(const char *arg_name, T &val); + + int ParsedArgc() + { + return pairs.size(); + } +}; + +template +void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val) +{ + using namespace std; + map::iterator itr; + if ((itr = pairs.find(arg_name)) != pairs.end()) { + istringstream strstream(itr->second); + strstream >> val; + } +} + +template <> +void CommandLineArgs::GetCmdLineArgument(const char* arg_name, char* &val) +{ + using namespace std; + map::iterator itr; + if ((itr = pairs.find(arg_name)) != pairs.end()) { + + string s = itr->second; + val = (char*) malloc(sizeof(char) * (s.length() + 1)); + std::strcpy(val, s.c_str()); + + } else { + val = NULL; + } +} + +#endif //COMMAND_LINE_ARGS_H diff --git a/opencl/parallel_primitives/host/btAlignedAllocator.cpp b/opencl/parallel_primitives/host/btAlignedAllocator.cpp new file mode 100644 index 000000000..a65296c6a --- /dev/null +++ b/opencl/parallel_primitives/host/btAlignedAllocator.cpp @@ -0,0 +1,181 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#include "btAlignedAllocator.h" + +int gNumAlignedAllocs = 0; +int gNumAlignedFree = 0; +int gTotalBytesAlignedAllocs = 0;//detect memory leaks + +static void *btAllocDefault(size_t size) +{ + return malloc(size); +} + +static void btFreeDefault(void *ptr) +{ + free(ptr); +} + +static btAllocFunc *sAllocFunc = btAllocDefault; +static btFreeFunc *sFreeFunc = btFreeDefault; + + + +#if defined (BT_HAS_ALIGNED_ALLOCATOR) +#include +static void *btAlignedAllocDefault(size_t size, int alignment) +{ + return _aligned_malloc(size, (size_t)alignment); +} + +static void btAlignedFreeDefault(void *ptr) +{ + _aligned_free(ptr); +} +#elif defined(__CELLOS_LV2__) +#include + +static inline void *btAlignedAllocDefault(size_t size, int alignment) +{ + return memalign(alignment, size); +} + +static inline void btAlignedFreeDefault(void *ptr) +{ + free(ptr); +} +#else + + + + + +static inline void *btAlignedAllocDefault(size_t size, int alignment) +{ + void *ret; + char *real; + real = (char *)sAllocFunc(size + sizeof(void *) + (alignment-1)); + if (real) { + ret = btAlignPointer(real + sizeof(void *),alignment); + *((void **)(ret)-1) = (void *)(real); + } else { + ret = (void *)(real); + } + return (ret); +} + +static inline void btAlignedFreeDefault(void *ptr) +{ + void* real; + + if (ptr) { + real = *((void **)(ptr)-1); + sFreeFunc(real); + } +} +#endif + + +static btAlignedAllocFunc *sAlignedAllocFunc = btAlignedAllocDefault; +static btAlignedFreeFunc *sAlignedFreeFunc = btAlignedFreeDefault; + +void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc) +{ + sAlignedAllocFunc = allocFunc ? allocFunc : btAlignedAllocDefault; + sAlignedFreeFunc = freeFunc ? freeFunc : btAlignedFreeDefault; +} + +void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc) +{ + sAllocFunc = allocFunc ? allocFunc : btAllocDefault; + sFreeFunc = freeFunc ? freeFunc : btFreeDefault; +} + +#ifdef BT_DEBUG_MEMORY_ALLOCATIONS +//this generic allocator provides the total allocated number of bytes +#include + +void* btAlignedAllocInternal (size_t size, int alignment,int line,char* filename) +{ + void *ret; + char *real; + + gTotalBytesAlignedAllocs += size; + gNumAlignedAllocs++; + + + real = (char *)sAllocFunc(size + 2*sizeof(void *) + (alignment-1)); + if (real) { + ret = (void*) btAlignPointer(real + 2*sizeof(void *), alignment); + *((void **)(ret)-1) = (void *)(real); + *((int*)(ret)-2) = size; + + } else { + ret = (void *)(real);//?? + } + + printf("allocation#%d at address %x, from %s,line %d, size %d\n",gNumAlignedAllocs,real, filename,line,size); + + int* ptr = (int*)ret; + *ptr = 12; + return (ret); +} + +void btAlignedFreeInternal (void* ptr,int line,char* filename) +{ + + void* real; + gNumAlignedFree++; + + if (ptr) { + real = *((void **)(ptr)-1); + int size = *((int*)(ptr)-2); + gTotalBytesAlignedAllocs -= size; + + printf("free #%d at address %x, from %s,line %d, size %d\n",gNumAlignedFree,real, filename,line,size); + + sFreeFunc(real); + } else + { + printf("NULL ptr\n"); + } +} + +#else //BT_DEBUG_MEMORY_ALLOCATIONS + +void* btAlignedAllocInternal (size_t size, int alignment) +{ + gNumAlignedAllocs++; + void* ptr; + ptr = sAlignedAllocFunc(size, alignment); +// printf("btAlignedAllocInternal %d, %x\n",size,ptr); + return ptr; +} + +void btAlignedFreeInternal (void* ptr) +{ + if (!ptr) + { + return; + } + + gNumAlignedFree++; +// printf("btAlignedFreeInternal %x\n",ptr); + sAlignedFreeFunc(ptr); +} + +#endif //BT_DEBUG_MEMORY_ALLOCATIONS + diff --git a/opencl/parallel_primitives/host/btAlignedAllocator.h b/opencl/parallel_primitives/host/btAlignedAllocator.h new file mode 100644 index 000000000..f168f3c66 --- /dev/null +++ b/opencl/parallel_primitives/host/btAlignedAllocator.h @@ -0,0 +1,107 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef BT_ALIGNED_ALLOCATOR +#define BT_ALIGNED_ALLOCATOR + +///we probably replace this with our own aligned memory allocator +///so we replace _aligned_malloc and _aligned_free with our own +///that is better portable and more predictable + +#include "btScalar.h" +//#define BT_DEBUG_MEMORY_ALLOCATIONS 1 +#ifdef BT_DEBUG_MEMORY_ALLOCATIONS + +#define btAlignedAlloc(a,b) \ + btAlignedAllocInternal(a,b,__LINE__,__FILE__) + +#define btAlignedFree(ptr) \ + btAlignedFreeInternal(ptr,__LINE__,__FILE__) + +void* btAlignedAllocInternal (size_t size, int alignment,int line,char* filename); + +void btAlignedFreeInternal (void* ptr,int line,char* filename); + +#else + void* btAlignedAllocInternal (size_t size, int alignment); + void btAlignedFreeInternal (void* ptr); + + #define btAlignedAlloc(size,alignment) btAlignedAllocInternal(size,alignment) + #define btAlignedFree(ptr) btAlignedFreeInternal(ptr) + +#endif +typedef int size_type; + +typedef void *(btAlignedAllocFunc)(size_t size, int alignment); +typedef void (btAlignedFreeFunc)(void *memblock); +typedef void *(btAllocFunc)(size_t size); +typedef void (btFreeFunc)(void *memblock); + +///The developer can let all Bullet memory allocations go through a custom memory allocator, using btAlignedAllocSetCustom +void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc); +///If the developer has already an custom aligned allocator, then btAlignedAllocSetCustomAligned can be used. The default aligned allocator pre-allocates extra memory using the non-aligned allocator, and instruments it. +void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc); + + +///The btAlignedAllocator is a portable class for aligned memory allocations. +///Default implementations for unaligned and aligned allocations can be overridden by a custom allocator using btAlignedAllocSetCustom and btAlignedAllocSetCustomAligned. +template < typename T , unsigned Alignment > +class btAlignedAllocator { + + typedef btAlignedAllocator< T , Alignment > self_type; + +public: + + //just going down a list: + btAlignedAllocator() {} + /* + btAlignedAllocator( const self_type & ) {} + */ + + template < typename Other > + btAlignedAllocator( const btAlignedAllocator< Other , Alignment > & ) {} + + typedef const T* const_pointer; + typedef const T& const_reference; + typedef T* pointer; + typedef T& reference; + typedef T value_type; + + pointer address ( reference ref ) const { return &ref; } + const_pointer address ( const_reference ref ) const { return &ref; } + pointer allocate ( size_type n , const_pointer * hint = 0 ) { + (void)hint; + return reinterpret_cast< pointer >(btAlignedAlloc( sizeof(value_type) * n , Alignment )); + } + void construct ( pointer ptr , const value_type & value ) { new (ptr) value_type( value ); } + void deallocate( pointer ptr ) { + btAlignedFree( reinterpret_cast< void * >( ptr ) ); + } + void destroy ( pointer ptr ) { ptr->~value_type(); } + + + template < typename O > struct rebind { + typedef btAlignedAllocator< O , Alignment > other; + }; + template < typename O > + self_type & operator=( const btAlignedAllocator< O , Alignment > & ) { return *this; } + + friend bool operator==( const self_type & , const self_type & ) { return true; } +}; + + + +#endif //BT_ALIGNED_ALLOCATOR + diff --git a/opencl/parallel_primitives/host/btAlignedObjectArray.h b/opencl/parallel_primitives/host/btAlignedObjectArray.h new file mode 100644 index 000000000..24e59ab65 --- /dev/null +++ b/opencl/parallel_primitives/host/btAlignedObjectArray.h @@ -0,0 +1,511 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#ifndef BT_OBJECT_ARRAY__ +#define BT_OBJECT_ARRAY__ + +#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE +#include "btAlignedAllocator.h" + +///If the platform doesn't support placement new, you can disable BT_USE_PLACEMENT_NEW +///then the btAlignedObjectArray doesn't support objects with virtual methods, and non-trivial constructors/destructors +///You can enable BT_USE_MEMCPY, then swapping elements in the array will use memcpy instead of operator= +///see discussion here: http://continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1231 and +///http://www.continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1240 + +#define BT_USE_PLACEMENT_NEW 1 +//#define BT_USE_MEMCPY 1 //disable, because it is cumbersome to find out for each platform where memcpy is defined. It can be in or or otherwise... +#define BT_ALLOW_ARRAY_COPY_OPERATOR // enabling this can accidently perform deep copies of data if you are not careful + +#ifdef BT_USE_MEMCPY +#include +#include +#endif //BT_USE_MEMCPY + +#ifdef BT_USE_PLACEMENT_NEW +#include //for placement new +#endif //BT_USE_PLACEMENT_NEW + + +///The btAlignedObjectArray template class uses a subset of the stl::vector interface for its methods +///It is developed to replace stl::vector to avoid portability issues, including STL alignment issues to add SIMD/SSE data +template +//template +class btAlignedObjectArray +{ + btAlignedAllocator m_allocator; + + int m_size; + int m_capacity; + T* m_data; + //PCK: added this line + bool m_ownsMemory; + +#ifdef BT_ALLOW_ARRAY_COPY_OPERATOR +public: + SIMD_FORCE_INLINE btAlignedObjectArray& operator=(const btAlignedObjectArray &other) + { + copyFromArray(other); + return *this; + } +#else//BT_ALLOW_ARRAY_COPY_OPERATOR +private: + SIMD_FORCE_INLINE btAlignedObjectArray& operator=(const btAlignedObjectArray &other); +#endif//BT_ALLOW_ARRAY_COPY_OPERATOR + +protected: + SIMD_FORCE_INLINE int allocSize(int size) + { + return (size ? size*2 : 1); + } + SIMD_FORCE_INLINE void copy(int start,int end, T* dest) const + { + int i; + for (i=start;i=0); + btAssert(n=0); + btAssert(n=0); + btAssert(n=0); + btAssert(n0); + m_size--; + m_data[m_size].~T(); + } + + + ///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument. + ///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations. + SIMD_FORCE_INLINE void resizeNoInitialize(int newsize) + { + int curSize = size(); + + if (newsize < curSize) + { + } else + { + if (newsize > size()) + { + reserve(newsize); + } + //leave this uninitialized + } + m_size = newsize; + } + + SIMD_FORCE_INLINE void resize(int newsize, const T& fillData=T()) + { + int curSize = size(); + + if (newsize < curSize) + { + for(int i = newsize; i < curSize; i++) + { + m_data[i].~T(); + } + } else + { + if (newsize > size()) + { + reserve(newsize); + } +#ifdef BT_USE_PLACEMENT_NEW + for (int i=curSize;i + void quickSortInternal(const L& CompareFunc,int lo, int hi) + { + // lo is the lower index, hi is the upper index + // of the region of array a that is to be sorted + int i=lo, j=hi; + T x=m_data[(lo+hi)/2]; + + // partition + do + { + while (CompareFunc(m_data[i],x)) + i++; + while (CompareFunc(x,m_data[j])) + j--; + if (i<=j) + { + swap(i,j); + i++; j--; + } + } while (i<=j); + + // recursion + if (lo + void quickSort(const L& CompareFunc) + { + //don't sort 0 or 1 elements + if (size()>1) + { + quickSortInternal(CompareFunc,0,size()-1); + } + } + + + ///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/ + template + void downHeap(T *pArr, int k, int n, const L& CompareFunc) + { + /* PRE: a[k+1..N] is a heap */ + /* POST: a[k..N] is a heap */ + + T temp = pArr[k - 1]; + /* k has child(s) */ + while (k <= n/2) + { + int child = 2*k; + + if ((child < n) && CompareFunc(pArr[child - 1] , pArr[child])) + { + child++; + } + /* pick larger child */ + if (CompareFunc(temp , pArr[child - 1])) + { + /* move child up */ + pArr[k - 1] = pArr[child - 1]; + k = child; + } + else + { + break; + } + } + pArr[k - 1] = temp; + } /*downHeap*/ + + void swap(int index0,int index1) + { +#ifdef BT_USE_MEMCPY + char temp[sizeof(T)]; + memcpy(temp,&m_data[index0],sizeof(T)); + memcpy(&m_data[index0],&m_data[index1],sizeof(T)); + memcpy(&m_data[index1],temp,sizeof(T)); +#else + T temp = m_data[index0]; + m_data[index0] = m_data[index1]; + m_data[index1] = temp; +#endif //BT_USE_PLACEMENT_NEW + + } + + template + void heapSort(const L& CompareFunc) + { + /* sort a[0..N-1], N.B. 0 to N-1 */ + int k; + int n = m_size; + for (k = n/2; k > 0; k--) + { + downHeap(m_data, k, n, CompareFunc); + } + + /* a[1..N] is now a heap */ + while ( n>=1 ) + { + swap(0,n-1); /* largest of a[0..n-1] */ + + + n = n - 1; + /* restore a[1..i-1] heap */ + downHeap(m_data, 1, n, CompareFunc); + } + } + + ///non-recursive binary search, assumes sorted array + int findBinarySearch(const T& key) const + { + int first = 0; + int last = size()-1; + + //assume sorted array + while (first <= last) { + int mid = (first + last) / 2; // compute mid point. + if (key > m_data[mid]) + first = mid + 1; // repeat search in top half. + else if (key < m_data[mid]) + last = mid - 1; // repeat search in bottom half. + else + return mid; // found it. return position ///// + } + return size(); // failed to find key + } + + + int findLinearSearch(const T& key) const + { + int index=size(); + int i; + + for (i=0;i( device, 1, BufferBase::BUFFER_CONST ); + + m_lower = (maxSize == 0)? 0: new btOpenCLArray(ctx,queue,maxSize ); + m_upper = (maxSize == 0)? 0: new btOpenCLArray(ctx,queue, maxSize ); + + m_filler = new btFillCL(ctx,device,queue); +} + +btBoundSearchCL::~btBoundSearchCL() +{ + + delete m_lower; + delete m_upper; + delete m_filler; + + clReleaseKernel(m_lowerSortDataKernel); + clReleaseKernel(m_upperSortDataKernel); + clReleaseKernel(m_subtractKernel); + + +} + + +void btBoundSearchCL::execute(btOpenCLArray& src, int nSrc, btOpenCLArray& dst, int nDst, Option option ) +{ + btInt4 constBuffer; + constBuffer.x = nSrc; + constBuffer.y = nDst; + + if( option == BOUND_LOWER ) + { + btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL()) }; + + btLauncherCL launcher( m_queue, m_lowerSortDataKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( nSrc ); + launcher.setConst( nDst ); + + launcher.launch1D( nSrc, 64 ); + } + else if( option == BOUND_UPPER ) + { + btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) }; + + btLauncherCL launcher(m_queue, m_upperSortDataKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( nSrc ); + launcher.setConst( nDst ); + + launcher.launch1D( nSrc, 64 ); + } + else if( option == COUNT ) + { + btAssert( m_lower ); + btAssert( m_upper ); + btAssert( m_lower->capacity() <= (int)nDst ); + btAssert( m_upper->capacity() <= (int)nDst ); + + int zero = 0; + m_filler->execute( *m_lower, zero, nDst ); + m_filler->execute( *m_upper, zero, nDst ); + + execute( src, nSrc, *m_lower, nDst, BOUND_LOWER ); + execute( src, nSrc, *m_upper, nDst, BOUND_UPPER ); + + { + btBufferInfoCL bInfo[] = { btBufferInfoCL( m_upper->getBufferCL(), true ), btBufferInfoCL( m_lower->getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) }; + + btLauncherCL launcher( m_queue, m_subtractKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( nSrc ); + launcher.setConst( nDst ); + + launcher.launch1D( nDst, 64 ); + } + } + else + { + btAssert( 0 ); + } + +} + + +void btBoundSearchCL::executeHost( btAlignedObjectArray& src, int nSrc, + btAlignedObjectArray& dst, int nDst, Option option ) +{ + + + for(int i=0; i lower; + lower.resize(nDst ); + btAlignedObjectArray upper; + upper.resize(nDst ); + + for(int i=0; i +#include +#include +#include +*/ + +#include "btOpenCLArray.h" +#include "btFillCL.h" +#include "btRadixSort32CL.h" //for btSortData (perhaps move it?) +class btBoundSearchCL +{ + public: + + enum Option + { + BOUND_LOWER, + BOUND_UPPER, + COUNT, + }; + + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + + + cl_kernel m_lowerSortDataKernel; + cl_kernel m_upperSortDataKernel; + cl_kernel m_subtractKernel; + + btOpenCLArray* m_constbtOpenCLArray; + btOpenCLArray* m_lower; + btOpenCLArray* m_upper; + + btFillCL* m_filler; + + btBoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size); + + virtual ~btBoundSearchCL(); + + // src has to be src[i].m_key <= src[i+1].m_key + void execute( btOpenCLArray& src, int nSrc, btOpenCLArray& dst, int nDst, Option option = BOUND_LOWER ); + + void executeHost( btAlignedObjectArray& src, int nSrc, btAlignedObjectArray& dst, int nDst, Option option = BOUND_LOWER); +}; + + +#endif //BT_BOUNDSEARCH_H diff --git a/opencl/parallel_primitives/host/btBufferInfoCL.h b/opencl/parallel_primitives/host/btBufferInfoCL.h new file mode 100644 index 000000000..48798e232 --- /dev/null +++ b/opencl/parallel_primitives/host/btBufferInfoCL.h @@ -0,0 +1,19 @@ + +#ifndef BT_BUFFER_INFO_CL_H +#define BT_BUFFER_INFO_CL_H + +#include "btOpenCLArray.h" + + +struct btBufferInfoCL +{ + //btBufferInfoCL(){} + +// template + btBufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){} + + cl_mem m_clBuffer; + bool m_isReadOnly; +}; + +#endif //BT_BUFFER_INFO_CL_H diff --git a/opencl/parallel_primitives/host/btFillCL.cpp b/opencl/parallel_primitives/host/btFillCL.cpp new file mode 100644 index 000000000..18a7e2093 --- /dev/null +++ b/opencl/parallel_primitives/host/btFillCL.cpp @@ -0,0 +1,126 @@ +#include "btFillCL.h" +#include "../../basic_initialize/btOpenCLUtils.h" +#include "btBufferInfoCL.h" +#include "btLauncherCL.h" + +#define FILL_CL_PROGRAM_PATH "opencl/parallel_primitives/kernels/FillKernels.cl" + +#include "../kernels/FillKernelsCL.h" + +btFillCL::btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue) +:m_commandQueue(queue) +{ + const char* kernelSource = fillKernelsCL; + cl_int pErrNum; + const char* additionalMacros = ""; + + cl_program fillProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH); + btAssert(fillProg); + + m_fillIntKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros ); + btAssert(m_fillIntKernel); + + m_fillUnsignedIntKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros ); + btAssert(m_fillIntKernel); + + m_fillFloatKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros ); + btAssert(m_fillFloatKernel); + + + + m_fillKernelInt2 = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros ); + btAssert(m_fillKernelInt2); + +} + +btFillCL::~btFillCL() +{ + clReleaseKernel(m_fillKernelInt2); + clReleaseKernel(m_fillIntKernel); + clReleaseKernel(m_fillUnsignedIntKernel); + clReleaseKernel(m_fillFloatKernel); + +} + +void btFillCL::execute(btOpenCLArray& src, const float value, int n, int offset) +{ + btAssert( n>0 ); + + { + btLauncherCL launcher( m_commandQueue, m_fillFloatKernel ); + launcher.setBuffer( src.getBufferCL()); + launcher.setConst( n ); + launcher.setConst( value ); + launcher.setConst( offset); + + launcher.launch1D( n ); + } +} + +void btFillCL::execute(btOpenCLArray& src, const int value, int n, int offset) +{ + btAssert( n>0 ); + + + { + btLauncherCL launcher( m_commandQueue, m_fillIntKernel ); + launcher.setBuffer(src.getBufferCL()); + launcher.setConst( n); + launcher.setConst( value); + launcher.setConst( offset); + launcher.launch1D( n ); + } +} + + +void btFillCL::execute(btOpenCLArray& src, const unsigned int value, int n, int offset) +{ + btAssert( n>0 ); + + { + btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) }; + + btLauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( n ); + launcher.setConst(value); + launcher.setConst(offset); + + launcher.launch1D( n ); + } +} + +void btFillCL::executeHost(btAlignedObjectArray &src, const btInt2 &value, int n, int offset) +{ + for (int i=0;i &src, const int value, int n, int offset) +{ + for (int i=0;i &src, const btInt2 &value, int n, int offset) +{ + btAssert( n>0 ); + + + { + btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) }; + + btLauncherCL launcher(m_commandQueue, m_fillKernelInt2); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst(n); + launcher.setConst(value); + launcher.setConst(offset); + + //( constBuffer ); + launcher.launch1D( n ); + } +} diff --git a/opencl/parallel_primitives/host/btFillCL.h b/opencl/parallel_primitives/host/btFillCL.h new file mode 100644 index 000000000..a9303a73d --- /dev/null +++ b/opencl/parallel_primitives/host/btFillCL.h @@ -0,0 +1,137 @@ +#ifndef BT_FILL_CL_H +#define BT_FILL_CL_H + +#include "btOpenCLArray.h" +#include "btScalar.h" + +ATTRIBUTE_ALIGNED16(struct) btUnsignedInt4 +{ + BT_DECLARE_ALIGNED_ALLOCATOR(); + + union + { + struct + { + unsigned int x,y,z,w; + }; + struct + { + unsigned int s[4]; + }; + }; +}; + +ATTRIBUTE_ALIGNED16(struct) btInt4 +{ + BT_DECLARE_ALIGNED_ALLOCATOR(); + + union + { + struct + { + int x,y,z,w; + }; + struct + { + int s[4]; + }; + }; +}; + +struct btUnsignedInt2 +{ + union + { + struct + { + unsigned int x,y; + }; + struct + { + unsigned int s[2]; + }; + }; +}; + +struct btInt2 +{ + union + { + struct + { + int x,y; + }; + struct + { + int s[2]; + }; + }; +}; + +SIMD_FORCE_INLINE btInt4 btMakeInt4(int x, int y, int z, int w = 0) +{ + btInt4 v; + v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w; + return v; +} + +SIMD_FORCE_INLINE btUnsignedInt4 btMakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0) +{ + btUnsignedInt4 v; + v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w; + return v; +} + +class btFillCL +{ + + cl_command_queue m_commandQueue; + + cl_kernel m_fillKernelInt2; + cl_kernel m_fillIntKernel; + cl_kernel m_fillUnsignedIntKernel; + cl_kernel m_fillFloatKernel; + + public: + + struct btConstData + { + union + { + btInt4 m_data; + btUnsignedInt4 m_UnsignedData; + }; + int m_offset; + int m_n; + int m_padding[2]; + }; + +protected: + +public: + + btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue); + + virtual ~btFillCL(); + + void execute(btOpenCLArray& src, const unsigned int value, int n, int offset = 0); + + void execute(btOpenCLArray& src, const int value, int n, int offset = 0); + + void execute(btOpenCLArray& src, const float value, int n, int offset = 0); + + void execute(btOpenCLArray& src, const btInt2& value, int n, int offset = 0); + + void executeHost(btAlignedObjectArray &src, const btInt2 &value, int n, int offset); + + void executeHost(btAlignedObjectArray &src, const int value, int n, int offset); + + // void execute(btOpenCLArray& src, const btInt4& value, int n, int offset = 0); + +}; + + + + + +#endif //BT_FILL_CL_H diff --git a/opencl/parallel_primitives/host/btHashMap.h b/opencl/parallel_primitives/host/btHashMap.h new file mode 100644 index 000000000..ce07db3ac --- /dev/null +++ b/opencl/parallel_primitives/host/btHashMap.h @@ -0,0 +1,450 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#ifndef BT_HASH_MAP_H +#define BT_HASH_MAP_H + +#include "btAlignedObjectArray.h" + +///very basic hashable string implementation, compatible with btHashMap +struct btHashString +{ + const char* m_string; + unsigned int m_hash; + + SIMD_FORCE_INLINE unsigned int getHash()const + { + return m_hash; + } + + btHashString(const char* name) + :m_string(name) + { + /* magic numbers from http://www.isthe.com/chongo/tech/comp/fnv/ */ + static const unsigned int InitialFNV = 2166136261u; + static const unsigned int FNVMultiple = 16777619u; + + /* Fowler / Noll / Vo (FNV) Hash */ + unsigned int hash = InitialFNV; + + for(int i = 0; m_string[i]; i++) + { + hash = hash ^ (m_string[i]); /* xor the low 8 bits */ + hash = hash * FNVMultiple; /* multiply by the magic number */ + } + m_hash = hash; + } + + int portableStringCompare(const char* src, const char* dst) const + { + int ret = 0 ; + + while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst) + ++src, ++dst; + + if ( ret < 0 ) + ret = -1 ; + else if ( ret > 0 ) + ret = 1 ; + + return( ret ); + } + + bool equals(const btHashString& other) const + { + return (m_string == other.m_string) || + (0==portableStringCompare(m_string,other.m_string)); + + } + +}; + +const int BT_HASH_NULL=0xffffffff; + + +class btHashInt +{ + int m_uid; +public: + btHashInt(int uid) :m_uid(uid) + { + } + + int getUid1() const + { + return m_uid; + } + + void setUid1(int uid) + { + m_uid = uid; + } + + bool equals(const btHashInt& other) const + { + return getUid1() == other.getUid1(); + } + //to our success + SIMD_FORCE_INLINE unsigned int getHash()const + { + int key = m_uid; + // Thomas Wang's hash + key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); + return key; + } +}; + + + +class btHashPtr +{ + + union + { + const void* m_pointer; + int m_hashValues[2]; + }; + +public: + + btHashPtr(const void* ptr) + :m_pointer(ptr) + { + } + + const void* getPointer() const + { + return m_pointer; + } + + bool equals(const btHashPtr& other) const + { + return getPointer() == other.getPointer(); + } + + //to our success + SIMD_FORCE_INLINE unsigned int getHash()const + { + const bool VOID_IS_8 = ((sizeof(void*)==8)); + + int key = VOID_IS_8? m_hashValues[0]+m_hashValues[1] : m_hashValues[0]; + + // Thomas Wang's hash + key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); + return key; + } + + +}; + + +template +class btHashKeyPtr +{ + int m_uid; +public: + + btHashKeyPtr(int uid) :m_uid(uid) + { + } + + int getUid1() const + { + return m_uid; + } + + bool equals(const btHashKeyPtr& other) const + { + return getUid1() == other.getUid1(); + } + + //to our success + SIMD_FORCE_INLINE unsigned int getHash()const + { + int key = m_uid; + // Thomas Wang's hash + key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); + return key; + } + + +}; + + +template +class btHashKey +{ + int m_uid; +public: + + btHashKey(int uid) :m_uid(uid) + { + } + + int getUid1() const + { + return m_uid; + } + + bool equals(const btHashKey& other) const + { + return getUid1() == other.getUid1(); + } + //to our success + SIMD_FORCE_INLINE unsigned int getHash()const + { + int key = m_uid; + // Thomas Wang's hash + key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); + return key; + } +}; + + +///The btHashMap template class implements a generic and lightweight hashmap. +///A basic sample of how to use btHashMap is located in Demos\BasicDemo\main.cpp +template +class btHashMap +{ + +protected: + btAlignedObjectArray m_hashTable; + btAlignedObjectArray m_next; + + btAlignedObjectArray m_valueArray; + btAlignedObjectArray m_keyArray; + + void growTables(const Key& /*key*/) + { + int newCapacity = m_valueArray.capacity(); + + if (m_hashTable.size() < newCapacity) + { + //grow hashtable and next table + int curHashtableSize = m_hashTable.size(); + + m_hashTable.resize(newCapacity); + m_next.resize(newCapacity); + + int i; + + for (i= 0; i < newCapacity; ++i) + { + m_hashTable[i] = BT_HASH_NULL; + } + for (i = 0; i < newCapacity; ++i) + { + m_next[i] = BT_HASH_NULL; + } + + for(i=0;i= (unsigned int)m_hashTable.size()) + { + return BT_HASH_NULL; + } + + int index = m_hashTable[hash]; + while ((index != BT_HASH_NULL) && key.equals(m_keyArray[index]) == false) + { + index = m_next[index]; + } + return index; + } + + void clear() + { + m_hashTable.clear(); + m_next.clear(); + m_valueArray.clear(); + m_keyArray.clear(); + } + +}; + +#endif //BT_HASH_MAP_H diff --git a/opencl/parallel_primitives/host/btLauncherCL.h b/opencl/parallel_primitives/host/btLauncherCL.h new file mode 100644 index 000000000..6b5657426 --- /dev/null +++ b/opencl/parallel_primitives/host/btLauncherCL.h @@ -0,0 +1,363 @@ + +#ifndef BT_LAUNCHER_CL_H +#define BT_LAUNCHER_CL_H + +#include "btBufferInfoCL.h" +#include "btMinMax.h" +#include "btOpenCLArray.h" +#include + +#ifdef _WIN32 +#pragma warning(disable :4996) +#endif +#define BT_CL_MAX_ARG_SIZE 16 +struct btKernelArgData +{ + int m_isBuffer; + int m_argIndex; + int m_argSizeInBytes; + union + { + cl_mem m_clBuffer; + unsigned char m_argData[BT_CL_MAX_ARG_SIZE]; + }; + +}; + +class btLauncherCL +{ + + cl_command_queue m_commandQueue; + cl_kernel m_kernel; + int m_idx; + + btAlignedObjectArray m_kernelArguments; + + + int m_serializationSizeInBytes; + + public: + + btAlignedObjectArray* > m_arrays; + + btLauncherCL(cl_command_queue queue, cl_kernel kernel) + :m_commandQueue(queue), + m_kernel(kernel), + m_idx(0) + { + m_serializationSizeInBytes = sizeof(int); + } + + virtual ~btLauncherCL() + { + for (int i=0;igetBufferCL()); + } + } + + inline void setBuffer( cl_mem clBuffer) + { + + btKernelArgData kernelArg; + kernelArg.m_argIndex = m_idx; + kernelArg.m_isBuffer = 1; + kernelArg.m_clBuffer = clBuffer; + + cl_mem_info param_name = CL_MEM_SIZE; + size_t param_value; + size_t sizeInBytes = sizeof(size_t); + size_t actualSizeInBytes; + cl_int err; + err = clGetMemObjectInfo ( kernelArg.m_clBuffer, + param_name, + sizeInBytes, + ¶m_value, + &actualSizeInBytes); + + btAssert( err == CL_SUCCESS ); + kernelArg.m_argSizeInBytes = param_value; + + m_kernelArguments.push_back(kernelArg); + m_serializationSizeInBytes+= sizeof(btKernelArgData); + m_serializationSizeInBytes+=param_value; + + cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer); + btAssert( status == CL_SUCCESS ); + } + + + inline void setBuffers( btBufferInfoCL* buffInfo, int n ) + { + for(int i=0; im_isBuffer) + { + btOpenCLArray* clData = new btOpenCLArray(ctx,m_commandQueue, arg->m_argSizeInBytes); + clData->resize(arg->m_argSizeInBytes); + + clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes); + + arg->m_clBuffer = clData->getBufferCL(); + + m_arrays.push_back(clData); + + cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer); + btAssert( status == CL_SUCCESS ); + index+=arg->m_argSizeInBytes; + } else + { + cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData); + btAssert( status == CL_SUCCESS ); + } + m_kernelArguments.push_back(*arg); + } + m_serializationSizeInBytes = index; + return index; + } + + inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx) + { + int index=0; + + int numArguments = *(int*) &goldBuffer[index]; + index+=sizeof(int); + + if (numArguments != m_kernelArguments.size()) + { + printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size()); + return -1; + } + + for (int ii=0;iim_argSizeInBytes) + { + printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes); + return -2; + } + + { + int expected = argGold->m_isBuffer; + int found = m_kernelArguments[ii].m_isBuffer; + + if (expected != found) + { + printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found); + return -3; + } + } + index+=sizeof(btKernelArgData); + + if (argGold->m_isBuffer) + { + + unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes); + unsigned char* goldBuf = &goldBuffer[index]; + for (int j=0;jm_argSizeInBytes; + } else + { + + //compare content + for (int b=0;bm_argData[b]; + int found =m_kernelArguments[ii].m_argData[b]; + if (expected != found) + { + printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n", + ii, b, expected, found); + return -5; + } + } + + } + } + return index; + + } + + inline int serializeArguments(unsigned char* destBuffer, int destBufferCapacity) + { + //initialize to known values + for (int i=0;i=m_serializationSizeInBytes); + + //todo: use the btSerializer for this to allow for 32/64bit, endianness etc + int numArguments = m_kernelArguments.size(); + int curBufferSize = 0; + int* dest = (int*)&destBuffer[curBufferSize]; + *dest = numArguments; + curBufferSize += sizeof(int); + + + + for (int i=0;im_kernelArguments.size();i++) + { + btKernelArgData* arg = (btKernelArgData*) &destBuffer[curBufferSize]; + *arg = m_kernelArguments[i]; + curBufferSize+=sizeof(btKernelArgData); + if (arg->m_isBuffer==1) + { + //copy the OpenCL buffer content + cl_int status = 0; + status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes, + &destBuffer[curBufferSize], 0,0,0 ); + btAssert( status==CL_SUCCESS ); + clFinish(m_commandQueue); + curBufferSize+=arg->m_argSizeInBytes; + } + + } + return curBufferSize; + } + + void serializeToFile(const char* fileName, int numWorkItems) + { + int num = numWorkItems; + int buffSize = getSerializationBufferSize(); + unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; + for (int i=0;i + inline void setConst( const T& consts ) + { + int sz=sizeof(T); + btAssert(sz<=BT_CL_MAX_ARG_SIZE); + btKernelArgData kernelArg; + kernelArg.m_argIndex = m_idx; + kernelArg.m_isBuffer = 0; + T* destArg = (T*)kernelArg.m_argData; + *destArg = consts; + kernelArg.m_argSizeInBytes = sizeof(T); + m_kernelArguments.push_back(kernelArg); + m_serializationSizeInBytes+=sizeof(btKernelArgData); + + cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts ); + btAssert( status == CL_SUCCESS ); + } + + inline void launch1D( int numThreads, int localSize = 64) + { + launch2D( numThreads, 1, localSize, 1 ); + } + + inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY ) + { + size_t gRange[3] = {1,1,1}; + size_t lRange[3] = {1,1,1}; + lRange[0] = localSizeX; + lRange[1] = localSizeY; + gRange[0] = btMax((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1)); + gRange[0] *= lRange[0]; + gRange[1] = btMax((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1)); + gRange[1] *= lRange[1]; + + cl_int status = clEnqueueNDRangeKernel( m_commandQueue, + m_kernel, 2, NULL, gRange, lRange, 0,0,0 ); + if (status != CL_SUCCESS) + { + printf("Error: OpenCL status = %d\n",status); + } + btAssert( status == CL_SUCCESS ); + + } +}; + + + +#endif //BT_LAUNCHER_CL_H diff --git a/opencl/parallel_primitives/host/btMinMax.h b/opencl/parallel_primitives/host/btMinMax.h new file mode 100644 index 000000000..5b436e9ba --- /dev/null +++ b/opencl/parallel_primitives/host/btMinMax.h @@ -0,0 +1,71 @@ +/* +Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + + +#ifndef BT_GEN_MINMAX_H +#define BT_GEN_MINMAX_H + +#include "btScalar.h" + +template +SIMD_FORCE_INLINE const T& btMin(const T& a, const T& b) +{ + return a < b ? a : b ; +} + +template +SIMD_FORCE_INLINE const T& btMax(const T& a, const T& b) +{ + return a > b ? a : b; +} + +template +SIMD_FORCE_INLINE const T& btClamped(const T& a, const T& lb, const T& ub) +{ + return a < lb ? lb : (ub < a ? ub : a); +} + +template +SIMD_FORCE_INLINE void btSetMin(T& a, const T& b) +{ + if (b < a) + { + a = b; + } +} + +template +SIMD_FORCE_INLINE void btSetMax(T& a, const T& b) +{ + if (a < b) + { + a = b; + } +} + +template +SIMD_FORCE_INLINE void btClamp(T& a, const T& lb, const T& ub) +{ + if (a < lb) + { + a = lb; + } + else if (ub < a) + { + a = ub; + } +} + +#endif //BT_GEN_MINMAX_H diff --git a/opencl/parallel_primitives/host/btOpenCLArray.h b/opencl/parallel_primitives/host/btOpenCLArray.h new file mode 100644 index 000000000..91e88e9ed --- /dev/null +++ b/opencl/parallel_primitives/host/btOpenCLArray.h @@ -0,0 +1,274 @@ +#ifndef BT_OPENCL_ARRAY_H +#define BT_OPENCL_ARRAY_H + +#include "btAlignedObjectArray.h" +#include "../../basic_initialize/btOpenCLInclude.h" + +template +class btOpenCLArray +{ + int m_size; + int m_capacity; + cl_mem m_clBuffer; + + cl_context m_clContext; + cl_command_queue m_commandQueue; + + bool m_ownsMemory; + + bool m_allowGrowingCapacity; + + void deallocate() + { + if (m_clBuffer && m_ownsMemory) + { + clReleaseMemObject(m_clBuffer); + } + m_clBuffer = 0; + m_capacity=0; + } + + btOpenCLArray& operator=(const btOpenCLArray& src); + + SIMD_FORCE_INLINE int allocSize(int size) + { + return (size ? size*2 : 1); + } + +public: + + btOpenCLArray(cl_context ctx, cl_command_queue queue, int initialCapacity=0, bool allowGrowingCapacity=true) + :m_size(0), m_capacity(0),m_clBuffer(0), + m_clContext(ctx),m_commandQueue(queue), + m_ownsMemory(true),m_allowGrowingCapacity(true) + { + if (initialCapacity) + { + reserve(initialCapacity); + } + m_allowGrowingCapacity = allowGrowingCapacity; + } + + ///this is an error-prone method with no error checking, be careful! + void setFromOpenCLBuffer(cl_mem buffer, int sizeInElements) + { + deallocate(); + m_ownsMemory = false; + m_allowGrowingCapacity = false; + m_clBuffer = buffer; + m_size = sizeInElements; + m_capacity = sizeInElements; + } + +// we could enable this assignment, but need to make sure to avoid accidental deep copies +// btOpenCLArray& operator=(const btAlignedObjectArray& src) +// { +// copyFromArray(src); +// return *this; +// } + + + cl_mem getBufferCL() const + { + return m_clBuffer; + } + + + virtual ~btOpenCLArray() + { + deallocate(); + m_size=0; + m_capacity=0; + } + + SIMD_FORCE_INLINE void push_back(const T& _Val,bool waitForCompletion=true) + { + int sz = size(); + if( sz == capacity() ) + { + reserve( allocSize(size()) ); + } + copyFromHostPointer(&_Val, 1, sz, waitForCompletion); + m_size++; + } + + SIMD_FORCE_INLINE T forcedAt(int n) const + { + btAssert(n>=0); + btAssert(n=0); + btAssert(n size()) + { + reserve(newsize,copyOldContents); + } + + //leave new data uninitialized (init in debug mode?) + //for (int i=curSize;i0); + btAssert(numElements<=m_size); + + int srcOffsetBytes = sizeof(T)*firstElem; + int dstOffsetInBytes = sizeof(T)*dstOffsetInElems; + + status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination, + srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 ); + + btAssert( status == CL_SUCCESS ); + } + + void copyFromHost(const btAlignedObjectArray& srcArray, bool waitForCompletion=true) + { + int newSize = srcArray.size(); + + bool copyOldContents = false; + resize (newSize,copyOldContents); + if (newSize) + copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion); + + } + + void copyFromHostPointer(const T* src, int numElems, int destFirstElem= 0, bool waitForCompletion=true) + { + btAssert(numElems+destFirstElem <= capacity()); + + cl_int status = 0; + int sizeInBytes=sizeof(T)*numElems; + status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes, + src, 0,0,0 ); + btAssert(status == CL_SUCCESS ); + if (waitForCompletion) + clFinish(m_commandQueue); + + } + + + void copyToHost(btAlignedObjectArray& destArray, bool waitForCompletion=true) const + { + destArray.resize(this->size()); + if (size()) + copyToHostPointer(&destArray[0], size(),0,waitForCompletion); + } + + void copyToHostPointer(T* destPtr, int numElem, int srcFirstElem=0, bool waitForCompletion=true) const + { + btAssert(numElem+srcFirstElem <= capacity()); + + cl_int status = 0; + status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem, + destPtr, 0,0,0 ); + btAssert( status==CL_SUCCESS ); + + if (waitForCompletion) + clFinish(m_commandQueue); + } + + void copyFromOpenCLArray(const btOpenCLArray& src) + { + int newSize = src.size(); + resize(newSize); + if (size()) + { + src.copyToCL(m_clBuffer,size()); + } + } + +}; + + +#endif //BT_OPENCL_ARRAY_H diff --git a/opencl/parallel_primitives/host/btPrefixScanCL.cpp b/opencl/parallel_primitives/host/btPrefixScanCL.cpp new file mode 100644 index 000000000..c584097c5 --- /dev/null +++ b/opencl/parallel_primitives/host/btPrefixScanCL.cpp @@ -0,0 +1,126 @@ +#include "btPrefixScanCL.h" +#include "btFillCL.h" +#define BT_PREFIXSCAN_PROG_PATH "opencl/parallel_primitives/kernels/PrefixScanKernels.cl" + +#include "btLauncherCL.h" +#include "../../basic_initialize/btOpenCLUtils.h" +#include "../kernels/PrefixScanKernelsCL.h" + +btPrefixScanCL::btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size) +:m_commandQueue(queue) +{ + const char* scanKernelSource = prefixScanKernelsCL; + cl_int pErrNum; + char* additionalMacros=0; + + m_workBuffer = new btOpenCLArray(ctx,queue,size); + cl_program scanProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, BT_PREFIXSCAN_PROG_PATH); + btAssert(scanProg); + + m_localScanKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros ); + btAssert(m_localScanKernel ); + m_blockSumKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros ); + btAssert(m_blockSumKernel ); + m_propagationKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros ); + btAssert(m_propagationKernel ); +} + + +btPrefixScanCL::~btPrefixScanCL() +{ + delete m_workBuffer; + clReleaseKernel(m_localScanKernel); + clReleaseKernel(m_blockSumKernel); + clReleaseKernel(m_propagationKernel); +} + +template +T btNextPowerOf2(T n) +{ + n -= 1; + for(int i=0; i>i); + return n+1; +} + +void btPrefixScanCL::execute(btOpenCLArray& src, btOpenCLArray& dst, int n, unsigned int* sum) +{ + +// btAssert( data->m_option == EXCLUSIVE ); + const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) ); + + dst.resize(src.size()); + m_workBuffer->resize(src.size()); + + btInt4 constBuffer; + constBuffer.x = n; + constBuffer.y = numBlocks; + constBuffer.z = (int)btNextPowerOf2( numBlocks ); + + btOpenCLArray* srcNative = &src; + btOpenCLArray* dstNative = &dst; + + { + btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( srcNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) }; + + btLauncherCL launcher( m_commandQueue, m_localScanKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE ); + } + + { + btBufferInfoCL bInfo[] = { btBufferInfoCL( m_workBuffer->getBufferCL() ) }; + + btLauncherCL launcher( m_commandQueue, m_blockSumKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE ); + } + + + if( numBlocks > 1 ) + { + btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) }; + btLauncherCL launcher( m_commandQueue, m_propagationKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE ); + } + + + if( sum ) + { + clFinish(m_commandQueue); + dstNative->copyToHostPointer(sum,1,n-1,true); + } + +} + + +void btPrefixScanCL::executeHost(btAlignedObjectArray& src, btAlignedObjectArray& dst, int n, unsigned int* sum) +{ + unsigned int s = 0; + //if( data->m_option == EXCLUSIVE ) + { + for(int i=0; i* m_workBuffer; + + + public: + + btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0); + + virtual ~btPrefixScanCL(); + + void execute(btOpenCLArray& src, btOpenCLArray& dst, int n, unsigned int* sum = 0); + void executeHost(btAlignedObjectArray& src, btAlignedObjectArray& dst, int n, unsigned int* sum); +}; + +#endif //BT_PREFIX_SCAN_CL_H diff --git a/opencl/parallel_primitives/host/btQuickprof.cpp b/opencl/parallel_primitives/host/btQuickprof.cpp new file mode 100644 index 000000000..544aee89d --- /dev/null +++ b/opencl/parallel_primitives/host/btQuickprof.cpp @@ -0,0 +1,566 @@ +/* + +*************************************************************************************************** +** +** profile.cpp +** +** Real-Time Hierarchical Profiling for Game Programming Gems 3 +** +** by Greg Hjelstrom & Byon Garrabrant +** +***************************************************************************************************/ + +// Credits: The Clock class was inspired by the Timer classes in +// Ogre (www.ogre3d.org). + +#include "btQuickprof.h" + +#ifndef BT_NO_PROFILE + + +static btClock gProfileClock; + + +#ifdef __CELLOS_LV2__ +#include +#include +#include +#endif + +#if defined (SUNOS) || defined (__SUNOS__) +#include +#endif + +#if defined(WIN32) || defined(_WIN32) + +#define BT_USE_WINDOWS_TIMERS +#define WIN32_LEAN_AND_MEAN +#define NOWINRES +#define NOMCX +#define NOIME + +#ifdef _XBOX + #include +#else //_XBOX + #include +#endif //_XBOX + +#include + + +#else //_WIN32 +#include +#endif //_WIN32 + +#define mymin(a,b) (a > b ? a : b) + +struct btClockData +{ + +#ifdef BT_USE_WINDOWS_TIMERS + LARGE_INTEGER mClockFrequency; + DWORD mStartTick; + LONGLONG mPrevElapsedTime; + LARGE_INTEGER mStartTime; +#else +#ifdef __CELLOS_LV2__ + uint64_t mStartTime; +#else + struct timeval mStartTime; +#endif +#endif //__CELLOS_LV2__ + +}; + +///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling. +btClock::btClock() +{ + m_data = new btClockData; +#ifdef BT_USE_WINDOWS_TIMERS + QueryPerformanceFrequency(&m_data->mClockFrequency); +#endif + reset(); +} + +btClock::~btClock() +{ + delete m_data; +} + +btClock::btClock(const btClock& other) +{ + m_data = new btClockData; + *m_data = *other.m_data; +} + +btClock& btClock::operator=(const btClock& other) +{ + *m_data = *other.m_data; + return *this; +} + + + /// Resets the initial reference time. +void btClock::reset() +{ +#ifdef BT_USE_WINDOWS_TIMERS + QueryPerformanceCounter(&m_data->mStartTime); + m_data->mStartTick = GetTickCount(); + m_data->mPrevElapsedTime = 0; +#else +#ifdef __CELLOS_LV2__ + + typedef uint64_t ClockSize; + ClockSize newTime; + //__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory"); + SYS_TIMEBASE_GET( newTime ); + m_data->mStartTime = newTime; +#else + gettimeofday(&m_data->mStartTime, 0); +#endif +#endif +} + +/// Returns the time in ms since the last call to reset or since +/// the btClock was created. +unsigned long int btClock::getTimeMilliseconds() +{ +#ifdef BT_USE_WINDOWS_TIMERS + LARGE_INTEGER currentTime; + QueryPerformanceCounter(¤tTime); + LONGLONG elapsedTime = currentTime.QuadPart - + m_data->mStartTime.QuadPart; + // Compute the number of millisecond ticks elapsed. + unsigned long msecTicks = (unsigned long)(1000 * elapsedTime / + m_data->mClockFrequency.QuadPart); + // Check for unexpected leaps in the Win32 performance counter. + // (This is caused by unexpected data across the PCI to ISA + // bridge, aka south bridge. See Microsoft KB274323.) + unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick; + signed long msecOff = (signed long)(msecTicks - elapsedTicks); + if (msecOff < -100 || msecOff > 100) + { + // Adjust the starting time forwards. + LONGLONG msecAdjustment = mymin(msecOff * + m_data->mClockFrequency.QuadPart / 1000, elapsedTime - + m_data->mPrevElapsedTime); + m_data->mStartTime.QuadPart += msecAdjustment; + elapsedTime -= msecAdjustment; + + // Recompute the number of millisecond ticks elapsed. + msecTicks = (unsigned long)(1000 * elapsedTime / + m_data->mClockFrequency.QuadPart); + } + + // Store the current elapsed time for adjustments next time. + m_data->mPrevElapsedTime = elapsedTime; + + return msecTicks; +#else + +#ifdef __CELLOS_LV2__ + uint64_t freq=sys_time_get_timebase_frequency(); + double dFreq=((double) freq) / 1000.0; + typedef uint64_t ClockSize; + ClockSize newTime; + SYS_TIMEBASE_GET( newTime ); + //__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory"); + + return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq); +#else + + struct timeval currentTime; + gettimeofday(¤tTime, 0); + return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000 + + (currentTime.tv_usec - m_data->mStartTime.tv_usec) / 1000; +#endif //__CELLOS_LV2__ +#endif +} + + /// Returns the time in us since the last call to reset or since + /// the Clock was created. +unsigned long int btClock::getTimeMicroseconds() +{ +#ifdef BT_USE_WINDOWS_TIMERS + LARGE_INTEGER currentTime; + QueryPerformanceCounter(¤tTime); + LONGLONG elapsedTime = currentTime.QuadPart - + m_data->mStartTime.QuadPart; + + // Compute the number of millisecond ticks elapsed. + unsigned long msecTicks = (unsigned long)(1000 * elapsedTime / + m_data->mClockFrequency.QuadPart); + + // Check for unexpected leaps in the Win32 performance counter. + // (This is caused by unexpected data across the PCI to ISA + // bridge, aka south bridge. See Microsoft KB274323.) + unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick; + signed long msecOff = (signed long)(msecTicks - elapsedTicks); + if (msecOff < -100 || msecOff > 100) + { + // Adjust the starting time forwards. + LONGLONG msecAdjustment = mymin(msecOff * + m_data->mClockFrequency.QuadPart / 1000, elapsedTime - + m_data->mPrevElapsedTime); + m_data->mStartTime.QuadPart += msecAdjustment; + elapsedTime -= msecAdjustment; + } + + // Store the current elapsed time for adjustments next time. + m_data->mPrevElapsedTime = elapsedTime; + + // Convert to microseconds. + unsigned long usecTicks = (unsigned long)(1000000 * elapsedTime / + m_data->mClockFrequency.QuadPart); + + return usecTicks; +#else + +#ifdef __CELLOS_LV2__ + uint64_t freq=sys_time_get_timebase_frequency(); + double dFreq=((double) freq)/ 1000000.0; + typedef uint64_t ClockSize; + ClockSize newTime; + //__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory"); + SYS_TIMEBASE_GET( newTime ); + + return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq); +#else + + struct timeval currentTime; + gettimeofday(¤tTime, 0); + return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 + + (currentTime.tv_usec - m_data->mStartTime.tv_usec); +#endif//__CELLOS_LV2__ +#endif +} + + + + + +inline void Profile_Get_Ticks(unsigned long int * ticks) +{ + *ticks = gProfileClock.getTimeMicroseconds(); +} + +inline float Profile_Get_Tick_Rate(void) +{ +// return 1000000.f; + return 1000.f; + +} + + + +/*************************************************************************************************** +** +** CProfileNode +** +***************************************************************************************************/ + +/*********************************************************************************************** + * INPUT: * + * name - pointer to a static string which is the name of this profile node * + * parent - parent pointer * + * * + * WARNINGS: * + * The name is assumed to be a static pointer, only the pointer is stored and compared for * + * efficiency reasons. * + *=============================================================================================*/ +CProfileNode::CProfileNode( const char * name, CProfileNode * parent ) : + Name( name ), + TotalCalls( 0 ), + TotalTime( 0 ), + StartTime( 0 ), + RecursionCounter( 0 ), + Parent( parent ), + Child( NULL ), + Sibling( NULL ), + m_userPtr(0) +{ + Reset(); +} + + +void CProfileNode::CleanupMemory() +{ + delete ( Child); + Child = NULL; + delete ( Sibling); + Sibling = NULL; +} + +CProfileNode::~CProfileNode( void ) +{ + delete ( Child); + delete ( Sibling); +} + + +/*********************************************************************************************** + * INPUT: * + * name - static string pointer to the name of the node we are searching for * + * * + * WARNINGS: * + * All profile names are assumed to be static strings so this function uses pointer compares * + * to find the named node. * + *=============================================================================================*/ +CProfileNode * CProfileNode::Get_Sub_Node( const char * name ) +{ + // Try to find this sub node + CProfileNode * child = Child; + while ( child ) { + if ( child->Name == name ) { + return child; + } + child = child->Sibling; + } + + // We didn't find it, so add it + + CProfileNode * node = new CProfileNode( name, this ); + node->Sibling = Child; + Child = node; + return node; +} + + +void CProfileNode::Reset( void ) +{ + TotalCalls = 0; + TotalTime = 0.0f; + + + if ( Child ) { + Child->Reset(); + } + if ( Sibling ) { + Sibling->Reset(); + } +} + + +void CProfileNode::Call( void ) +{ + TotalCalls++; + if (RecursionCounter++ == 0) { + Profile_Get_Ticks(&StartTime); + } +} + + +bool CProfileNode::Return( void ) +{ + if ( --RecursionCounter == 0 && TotalCalls != 0 ) { + unsigned long int time; + Profile_Get_Ticks(&time); + time-=StartTime; + TotalTime += (float)time / Profile_Get_Tick_Rate(); + } + return ( RecursionCounter == 0 ); +} + + +/*************************************************************************************************** +** +** CProfileIterator +** +***************************************************************************************************/ +CProfileIterator::CProfileIterator( CProfileNode * start ) +{ + CurrentParent = start; + CurrentChild = CurrentParent->Get_Child(); +} + + +void CProfileIterator::First(void) +{ + CurrentChild = CurrentParent->Get_Child(); +} + + +void CProfileIterator::Next(void) +{ + CurrentChild = CurrentChild->Get_Sibling(); +} + + +bool CProfileIterator::Is_Done(void) +{ + return CurrentChild == NULL; +} + + +void CProfileIterator::Enter_Child( int index ) +{ + CurrentChild = CurrentParent->Get_Child(); + while ( (CurrentChild != NULL) && (index != 0) ) { + index--; + CurrentChild = CurrentChild->Get_Sibling(); + } + + if ( CurrentChild != NULL ) { + CurrentParent = CurrentChild; + CurrentChild = CurrentParent->Get_Child(); + } +} + + +void CProfileIterator::Enter_Parent( void ) +{ + if ( CurrentParent->Get_Parent() != NULL ) { + CurrentParent = CurrentParent->Get_Parent(); + } + CurrentChild = CurrentParent->Get_Child(); +} + + +/*************************************************************************************************** +** +** CProfileManager +** +***************************************************************************************************/ + +CProfileNode CProfileManager::Root( "Root", NULL ); +CProfileNode * CProfileManager::CurrentNode = &CProfileManager::Root; +int CProfileManager::FrameCounter = 0; +unsigned long int CProfileManager::ResetTime = 0; + + +/*********************************************************************************************** + * CProfileManager::Start_Profile -- Begin a named profile * + * * + * Steps one level deeper into the tree, if a child already exists with the specified name * + * then it accumulates the profiling; otherwise a new child node is added to the profile tree. * + * * + * INPUT: * + * name - name of this profiling record * + * * + * WARNINGS: * + * The string used is assumed to be a static string; pointer compares are used throughout * + * the profiling code for efficiency. * + *=============================================================================================*/ +void CProfileManager::Start_Profile( const char * name ) +{ + if (name != CurrentNode->Get_Name()) { + CurrentNode = CurrentNode->Get_Sub_Node( name ); + } + + CurrentNode->Call(); +} + + +/*********************************************************************************************** + * CProfileManager::Stop_Profile -- Stop timing and record the results. * + *=============================================================================================*/ +void CProfileManager::Stop_Profile( void ) +{ + // Return will indicate whether we should back up to our parent (we may + // be profiling a recursive function) + if (CurrentNode->Return()) { + CurrentNode = CurrentNode->Get_Parent(); + } +} + + +/*********************************************************************************************** + * CProfileManager::Reset -- Reset the contents of the profiling system * + * * + * This resets everything except for the tree structure. All of the timing data is reset. * + *=============================================================================================*/ +void CProfileManager::Reset( void ) +{ + gProfileClock.reset(); + Root.Reset(); + Root.Call(); + FrameCounter = 0; + Profile_Get_Ticks(&ResetTime); +} + + +/*********************************************************************************************** + * CProfileManager::Increment_Frame_Counter -- Increment the frame counter * + *=============================================================================================*/ +void CProfileManager::Increment_Frame_Counter( void ) +{ + FrameCounter++; +} + + +/*********************************************************************************************** + * CProfileManager::Get_Time_Since_Reset -- returns the elapsed time since last reset * + *=============================================================================================*/ +float CProfileManager::Get_Time_Since_Reset( void ) +{ + unsigned long int time; + Profile_Get_Ticks(&time); + time -= ResetTime; + return (float)time / Profile_Get_Tick_Rate(); +} + +#include + +void CProfileManager::dumpRecursive(CProfileIterator* profileIterator, int spacing) +{ + profileIterator->First(); + if (profileIterator->Is_Done()) + return; + + float accumulated_time=0,parent_time = profileIterator->Is_Root() ? CProfileManager::Get_Time_Since_Reset() : profileIterator->Get_Current_Parent_Total_Time(); + int i; + int frames_since_reset = CProfileManager::Get_Frame_Count_Since_Reset(); + for (i=0;iGet_Current_Parent_Name(), parent_time ); + float totalTime = 0.f; + + + int numChildren = 0; + + for (i = 0; !profileIterator->Is_Done(); i++,profileIterator->Next()) + { + numChildren++; + float current_total_time = profileIterator->Get_Current_Total_Time(); + accumulated_time += current_total_time; + float fraction = parent_time > SIMD_EPSILON ? (current_total_time / parent_time) * 100 : 0.f; + { + int i; for (i=0;iGet_Current_Name(), fraction,(current_total_time / (double)frames_since_reset),profileIterator->Get_Current_Total_Calls()); + totalTime += current_total_time; + //recurse into children + } + + if (parent_time < accumulated_time) + { + printf("what's wrong\n"); + } + for (i=0;i SIMD_EPSILON ? ((parent_time - accumulated_time) / parent_time) * 100 : 0.f, parent_time - accumulated_time); + + for (i=0;iEnter_Child(i); + dumpRecursive(profileIterator,spacing+3); + profileIterator->Enter_Parent(); + } +} + + + +void CProfileManager::dumpAll() +{ + CProfileIterator* profileIterator = 0; + profileIterator = CProfileManager::Get_Iterator(); + + dumpRecursive(profileIterator,0); + + CProfileManager::Release_Iterator(profileIterator); +} + + + + +#endif //BT_NO_PROFILE diff --git a/opencl/parallel_primitives/host/btQuickprof.h b/opencl/parallel_primitives/host/btQuickprof.h new file mode 100644 index 000000000..93f3f4a60 --- /dev/null +++ b/opencl/parallel_primitives/host/btQuickprof.h @@ -0,0 +1,203 @@ + +/*************************************************************************************************** +** +** Real-Time Hierarchical Profiling for Game Programming Gems 3 +** +** by Greg Hjelstrom & Byon Garrabrant +** +***************************************************************************************************/ + +// Credits: The Clock class was inspired by the Timer classes in +// Ogre (www.ogre3d.org). + + + +#ifndef BT_QUICK_PROF_H +#define BT_QUICK_PROF_H + +//To disable built-in profiling, please comment out next line +//#define BT_NO_PROFILE 1 +#ifndef BT_NO_PROFILE +#include //@todo remove this, backwards compatibility +#include "btScalar.h" +#include "btAlignedAllocator.h" +#include + + + + + +#define USE_BT_CLOCK 1 + +#ifdef USE_BT_CLOCK + +///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling. +class btClock +{ +public: + btClock(); + + btClock(const btClock& other); + btClock& operator=(const btClock& other); + + ~btClock(); + + /// Resets the initial reference time. + void reset(); + + /// Returns the time in ms since the last call to reset or since + /// the btClock was created. + unsigned long int getTimeMilliseconds(); + + /// Returns the time in us since the last call to reset or since + /// the Clock was created. + unsigned long int getTimeMicroseconds(); +private: + struct btClockData* m_data; +}; + +#endif //USE_BT_CLOCK + + + + +///A node in the Profile Hierarchy Tree +class CProfileNode { + +public: + CProfileNode( const char * name, CProfileNode * parent ); + ~CProfileNode( void ); + + CProfileNode * Get_Sub_Node( const char * name ); + + CProfileNode * Get_Parent( void ) { return Parent; } + CProfileNode * Get_Sibling( void ) { return Sibling; } + CProfileNode * Get_Child( void ) { return Child; } + + void CleanupMemory(); + void Reset( void ); + void Call( void ); + bool Return( void ); + + const char * Get_Name( void ) { return Name; } + int Get_Total_Calls( void ) { return TotalCalls; } + float Get_Total_Time( void ) { return TotalTime; } + void* GetUserPointer() const {return m_userPtr;} + void SetUserPointer(void* ptr) { m_userPtr = ptr;} +protected: + + const char * Name; + int TotalCalls; + float TotalTime; + unsigned long int StartTime; + int RecursionCounter; + + CProfileNode * Parent; + CProfileNode * Child; + CProfileNode * Sibling; + void* m_userPtr; +}; + +///An iterator to navigate through the tree +class CProfileIterator +{ +public: + // Access all the children of the current parent + void First(void); + void Next(void); + bool Is_Done(void); + bool Is_Root(void) { return (CurrentParent->Get_Parent() == 0); } + + void Enter_Child( int index ); // Make the given child the new parent + void Enter_Largest_Child( void ); // Make the largest child the new parent + void Enter_Parent( void ); // Make the current parent's parent the new parent + + // Access the current child + const char * Get_Current_Name( void ) { return CurrentChild->Get_Name(); } + int Get_Current_Total_Calls( void ) { return CurrentChild->Get_Total_Calls(); } + float Get_Current_Total_Time( void ) { return CurrentChild->Get_Total_Time(); } + + void* Get_Current_UserPointer( void ) { return CurrentChild->GetUserPointer(); } + void Set_Current_UserPointer(void* ptr) {CurrentChild->SetUserPointer(ptr);} + // Access the current parent + const char * Get_Current_Parent_Name( void ) { return CurrentParent->Get_Name(); } + int Get_Current_Parent_Total_Calls( void ) { return CurrentParent->Get_Total_Calls(); } + float Get_Current_Parent_Total_Time( void ) { return CurrentParent->Get_Total_Time(); } + + + +protected: + + CProfileNode * CurrentParent; + CProfileNode * CurrentChild; + + + CProfileIterator( CProfileNode * start ); + friend class CProfileManager; +}; + + +///The Manager for the Profile system +class CProfileManager { +public: + static void Start_Profile( const char * name ); + static void Stop_Profile( void ); + + static void CleanupMemory(void) + { + Root.CleanupMemory(); + } + + static void Reset( void ); + static void Increment_Frame_Counter( void ); + static int Get_Frame_Count_Since_Reset( void ) { return FrameCounter; } + static float Get_Time_Since_Reset( void ); + + static CProfileIterator * Get_Iterator( void ) + { + + return new CProfileIterator( &Root ); + } + static void Release_Iterator( CProfileIterator * iterator ) { delete ( iterator); } + + static void dumpRecursive(CProfileIterator* profileIterator, int spacing); + + static void dumpAll(); + +private: + static CProfileNode Root; + static CProfileNode * CurrentNode; + static int FrameCounter; + static unsigned long int ResetTime; +}; + + +///ProfileSampleClass is a simple way to profile a function's scope +///Use the BT_PROFILE macro at the start of scope to time +class CProfileSample { +public: + CProfileSample( const char * name ) + { + CProfileManager::Start_Profile( name ); + } + + ~CProfileSample( void ) + { + CProfileManager::Stop_Profile(); + } +}; + + +#define BT_PROFILE( name ) CProfileSample __profile( name ) + +#else + +#define BT_PROFILE( name ) + +#endif //#ifndef BT_NO_PROFILE + + + +#endif //BT_QUICK_PROF_H + + diff --git a/opencl/parallel_primitives/host/btRadixSort32CL.cpp b/opencl/parallel_primitives/host/btRadixSort32CL.cpp new file mode 100644 index 000000000..6d007fef2 --- /dev/null +++ b/opencl/parallel_primitives/host/btRadixSort32CL.cpp @@ -0,0 +1,712 @@ + +#include "btRadixSort32CL.h" +#include "btLauncherCL.h" +#include "../../basic_initialize/btOpenCLUtils.h" +#include "btPrefixScanCL.h" +#include "btFillCL.h" + +#define RADIXSORT32_PATH "opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" + +#include "../kernels/RadixSort32KernelsCL.h" + +btRadixSort32CL::btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity) +:m_commandQueue(queue) +{ + btOpenCLDeviceInfo info; + btOpenCLUtils::getDeviceInfo(device,&info); + m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0; + + m_workBuffer1 = new btOpenCLArray(ctx,queue); + m_workBuffer2 = new btOpenCLArray(ctx,queue); + m_workBuffer3 = new btOpenCLArray(ctx,queue); + m_workBuffer3a = new btOpenCLArray(ctx,queue); + m_workBuffer4 = new btOpenCLArray(ctx,queue); + m_workBuffer4a = new btOpenCLArray(ctx,queue); + + + if (initialCapacity>0) + { + m_workBuffer1->resize(initialCapacity); + m_workBuffer3->resize(initialCapacity); + m_workBuffer3a->resize(initialCapacity); + m_workBuffer4->resize(initialCapacity); + m_workBuffer4a->resize(initialCapacity); + } + + m_scan = new btPrefixScanCL(ctx,device,queue); + m_fill = new btFillCL(ctx,device,queue); + + const char* additionalMacros = ""; + const char* srcFileNameForCaching=""; + + cl_int pErrNum; + const char* kernelSource = radixSort32KernelsCL; + + cl_program sortProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH); + btAssert(sortProg); + + m_streamCountSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros ); + btAssert(m_streamCountSortDataKernel ); + + + + m_streamCountKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros ); + btAssert(m_streamCountKernel); + + + + if (m_deviceCPU) + { + + m_sortAndScatterSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros ); + btAssert(m_sortAndScatterSortDataKernel); + m_sortAndScatterKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros ); + btAssert(m_sortAndScatterKernel); + } else + { + m_sortAndScatterSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros ); + btAssert(m_sortAndScatterSortDataKernel); + m_sortAndScatterKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros ); + btAssert(m_sortAndScatterKernel); + } + + m_prefixScanKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros ); + btAssert(m_prefixScanKernel); + +} + +btRadixSort32CL::~btRadixSort32CL() +{ + delete m_scan; + delete m_fill; + delete m_workBuffer1; + delete m_workBuffer2; + delete m_workBuffer3; + delete m_workBuffer3a; + delete m_workBuffer4; + delete m_workBuffer4a; + + clReleaseKernel(m_streamCountSortDataKernel); + clReleaseKernel(m_streamCountKernel); + clReleaseKernel(m_sortAndScatterSortDataKernel); + clReleaseKernel(m_sortAndScatterKernel); + clReleaseKernel(m_prefixScanKernel); +} + +void btRadixSort32CL::executeHost(btAlignedObjectArray& inout, int sortBits /* = 32 */) +{ + int n = inout.size(); + const int BITS_PER_PASS = 8; + const int NUM_TABLES = (1< workbuffer; + workbuffer.resize(inout.size()); + btSortData* dst = &workbuffer[0]; + + int count=0; + for(int startBit=0; startBit> startBit) & (NUM_TABLES-1); + tables[tableIdx]++; + } +//#define TEST +#ifdef TEST + printf("histogram size=%d\n",NUM_TABLES); + for (int i=0;i> startBit) & (NUM_TABLES-1); + + dst[tables[tableIdx] + counter[tableIdx]] = src[i]; + counter[tableIdx] ++; + } + + btSwap( src, dst ); + count++; + } + + if (count&1) + { + btAssert(0);//need to copy + + } +} + +void btRadixSort32CL::executeHost(btOpenCLArray& keyValuesInOut, int sortBits /* = 32 */) +{ + + btAlignedObjectArray inout; + keyValuesInOut.copyToHost(inout); + + executeHost(inout,sortBits); + + keyValuesInOut.copyFromHost(inout); +} + +void btRadixSort32CL::execute(btOpenCLArray& keysIn, btOpenCLArray& keysOut, btOpenCLArray& valuesIn, + btOpenCLArray& valuesOut, int n, int sortBits) +{ + +} + +//#define DEBUG_RADIXSORT +//#define DEBUG_RADIXSORT2 + + +void btRadixSort32CL::execute(btOpenCLArray& keyValuesInOut, int sortBits /* = 32 */) +{ + + int originalSize = keyValuesInOut.size(); + int workingSize = originalSize; + + + int dataAlignment = DATA_ALIGNMENT; + +#ifdef DEBUG_RADIXSORT2 + btAlignedObjectArray test2; + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n",test2.size()); + for (int i=0;i* src = 0; + + if (workingSize%dataAlignment) + { + workingSize += dataAlignment-(workingSize%dataAlignment); + m_workBuffer4->copyFromOpenCLArray(keyValuesInOut); + m_workBuffer4->resize(workingSize); + btSortData fillValue; + fillValue.m_key = 0xffffffff; + fillValue.m_value = 0xffffffff; + +#define USE_BTFILL +#ifdef USE_BTFILL + m_fill->execute((btOpenCLArray&)*m_workBuffer4,(btInt2&)fillValue,workingSize-originalSize,originalSize); +#else + //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side) + + for (int i=originalSize; icopyFromHostPointer(&fillValue,1,i); + } +#endif//USE_BTFILL + + src = m_workBuffer4; + } else + { + src = &keyValuesInOut; + m_workBuffer4->resize(0); + } + + btAssert( workingSize%DATA_ALIGNMENT == 0 ); + int minCap = NUM_BUCKET*NUM_WGS; + + + int n = workingSize; + + m_workBuffer1->resize(minCap); + m_workBuffer3->resize(workingSize); + + +// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); + btAssert( BITS_PER_PASS == 4 ); + btAssert( WG_SIZE == 64 ); + btAssert( (sortBits&0x3) == 0 ); + + + + btOpenCLArray* dst = m_workBuffer3; + + btOpenCLArray* srcHisto = m_workBuffer1; + btOpenCLArray* destHisto = m_workBuffer2; + + + int nWGs = NUM_WGS; + btConstData cdata; + + { + int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 + int nBlocks = (n+blockSize-1)/(blockSize); + cdata.m_n = n; + cdata.m_nWGs = NUM_WGS; + cdata.m_startBit = 0; + cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; + if( nBlocks < NUM_WGS ) + { + cdata.m_nBlocksPerWG = 1; + nWGs = nBlocks; + } + } + + int count=0; + for(int ib=0; ibsize()) + { + btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) }; + btLauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel); + + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( cdata ); + + int num = NUM_WGS*WG_SIZE; + launcher.launch1D( num, WG_SIZE ); + } + + + +#ifdef DEBUG_RADIXSORT + btAlignedObjectArray testHist; + srcHisto->copyToHost(testHist); + printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); + for (int i=0;igetBufferCL() ) }; + btLauncherCL launcher( m_commandQueue, m_prefixScanKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( 128, 128 ); + destHisto = srcHisto; + }else + { + //unsigned int sum; //for debugging + m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); + } + + +#ifdef DEBUG_RADIXSORT + destHisto->copyToHost(testHist); + printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); + for (int i=0;isize()) + {// local sort and distribute + btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )}; + btLauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); + + } +#else + { +#define NUM_TABLES 16 +//#define SEQUENTIAL +#ifdef SEQUENTIAL + int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + int tables[NUM_TABLES]; + int startBit = ib; + + destHisto->copyToHost(testHist); + btAlignedObjectArray srcHost; + btAlignedObjectArray dstHost; + dstHost.resize(src->size()); + + src->copyToHost(srcHost); + + for (int i=0;i> startBit) & (NUM_TABLES-1); + + dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; + counter2[tableIdx] ++; + } + + +#else + + int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + int tables[NUM_TABLES]; + btAlignedObjectArray dstHostOK; + dstHostOK.resize(src->size()); + + destHisto->copyToHost(testHist); + btAlignedObjectArray srcHost; + src->copyToHost(srcHost); + + int blockSize = 256; + int nBlocksPerWG = cdata.m_nBlocksPerWG; + int startBit = ib; + + { + for (int i=0;i> startBit) & (NUM_TABLES-1); + + dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; + counter2[tableIdx] ++; + } + + + } + + + btAlignedObjectArray dstHost; + dstHost.resize(src->size()); + + + int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + + + for (int wgIdx=0;wgIdx> startBit) & (NUM_TABLES-1); + + int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx]; + + btSortData ok = dstHostOK[destIndex]; + + if (ok.m_key != srcHost[i].m_key) + { + printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key ); + printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value ); + } + if (ok.m_value != srcHost[i].m_value) + { + + printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value ); + printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key ); + + } + + dstHost[destIndex] = srcHost[i]; + counter[tableIdx] ++; + + } + } + } + } + } + + +#endif //SEQUENTIAL + + dst->copyFromHost(dstHost); + } +#endif//USE_GPU + + + +#ifdef DEBUG_RADIXSORT + destHisto->copyToHost(testHist); + printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); + for (int i=0;isize()) + { + m_workBuffer4->resize(originalSize); + keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4); + } + + +#ifdef DEBUG_RADIXSORT + keyValuesInOut.copyToHost(test2); + + printf("numElem = %d\n",test2.size()); + for (int i=0;i& keysInOut, int sortBits /* = 32 */) +{ + int originalSize = keysInOut.size(); + int workingSize = originalSize; + + + int dataAlignment = DATA_ALIGNMENT; + + btOpenCLArray* src = 0; + + if (workingSize%dataAlignment) + { + workingSize += dataAlignment-(workingSize%dataAlignment); + m_workBuffer4a->copyFromOpenCLArray(keysInOut); + m_workBuffer4a->resize(workingSize); + unsigned int fillValue = 0xffffffff; + + m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize); + + src = m_workBuffer4a; + } else + { + src = &keysInOut; + m_workBuffer4a->resize(0); + } + + + + btAssert( workingSize%DATA_ALIGNMENT == 0 ); + int minCap = NUM_BUCKET*NUM_WGS; + + + int n = workingSize; + + + m_workBuffer1->resize(minCap); + m_workBuffer3->resize(workingSize); + m_workBuffer3a->resize(workingSize); + +// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); + btAssert( BITS_PER_PASS == 4 ); + btAssert( WG_SIZE == 64 ); + btAssert( (sortBits&0x3) == 0 ); + + + + btOpenCLArray* dst = m_workBuffer3a; + + btOpenCLArray* srcHisto = m_workBuffer1; + btOpenCLArray* destHisto = m_workBuffer2; + + + int nWGs = NUM_WGS; + btConstData cdata; + + { + int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 + int nBlocks = (n+blockSize-1)/(blockSize); + cdata.m_n = n; + cdata.m_nWGs = NUM_WGS; + cdata.m_startBit = 0; + cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; + if( nBlocks < NUM_WGS ) + { + cdata.m_nBlocksPerWG = 1; + nWGs = nBlocks; + } + } + + int count=0; + for(int ib=0; ibsize()) + { + btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) }; + btLauncherCL launcher(m_commandQueue, m_streamCountKernel); + + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( cdata ); + + int num = NUM_WGS*WG_SIZE; + launcher.launch1D( num, WG_SIZE ); + } + + + +//fast prefix scan is not working properly on Mac OSX yet +#ifdef _WIN32 + bool fastScan=!m_deviceCPU; + +#else + bool fastScan=false; +#endif + + if (fastScan) + {// prefix scan group histogram + btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) }; + btLauncherCL launcher( m_commandQueue, m_prefixScanKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( 128, 128 ); + destHisto = srcHisto; + }else + { + //unsigned int sum; //for debugging + m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); + } + + if (src->size()) + {// local sort and distribute + btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )}; + btLauncherCL launcher( m_commandQueue, m_sortAndScatterKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); + + } + + btSwap(src, dst ); + btSwap(srcHisto,destHisto); + + count++; + } + + if (count&1) + { + btAssert(0);//need to copy from workbuffer to keyValuesInOut + } + + if (m_workBuffer4a->size()) + { + m_workBuffer4a->resize(originalSize); + keysInOut.copyFromOpenCLArray(*m_workBuffer4a); + } + +} + + + + + + + diff --git a/opencl/parallel_primitives/host/btRadixSort32CL.h b/opencl/parallel_primitives/host/btRadixSort32CL.h new file mode 100644 index 000000000..4570303c6 --- /dev/null +++ b/opencl/parallel_primitives/host/btRadixSort32CL.h @@ -0,0 +1,85 @@ + +#ifndef BT_RADIXSORT32_H +#define BT_RADIXSORT32_H + +#include "btOpenCLArray.h" + +struct btSortData +{ + int m_key; + int m_value; +}; +#include "btBufferInfoCL.h" + +class btRadixSort32CL +{ + + btOpenCLArray* m_workBuffer1; + btOpenCLArray* m_workBuffer2; + + btOpenCLArray* m_workBuffer3; + btOpenCLArray* m_workBuffer4; + + btOpenCLArray* m_workBuffer3a; + btOpenCLArray* m_workBuffer4a; + + cl_command_queue m_commandQueue; + + cl_kernel m_streamCountSortDataKernel; + cl_kernel m_streamCountKernel; + + cl_kernel m_prefixScanKernel; + cl_kernel m_sortAndScatterSortDataKernel; + cl_kernel m_sortAndScatterKernel; + + + bool m_deviceCPU; + + class btPrefixScanCL* m_scan; + class btFillCL* m_fill; + +public: + struct btConstData + { + int m_n; + int m_nWGs; + int m_startBit; + int m_nBlocksPerWG; + }; + enum + { + DATA_ALIGNMENT = 256, + WG_SIZE = 64, + BLOCK_SIZE = 256, + ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE), + BITS_PER_PASS = 4, + NUM_BUCKET=(1<& keysIn, btOpenCLArray& keysOut, btOpenCLArray& valuesIn, + btOpenCLArray& valuesOut, int n, int sortBits = 32); + + ///keys only + void execute(btOpenCLArray& keysInOut, int sortBits = 32 ); + + void execute(btOpenCLArray& keyValuesInOut, int sortBits = 32 ); + void executeHost(btOpenCLArray& keyValuesInOut, int sortBits = 32); + void executeHost(btAlignedObjectArray& keyValuesInOut, int sortBits = 32); + +}; +#endif //BT_RADIXSORT32_H + diff --git a/opencl/parallel_primitives/host/btScalar.h b/opencl/parallel_primitives/host/btScalar.h new file mode 100644 index 000000000..3a94054e9 --- /dev/null +++ b/opencl/parallel_primitives/host/btScalar.h @@ -0,0 +1,660 @@ +/* +Copyright (c) 2003-2009 Erwin Coumans http://bullet.googlecode.com + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + + +#ifndef BT_SCALAR_H +#define BT_SCALAR_H + +#ifdef BT_MANAGED_CODE +//Aligned data types not supported in managed code +#pragma unmanaged +#endif + + +#include +#include //size_t for MSVC 6.0 +#include + +/* SVN $Revision$ on $Date$ from http://bullet.googlecode.com*/ +#define BT_BULLET_VERSION 281 + +inline int btGetVersion() +{ + return BT_BULLET_VERSION; +} + +#if defined(DEBUG) || defined (_DEBUG) +#define BT_DEBUG +#endif + + +#ifdef _WIN32 + + #if defined(__MINGW32__) || defined(__CYGWIN__) || (defined (_MSC_VER) && _MSC_VER < 1300) + + #define SIMD_FORCE_INLINE inline + #define ATTRIBUTE_ALIGNED16(a) a + #define ATTRIBUTE_ALIGNED64(a) a + #define ATTRIBUTE_ALIGNED128(a) a + #else + //#define BT_HAS_ALIGNED_ALLOCATOR + #pragma warning(disable : 4324) // disable padding warning +// #pragma warning(disable:4530) // Disable the exception disable but used in MSCV Stl warning. +// #pragma warning(disable:4996) //Turn off warnings about deprecated C routines +// #pragma warning(disable:4786) // Disable the "debug name too long" warning + + #define SIMD_FORCE_INLINE __forceinline + #define ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a + #define ATTRIBUTE_ALIGNED64(a) __declspec(align(64)) a + #define ATTRIBUTE_ALIGNED128(a) __declspec (align(128)) a + #ifdef _XBOX + #define BT_USE_VMX128 + + #include + #define BT_HAVE_NATIVE_FSEL + #define btFsel(a,b,c) __fsel((a),(b),(c)) + #else + +#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (BT_USE_DOUBLE_PRECISION)) + #define BT_USE_SSE + #ifdef BT_USE_SSE + //BT_USE_SSE_IN_API is disabled under Windows by default, because + //it makes it harder to integrate Bullet into your application under Windows + //(structured embedding Bullet structs/classes need to be 16-byte aligned) + //with relatively little performance gain + //If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries + //you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage) + //#define BT_USE_SSE_IN_API + #endif //BT_USE_SSE + #include +#endif + + #endif//_XBOX + + #endif //__MINGW32__ + +#ifdef BT_DEBUG + #ifdef _MSC_VER + #include + #define btAssert(x) { if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);__debugbreak(); }} + #else//_MSC_VER + #include + #define btAssert assert + #endif//_MSC_VER +#else + #define btAssert(x) +#endif + //btFullAssert is optional, slows down a lot + #define btFullAssert(x) + + #define btLikely(_c) _c + #define btUnlikely(_c) _c + +#else + +#if defined (__CELLOS_LV2__) + #define SIMD_FORCE_INLINE inline __attribute__((always_inline)) + #define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16))) + #define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64))) + #define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128))) + #ifndef assert + #include + #endif +#ifdef BT_DEBUG +#ifdef __SPU__ +#include +#define printf spu_printf + #define btAssert(x) {if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}} +#else + #define btAssert assert +#endif + +#else + #define btAssert(x) +#endif + //btFullAssert is optional, slows down a lot + #define btFullAssert(x) + + #define btLikely(_c) _c + #define btUnlikely(_c) _c + +#else + +#ifdef USE_LIBSPE2 + + #define SIMD_FORCE_INLINE __inline + #define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16))) + #define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64))) + #define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128))) + #ifndef assert + #include + #endif +#ifdef BT_DEBUG + #define btAssert assert +#else + #define btAssert(x) +#endif + //btFullAssert is optional, slows down a lot + #define btFullAssert(x) + + + #define btLikely(_c) __builtin_expect((_c), 1) + #define btUnlikely(_c) __builtin_expect((_c), 0) + + +#else + //non-windows systems + +#if (defined (__APPLE__) && (!defined (BT_USE_DOUBLE_PRECISION))) + #if defined (__i386__) || defined (__x86_64__) + #define BT_USE_SSE + //BT_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries + //if apps run into issues, we will disable the next line + #define BT_USE_SSE_IN_API + #ifdef BT_USE_SSE + // include appropriate SSE level + #if defined (__SSE4_1__) + #include + #elif defined (__SSSE3__) + #include + #elif defined (__SSE3__) + #include + #else + #include + #endif + #endif //BT_USE_SSE + #elif defined( __armv7__ ) + #ifdef __clang__ + #define BT_USE_NEON 1 + + #if defined BT_USE_NEON && defined (__clang__) + #include + #endif//BT_USE_NEON + #endif //__clang__ + #endif//__arm__ + + #define SIMD_FORCE_INLINE inline __attribute__ ((always_inline)) +///@todo: check out alignment methods for other platforms/compilers + #define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16))) + #define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64))) + #define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128))) + #ifndef assert + #include + #endif + + #if defined(DEBUG) || defined (_DEBUG) + #if defined (__i386__) || defined (__x86_64__) + #include + #define btAssert(x)\ + {\ + if(!(x))\ + {\ + printf("Assert %s in line %d, file %s\n",#x, __LINE__, __FILE__);\ + asm volatile ("int3");\ + }\ + } + #else//defined (__i386__) || defined (__x86_64__) + #define btAssert assert + #endif//defined (__i386__) || defined (__x86_64__) + #else//defined(DEBUG) || defined (_DEBUG) + #define btAssert(x) + #endif//defined(DEBUG) || defined (_DEBUG) + + //btFullAssert is optional, slows down a lot + #define btFullAssert(x) + #define btLikely(_c) _c + #define btUnlikely(_c) _c + +#else + + #define SIMD_FORCE_INLINE inline + ///@todo: check out alignment methods for other platforms/compilers + ///#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16))) + ///#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64))) + ///#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128))) + #define ATTRIBUTE_ALIGNED16(a) a + #define ATTRIBUTE_ALIGNED64(a) a + #define ATTRIBUTE_ALIGNED128(a) a + #ifndef assert + #include + #endif + +#if defined(DEBUG) || defined (_DEBUG) + #define btAssert assert +#else + #define btAssert(x) +#endif + + //btFullAssert is optional, slows down a lot + #define btFullAssert(x) + #define btLikely(_c) _c + #define btUnlikely(_c) _c +#endif //__APPLE__ + +#endif // LIBSPE2 + +#endif //__CELLOS_LV2__ +#endif + + +///The btScalar type abstracts floating point numbers, to easily switch between double and single floating point precision. +#if defined(BT_USE_DOUBLE_PRECISION) +typedef double btScalar; +//this number could be bigger in double precision +#define BT_LARGE_FLOAT 1e30 +#else +typedef float btScalar; +//keep BT_LARGE_FLOAT*BT_LARGE_FLOAT < FLT_MAX +#define BT_LARGE_FLOAT 1e18f +#endif + +#ifdef BT_USE_SSE +typedef __m128 btSimdFloat4; +#endif//BT_USE_SSE + +#if defined BT_USE_SSE_IN_API && defined (BT_USE_SSE) +#ifdef _WIN32 + +#ifndef BT_NAN +static int btNanMask = 0x7F800001; +#define BT_NAN (*(float*)&btNanMask) +#endif + +#ifndef BT_INFINITY +static int btInfinityMask = 0x7F800000; +#define BT_INFINITY (*(float*)&btInfinityMask) +#endif + +inline __m128 operator + (const __m128 A, const __m128 B) +{ + return _mm_add_ps(A, B); +} + +inline __m128 operator - (const __m128 A, const __m128 B) +{ + return _mm_sub_ps(A, B); +} + +inline __m128 operator * (const __m128 A, const __m128 B) +{ + return _mm_mul_ps(A, B); +} + +#define btCastfTo128i(a) (_mm_castps_si128(a)) +#define btCastfTo128d(a) (_mm_castps_pd(a)) +#define btCastiTo128f(a) (_mm_castsi128_ps(a)) +#define btCastdTo128f(a) (_mm_castpd_ps(a)) +#define btCastdTo128i(a) (_mm_castpd_si128(a)) +#define btAssign128(r0,r1,r2,r3) _mm_setr_ps(r0,r1,r2,r3) + +#else//_WIN32 + +#define btCastfTo128i(a) ((__m128i)(a)) +#define btCastfTo128d(a) ((__m128d)(a)) +#define btCastiTo128f(a) ((__m128) (a)) +#define btCastdTo128f(a) ((__m128) (a)) +#define btCastdTo128i(a) ((__m128i)(a)) +#define btAssign128(r0,r1,r2,r3) (__m128){r0,r1,r2,r3} +#define BT_INFINITY INFINITY +#define BT_NAN NAN +#endif//_WIN32 +#endif //BT_USE_SSE_IN_API + +#ifdef BT_USE_NEON +#include + +typedef float32x4_t btSimdFloat4; +#define BT_INFINITY INFINITY +#define BT_NAN NAN +#define btAssign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3} +#endif + + + + + +#define BT_DECLARE_ALIGNED_ALLOCATOR() \ + SIMD_FORCE_INLINE void* operator new(size_t sizeInBytes) { return btAlignedAlloc(sizeInBytes,16); } \ + SIMD_FORCE_INLINE void operator delete(void* ptr) { btAlignedFree(ptr); } \ + SIMD_FORCE_INLINE void* operator new(size_t, void* ptr) { return ptr; } \ + SIMD_FORCE_INLINE void operator delete(void*, void*) { } \ + SIMD_FORCE_INLINE void* operator new[](size_t sizeInBytes) { return btAlignedAlloc(sizeInBytes,16); } \ + SIMD_FORCE_INLINE void operator delete[](void* ptr) { btAlignedFree(ptr); } \ + SIMD_FORCE_INLINE void* operator new[](size_t, void* ptr) { return ptr; } \ + SIMD_FORCE_INLINE void operator delete[](void*, void*) { } \ + + + +#if defined(BT_USE_DOUBLE_PRECISION) || defined(BT_FORCE_DOUBLE_FUNCTIONS) + +SIMD_FORCE_INLINE btScalar btSqrt(btScalar x) { return sqrt(x); } +SIMD_FORCE_INLINE btScalar btFabs(btScalar x) { return fabs(x); } +SIMD_FORCE_INLINE btScalar btCos(btScalar x) { return cos(x); } +SIMD_FORCE_INLINE btScalar btSin(btScalar x) { return sin(x); } +SIMD_FORCE_INLINE btScalar btTan(btScalar x) { return tan(x); } +SIMD_FORCE_INLINE btScalar btAcos(btScalar x) { if (xbtScalar(1)) x=btScalar(1); return acos(x); } +SIMD_FORCE_INLINE btScalar btAsin(btScalar x) { if (xbtScalar(1)) x=btScalar(1); return asin(x); } +SIMD_FORCE_INLINE btScalar btAtan(btScalar x) { return atan(x); } +SIMD_FORCE_INLINE btScalar btAtan2(btScalar x, btScalar y) { return atan2(x, y); } +SIMD_FORCE_INLINE btScalar btExp(btScalar x) { return exp(x); } +SIMD_FORCE_INLINE btScalar btLog(btScalar x) { return log(x); } +SIMD_FORCE_INLINE btScalar btPow(btScalar x,btScalar y) { return pow(x,y); } +SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmod(x,y); } + +#else + +SIMD_FORCE_INLINE btScalar btSqrt(btScalar y) +{ +#ifdef USE_APPROXIMATION + double x, z, tempf; + unsigned long *tfptr = ((unsigned long *)&tempf) + 1; + + tempf = y; + *tfptr = (0xbfcdd90a - *tfptr)>>1; /* estimate of 1/sqrt(y) */ + x = tempf; + z = y*btScalar(0.5); + x = (btScalar(1.5)*x)-(x*x)*(x*z); /* iteration formula */ + x = (btScalar(1.5)*x)-(x*x)*(x*z); + x = (btScalar(1.5)*x)-(x*x)*(x*z); + x = (btScalar(1.5)*x)-(x*x)*(x*z); + x = (btScalar(1.5)*x)-(x*x)*(x*z); + return x*y; +#else + return sqrtf(y); +#endif +} +SIMD_FORCE_INLINE btScalar btFabs(btScalar x) { return fabsf(x); } +SIMD_FORCE_INLINE btScalar btCos(btScalar x) { return cosf(x); } +SIMD_FORCE_INLINE btScalar btSin(btScalar x) { return sinf(x); } +SIMD_FORCE_INLINE btScalar btTan(btScalar x) { return tanf(x); } +SIMD_FORCE_INLINE btScalar btAcos(btScalar x) { + if (xbtScalar(1)) + x=btScalar(1); + return acosf(x); +} +SIMD_FORCE_INLINE btScalar btAsin(btScalar x) { + if (xbtScalar(1)) + x=btScalar(1); + return asinf(x); +} +SIMD_FORCE_INLINE btScalar btAtan(btScalar x) { return atanf(x); } +SIMD_FORCE_INLINE btScalar btAtan2(btScalar x, btScalar y) { return atan2f(x, y); } +SIMD_FORCE_INLINE btScalar btExp(btScalar x) { return expf(x); } +SIMD_FORCE_INLINE btScalar btLog(btScalar x) { return logf(x); } +SIMD_FORCE_INLINE btScalar btPow(btScalar x,btScalar y) { return powf(x,y); } +SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmodf(x,y); } + +#endif + +#define SIMD_2_PI btScalar(6.283185307179586232) +#define SIMD_PI (SIMD_2_PI * btScalar(0.5)) +#define SIMD_HALF_PI (SIMD_2_PI * btScalar(0.25)) +#define SIMD_RADS_PER_DEG (SIMD_2_PI / btScalar(360.0)) +#define SIMD_DEGS_PER_RAD (btScalar(360.0) / SIMD_2_PI) +#define SIMDSQRT12 btScalar(0.7071067811865475244008443621048490) + +#define btRecipSqrt(x) ((btScalar)(btScalar(1.0)/btSqrt(btScalar(x)))) /* reciprocal square root */ + + +#ifdef BT_USE_DOUBLE_PRECISION +#define SIMD_EPSILON DBL_EPSILON +#define SIMD_INFINITY DBL_MAX +#else +#define SIMD_EPSILON FLT_EPSILON +#define SIMD_INFINITY FLT_MAX +#endif + +SIMD_FORCE_INLINE btScalar btAtan2Fast(btScalar y, btScalar x) +{ + btScalar coeff_1 = SIMD_PI / 4.0f; + btScalar coeff_2 = 3.0f * coeff_1; + btScalar abs_y = btFabs(y); + btScalar angle; + if (x >= 0.0f) { + btScalar r = (x - abs_y) / (x + abs_y); + angle = coeff_1 - coeff_1 * r; + } else { + btScalar r = (x + abs_y) / (abs_y - x); + angle = coeff_2 - coeff_1 * r; + } + return (y < 0.0f) ? -angle : angle; +} + +SIMD_FORCE_INLINE bool btFuzzyZero(btScalar x) { return btFabs(x) < SIMD_EPSILON; } + +SIMD_FORCE_INLINE bool btEqual(btScalar a, btScalar eps) { + return (((a) <= eps) && !((a) < -eps)); +} +SIMD_FORCE_INLINE bool btGreaterEqual (btScalar a, btScalar eps) { + return (!((a) <= eps)); +} + + +SIMD_FORCE_INLINE int btIsNegative(btScalar x) { + return x < btScalar(0.0) ? 1 : 0; +} + +SIMD_FORCE_INLINE btScalar btRadians(btScalar x) { return x * SIMD_RADS_PER_DEG; } +SIMD_FORCE_INLINE btScalar btDegrees(btScalar x) { return x * SIMD_DEGS_PER_RAD; } + +#define BT_DECLARE_HANDLE(name) typedef struct name##__ { int unused; } *name + +#ifndef btFsel +SIMD_FORCE_INLINE btScalar btFsel(btScalar a, btScalar b, btScalar c) +{ + return a >= 0 ? b : c; +} +#endif +#define btFsels(a,b,c) (btScalar)btFsel(a,b,c) + + +SIMD_FORCE_INLINE bool btMachineIsLittleEndian() +{ + long int i = 1; + const char *p = (const char *) &i; + if (p[0] == 1) // Lowest address contains the least significant byte + return true; + else + return false; +} + + + +///btSelect avoids branches, which makes performance much better for consoles like Playstation 3 and XBox 360 +///Thanks Phil Knight. See also http://www.cellperformance.com/articles/2006/04/more_techniques_for_eliminatin_1.html +SIMD_FORCE_INLINE unsigned btSelect(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero) +{ + // Set testNz to 0xFFFFFFFF if condition is nonzero, 0x00000000 if condition is zero + // Rely on positive value or'ed with its negative having sign bit on + // and zero value or'ed with its negative (which is still zero) having sign bit off + // Use arithmetic shift right, shifting the sign bit through all 32 bits + unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31); + unsigned testEqz = ~testNz; + return ((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz)); +} +SIMD_FORCE_INLINE int btSelect(unsigned condition, int valueIfConditionNonZero, int valueIfConditionZero) +{ + unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31); + unsigned testEqz = ~testNz; + return static_cast((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz)); +} +SIMD_FORCE_INLINE float btSelect(unsigned condition, float valueIfConditionNonZero, float valueIfConditionZero) +{ +#ifdef BT_HAVE_NATIVE_FSEL + return (float)btFsel((btScalar)condition - btScalar(1.0f), valueIfConditionNonZero, valueIfConditionZero); +#else + return (condition != 0) ? valueIfConditionNonZero : valueIfConditionZero; +#endif +} + +template SIMD_FORCE_INLINE void btSwap(T& a, T& b) +{ + T tmp = a; + a = b; + b = tmp; +} + + +//PCK: endian swapping functions +SIMD_FORCE_INLINE unsigned btSwapEndian(unsigned val) +{ + return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8) | ((val & 0x000000ff) << 24)); +} + +SIMD_FORCE_INLINE unsigned short btSwapEndian(unsigned short val) +{ + return static_cast(((val & 0xff00) >> 8) | ((val & 0x00ff) << 8)); +} + +SIMD_FORCE_INLINE unsigned btSwapEndian(int val) +{ + return btSwapEndian((unsigned)val); +} + +SIMD_FORCE_INLINE unsigned short btSwapEndian(short val) +{ + return btSwapEndian((unsigned short) val); +} + +///btSwapFloat uses using char pointers to swap the endianness +////btSwapFloat/btSwapDouble will NOT return a float, because the machine might 'correct' invalid floating point values +///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754. +///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception. +///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you. +///so instead of returning a float/double, we return integer/long long integer +SIMD_FORCE_INLINE unsigned int btSwapEndianFloat(float d) +{ + unsigned int a = 0; + unsigned char *dst = (unsigned char *)&a; + unsigned char *src = (unsigned char *)&d; + + dst[0] = src[3]; + dst[1] = src[2]; + dst[2] = src[1]; + dst[3] = src[0]; + return a; +} + +// unswap using char pointers +SIMD_FORCE_INLINE float btUnswapEndianFloat(unsigned int a) +{ + float d = 0.0f; + unsigned char *src = (unsigned char *)&a; + unsigned char *dst = (unsigned char *)&d; + + dst[0] = src[3]; + dst[1] = src[2]; + dst[2] = src[1]; + dst[3] = src[0]; + + return d; +} + + +// swap using char pointers +SIMD_FORCE_INLINE void btSwapEndianDouble(double d, unsigned char* dst) +{ + unsigned char *src = (unsigned char *)&d; + + dst[0] = src[7]; + dst[1] = src[6]; + dst[2] = src[5]; + dst[3] = src[4]; + dst[4] = src[3]; + dst[5] = src[2]; + dst[6] = src[1]; + dst[7] = src[0]; + +} + +// unswap using char pointers +SIMD_FORCE_INLINE double btUnswapEndianDouble(const unsigned char *src) +{ + double d = 0.0; + unsigned char *dst = (unsigned char *)&d; + + dst[0] = src[7]; + dst[1] = src[6]; + dst[2] = src[5]; + dst[3] = src[4]; + dst[4] = src[3]; + dst[5] = src[2]; + dst[6] = src[1]; + dst[7] = src[0]; + + return d; +} + +// returns normalized value in range [-SIMD_PI, SIMD_PI] +SIMD_FORCE_INLINE btScalar btNormalizeAngle(btScalar angleInRadians) +{ + angleInRadians = btFmod(angleInRadians, SIMD_2_PI); + if(angleInRadians < -SIMD_PI) + { + return angleInRadians + SIMD_2_PI; + } + else if(angleInRadians > SIMD_PI) + { + return angleInRadians - SIMD_2_PI; + } + else + { + return angleInRadians; + } +} + +///rudimentary class to provide type info +struct btTypedObject +{ + btTypedObject(int objectType) + :m_objectType(objectType) + { + } + int m_objectType; + inline int getObjectType() const + { + return m_objectType; + } +}; + + + +///align a pointer to the provided alignment, upwards +template T* btAlignPointer(T* unalignedPtr, size_t alignment) +{ + + struct btConvertPointerSizeT + { + union + { + T* ptr; + size_t integer; + }; + }; + btConvertPointerSizeT converter; + + + const size_t bit_mask = ~(alignment - 1); + converter.ptr = unalignedPtr; + converter.integer += alignment-1; + converter.integer &= bit_mask; + return converter.ptr; +} + +#endif //BT_SCALAR_H diff --git a/opencl/parallel_primitives/host/premake4.lua b/opencl/parallel_primitives/host/premake4.lua new file mode 100644 index 000000000..9aaa4692e --- /dev/null +++ b/opencl/parallel_primitives/host/premake4.lua @@ -0,0 +1,26 @@ +function createProject(vendor) + hasCL = findOpenCL(vendor) + + if (hasCL) then + + project ("OpenCL_lib_parallel_primitives_host_" .. vendor) + + initOpenCL(vendor) + + kind "StaticLib" + targetdir "../../../lib" + includedirs { + ".", + } + files { + "**.cpp", + "**.h" + } + + end +end + +createProject("AMD") +createProject("Intel") +createProject("NVIDIA") +createProject("Apple") \ No newline at end of file diff --git a/opencl/parallel_primitives/kernels/BoundSearchKernels.cl b/opencl/parallel_primitives/kernels/BoundSearchKernels.cl new file mode 100644 index 000000000..f3b4a1e8a --- /dev/null +++ b/opencl/parallel_primitives/kernels/BoundSearchKernels.cl @@ -0,0 +1,106 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) + +typedef struct +{ + u32 m_key; + u32 m_value; +}SortData; + + + +typedef struct +{ + u32 m_nSrc; + u32 m_nDst; + u32 m_padding[2]; +} ConstBuffer; + + + +__attribute__((reqd_work_group_size(64,1,1))) +__kernel +void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, + unsigned int nSrc, unsigned int nDst) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nSrc ) + { + SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1); + SortData end; end.m_key = nDst; end.m_value = nDst; + + SortData iData = (gIdx==0)? first: src[gIdx-1]; + SortData jData = (gIdx==nSrc)? end: src[gIdx]; + + if( iData.m_key != jData.m_key ) + { +// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++) + u32 k = jData.m_key; + { + dst[k] = gIdx; + } + } + } +} + + +__attribute__((reqd_work_group_size(64,1,1))) +__kernel +void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, + unsigned int nSrc, unsigned int nDst) +{ + int gIdx = GET_GLOBAL_IDX+1; + + if( gIdx < nSrc+1 ) + { + SortData first; first.m_key = 0; first.m_value = 0; + SortData end; end.m_key = nDst; end.m_value = nDst; + + SortData iData = src[gIdx-1]; + SortData jData = (gIdx==nSrc)? end: src[gIdx]; + + if( iData.m_key != jData.m_key ) + { + u32 k = iData.m_key; + { + dst[k] = gIdx; + } + } + } +} + +__attribute__((reqd_work_group_size(64,1,1))) +__kernel +void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, + unsigned int nSrc, unsigned int nDst) +{ + int gIdx = GET_GLOBAL_IDX; + + + if( gIdx < nDst ) + { + C[gIdx] = A[gIdx] - B[gIdx]; + } +} + diff --git a/opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h b/opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h new file mode 100644 index 000000000..bf802e9fe --- /dev/null +++ b/opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h @@ -0,0 +1,110 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* boundSearchKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"\n" +"\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"\n" +"typedef struct\n" +"{\n" +" u32 m_key; \n" +" u32 m_value;\n" +"}SortData;\n" +"\n" +"\n" +"\n" +"typedef struct\n" +"{\n" +" u32 m_nSrc;\n" +" u32 m_nDst;\n" +" u32 m_padding[2];\n" +"} ConstBuffer;\n" +"\n" +"\n" +"\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"__kernel\n" +"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n" +" unsigned int nSrc, unsigned int nDst)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < nSrc )\n" +" {\n" +" SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n" +" SortData end; end.m_key = nDst; end.m_value = nDst;\n" +"\n" +" SortData iData = (gIdx==0)? first: src[gIdx-1];\n" +" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" +"\n" +" if( iData.m_key != jData.m_key )\n" +" {\n" +"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n" +" u32 k = jData.m_key;\n" +" {\n" +" dst[k] = gIdx;\n" +" }\n" +" }\n" +" }\n" +"}\n" +"\n" +"\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"__kernel\n" +"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n" +" unsigned int nSrc, unsigned int nDst)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX+1;\n" +"\n" +" if( gIdx < nSrc+1 )\n" +" {\n" +" SortData first; first.m_key = 0; first.m_value = 0;\n" +" SortData end; end.m_key = nDst; end.m_value = nDst;\n" +"\n" +" SortData iData = src[gIdx-1];\n" +" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" +"\n" +" if( iData.m_key != jData.m_key )\n" +" {\n" +" u32 k = iData.m_key;\n" +" {\n" +" dst[k] = gIdx;\n" +" }\n" +" }\n" +" }\n" +"}\n" +"\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"__kernel\n" +"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n" +" unsigned int nSrc, unsigned int nDst)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" \n" +"\n" +" if( gIdx < nDst )\n" +" {\n" +" C[gIdx] = A[gIdx] - B[gIdx];\n" +" }\n" +"}\n" +"\n" +"\n" +; diff --git a/opencl/parallel_primitives/kernels/CopyKernels.cl b/opencl/parallel_primitives/kernels/CopyKernels.cl new file mode 100644 index 000000000..2eee5752e --- /dev/null +++ b/opencl/parallel_primitives/kernels/CopyKernels.cl @@ -0,0 +1,128 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) + +#define make_uint4 (uint4) +#define make_uint2 (uint2) +#define make_int2 (int2) + +typedef struct +{ + int m_n; + int m_padding[3]; +} ConstBuffer; + + + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void Copy1F4Kernel(__global float4* dst, __global float4* src, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < cb.m_n ) + { + float4 a0 = src[gIdx]; + + dst[ gIdx ] = a0; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void Copy2F4Kernel(__global float4* dst, __global float4* src, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( 2*gIdx <= cb.m_n ) + { + float4 a0 = src[gIdx*2+0]; + float4 a1 = src[gIdx*2+1]; + + dst[ gIdx*2+0 ] = a0; + dst[ gIdx*2+1 ] = a1; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void Copy4F4Kernel(__global float4* dst, __global float4* src, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( 4*gIdx <= cb.m_n ) + { + int idx0 = gIdx*4+0; + int idx1 = gIdx*4+1; + int idx2 = gIdx*4+2; + int idx3 = gIdx*4+3; + + float4 a0 = src[idx0]; + float4 a1 = src[idx1]; + float4 a2 = src[idx2]; + float4 a3 = src[idx3]; + + dst[ idx0 ] = a0; + dst[ idx1 ] = a1; + dst[ idx2 ] = a2; + dst[ idx3 ] = a3; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void CopyF1Kernel(__global float* dstF1, __global float* srcF1, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < cb.m_n ) + { + float a0 = srcF1[gIdx]; + + dstF1[ gIdx ] = a0; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < cb.m_n ) + { + float2 a0 = srcF2[gIdx]; + + dstF2[ gIdx ] = a0; + } +} + diff --git a/opencl/parallel_primitives/kernels/CopyKernelsCL.h b/opencl/parallel_primitives/kernels/CopyKernelsCL.h new file mode 100644 index 000000000..e5670e3cd --- /dev/null +++ b/opencl/parallel_primitives/kernels/CopyKernelsCL.h @@ -0,0 +1,132 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* copyKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"\n" +"#define make_uint4 (uint4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"\n" +"typedef struct\n" +"{\n" +" int m_n;\n" +" int m_padding[3];\n" +"} ConstBuffer;\n" +"\n" +"\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < cb.m_n )\n" +" {\n" +" float4 a0 = src[gIdx];\n" +"\n" +" dst[ gIdx ] = a0;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( 2*gIdx <= cb.m_n )\n" +" {\n" +" float4 a0 = src[gIdx*2+0];\n" +" float4 a1 = src[gIdx*2+1];\n" +"\n" +" dst[ gIdx*2+0 ] = a0;\n" +" dst[ gIdx*2+1 ] = a1;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( 4*gIdx <= cb.m_n )\n" +" {\n" +" int idx0 = gIdx*4+0;\n" +" int idx1 = gIdx*4+1;\n" +" int idx2 = gIdx*4+2;\n" +" int idx3 = gIdx*4+3;\n" +"\n" +" float4 a0 = src[idx0];\n" +" float4 a1 = src[idx1];\n" +" float4 a2 = src[idx2];\n" +" float4 a3 = src[idx3];\n" +"\n" +" dst[ idx0 ] = a0;\n" +" dst[ idx1 ] = a1;\n" +" dst[ idx2 ] = a2;\n" +" dst[ idx3 ] = a3;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < cb.m_n )\n" +" {\n" +" float a0 = srcF1[gIdx];\n" +"\n" +" dstF1[ gIdx ] = a0;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < cb.m_n )\n" +" {\n" +" float2 a0 = srcF2[gIdx];\n" +"\n" +" dstF2[ gIdx ] = a0;\n" +" }\n" +"}\n" +"\n" +"\n" +; diff --git a/opencl/parallel_primitives/kernels/FillKernels.cl b/opencl/parallel_primitives/kernels/FillKernels.cl new file mode 100644 index 000000000..71c31075d --- /dev/null +++ b/opencl/parallel_primitives/kernels/FillKernels.cl @@ -0,0 +1,107 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) + +#define make_uint4 (uint4) +#define make_uint2 (uint2) +#define make_int2 (int2) + +typedef struct +{ + union + { + int4 m_data; + uint4 m_unsignedData; + float m_floatData; + }; + int m_offset; + int m_n; + int m_padding[2]; +} ConstBuffer; + + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num_elements ) + { + dstInt[ offset+gIdx ] = value; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num_elements ) + { + dstFloat[ offset+gIdx ] = value; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num ) + { + dstInt[ offset+gIdx ] = value; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num ) + { + dstInt2[ gIdx + offset] = make_int2( value.x, value.y ); + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num ) + { + dstInt4[ offset+gIdx ] = value; + } +} + diff --git a/opencl/parallel_primitives/kernels/FillKernelsCL.h b/opencl/parallel_primitives/kernels/FillKernelsCL.h new file mode 100644 index 000000000..24eac7b11 --- /dev/null +++ b/opencl/parallel_primitives/kernels/FillKernelsCL.h @@ -0,0 +1,111 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* fillKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"\n" +"\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"\n" +"#define make_uint4 (uint4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"\n" +"typedef struct\n" +"{\n" +" union\n" +" {\n" +" int4 m_data;\n" +" uint4 m_unsignedData;\n" +" float m_floatData;\n" +" };\n" +" int m_offset;\n" +" int m_n;\n" +" int m_padding[2];\n" +"} ConstBuffer;\n" +"\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < num_elements )\n" +" {\n" +" dstInt[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < num_elements )\n" +" {\n" +" dstFloat[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < num )\n" +" {\n" +" dstInt[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < num )\n" +" {\n" +" dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < num )\n" +" {\n" +" dstInt4[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"\n" +"\n" +; diff --git a/opencl/parallel_primitives/kernels/PrefixScanKernels.cl b/opencl/parallel_primitives/kernels/PrefixScanKernels.cl new file mode 100644 index 000000000..963cc1e48 --- /dev/null +++ b/opencl/parallel_primitives/kernels/PrefixScanKernels.cl @@ -0,0 +1,154 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) + +// takahiro end +#define WG_SIZE 128 +#define m_numElems x +#define m_numBlocks y +#define m_numScanBlocks z + +/*typedef struct +{ + uint m_numElems; + uint m_numBlocks; + uint m_numScanBlocks; + uint m_padding[1]; +} ConstBuffer; +*/ + +u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize) +{ + u32 blocksum; + int offset = 1; + for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1) + { + GROUP_LDS_BARRIER; + for(int iIdx=lIdx; iIdx>= 1; + for(int nActive=1; nActive>=1 ) + { + GROUP_LDS_BARRIER; + for( int iIdx = lIdx; iIdx>1; nActive>0; nActive>>=1, offset<<=1)\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for(int iIdx=lIdx; iIdx>= 1;\n" +" for(int nActive=1; nActive>=1 )\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for( int iIdx = lIdx; iIdx 64 ) + { + sorterSharedMemory[idx] += sorterSharedMemory[idx-64]; + GROUP_MEM_FENCE; + } + + sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2]; + GROUP_MEM_FENCE; + } +#else + if( lIdx < 64 ) + { + sorterSharedMemory[idx] += sorterSharedMemory[idx-1]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-4]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-8]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-16]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-32]; + GROUP_MEM_FENCE; + if( wgSize > 64 ) + { + sorterSharedMemory[idx] += sorterSharedMemory[idx-64]; + GROUP_MEM_FENCE; + } + + sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2]; + GROUP_MEM_FENCE; + } +#endif + } + + GROUP_LDS_BARRIER; + + *totalSum = sorterSharedMemory[wgSize*2-1]; + u32 addValue = sorterSharedMemory[lIdx+wgSize-1]; + return addValue; +} + +//__attribute__((reqd_work_group_size(128,1,1))) +uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory ) +{ + u32 s4 = prefixScanVectorEx( &pData ); + u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 ); + return pData + make_uint4( rank, rank, rank, rank ); +} + + +//__attribute__((reqd_work_group_size(64,1,1))) +uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory ) +{ + u32 s4 = prefixScanVectorEx( &pData ); + u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 ); + return pData + make_uint4( rank, rank, rank, rank ); +} + +u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;} + +u32 bit8Scan(u32 v) +{ + return (v<<8) + (v<<16) + (v<<24); +} + +//=== + + + + +#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx] + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb ) +{ + __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + for(int i=0; i>startBit) & 0xf; +#if defined(NV_GPU) + MY_HISTOGRAM( localKey )++; +#else + AtomInc( MY_HISTOGRAM( localKey ) ); +#endif + } + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i>startBit) & 0xf; +#if defined(NV_GPU) + MY_HISTOGRAM( localKey )++; +#else + AtomInc( MY_HISTOGRAM( localKey ) ); +#endif + } + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask ); + uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) ); + u32 total; + prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData ); + { + uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3); + uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total ); + dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) ); + + GROUP_LDS_BARRIER; + + ldsSortData[dstAddr.x] = sortData[0]; + ldsSortData[dstAddr.y] = sortData[1]; + ldsSortData[dstAddr.z] = sortData[2]; + ldsSortData[dstAddr.w] = sortData[3]; + + GROUP_LDS_BARRIER; + + sortData[0] = ldsSortData[localAddr.x]; + sortData[1] = ldsSortData[localAddr.y]; + sortData[2] = ldsSortData[localAddr.z]; + sortData[3] = ldsSortData[localAddr.w]; + + GROUP_LDS_BARRIER; + } + } +} + +// 2 scan, 2 exchange +void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData) +{ + for(uint ibit=0; ibit>(startBit+ibit)) & 0x3, + (sortData[1]>>(startBit+ibit)) & 0x3, + (sortData[2]>>(startBit+ibit)) & 0x3, + (sortData[3]>>(startBit+ibit)) & 0x3); + + u32 key4; + u32 sKeyPacked[4] = { 0, 0, 0, 0 }; + { + sKeyPacked[0] |= 1<<(8*b.x); + sKeyPacked[1] |= 1<<(8*b.y); + sKeyPacked[2] |= 1<<(8*b.z); + sKeyPacked[3] |= 1<<(8*b.w); + + key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3]; + } + + u32 rankPacked; + u32 sumPacked; + { + rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE ); + } + + GROUP_LDS_BARRIER; + + u32 newOffset[4] = { 0,0,0,0 }; + { + u32 sumScanned = bit8Scan( sumPacked ); + + u32 scannedKeys[4]; + scannedKeys[0] = 1<<(8*b.x); + scannedKeys[1] = 1<<(8*b.y); + scannedKeys[2] = 1<<(8*b.z); + scannedKeys[3] = 1<<(8*b.w); + { // 4 scans at once + u32 sum4 = 0; + for(int ie=0; ie<4; ie++) + { + u32 tmp = scannedKeys[ie]; + scannedKeys[ie] = sum4; + sum4 += tmp; + } + } + + { + u32 sumPlusRank = sumScanned + rankPacked; + { u32 ie = b.x; + scannedKeys[0] += sumPlusRank; + newOffset[0] = unpack4Key( scannedKeys[0], ie ); + } + { u32 ie = b.y; + scannedKeys[1] += sumPlusRank; + newOffset[1] = unpack4Key( scannedKeys[1], ie ); + } + { u32 ie = b.z; + scannedKeys[2] += sumPlusRank; + newOffset[2] = unpack4Key( scannedKeys[2], ie ); + } + { u32 ie = b.w; + scannedKeys[3] += sumPlusRank; + newOffset[3] = unpack4Key( scannedKeys[3], ie ); + } + } + } + + + GROUP_LDS_BARRIER; + + { + ldsSortData[newOffset[0]] = sortData[0]; + ldsSortData[newOffset[1]] = sortData[1]; + ldsSortData[newOffset[2]] = sortData[2]; + ldsSortData[newOffset[3]] = sortData[3]; + + GROUP_LDS_BARRIER; + + u32 dstAddr = 4*lIdx; + sortData[0] = ldsSortData[dstAddr+0]; + sortData[1] = ldsSortData[dstAddr+1]; + sortData[2] = ldsSortData[dstAddr+2]; + sortData[3] = ldsSortData[dstAddr+3]; + + GROUP_LDS_BARRIER; + } + } +} + +#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key] + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb ) +{ + __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; + __local u32 localHistogramToCarry[NUM_BUCKET]; + __local u32 localHistogram[NUM_BUCKET*2]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int startBit = cb.m_startBit; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + if( lIdx < (NUM_BUCKET) ) + { + localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx]; + } + + GROUP_LDS_BARRIER; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = n/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock>startBit) & 0xf; + + { // create histogram + u32 setIdx = lIdx/16; + if( lIdx < NUM_BUCKET ) + { + localHistogram[lIdx] = 0; + } + ldsSortData[lIdx] = 0; + GROUP_LDS_BARRIER; + + for(int i=0; i>(startBit+ibit)) & 0x3, + (sortData[1]>>(startBit+ibit)) & 0x3, + (sortData[2]>>(startBit+ibit)) & 0x3, + (sortData[3]>>(startBit+ibit)) & 0x3); + + u32 key4; + u32 sKeyPacked[4] = { 0, 0, 0, 0 }; + { + sKeyPacked[0] |= 1<<(8*b.x); + sKeyPacked[1] |= 1<<(8*b.y); + sKeyPacked[2] |= 1<<(8*b.z); + sKeyPacked[3] |= 1<<(8*b.w); + + key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3]; + } + + u32 rankPacked; + u32 sumPacked; + { + rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE ); + } + + GROUP_LDS_BARRIER; + + u32 newOffset[4] = { 0,0,0,0 }; + { + u32 sumScanned = bit8Scan( sumPacked ); + + u32 scannedKeys[4]; + scannedKeys[0] = 1<<(8*b.x); + scannedKeys[1] = 1<<(8*b.y); + scannedKeys[2] = 1<<(8*b.z); + scannedKeys[3] = 1<<(8*b.w); + { // 4 scans at once + u32 sum4 = 0; + for(int ie=0; ie<4; ie++) + { + u32 tmp = scannedKeys[ie]; + scannedKeys[ie] = sum4; + sum4 += tmp; + } + } + + { + u32 sumPlusRank = sumScanned + rankPacked; + { u32 ie = b.x; + scannedKeys[0] += sumPlusRank; + newOffset[0] = unpack4Key( scannedKeys[0], ie ); + } + { u32 ie = b.y; + scannedKeys[1] += sumPlusRank; + newOffset[1] = unpack4Key( scannedKeys[1], ie ); + } + { u32 ie = b.z; + scannedKeys[2] += sumPlusRank; + newOffset[2] = unpack4Key( scannedKeys[2], ie ); + } + { u32 ie = b.w; + scannedKeys[3] += sumPlusRank; + newOffset[3] = unpack4Key( scannedKeys[3], ie ); + } + } + } + + + GROUP_LDS_BARRIER; + + { + ldsSortData[newOffset[0]] = sortData[0]; + ldsSortData[newOffset[1]] = sortData[1]; + ldsSortData[newOffset[2]] = sortData[2]; + ldsSortData[newOffset[3]] = sortData[3]; + + ldsSortVal[newOffset[0]] = sortVal[0]; + ldsSortVal[newOffset[1]] = sortVal[1]; + ldsSortVal[newOffset[2]] = sortVal[2]; + ldsSortVal[newOffset[3]] = sortVal[3]; + + GROUP_LDS_BARRIER; + + u32 dstAddr = 4*lIdx; + sortData[0] = ldsSortData[dstAddr+0]; + sortData[1] = ldsSortData[dstAddr+1]; + sortData[2] = ldsSortData[dstAddr+2]; + sortData[3] = ldsSortData[dstAddr+3]; + + sortVal[0] = ldsSortVal[dstAddr+0]; + sortVal[1] = ldsSortVal[dstAddr+1]; + sortVal[2] = ldsSortVal[dstAddr+2]; + sortVal[3] = ldsSortVal[dstAddr+3]; + + GROUP_LDS_BARRIER; + } + } +} + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb) +{ + __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; + __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; + __local u32 localHistogramToCarry[NUM_BUCKET]; + __local u32 localHistogram[NUM_BUCKET*2]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int startBit = cb.m_startBit; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + if( lIdx < (NUM_BUCKET) ) + { + localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx]; + } + + GROUP_LDS_BARRIER; + + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = n/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock>startBit) & 0xf; + + { // create histogram + u32 setIdx = lIdx/16; + if( lIdx < NUM_BUCKET ) + { + localHistogram[lIdx] = 0; + } + ldsSortData[lIdx] = 0; + GROUP_LDS_BARRIER; + + for(int i=0; i0) + return; + + for (int c=0;c>startBit) & 0xf;//0xf = NUM_TABLES-1 + gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i]; + counter[tableIdx] ++; + } + } + } + } + +} + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb ) +{ + + u32 gIdx = GET_GLOBAL_IDX; + u32 realLocalIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + int counter[NUM_BUCKET]; + + if (realLocalIdx>0) + return; + + for (int c=0;c>startBit) & 0xf;//0xf = NUM_TABLES-1 + gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i]; + counter[tableIdx] ++; + } + } + } + } + +} \ No newline at end of file diff --git a/opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h b/opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h new file mode 100644 index 000000000..464829c3b --- /dev/null +++ b/opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h @@ -0,0 +1,1074 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* radixSort32KernelsCL= \ +"/*\n" +"Bullet Continuous Collision Detection and Physics Library\n" +"Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org\n" +"\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Author Takahiro Harada\n" +"\n" +"\n" +"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"\n" +"\n" +"#define make_uint4 (uint4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"\n" +"#define WG_SIZE 64\n" +"#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n" +"#define BITS_PER_PASS 4\n" +"#define NUM_BUCKET (1< 64 )\n" +" {\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"\n" +" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#else\n" +" if( lIdx < 64 )\n" +" {\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; \n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n" +" GROUP_MEM_FENCE;\n" +" if( wgSize > 64 )\n" +" {\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"\n" +" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#endif\n" +" }\n" +"\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" *totalSum = sorterSharedMemory[wgSize*2-1];\n" +" u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n" +" return addValue;\n" +"}\n" +"\n" +"//__attribute__((reqd_work_group_size(128,1,1)))\n" +"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" +"{\n" +" u32 s4 = prefixScanVectorEx( &pData );\n" +" u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n" +" return pData + make_uint4( rank, rank, rank, rank );\n" +"}\n" +"\n" +"\n" +"//__attribute__((reqd_work_group_size(64,1,1)))\n" +"uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" +"{\n" +" u32 s4 = prefixScanVectorEx( &pData );\n" +" u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n" +" return pData + make_uint4( rank, rank, rank, rank );\n" +"}\n" +"\n" +"u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n" +"\n" +"u32 bit8Scan(u32 v)\n" +"{\n" +" return (v<<8) + (v<<16) + (v<<24);\n" +"}\n" +"\n" +"//===\n" +"\n" +"\n" +"\n" +"\n" +"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n" +"\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n" +"{\n" +" __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n" +"\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +"\n" +" for(int i=0; i>startBit) & 0xf;\n" +"#if defined(NV_GPU)\n" +" MY_HISTOGRAM( localKey )++;\n" +"#else\n" +" AtomInc( MY_HISTOGRAM( localKey ) );\n" +"#endif\n" +" }\n" +" }\n" +" }\n" +"\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i>startBit) & 0xf;\n" +"#if defined(NV_GPU)\n" +" MY_HISTOGRAM( localKey )++;\n" +"#else\n" +" AtomInc( MY_HISTOGRAM( localKey ) );\n" +"#endif\n" +" }\n" +" }\n" +" }\n" +"\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n" +" uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n" +" u32 total;\n" +" prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n" +" {\n" +" uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n" +" uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n" +" dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n" +"\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" ldsSortData[dstAddr.x] = sortData[0];\n" +" ldsSortData[dstAddr.y] = sortData[1];\n" +" ldsSortData[dstAddr.z] = sortData[2];\n" +" ldsSortData[dstAddr.w] = sortData[3];\n" +"\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" sortData[0] = ldsSortData[localAddr.x];\n" +" sortData[1] = ldsSortData[localAddr.y];\n" +" sortData[2] = ldsSortData[localAddr.z];\n" +" sortData[3] = ldsSortData[localAddr.w];\n" +"\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" }\n" +"}\n" +"\n" +"// 2 scan, 2 exchange\n" +"void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n" +"{\n" +" for(uint ibit=0; ibit>(startBit+ibit)) & 0x3, \n" +" (sortData[1]>>(startBit+ibit)) & 0x3, \n" +" (sortData[2]>>(startBit+ibit)) & 0x3, \n" +" (sortData[3]>>(startBit+ibit)) & 0x3);\n" +"\n" +" u32 key4;\n" +" u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" +" {\n" +" sKeyPacked[0] |= 1<<(8*b.x);\n" +" sKeyPacked[1] |= 1<<(8*b.y);\n" +" sKeyPacked[2] |= 1<<(8*b.z);\n" +" sKeyPacked[3] |= 1<<(8*b.w);\n" +"\n" +" key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" +" }\n" +"\n" +" u32 rankPacked;\n" +" u32 sumPacked;\n" +" {\n" +" rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" +" }\n" +"\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" u32 newOffset[4] = { 0,0,0,0 };\n" +" {\n" +" u32 sumScanned = bit8Scan( sumPacked );\n" +"\n" +" u32 scannedKeys[4];\n" +" scannedKeys[0] = 1<<(8*b.x);\n" +" scannedKeys[1] = 1<<(8*b.y);\n" +" scannedKeys[2] = 1<<(8*b.z);\n" +" scannedKeys[3] = 1<<(8*b.w);\n" +" { // 4 scans at once\n" +" u32 sum4 = 0;\n" +" for(int ie=0; ie<4; ie++)\n" +" {\n" +" u32 tmp = scannedKeys[ie];\n" +" scannedKeys[ie] = sum4;\n" +" sum4 += tmp;\n" +" }\n" +" }\n" +"\n" +" {\n" +" u32 sumPlusRank = sumScanned + rankPacked;\n" +" { u32 ie = b.x;\n" +" scannedKeys[0] += sumPlusRank;\n" +" newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" +" }\n" +" { u32 ie = b.y;\n" +" scannedKeys[1] += sumPlusRank;\n" +" newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" +" }\n" +" { u32 ie = b.z;\n" +" scannedKeys[2] += sumPlusRank;\n" +" newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" +" }\n" +" { u32 ie = b.w;\n" +" scannedKeys[3] += sumPlusRank;\n" +" newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" +" }\n" +" }\n" +" }\n" +"\n" +"\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" {\n" +" ldsSortData[newOffset[0]] = sortData[0];\n" +" ldsSortData[newOffset[1]] = sortData[1];\n" +" ldsSortData[newOffset[2]] = sortData[2];\n" +" ldsSortData[newOffset[3]] = sortData[3];\n" +"\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" u32 dstAddr = 4*lIdx;\n" +" sortData[0] = ldsSortData[dstAddr+0];\n" +" sortData[1] = ldsSortData[dstAddr+1];\n" +" sortData[2] = ldsSortData[dstAddr+2];\n" +" sortData[3] = ldsSortData[dstAddr+3];\n" +"\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" }\n" +"}\n" +"\n" +"#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" +"{\n" +" __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" +" __local u32 localHistogramToCarry[NUM_BUCKET];\n" +" __local u32 localHistogram[NUM_BUCKET*2];\n" +"\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +"\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int startBit = cb.m_startBit;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +"\n" +" if( lIdx < (NUM_BUCKET) )\n" +" {\n" +" localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" +" }\n" +"\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +"\n" +" int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" +"\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +"\n" +" for(int iblock=0; iblock>startBit) & 0xf;\n" +"\n" +" { // create histogram\n" +" u32 setIdx = lIdx/16;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[lIdx] = 0;\n" +" }\n" +" ldsSortData[lIdx] = 0;\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" for(int i=0; i>(startBit+ibit)) & 0x3, \n" +" (sortData[1]>>(startBit+ibit)) & 0x3, \n" +" (sortData[2]>>(startBit+ibit)) & 0x3, \n" +" (sortData[3]>>(startBit+ibit)) & 0x3);\n" +"\n" +" u32 key4;\n" +" u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" +" {\n" +" sKeyPacked[0] |= 1<<(8*b.x);\n" +" sKeyPacked[1] |= 1<<(8*b.y);\n" +" sKeyPacked[2] |= 1<<(8*b.z);\n" +" sKeyPacked[3] |= 1<<(8*b.w);\n" +"\n" +" key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" +" }\n" +"\n" +" u32 rankPacked;\n" +" u32 sumPacked;\n" +" {\n" +" rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" +" }\n" +"\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" u32 newOffset[4] = { 0,0,0,0 };\n" +" {\n" +" u32 sumScanned = bit8Scan( sumPacked );\n" +"\n" +" u32 scannedKeys[4];\n" +" scannedKeys[0] = 1<<(8*b.x);\n" +" scannedKeys[1] = 1<<(8*b.y);\n" +" scannedKeys[2] = 1<<(8*b.z);\n" +" scannedKeys[3] = 1<<(8*b.w);\n" +" { // 4 scans at once\n" +" u32 sum4 = 0;\n" +" for(int ie=0; ie<4; ie++)\n" +" {\n" +" u32 tmp = scannedKeys[ie];\n" +" scannedKeys[ie] = sum4;\n" +" sum4 += tmp;\n" +" }\n" +" }\n" +"\n" +" {\n" +" u32 sumPlusRank = sumScanned + rankPacked;\n" +" { u32 ie = b.x;\n" +" scannedKeys[0] += sumPlusRank;\n" +" newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" +" }\n" +" { u32 ie = b.y;\n" +" scannedKeys[1] += sumPlusRank;\n" +" newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" +" }\n" +" { u32 ie = b.z;\n" +" scannedKeys[2] += sumPlusRank;\n" +" newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" +" }\n" +" { u32 ie = b.w;\n" +" scannedKeys[3] += sumPlusRank;\n" +" newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" +" }\n" +" }\n" +" }\n" +"\n" +"\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" {\n" +" ldsSortData[newOffset[0]] = sortData[0];\n" +" ldsSortData[newOffset[1]] = sortData[1];\n" +" ldsSortData[newOffset[2]] = sortData[2];\n" +" ldsSortData[newOffset[3]] = sortData[3];\n" +"\n" +" ldsSortVal[newOffset[0]] = sortVal[0];\n" +" ldsSortVal[newOffset[1]] = sortVal[1];\n" +" ldsSortVal[newOffset[2]] = sortVal[2];\n" +" ldsSortVal[newOffset[3]] = sortVal[3];\n" +"\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" u32 dstAddr = 4*lIdx;\n" +" sortData[0] = ldsSortData[dstAddr+0];\n" +" sortData[1] = ldsSortData[dstAddr+1];\n" +" sortData[2] = ldsSortData[dstAddr+2];\n" +" sortData[3] = ldsSortData[dstAddr+3];\n" +"\n" +" sortVal[0] = ldsSortVal[dstAddr+0];\n" +" sortVal[1] = ldsSortVal[dstAddr+1];\n" +" sortVal[2] = ldsSortVal[dstAddr+2];\n" +" sortVal[3] = ldsSortVal[dstAddr+3];\n" +"\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" }\n" +"}\n" +"\n" +"\n" +"\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n" +"{\n" +" __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" +" __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" +" __local u32 localHistogramToCarry[NUM_BUCKET];\n" +" __local u32 localHistogram[NUM_BUCKET*2];\n" +"\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +"\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int startBit = cb.m_startBit;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +"\n" +" if( lIdx < (NUM_BUCKET) )\n" +" {\n" +" localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" +" }\n" +"\n" +" GROUP_LDS_BARRIER;\n" +" \n" +"\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +"\n" +" int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" +"\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +"\n" +" for(int iblock=0; iblock>startBit) & 0xf;\n" +"\n" +" { // create histogram\n" +" u32 setIdx = lIdx/16;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[lIdx] = 0;\n" +" }\n" +" ldsSortData[lIdx] = 0;\n" +" GROUP_LDS_BARRIER;\n" +"\n" +" for(int i=0; i0)\n" +" return;\n" +" \n" +" for (int c=0;c>startBit) & 0xf;//0xf = NUM_TABLES-1\n" +" gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" +" counter[tableIdx] ++;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" \n" +"}\n" +"\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" +"{\n" +" \n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 realLocalIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +"\n" +" int counter[NUM_BUCKET];\n" +" \n" +" if (realLocalIdx>0)\n" +" return;\n" +" \n" +" for (int c=0;c>startBit) & 0xf;//0xf = NUM_TABLES-1\n" +" gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" +" counter[tableIdx] ++;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" \n" +"}\n" +; diff --git a/opencl/parallel_primitives/test/main.cpp b/opencl/parallel_primitives/test/main.cpp new file mode 100644 index 000000000..d659410f1 --- /dev/null +++ b/opencl/parallel_primitives/test/main.cpp @@ -0,0 +1,379 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#include +#include "../basic_initialize/btOpenCLUtils.h" +#include "../host/btFillCL.h" +#include "../host/btBoundSearchCL.h" +#include "../host/btRadixSort32CL.h" +#include "../host/btPrefixScanCL.h" +#include "../host/CommandLineArgs.h" + +#include "../host/btMinMax.h" +int g_nPassed = 0; +int g_nFailed = 0; +bool g_testFailed = 0; + +#define TEST_INIT g_testFailed = 0; +#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;} +#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++; +#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment)) + +cl_context g_context=0; +cl_device_id g_device=0; +cl_command_queue g_queue =0; +const char* g_deviceName = 0; + +void initCL(int preferredDeviceIndex, int preferredPlatformIndex) +{ + void* glCtx=0; + void* glDC = 0; + int ciErrNum = 0; + //bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers) + + cl_device_type deviceType = CL_DEVICE_TYPE_ALL; + + g_context = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + int numDev = btOpenCLUtils::getNumDevices(g_context); + if (numDev>0) + { + btOpenCLDeviceInfo info; + g_device= btOpenCLUtils::getDevice(g_context,0); + g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + btOpenCLUtils::printDeviceInfo(g_device); + btOpenCLUtils::getDeviceInfo(g_device,&info); + g_deviceName = info.m_deviceName; + } +} + +void exitCL() +{ + clReleaseCommandQueue(g_queue); + clReleaseContext(g_context); +} + + +inline void fillIntTest() +{ + TEST_INIT; + + btFillCL* fillCL = new btFillCL(g_context,g_device,g_queue); + int maxSize=1024*256; + btOpenCLArray intBuffer(g_context,g_queue,maxSize); + intBuffer.resize(maxSize); + +#define NUM_TESTS 7 + + int dx = maxSize/NUM_TESTS; + for (int iter=0;iterexecute(intBuffer,value,size,offset); + + btAlignedObjectArray hostBuf2; + hostBuf2.resize(size); + fillCL->executeHost(hostBuf2,value,size,offset); + + btAlignedObjectArray hostBuf; + intBuffer.copyToHost(hostBuf); + + for(int i=0; i +__inline +T getRandom(const T& minV, const T& maxV) +{ + float r = (rand()%10000)/10000.f; + T range = maxV - minV; + return (T)(minV + r*range); +} + +struct btSortDataCompare +{ + inline bool operator()(const btSortData& first, const btSortData& second) const + { + return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value); + } +}; + + +void boundSearchTest( ) +{ + TEST_INIT; + + int maxSize = 1024*256; + int bucketSize = 256; + + btOpenCLArray srcCL(g_context,g_queue,maxSize); + btOpenCLArray upperCL(g_context,g_queue,maxSize); + btOpenCLArray lowerCL(g_context,g_queue,maxSize); + + btAlignedObjectArray srcHost; + btAlignedObjectArray upperHost; + btAlignedObjectArray lowerHost; + btAlignedObjectArray upperHostCompare; + btAlignedObjectArray lowerHostCompare; + + btBoundSearchCL* search = new btBoundSearchCL(g_context,g_device,g_queue, maxSize); + + + int dx = maxSize/NUM_TESTS; + for(int iter=0; iterexecute(srcCL,size,upperCL,bucketSize,btBoundSearchCL::BOUND_UPPER); + search->execute(srcCL,size,lowerCL,bucketSize,btBoundSearchCL::BOUND_LOWER); + + search->executeHost(srcHost,size,upperHostCompare,bucketSize,btBoundSearchCL::BOUND_UPPER); + search->executeHost(srcHost,size,lowerHostCompare,bucketSize,btBoundSearchCL::BOUND_LOWER); + + lowerCL.copyToHost(lowerHost); + upperCL.copyToHost(upperHost); + for(int i=0; i buf0Host; + btAlignedObjectArray buf1Host; + + btOpenCLArray buf2CL(g_context,g_queue,maxSize); + btOpenCLArray buf3CL(g_context,g_queue,maxSize); + + + btPrefixScanCL* scan = new btPrefixScanCL(g_context,g_device,g_queue,maxSize); + + int dx = maxSize/NUM_TESTS; + for(int iter=0; iterexecuteHost(buf0Host, buf1Host, size, &sumHost ); + scan->execute( buf2CL, buf3CL, size, &sumGPU ); + + buf3CL.copyToHost(buf0Host); + + TEST_ASSERT( sumHost == sumGPU ); + for(int i=0; i buf0Host; + buf0Host.resize(maxSize); + btAlignedObjectArray buf1Host; + buf1Host.resize(maxSize ); + btOpenCLArray buf2CL(g_context,g_queue,maxSize); + + btRadixSort32CL* sort = new btRadixSort32CL(g_context,g_device,g_queue,maxSize); + + int dx = maxSize/NUM_TESTS; + for(int iter=0; iterexecuteHost( buf0Host); + sort->execute(buf2CL); + + buf2CL.copyToHost(buf1Host); + + for(int i=0; i\n"); + getchar(); +} + diff --git a/opencl/parallel_primitives/test/premake4.lua b/opencl/parallel_primitives/test/premake4.lua new file mode 100644 index 000000000..119087926 --- /dev/null +++ b/opencl/parallel_primitives/test/premake4.lua @@ -0,0 +1,41 @@ +function createProject(vendor) + hasCL = findOpenCL(vendor) + + if (hasCL) then + + project ("OpenCL_primitives_test_" .. vendor) + + initOpenCL(vendor) + + language "C++" + + kind "ConsoleApp" + targetdir "../../../bin" + includedirs {".",".."} + + + files { + "main.cpp", + "../../basic_initialize/btOpenCLInclude.h", + "../../basic_initialize/btOpenCLUtils.cpp", + "../../basic_initialize/btOpenCLUtils.h", + "../host/btFillCL.cpp", + "../host/btFillCL.h", + "../host/btBoundSearchCL.cpp", + "../host/btBoundSearchCL.h", + "../host/btPrefixScanCL.cpp", + "../host/btPrefixScanCL.h", + "../host/btRadixSort32CL.cpp", + "../host/btRadixSort32CL.h", + "../host/btAlignedAllocator.cpp", + "../host/btAlignedAllocator.h", + "../host/btAlignedObjectArray.h", + } + + end +end + +createProject("AMD") +createProject("Intel") +createProject("NVIDIA") +createProject("Apple") \ No newline at end of file diff --git a/opencl/reduce/main.cpp b/opencl/reduce/main.cpp new file mode 100644 index 000000000..f925f6855 --- /dev/null +++ b/opencl/reduce/main.cpp @@ -0,0 +1,116 @@ +///original author: Erwin Coumans +#include "btOpenCLUtils.h" +#include "../parallel_primitives/host/btOpenCLArray.h" +#include "../parallel_primitives/host/btLauncherCL.h" +#include + + +#define MSTRINGIFY(A) #A +const char* kernelString= MSTRINGIFY( +__kernel void ReduceGlobal(__global int* d_in, __global int* d_out, int numElements) +{ + int myId = get_global_id(0); + int tid = get_local_id(0); + + + int ls = get_local_size(0); + for (unsigned int s=ls/2;s>0;s>>=1) + { + if (myId a(ctx,queue); + btOpenCLArray b(ctx,queue); + btAlignedObjectArray hostA; + btAlignedObjectArray hostB; + + for (int i=0;i=numElements) + return; + + float8 aGID = a[iGID]; + float8 bGID = b[iGID]; + + float8 result = aGID + bGID; + // write back out to GMEM + c[iGID] = result; +} diff --git a/opencl/vector_add/VectorAddKernels.h b/opencl/vector_add/VectorAddKernels.h new file mode 100644 index 000000000..55c238aae --- /dev/null +++ b/opencl/vector_add/VectorAddKernels.h @@ -0,0 +1,20 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* vectorAddCL= \ +"\n" +"\n" +"__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)\n" +"{\n" +" // get oct-float index into global data array\n" +" int iGID = get_global_id(0);\n" +" if (iGID>=numElements)\n" +" return;\n" +"\n" +" float8 aGID = a[iGID];\n" +" float8 bGID = b[iGID];\n" +"\n" +" float8 result = aGID + bGID;\n" +" // write back out to GMEM\n" +" c[iGID] = result;\n" +"}\n" +"\n" +; diff --git a/opencl/vector_add/main.cpp b/opencl/vector_add/main.cpp new file mode 100644 index 000000000..aa4132d98 --- /dev/null +++ b/opencl/vector_add/main.cpp @@ -0,0 +1,408 @@ + +///VectorAdd sample, from the NVidia JumpStart Guide +///http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf + +///Instead of #include we include +///Apart from this include file, all other code should compile and work on OpenCL compliant implementation + + +#define LOAD_FROM_FILE + +#ifdef __APPLE__ + #include +#else + #include +#endif //__APPLE__ +#ifdef _WIN32 +#pragma warning (disable:4996) +#endif +#include +#include +#include +#include + +#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); } +size_t wgSize; + +#include "VectorAddKernels.h" + +#ifdef CL_PLATFORM_INTEL + const char* preferredPlatform = "Intel(R) Corporation"; +#elif defined CL_PLATFORM_AMD + const char* preferredPlatform = "Advanced Micro Devices, Inc."; +#elif defined CL_PLATFORM_NVIDIA + const char* preferredPlatform = "NVIDIA Corporation"; +#else + const char* preferredPlatform = "Unknown"; +#endif + + + +char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength) +{ + // locals + FILE* pFileStream = NULL; + size_t szSourceLength; + + // open the OpenCL source code file + pFileStream = fopen(cFilename, "rb"); + if(pFileStream == 0) + { + return NULL; + } + + size_t szPreambleLength = strlen(cPreamble); + + // get the length of the source code + fseek(pFileStream, 0, SEEK_END); + szSourceLength = ftell(pFileStream); + fseek(pFileStream, 0, SEEK_SET); + + // allocate a buffer for the source code string and read it in + char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); + memcpy(cSourceString, cPreamble, szPreambleLength); + fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream); + + // close the file and return the total length of the combined (preamble + source) string + fclose(pFileStream); + if(szFinalLength != 0) + { + *szFinalLength = szSourceLength + szPreambleLength; + } + cSourceString[szSourceLength + szPreambleLength] = '\0'; + + return cSourceString; +} + +size_t workitem_size[3]; + +void printDevInfo(cl_device_id device) +{ + char device_string[1024]; + + clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); + printf( " Device %s:\n", device_string); + + // CL_DEVICE_INFO + cl_device_type type; + clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL); + if( type & CL_DEVICE_TYPE_CPU ) + printf(" CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_CPU"); + if( type & CL_DEVICE_TYPE_GPU ) + printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_GPU"); + if( type & CL_DEVICE_TYPE_ACCELERATOR ) + printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR"); + if( type & CL_DEVICE_TYPE_DEFAULT ) + printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT"); + + // CL_DEVICE_MAX_COMPUTE_UNITS + cl_uint compute_units; + clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); + printf( " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units); + + // CL_DEVICE_MAX_WORK_GROUP_SIZE + + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL); + printf( " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]); + +} + + + + +// Main function +// ********************************************************************* +int main(int argc, char **argv) +{ + void *srcA, *srcB, *dst; // Host buffers for OpenCL test + cl_context cxGPUContext; // OpenCL context + cl_command_queue cqCommandQue; // OpenCL command que + cl_device_id* cdDevices; // OpenCL device list + cl_program cpProgram; // OpenCL program + cl_kernel ckKernel; // OpenCL kernel + cl_mem cmMemObjs[3]; // OpenCL memory buffer objects: 3 for device + size_t szGlobalWorkSize[1]; // 1D var for Total # of work items + size_t szLocalWorkSize[1]; // 1D var for # of work items in the work group + size_t szParmDataBytes; // Byte size of context information + cl_int ciErr1, ciErr2; // Error code var + + + int iTestN = 100000 * 8; // Size of Vectors to process + + int actualGlobalSize = iTestN / 8; + + + // set Global and Local work size dimensions + szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item + szLocalWorkSize[0]= iTestN>>3; + + + // Allocate and initialize host arrays + srcA = (void *)malloc (sizeof(cl_float) * iTestN); + srcB = (void *)malloc (sizeof(cl_float) * iTestN); + dst = (void *)malloc (sizeof(cl_float) * iTestN); + + int i; + + // Initialize arrays with some values + for (i=0;i processing outside of the buffer + //make sure to check kernel + } + + size_t globalThreads[] = {num_t * workgroupSize}; + size_t localThreads[] = {workgroupSize}; + + + localWorkSize[0] = workgroupSize; + globalWorkSize[0] = num_t * workgroupSize; + localWorkSize[1] = 1; + globalWorkSize[1] = 1; + + // Copy input data from host to GPU and launch kernel + ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); + + } + + if (ciErrNum != CL_SUCCESS) + { + printf("cannot clEnqueueNDRangeKernel\n"); + exit(0); + } + + clFinish(cqCommandQue); + // Read back results and check accumulated errors + ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL); + + // Release kernel, program, and memory objects + // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity. + free(cdDevices); + clReleaseKernel(ckKernel); + clReleaseProgram(cpProgram); + clReleaseCommandQueue(cqCommandQue); + clReleaseContext(cxGPUContext); + + + // print the results + int iErrorCount = 0; + for (i = 0; i < iTestN; i++) + { + if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i]) + iErrorCount++; + } + + if (iErrorCount) + { + printf("Validation FAILED\n"); + } else + { + printf("Validation SUCCESSFULL\n"); + } + // Free host memory, close log and return success + for (i = 0; i < 3; i++) + { + clReleaseMemObject(cmMemObjs[i]); + } + + free(srcA); + free(srcB); + free (dst); + printf("Press ENTER to quit\n"); + getchar(); +} + + diff --git a/opencl/vector_add/premake4.lua b/opencl/vector_add/premake4.lua new file mode 100644 index 000000000..ccaabd73b --- /dev/null +++ b/opencl/vector_add/premake4.lua @@ -0,0 +1,28 @@ +function createProject(vendor) + + hasCL = findOpenCL(vendor) + + if (hasCL) then + + project ("OpenCL_VectorAdd_" .. vendor) + + initOpenCL(vendor) + + language "C++" + + kind "ConsoleApp" + targetdir "../../bin" + + files { + "main.cpp", + "../basic_initialize/btOpenCLUtils.cpp", + "../basic_initialize/btOpenCLUtils.h" + } + + end +end + +createProject("AMD") +createProject("Intel") +createProject("NVIDIA") +createProject("Apple") diff --git a/opencl/vector_add_simplified/main.cpp b/opencl/vector_add_simplified/main.cpp new file mode 100644 index 000000000..d911ec2c2 --- /dev/null +++ b/opencl/vector_add_simplified/main.cpp @@ -0,0 +1,69 @@ +///original author: Erwin Coumans +#include "btOpenCLUtils.h" +#include "../parallel_primitives/host/btOpenCLArray.h" +#include "../parallel_primitives/host/btLauncherCL.h" +#include + + +#define MSTRINGIFY(A) #A +const char* kernelString= MSTRINGIFY( +__kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int numElements) +{ + int iGID = get_global_id(0); + if (iGID>=numElements) + return; + float aGID = a[iGID]; + float bGID = b[iGID]; + float result = aGID + bGID; + c[iGID] = result; +} +); + +int main(int argc, char* argv[]) +{ + int ciErrNum = 0; + int preferred_device = -1; + int preferred_platform = -1; + cl_platform_id platformId; + cl_context ctx; + cl_command_queue queue; + cl_device_id device; + cl_kernel addKernel; + ctx = btOpenCLUtils::createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId); + btOpenCLUtils::printPlatformInfo(platformId); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + if (!ctx) { + printf("No OpenCL capable GPU found!"); + return 0; + } + + device = btOpenCLUtils::getDevice(ctx,0); + queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum); + addKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"VectorAdd",&ciErrNum); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + int numElements = 32; + btOpenCLArray a(ctx,queue); + btOpenCLArray b(ctx,queue); + btOpenCLArray c(ctx,queue); + for (int i=0;i