import opencl_course source for a start

This commit is contained in:
erwin coumans
2013-03-11 22:03:27 +01:00
commit 08272c7de5
64 changed files with 12336 additions and 0 deletions

36
build/findDirectX11.lua Normal file
View File

@@ -0,0 +1,36 @@
function findDirectX11()
local dx11path = os.getenv("DXSDK_DIR")
if (dx11path) then
local filepath = string.format("%s%s",dx11path,"Include/D3D11.h")
headerdx11 = io.open(filepath, "r")
if (headerdx11) then
printf("Found DX11: '%s'", filepath)
return true
end
end
return false
end
function initDirectX11()
configuration {}
local dx11path = os.getenv("DXSDK_DIR")
defines { "ADL_ENABLE_DX11"}
includedirs {"$(DXSDK_DIR)/include"}
configuration "x32"
libdirs {"$(DXSDK_DIR)/Lib/x86"}
configuration "x64"
libdirs {"$(DXSDK_DIR)/Lib/x64"}
configuration {}
links {"d3dcompiler",
"dxerr",
"dxguid",
"d3dx9",
"d3d9",
"winmm",
"comctl32",
"d3dx11"
}
return true
end

151
build/findOpenCL.lua Normal file
View File

@@ -0,0 +1,151 @@
function findOpenCL_Apple()
if os.is("macosx") then
return true
else
return false
end
end
function findOpenCL_AMD()
local amdopenclpath = os.getenv("AMDAPPSDKROOT")
if (amdopenclpath) then
return true
end
return false
end
function findOpenCL_NVIDIA()
local nvidiaopenclpath = os.getenv("CUDA_PATH")
if (nvidiaopenclpath) then
return true
end
return false
end
function findOpenCL_Intel()
if os.is("Windows") then
local intelopenclpath = os.getenv("INTELOCLSDKROOT")
if (intelopenclpath) then
return true
end
end
if os.is("Linux") then
local intelsdk = io.open("/usr/include/CL/opencl.h","r")
if (intelsdk) then
return true;
end
end
return false
end
function initOpenCL_Apple()
configuration{}
includedirs {
"/System/Library/Frameworks/OpenCL.framework"
}
libdirs "/System/Library/Frameworks/OpenCL.framework"
links
{
"OpenCL.framework"
}
end
function initOpenCL_AMD()
configuration {}
local amdopenclpath = os.getenv("AMDAPPSDKROOT")
if (amdopenclpath) then
defines { "ADL_ENABLE_CL" , "CL_PLATFORM_AMD"}
includedirs {
"$(AMDAPPSDKROOT)/include"
}
configuration "x32"
libdirs {"$(AMDAPPSDKROOT)/lib/x86"}
configuration "x64"
libdirs {"$(AMDAPPSDKROOT)/lib/x86_64"}
configuration {}
links {"OpenCL"}
return true
end
return false
end
function initOpenCL_NVIDIA()
configuration {}
local nvidiaopenclpath = os.getenv("CUDA_PATH")
if (nvidiaopenclpath) then
defines { "ADL_ENABLE_CL" , "CL_PLATFORM_NVIDIA"}
includedirs {
"$(CUDA_PATH)/include"
}
configuration "x32"
libdirs {"$(CUDA_PATH)/lib/Win32"}
configuration "x64"
libdirs {"$(CUDA_PATH)/lib/x64"}
configuration {}
links {"OpenCL"}
return true
end
return false
end
function initOpenCL_Intel()
configuration {}
if os.is("Windows") then
local intelopenclpath = os.getenv("INTELOCLSDKROOT")
if (intelopenclpath) then
defines { "ADL_ENABLE_CL" , "CL_PLATFORM_INTEL"}
includedirs {
"$(INTELOCLSDKROOT)/include"
}
configuration "x32"
libdirs {"$(INTELOCLSDKROOT)/lib/x86"}
configuration "x64"
libdirs {"$(INTELOCLSDKROOT)/lib/x64"}
configuration {}
links {"OpenCL"}
return true
end
end
if os.is("Linux") then
defines { "ADL_ENABLE_CL" , "CL_PLATFORM_INTEL"}
configuration {}
links {"OpenCL"}
end
return false
end
function findOpenCL (vendor )
if vendor=="AMD" then
return findOpenCL_AMD()
end
if vendor=="NVIDIA" then
return findOpenCL_NVIDIA()
end
if vendor=="Intel" then
return findOpenCL_Intel()
end
if vendor=="Apple" then
return findOpenCL_Apple()
end
return false
end
function initOpenCL ( vendor )
if vendor=="AMD" then
initOpenCL_AMD()
end
if vendor=="NVIDIA" then
return initOpenCL_NVIDIA()
end
if vendor=="Intel" then
initOpenCL_Intel()
end
if vendor=="Apple" then
return initOpenCL_Apple()
end
end

View File

@@ -0,0 +1,51 @@
function initOpenGL()
configuration {}
configuration {"Windows"}
links {"opengl32","glu32"}
configuration {"MacOSX"}
links { "OpenGL.framework"}
configuration {"not Windows", "not MacOSX"}
links {"GL"}
configuration{}
end
function initGlut()
configuration {}
configuration {"Windows"}
includedirs {
projectRootDir .. "rendering/GlutGlewWindows"
}
libdirs { projectRootDir .. "rendering/GlutGlewWindows"}
configuration {"Windows", "x32"}
links {"glut32"}
configuration {"Windows", "x64"}
links {"glut64"}
configuration {"MacOSX"}
links { "Glut.framework" }
configuration {"Linux"}
links {"glut","GLU"}
configuration{}
end
function initGlew()
configuration {}
if os.is("Windows") then
configuration {"Windows"}
defines { "GLEW_STATIC"}
includedirs {
projectRootDir .. "rendering/GlutGlewWindows"
}
libdirs { projectRootDir .. "rendering/GlutGlewWindows"}
files { projectRootDir .. "rendering/GlutGlewWindows/glew.c"}
end
if os.is("Linux") then
links{"GLEW"}
end
configuration{}
end

BIN
build/premake4.exe Normal file

Binary file not shown.

96
build/premake4.lua Normal file
View File

@@ -0,0 +1,96 @@
solution "0MySolution"
-- Multithreaded compiling
if _ACTION == "vs2010" or _ACTION=="vs2008" then
buildoptions { "/MP" }
end
act = ""
if _ACTION then
act = _ACTION
end
newoption
{
trigger = "ios",
description = "Enable iOS target (requires xcode4)"
}
configurations {"Release", "Debug"}
configuration "Release"
flags { "Optimize", "EnableSSE","StaticRuntime", "NoMinimalRebuild", "FloatFast"}
configuration "Debug"
defines {"_DEBUG=1"}
flags { "Symbols", "StaticRuntime" , "NoMinimalRebuild", "NoEditAndContinue" ,"FloatFast"}
platforms {"x32", "x64"}
configuration {"x32"}
targetsuffix ("_" .. act)
configuration "x64"
targetsuffix ("_" .. act .. "_64" )
configuration {"x64", "debug"}
targetsuffix ("_" .. act .. "_x64_debug")
configuration {"x64", "release"}
targetsuffix ("_" .. act .. "_x64_release" )
configuration {"x32", "debug"}
targetsuffix ("_" .. act .. "_debug" )
configuration{}
postfix=""
if _ACTION == "xcode4" then
if _OPTIONS["ios"] then
postfix = "ios";
xcodebuildsettings
{
'CODE_SIGN_IDENTITY = "iPhone Developer"',
"SDKROOT = iphoneos",
'ARCHS = "armv7"',
'TARGETED_DEVICE_FAMILY = "1,2"',
'VALID_ARCHS = "armv7"',
}
else
xcodebuildsettings
{
'ARCHS = "$(ARCHS_STANDARD_32_BIT) $(ARCHS_STANDARD_64_BIT)"',
'VALID_ARCHS = "x86_64 i386"',
}
end
end
flags { "NoRTTI", "NoExceptions"}
defines { "_HAS_EXCEPTIONS=0" }
targetdir "../bin"
location("./" .. act .. postfix)
projectRootDir = os.getcwd() .. "/../"
print("Project root directroy: " .. projectRootDir);
dofile ("findOpenCL.lua")
dofile ("findDirectX11.lua")
dofile ("findOpenGLGlewGlut.lua")
language "C++"
if not _OPTIONS["ios"] then
include "../opencl/vector_add_simplified"
include "../opencl/vector_add"
include "../opencl/basic_initialize"
include "../opencl/parallel_primitives/host"
include "../opencl/parallel_primitives/test"
include "../opencl/parallel_primitives/benchmark"
include "../opencl/lds_bank_conflict"
include "../opencl/reduce"
end

BIN
build/premake4_linux Normal file

Binary file not shown.

BIN
build/premake4_linux64 Normal file

Binary file not shown.

BIN
build/premake4_osx Normal file

Binary file not shown.

13
build/stringify.bat Normal file
View File

@@ -0,0 +1,13 @@
@echo off
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/vector_add/VectorAddKernels.cl" --headerfile="../opencl/vector_add/VectorAddKernels.h" --stringname="vectorAddCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" --headerfile="../opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/BoundSearchKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/PrefixScanKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/FillKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify
pause

8
build/stringify.sh Normal file
View File

@@ -0,0 +1,8 @@
#!/bin/sh
./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/vector_add/VectorAddKernels.cl" --headerfile="../opencl/vector_add/VectorAddKernels.h" --stringname="vectorAddCL" stringify
./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" --headerfile="../opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h" --stringname="radixSort32KernelsCL" stringify
./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/BoundSearchKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h" --stringname="boundSearchKernelsCL" stringify
./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/PrefixScanKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h" --stringname="prefixScanKernelsCL" stringify
./premake4_osx --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/kernels/FillKernels.cl" --headerfile="../opencl/parallel_primitives/kernels/FillKernelsCL.h" --stringname="fillKernelsCL" stringify

78
build/stringifyKernel.lua Normal file
View File

@@ -0,0 +1,78 @@
function stringifyKernel(filenameIn, filenameOut, kernelMethod)
local BUFSIZE = 1024*1024 -- 1MB
local f = io.open(filenameIn,"r");
local fw = io.open(filenameOut,"w");
fw:write("//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project\n")
fw:write("static const char* " .. kernelMethod .. "= \\\n")
local cc, lc, wc = 0, 0, 0 -- char, line, and word counts
while true do
local lines, rest = f:read(BUFSIZE, "*line")
if not lines then break end
local i = 0
local startpos = 0
local slen = string.len(lines)
local endpos = 0
while true do
i = string.find(lines, "\n", i+1) -- find 'next' newline
if i == nil then
endpos = slen
else
endpos = i
end
oneline = string.sub(lines,startpos,endpos)
oneline = string.gsub(oneline,"\n","")
oneline = '\"' .. oneline .. '\\n\"'
oneline = string.gsub(oneline,"\\\\n","")
oneline = oneline .. "\n"
--print(oneline)
fw:write(oneline)
if i == nil then break end
startpos = i+1
end
if rest then lines = lines .. rest .. '\n' end
cc = cc + string.len(lines)
-- count words in the chunk
local _,t = string.gsub(lines, "%S+", "")
wc = wc + t
-- count newlines in the chunk
_,t = string.gsub(lines, "\n", "\n")
lc = lc + t
end
--print("stringified " .. filenameIn .. " into " .. filenameOut .. " processed " .. lc .. " lines")
print(filenameIn .. " (" .. lc .. " lines)")
f:close()
fw:write(";\n")
fw:close()
end
newoption {
trigger = "kernelfile",
value = "kernelpath",
description = "full path to the kernel source input file"
}
newoption {
trigger = "headerfile",
value = "path",
description = "full path to the header output file"
}
newoption {
trigger = "stringname",
value = "var",
description = "name of the kernel string variable"
}
newaction {
trigger = "stringify",
description = "stringify kernels source code into strings",
execute = function ()
stringifyKernel( _OPTIONS["kernelfile"] , _OPTIONS["headerfile"], _OPTIONS["stringname"])
end
}

6
build/vs2010.bat Normal file
View File

@@ -0,0 +1,6 @@
rem premake4 --with-pe vs2010
premake4 vs2010
mkdir vs2010\cache
pause

4
build/xcode.command Normal file
View File

@@ -0,0 +1,4 @@
cd `dirname $0`
./premake4_osx xcode4

View File

@@ -0,0 +1,44 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_OPENCL_INCLUDE_H
#define BT_OPENCL_INCLUDE_H
#ifdef __APPLE__
#ifdef USE_MINICL
#include <MiniCL/cl.h>
#else
#include <OpenCL/cl.h>
#include <OpenCL/cl_ext.h> //clLogMessagesToStderrAPPLE
#endif
#else
#ifdef USE_MINICL
#include <MiniCL/cl.h>
#else
#include <CL/cl.h>
#ifdef _WIN32
#include "CL/cl_gl.h"
#endif //_WIN32
#endif
#endif //__APPLE__
#include <assert.h>
#include <stdio.h>
#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); }
#endif //BT_OPENCL_INCLUDE_H

View File

@@ -0,0 +1,903 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//original author: Roman Ponomarev
//cleanup by Erwin Coumans
#include <string.h>
#ifdef _WIN32
#pragma warning (disable:4996)
#endif
#include "btOpenCLUtils.h"
//#include "btOpenCLInclude.h"
#include <stdio.h>
#include <stdlib.h>
#define BT_MAX_CL_DEVICES 16 //who needs 16 devices?
#ifdef _WIN32
#include <Windows.h>
#endif
#include <assert.h>
#define btAssert assert
//Set the preferred platform vendor using the OpenCL SDK
static const char* spPlatformVendor =
#if defined(CL_PLATFORM_MINI_CL)
"MiniCL, SCEA";
#elif defined(CL_PLATFORM_AMD)
"Advanced Micro Devices, Inc.";
#elif defined(CL_PLATFORM_NVIDIA)
"NVIDIA Corporation";
#elif defined(CL_PLATFORM_INTEL)
"Intel(R) Corporation";
#else
"Unknown Vendor";
#endif
#ifndef CL_PLATFORM_MINI_CL
#ifdef _WIN32
#include "CL/cl_gl.h"
#endif //_WIN32
#endif
bool gDebugForceLoadingFromSource = false;
bool gDebugSkipLoadingBinary = false;
void MyFatalBreakAPPLE( const char * errstr ,
const void * private_info ,
size_t cb ,
void * user_data )
{
printf("Error: %s\n", errstr);
const char* patloc = strstr(errstr, "Warning");
//find out if it is a warning or error, exit if error
if (patloc)
{
printf("warning\n");
} else
{
printf("error\n");
btAssert(0);
}
}
int btOpenCLUtils_getNumPlatforms(cl_int* pErrNum)
{
cl_platform_id pPlatforms[10] = { 0 };
cl_uint numPlatforms = 0;
cl_int ciErrNum = clGetPlatformIDs(10, pPlatforms, &numPlatforms);
//cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL)
*pErrNum = ciErrNum;
}
return numPlatforms;
}
const char* btOpenCLUtils_getSdkVendorName()
{
return spPlatformVendor;
}
cl_platform_id btOpenCLUtils_getPlatform(int platformIndex0, cl_int* pErrNum)
{
cl_platform_id platform = 0;
unsigned int platformIndex = (unsigned int )platformIndex0;
cl_uint numPlatforms;
cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if (platformIndex>=0 && platformIndex<numPlatforms)
{
cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms);
ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL)
*pErrNum = ciErrNum;
return platform;
}
platform = platforms[platformIndex];
free (platforms);
}
return platform;
}
void btOpenCLUtils::getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo* platformInfo)
{
cl_int ciErrNum;
ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VENDOR,BT_MAX_STRING_LENGTH,platformInfo->m_platformVendor,NULL);
oclCHECKERROR(ciErrNum,CL_SUCCESS);
ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_NAME,BT_MAX_STRING_LENGTH,platformInfo->m_platformName,NULL);
oclCHECKERROR(ciErrNum,CL_SUCCESS);
ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VERSION,BT_MAX_STRING_LENGTH,platformInfo->m_platformVersion,NULL);
oclCHECKERROR(ciErrNum,CL_SUCCESS);
}
void btOpenCLUtils_printPlatformInfo(cl_platform_id platform)
{
btOpenCLPlatformInfo platformInfo;
btOpenCLUtils::getPlatformInfo (platform, &platformInfo);
printf("Platform info:\n");
printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
}
cl_context btOpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex)
{
cl_context retContext = 0;
cl_int ciErrNum=0;
cl_uint num_entries;
cl_device_id devices[BT_MAX_CL_DEVICES];
cl_uint num_devices;
cl_context_properties* cprops;
/*
* If we could find our platform, use it. Otherwise pass a NULL and get whatever the
* implementation thinks we should be using.
*/
cl_context_properties cps[7] = {0,0,0,0,0,0,0};
cps[0] = CL_CONTEXT_PLATFORM;
cps[1] = (cl_context_properties)platform;
#ifdef _WIN32
if (pGLContext && pGLDC)
{
cps[2] = CL_GL_CONTEXT_KHR;
cps[3] = (cl_context_properties)pGLContext;
cps[4] = CL_WGL_HDC_KHR;
cps[5] = (cl_context_properties)pGLDC;
}
#endif //_WIN32
num_entries = BT_MAX_CL_DEVICES;
num_devices=-1;
ciErrNum = clGetDeviceIDs(
platform,
deviceType,
num_entries,
devices,
&num_devices);
if (ciErrNum<0)
{
printf("clGetDeviceIDs returned %d\n",ciErrNum);
return 0;
}
cprops = (NULL == platform) ? NULL : cps;
if (!num_devices)
return 0;
if (pGLContext)
{
//search for the GPU that relates to the OpenCL context
unsigned int i;
for (i=0;i<num_devices;i++)
{
retContext = clCreateContext(cprops,1,&devices[i],NULL,NULL,&ciErrNum);
if (ciErrNum==CL_SUCCESS)
break;
}
}
else
{
if (preferredDeviceIndex>=0 && (unsigned int)preferredDeviceIndex<num_devices)
{
//create a context of the preferred device index
retContext = clCreateContext(cprops,1,&devices[preferredDeviceIndex],NULL,NULL,&ciErrNum);
} else
{
//create a context of all devices
#if defined (__APPLE__)
retContext = clCreateContext(cprops,num_devices,devices,MyFatalBreakAPPLE,NULL,&ciErrNum);
#else
printf("numDevices=%d\n",num_devices);
retContext = clCreateContext(cprops,num_devices,devices,NULL,NULL,&ciErrNum);
#endif
}
}
if(pErrNum != NULL)
{
*pErrNum = ciErrNum;
};
return retContext;
}
cl_context btOpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC , int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* retPlatformId)
{
cl_uint numPlatforms;
cl_context retContext = 0;
unsigned int i;
cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL) *pErrNum = ciErrNum;
return NULL;
}
if(numPlatforms > 0)
{
cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms);
ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL)
*pErrNum = ciErrNum;
free(platforms);
return NULL;
}
for ( i = 0; i < numPlatforms; ++i)
{
char pbuf[128];
ciErrNum = clGetPlatformInfo( platforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuf),
pbuf,
NULL);
if(ciErrNum != CL_SUCCESS)
{
if(pErrNum != NULL) *pErrNum = ciErrNum;
return NULL;
}
if (preferredPlatformIndex>=0 && i==preferredPlatformIndex)
{
cl_platform_id tmpPlatform = platforms[0];
platforms[0] = platforms[i];
platforms[i] = tmpPlatform;
break;
} else
{
if(!strcmp(pbuf, spPlatformVendor))
{
cl_platform_id tmpPlatform = platforms[0];
platforms[0] = platforms[i];
platforms[i] = tmpPlatform;
}
}
}
for (i = 0; i < numPlatforms; ++i)
{
cl_platform_id platform = platforms[i];
assert(platform);
retContext = btOpenCLUtils_createContextFromPlatform(platform,deviceType,pErrNum,pGLContext,pGLDC,preferredDeviceIndex,preferredPlatformIndex);
if (retContext)
{
// printf("OpenCL platform details:\n");
btOpenCLPlatformInfo platformInfo;
btOpenCLUtils::getPlatformInfo(platform, &platformInfo);
if (retPlatformId)
*retPlatformId = platform;
break;
}
}
free (platforms);
}
return retContext;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxMainContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
cl_device_id btOpenCLUtils_getDevice(cl_context cxMainContext, int deviceIndex)
{
assert(cxMainContext);
size_t szParmDataBytes;
cl_device_id* cdDevices;
cl_device_id device ;
// get the list of devices associated with context
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
if( szParmDataBytes / sizeof(cl_device_id) < (unsigned int)deviceIndex ) {
return (cl_device_id)-1;
}
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
device = cdDevices[deviceIndex];
free(cdDevices);
return device;
}
int btOpenCLUtils_getNumDevices(cl_context cxMainContext)
{
size_t szParamDataBytes;
int device_count;
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes);
device_count = (int) szParamDataBytes/ sizeof(cl_device_id);
return device_count;
}
void btOpenCLUtils::getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo* info)
{
// CL_DEVICE_NAME
clGetDeviceInfo(device, CL_DEVICE_NAME, BT_MAX_STRING_LENGTH, &info->m_deviceName, NULL);
// CL_DEVICE_VENDOR
clGetDeviceInfo(device, CL_DEVICE_VENDOR, BT_MAX_STRING_LENGTH, &info->m_deviceVendor, NULL);
// CL_DRIVER_VERSION
clGetDeviceInfo(device, CL_DRIVER_VERSION, BT_MAX_STRING_LENGTH, &info->m_driverVersion, NULL);
// CL_DEVICE_INFO
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &info->m_deviceType, NULL);
// CL_DEVICE_MAX_COMPUTE_UNITS
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(info->m_computeUnits), &info->m_computeUnits, NULL);
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(info->m_workitemDims), &info->m_workitemDims, NULL);
// CL_DEVICE_MAX_WORK_ITEM_SIZES
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(info->m_workItemSize), &info->m_workItemSize, NULL);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(info->m_workgroupSize), &info->m_workgroupSize, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(info->m_clockFrequency), &info->m_clockFrequency, NULL);
// CL_DEVICE_ADDRESS_BITS
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(info->m_addressBits), &info->m_addressBits, NULL);
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(info->m_maxMemAllocSize), &info->m_maxMemAllocSize, NULL);
// CL_DEVICE_GLOBAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(info->m_globalMemSize), &info->m_globalMemSize, NULL);
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(info->m_errorCorrectionSupport), &info->m_errorCorrectionSupport, NULL);
// CL_DEVICE_LOCAL_MEM_TYPE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(info->m_localMemType), &info->m_localMemType, NULL);
// CL_DEVICE_LOCAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(info->m_localMemSize), &info->m_localMemSize, NULL);
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(info->m_constantBufferSize), &info->m_constantBufferSize, NULL);
// CL_DEVICE_QUEUE_PROPERTIES
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(info->m_queueProperties), &info->m_queueProperties, NULL);
// CL_DEVICE_IMAGE_SUPPORT
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(info->m_imageSupport), &info->m_imageSupport, NULL);
// CL_DEVICE_MAX_READ_IMAGE_ARGS
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(info->m_maxReadImageArgs), &info->m_maxReadImageArgs, NULL);
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(info->m_maxWriteImageArgs), &info->m_maxWriteImageArgs, NULL);
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &info->m_image2dMaxWidth, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &info->m_image2dMaxHeight, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &info->m_image3dMaxWidth, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &info->m_image3dMaxHeight, NULL);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &info->m_image3dMaxDepth, NULL);
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, BT_MAX_STRING_LENGTH, &info->m_deviceExtensions, NULL);
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &info->m_vecWidthChar, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &info->m_vecWidthShort, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &info->m_vecWidthInt, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &info->m_vecWidthLong, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &info->m_vecWidthFloat, NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info->m_vecWidthDouble, NULL);
}
void btOpenCLUtils_printDeviceInfo(cl_device_id device)
{
btOpenCLDeviceInfo info;
btOpenCLUtils::getDeviceInfo(device,&info);
printf("Device Info:\n");
printf(" CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName);
printf(" CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor);
printf(" CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion);
if( info.m_deviceType & CL_DEVICE_TYPE_CPU )
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if( info.m_deviceType & CL_DEVICE_TYPE_GPU )
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if( info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR )
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if( info.m_deviceType & CL_DEVICE_TYPE_DEFAULT )
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
printf(" CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits);
printf(" CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", info.m_workitemDims);
printf(" CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", info.m_workItemSize[0], info.m_workItemSize[1], info.m_workItemSize[2]);
printf(" CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize);
printf(" CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency);
printf(" CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits);
printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize/ (1024 * 1024)));
printf(" CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize/ (1024 * 1024)));
printf(" CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport== CL_TRUE ? "yes" : "no");
printf(" CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global");
printf(" CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024));
printf(" CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024));
if( info.m_queueProperties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
if( info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE )
printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
printf(" CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport);
printf(" CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", info.m_maxReadImageArgs);
printf(" CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", info.m_maxWriteImageArgs);
printf("\n CL_DEVICE_IMAGE <dim>");
printf("\t\t\t2D_MAX_WIDTH\t %u\n", info.m_image2dMaxWidth);
printf("\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", info.m_image2dMaxHeight);
printf("\t\t\t\t\t3D_MAX_WIDTH\t %u\n", info.m_image3dMaxWidth);
printf("\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", info.m_image3dMaxHeight);
printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth);
if (info.m_deviceExtensions != 0)
printf("\n CL_DEVICE_EXTENSIONS:%s\n",info.m_deviceExtensions);
else
printf(" CL_DEVICE_EXTENSIONS: None\n");
printf(" CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong,info.m_vecWidthFloat, info.m_vecWidthDouble);
}
static const char* strip2(const char* name, const char* pattern)
{
size_t const patlen = strlen(pattern);
size_t patcnt = 0;
const char * oriptr;
const char * patloc;
// find how many times the pattern occurs in the original string
for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
{
patcnt++;
}
return oriptr;
}
cl_program btOpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSourceOrg, cl_int* pErrNum, const char* additionalMacrosArg , const char* clFileNameForCaching)
{
const char* additionalMacros = additionalMacrosArg?additionalMacrosArg:"";
cl_program m_cpProgram=0;
cl_int status;
#ifdef _WIN32
char binaryFileName[BT_MAX_STRING_LENGTH];
char* bla=0;
if (clFileNameForCaching && !(gDebugSkipLoadingBinary||gDebugForceLoadingFromSource) )
{
char deviceName[256];
char driverVersion[256];
const char* strippedName;
int fileUpToDate = 0;
int binaryFileValid=0;
FILETIME modtimeBinary;
clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL);
clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
strippedName = strip2(clFileNameForCaching,"\\");
strippedName = strip2(strippedName,"/");
#ifdef _WIN32
sprintf_s(binaryFileName,BT_MAX_STRING_LENGTH,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion );
#else
sprintf(binaryFileName,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion );
#endif
//printf("searching for %s\n", binaryFileName);
CreateDirectory("cache",0);
{
HANDLE binaryFileHandle = CreateFile(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
if (binaryFileHandle ==INVALID_HANDLE_VALUE)
{
DWORD errorCode;
errorCode = GetLastError();
switch (errorCode)
{
case ERROR_FILE_NOT_FOUND:
{
printf("\nCached file not found %s\n", binaryFileName);
break;
}
case ERROR_PATH_NOT_FOUND:
{
printf("\nCached file path not found %s\n", binaryFileName);
break;
}
default:
{
printf("\nFailed reading cached file with errorCode = %d\n", errorCode);
}
}
} else
{
if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0)
{
DWORD errorCode;
errorCode = GetLastError();
printf("\nGetFileTime errorCode = %d\n", errorCode);
} else
{
binaryFileValid = 1;
}
CloseHandle(binaryFileHandle);
}
if (binaryFileValid)
{
HANDLE srcFileHandle = CreateFile(clFileNameForCaching,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
if (srcFileHandle==INVALID_HANDLE_VALUE)
{
const char* prefix[]={"../","../../","../../../","../../../../"};
for (int i=0;(srcFileHandle==INVALID_HANDLE_VALUE) && i<3;i++)
{
char relativeFileName[1024];
sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching);
srcFileHandle = CreateFile(relativeFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
}
}
if (srcFileHandle!=INVALID_HANDLE_VALUE)
{
FILETIME modtimeSrc;
if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0)
{
DWORD errorCode;
errorCode = GetLastError();
printf("\nGetFileTime errorCode = %d\n", errorCode);
}
if ( ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime)
||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
{
fileUpToDate=1;
} else
{
printf("\nCached binary file out-of-date (%s)\n",binaryFileName);
}
CloseHandle(srcFileHandle);
}
else
{
#ifdef _DEBUG
DWORD errorCode;
errorCode = GetLastError();
switch (errorCode)
{
case ERROR_FILE_NOT_FOUND:
{
printf("\nSrc file not found %s\n", clFileNameForCaching);
break;
}
case ERROR_PATH_NOT_FOUND:
{
printf("\nSrc path not found %s\n", clFileNameForCaching);
break;
}
default:
{
printf("\nnSrc file reading errorCode = %d\n", errorCode);
}
}
//we should make sure the src file exists so we can verify the timestamp with binary
assert(0);
fileUpToDate = false;
#else
//if we cannot find the source, assume it is OK in release builds
fileUpToDate = true;
#endif
}
}
}
if( fileUpToDate)
{
#ifdef _WIN32
FILE* file;
if (fopen_s(&file,binaryFileName, "rb")!=0)
file=0;
#else
FILE* file = fopen(binaryFileName, "rb");
#endif
if (file)
{
size_t binarySize=0;
char* binary =0;
fseek( file, 0L, SEEK_END );
binarySize = ftell( file );
rewind( file );
binary = (char*)malloc(sizeof(char)*binarySize);
fread( binary, sizeof(char), binarySize, file );
fclose( file );
m_cpProgram = clCreateProgramWithBinary( clContext, 1,&device, &binarySize, (const unsigned char**)&binary, 0, &status );
btAssert( status == CL_SUCCESS );
status = clBuildProgram( m_cpProgram, 1, &device, additionalMacros, 0, 0 );
btAssert( status == CL_SUCCESS );
if( status != CL_SUCCESS )
{
char *build_log;
size_t ret_val_size;
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = (char*)malloc(sizeof(char)*(ret_val_size+1));
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
printf("%s\n", build_log);
free (build_log);
btAssert(0);
m_cpProgram = 0;
}
free (binary);
}
}
}
#endif //_WIN32
if (!m_cpProgram)
{
cl_int localErrNum;
char* compileFlags;
int flagsize;
const char* kernelSource = kernelSourceOrg;
if (!kernelSourceOrg || gDebugForceLoadingFromSource)
{
if (clFileNameForCaching)
{
FILE* file = fopen(clFileNameForCaching, "rb");
//in many cases the relative path is a few levels up the directory hierarchy, so try it
if (!file)
{
const char* prefix[]={"../","../../","../../../","../../../../"};
for (int i=0;!file && i<3;i++)
{
char relativeFileName[1024];
sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching);
file = fopen(relativeFileName, "rb");
}
}
if (file)
{
char* kernelSrc=0;
fseek( file, 0L, SEEK_END );
int kernelSize = ftell( file );
rewind( file );
kernelSrc = (char*)malloc(kernelSize+1);
int readBytes = fread((void*)kernelSrc,1,kernelSize, file);
kernelSrc[kernelSize] = 0;
fclose(file);
kernelSource = kernelSrc;
}
}
}
size_t program_length = kernelSource ? strlen(kernelSource) : 0;
#ifdef MAC //or __APPLE__?
char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
#else
//const char* flags = "-DGUID_ARG= -fno-alias";
const char* flags = "-DGUID_ARG= ";
#endif
m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum);
if (localErrNum!= CL_SUCCESS)
{
if (pErrNum)
*pErrNum = localErrNum;
return 0;
}
// Build the program with 'mad' Optimization option
flagsize = sizeof(char)*(strlen(additionalMacros) + strlen(flags) + 5);
compileFlags = (char*) malloc(flagsize);
#ifdef _WIN32
sprintf_s(compileFlags,flagsize, "%s %s", flags, additionalMacros);
#else
sprintf(compileFlags, "%s %s", flags, additionalMacros);
#endif
localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL);
if (localErrNum!= CL_SUCCESS)
{
char *build_log;
size_t ret_val_size;
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = (char*) malloc(sizeof(char)*(ret_val_size+1));
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
// to be carefully, terminate with \0
// there's no information in the reference whether the string is 0 terminated or not
build_log[ret_val_size] = '\0';
printf("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log);
free (build_log);
if (pErrNum)
*pErrNum = localErrNum;
return 0;
}
#ifdef _WIN32
if( clFileNameForCaching )
{ // write to binary
cl_uint numAssociatedDevices;
status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 );
btAssert( status == CL_SUCCESS );
if (numAssociatedDevices==1)
{
size_t binarySize;
char* binary ;
status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
btAssert( status == CL_SUCCESS );
binary = (char*)malloc(sizeof(char)*binarySize);
status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
btAssert( status == CL_SUCCESS );
{
FILE* file=0;
#ifdef _WIN32
if (fopen_s(&file,binaryFileName, "wb")!=0)
file=0;
#else
file = fopen(binaryFileName, "wb");
#endif
if (file)
{
fwrite( binary, sizeof(char), binarySize, file );
fclose( file );
} else
{
printf("cannot write file %s\n", binaryFileName);
}
}
free (binary);
}
}
#endif //_WIN32
free(compileFlags);
}
return m_cpProgram;
}
cl_kernel btOpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros )
{
cl_kernel kernel;
cl_int localErrNum;
cl_program m_cpProgram = prog;
printf("compiling kernel %s ",kernelName);
if (!m_cpProgram)
{
m_cpProgram = btOpenCLUtils_compileCLProgramFromString(clContext,device,kernelSource,pErrNum, additionalMacros,0);
}
// Create the kernel
kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum);
if (localErrNum != CL_SUCCESS)
{
printf("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName);
assert(0);
if (pErrNum)
*pErrNum = localErrNum;
return 0;
}
if (!prog && m_cpProgram)
{
clReleaseProgram(m_cpProgram);
}
printf("ready. \n");
if (pErrNum)
*pErrNum = CL_SUCCESS;
return kernel;
}

View File

@@ -0,0 +1,179 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//original author: Roman Ponomarev
//cleanup by Erwin Coumans
#ifndef BT_OPENCL_UTILS_H
#define BT_OPENCL_UTILS_H
#include "btOpenCLInclude.h"
#ifdef __cplusplus
extern "C" {
#endif
///C API for OpenCL utilities: convenience functions, see below for C++ API
/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
cl_context btOpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC , int preferredDeviceIndex , int preferredPlatformIndex, cl_platform_id* platformId);
int btOpenCLUtils_getNumDevices(cl_context cxMainContext);
cl_device_id btOpenCLUtils_getDevice(cl_context cxMainContext, int nr);
void btOpenCLUtils_printDeviceInfo(cl_device_id device);
cl_kernel btOpenCLUtils_compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog,const char* additionalMacros);
//optional
cl_program btOpenCLUtils_compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum,const char* additionalMacros , const char* srcFileNameForCaching);
//the following optional APIs provide access using specific platform information
int btOpenCLUtils_getNumPlatforms(cl_int* pErrNum);
///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
cl_platform_id btOpenCLUtils_getPlatform(int nr, cl_int* pErrNum);
void btOpenCLUtils_printPlatformInfo(cl_platform_id platform);
const char* btOpenCLUtils_getSdkVendorName();
cl_context btOpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC ,int preferredDeviceIndex , int preferredPlatformIndex);
#ifdef __cplusplus
}
#define BT_MAX_STRING_LENGTH 1024
typedef struct
{
char m_deviceName[BT_MAX_STRING_LENGTH];
char m_deviceVendor[BT_MAX_STRING_LENGTH];
char m_driverVersion[BT_MAX_STRING_LENGTH];
char m_deviceExtensions[BT_MAX_STRING_LENGTH];
cl_device_type m_deviceType;
cl_uint m_computeUnits;
size_t m_workitemDims;
size_t m_workItemSize[3];
size_t m_image2dMaxWidth;
size_t m_image2dMaxHeight;
size_t m_image3dMaxWidth;
size_t m_image3dMaxHeight;
size_t m_image3dMaxDepth;
size_t m_workgroupSize;
cl_uint m_clockFrequency;
cl_ulong m_constantBufferSize;
cl_ulong m_localMemSize;
cl_ulong m_globalMemSize;
cl_bool m_errorCorrectionSupport;
cl_device_local_mem_type m_localMemType;
cl_uint m_maxReadImageArgs;
cl_uint m_maxWriteImageArgs;
cl_uint m_addressBits;
cl_ulong m_maxMemAllocSize;
cl_command_queue_properties m_queueProperties;
cl_bool m_imageSupport;
cl_uint m_vecWidthChar;
cl_uint m_vecWidthShort;
cl_uint m_vecWidthInt;
cl_uint m_vecWidthLong;
cl_uint m_vecWidthFloat;
cl_uint m_vecWidthDouble;
} btOpenCLDeviceInfo;
typedef struct
{
char m_platformVendor[BT_MAX_STRING_LENGTH];
char m_platformName[BT_MAX_STRING_LENGTH];
char m_platformVersion[BT_MAX_STRING_LENGTH];
} btOpenCLPlatformInfo;
///C++ API for OpenCL utilities: convenience functions
struct btOpenCLUtils
{
/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
static inline cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1, cl_platform_id* platformId=0)
{
return btOpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx , pGLDC , preferredDeviceIndex, preferredPlatformIndex, platformId);
}
static inline int getNumDevices(cl_context cxMainContext)
{
return btOpenCLUtils_getNumDevices(cxMainContext);
}
static inline cl_device_id getDevice(cl_context cxMainContext, int nr)
{
return btOpenCLUtils_getDevice(cxMainContext,nr);
}
static void getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo* info);
static inline void printDeviceInfo(cl_device_id device)
{
btOpenCLUtils_printDeviceInfo(device);
}
static inline cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" )
{
return btOpenCLUtils_compileCLKernelFromString(clContext,device, kernelSource, kernelName, pErrNum, prog,additionalMacros);
}
//optional
static inline cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0)
{
return btOpenCLUtils_compileCLProgramFromString(clContext,device, kernelSource, pErrNum,additionalMacros, srcFileNameForCaching);
}
//the following optional APIs provide access using specific platform information
static inline int getNumPlatforms(cl_int* pErrNum=0)
{
return btOpenCLUtils_getNumPlatforms(pErrNum);
}
///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum=0)
{
return btOpenCLUtils_getPlatform(nr,pErrNum);
}
static void getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo* platformInfo);
static inline void printPlatformInfo(cl_platform_id platform)
{
btOpenCLUtils_printPlatformInfo(platform);
}
static inline const char* getSdkVendorName()
{
return btOpenCLUtils_getSdkVendorName();
}
static inline cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1)
{
return btOpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx,pGLDC,preferredDeviceIndex, preferredPlatformIndex);
}
};
#endif //__cplusplus
#endif // BT_OPENCL_UTILS_H

View File

@@ -0,0 +1,98 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///original author: Erwin Coumans
#include "btOpenCLUtils.h"
#include <stdio.h>
cl_context g_cxMainContext;
cl_command_queue g_cqCommandQue;
int main(int argc, char* argv[])
{
int ciErrNum = 0;
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
const char* vendorSDK = btOpenCLUtils::getSdkVendorName();
printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
int numPlatforms = btOpenCLUtils::getNumPlatforms();
printf("Num Platforms = %d\n", numPlatforms);
for (int i=0;i<numPlatforms;i++)
{
cl_platform_id platform = btOpenCLUtils::getPlatform(i);
btOpenCLPlatformInfo platformInfo;
btOpenCLUtils::getPlatformInfo(platform,&platformInfo);
printf("--------------------------------\n");
printf("Platform info for platform nr %d:\n",i);
printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
cl_context context = btOpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
int numDevices = btOpenCLUtils::getNumDevices(context);
printf("Num Devices = %d\n", numDevices);
for (int j=0;j<numDevices;j++)
{
cl_device_id dev = btOpenCLUtils::getDevice(context,j);
btOpenCLDeviceInfo devInfo;
btOpenCLUtils::getDeviceInfo(dev,&devInfo);
btOpenCLUtils::printDeviceInfo(dev);
}
clReleaseContext(context);
}
///Easier method to initialize OpenCL using createContextFromType for a GPU
deviceType = CL_DEVICE_TYPE_GPU;
void* glCtx=0;
void* glDC = 0;
printf("Initialize OpenCL using btOpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
if (g_cxMainContext)
{
int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
for (int i=0;i<numDev;i++)
{
cl_device_id device;
device = btOpenCLUtils::getDevice(g_cxMainContext,i);
btOpenCLDeviceInfo clInfo;
btOpenCLUtils::getDeviceInfo(device,&clInfo);
btOpenCLUtils::printDeviceInfo(device);
// create a command-queue
g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//normally you would create and execute kernels using this command queue
clReleaseCommandQueue(g_cqCommandQue);
}
clReleaseContext(g_cxMainContext);
}
else {
printf("No OpenCL capable GPU found!");
}
return 0;
}

View File

@@ -0,0 +1,28 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("OpenCL_intialize_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../bin"
files {
"main.cpp",
"btOpenCLUtils.cpp",
"btOpenCLUtils.h"
}
end
end
createProject("Apple")
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")

View File

@@ -0,0 +1,171 @@
#define TILE_DIM 32
#define BLOCK_ROWS 8
/*// simple copy kernel (CUDA)
// Used as reference case representing best effective bandwidth.
__global__ void copy(float *odata, const float *idata)
{
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
odata[(y+j)*width + x] = idata[(y+j)*width + x];
}
*/
// simple copy kernel (OpenCL)
__kernel void copyKernel(__global float* odata, __global const float* idata)
{
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < get_num_groups(1); j+= get_local_size(1))
{
odata[(y+j)*width + x] = idata[(y+j)*width + x];
}
}
/*
// copy kernel using shared memory (CUDA)
// Also used as reference case, demonstrating effect of using shared memory.
__global__ void copySharedMem(float *odata, const float *idata)
{
__shared__ float tile[TILE_DIM * TILE_DIM];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x];
__syncthreads();
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x];
}
*/
// copy kernel using shared memory (OpenCL)
// Also used as reference case, demonstrating effect of using shared memory.
__kernel void copySharedMemKernel(__global float *odata, __global const float *idata)
{
__local float tile[TILE_DIM * TILE_DIM];
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)] = idata[(y+j)*width + x];
barrier(CLK_LOCAL_MEM_FENCE);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)];
}
/*
// naive transpose (CUDA)
// Simplest transpose; doesn't use shared memory.
// Global memory reads are coalesced but writes are not.
__global__ void transposeNaive(float *odata, const float *idata)
{
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
odata[x*width + (y+j)] = idata[(y+j)*width + x];
}
*/
// naive transpose (OpenCL)
// Simplest transpose; doesn't use shared memory.
// Global memory reads are coalesced but writes are not.
__kernel void transposeNaiveKernel(__global float *odata, __global const float *idata)
{
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
odata[x*width + (y+j)] = idata[(y+j)*width + x];
}
/*
// coalesced transpose (CUDA)
// Uses shared memory to achieve coalesing in both reads and writes
// Tile width == #banks causes shared memory bank conflicts.
__global__ void transposeCoalesced(float *odata, const float *idata)
{
__shared__ float tile[TILE_DIM][TILE_DIM];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
__syncthreads();
x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
y = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
}
*/
// coalesced transpose (OpenCL)
// Uses shared memory to achieve coalesing in both reads and writes
// Tile width == #banks causes shared memory bank conflicts.
__kernel void transposeCoalescedKernel(__global float *odata, __global const float *idata)
{
__local float tile[TILE_DIM][TILE_DIM];
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
barrier(CLK_LOCAL_MEM_FENCE);
x = get_group_id(1) * TILE_DIM + get_local_id(0);
y = get_group_id(0) * TILE_DIM + get_local_id(1);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
}
// No bank-conflict transpose (OpenCL)
// Same as transposeCoalesced except the first tile dimension is padded
// to avoid shared memory bank conflicts.
__kernel void transposeNoBankConflictsKernel(__global float *odata, __global const float *idata)
{
__local float tile[TILE_DIM][TILE_DIM+1];
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
int width = get_num_groups(0) * get_local_size(0);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
barrier(CLK_LOCAL_MEM_FENCE);
x = get_group_id(1) * TILE_DIM + get_local_id(0);
y = get_group_id(0) * TILE_DIM + get_local_id(1);
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
}

View File

@@ -0,0 +1,361 @@
//Adapted from CUDA to OpenCL by Erwin Coumans
//See http://bitbucket.org/erwincoumans/opencl_course
// Copyright 2012 NVIDIA Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "btOpenCLUtils.h"
#include "../parallel_primitives/host/btOpenCLArray.h"
#include "../parallel_primitives/host/btLauncherCL.h"
#include "../parallel_primitives/host/btQuickprof.h"
#include "../parallel_primitives/host/btFillCL.h"
#include "../parallel_primitives/host/CommandLineArgs.h"
#include <string.h>
#include <stdio.h>
#include <assert.h>
//make sure to update the same #define in the opencl/lds_bank_conflict/lds_kernels.cl
const int TILE_DIM = 32;
const int BLOCK_ROWS = 8;
const int NUM_REPS = 100;
// Check errors and print GB/s
void postprocess(const float *ref, const float *res, int n, float ms)
{
bool passed = true;
for (int i = 0; i < n; i++)
if (res[i] != ref[i]) {
printf("\nError: at res[%d] got %f but expected %f\n", i, res[i], ref[i]);
printf("%25s\n", "*** FAILED ***");
passed = false;
break;
}
if (passed)
printf("%20.2f\n", 2 * n * sizeof(float) * 1e-6 * NUM_REPS / ms );
}
char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
size_t szPreambleLength = strlen(cPreamble);
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
memcpy(cSourceString, cPreamble, szPreambleLength);
fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream);
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength + szPreambleLength;
}
cSourceString[szSourceLength + szPreambleLength] = '\0';
return cSourceString;
}
int main(int argc, char **argv)
{
printf("Use --deviceId=<id> or --platformId=<id> to override OpenCL device\n");
CommandLineArgs args(argc,argv);
const int nx = 1024;
const int ny = 1024;
const int mem_size = nx*ny*sizeof(float);
const int num_elements = nx*ny;
btClock clock;
double startEvent=0.f;
double stopEvent=0.f;
int localSizeX = TILE_DIM;
int localSizeY = BLOCK_ROWS;
int numThreadsX = (nx/TILE_DIM)*TILE_DIM;
int numThreadsY = (ny/TILE_DIM)*BLOCK_ROWS;
int gridX = numThreadsX / localSizeX;
int gridY = numThreadsY / localSizeY;
int ciErrNum = 0;
int preferred_device = -1;
int preferred_platform = -1;
args.GetCmdLineArgument("deviceId",preferred_device);
args.GetCmdLineArgument("platformId",preferred_platform);
cl_platform_id platformId=0;
cl_context ctx=0;
cl_command_queue queue=0;
cl_device_id device=0;
cl_kernel copyKernel=0;
cl_kernel copySharedMemKernel=0;
cl_kernel transposeNaiveKernel = 0;
cl_kernel transposeCoalescedKernel = 0;
cl_kernel transposeNoBankConflictsKernel= 0;
ctx = btOpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
btOpenCLUtils::printPlatformInfo(platformId);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
device = btOpenCLUtils::getDevice(ctx,0);
btOpenCLUtils::printDeviceInfo(device);
queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
const char* cSourceFile = "opencl/lds_bank_conflict/lds_kernels.cl";
size_t szKernelLength;
const char* cSourceCL =0;
char relativeFileName[1024];
{
const char* prefix[]={"./","../","../../","../../../","../../../../"};
int numPrefixes = sizeof(prefix)/sizeof(char*);
for (int i=0;!cSourceCL && i<numPrefixes;i++)
{
sprintf(relativeFileName,"%s%s",prefix[i],cSourceFile);
cSourceCL = loadProgSource(relativeFileName, "", &szKernelLength);
if (cSourceCL)
{
printf("Loaded program source: %s\n", relativeFileName);
}
}
}
if (!cSourceCL)
{
printf("Couldn't find file %s, exiting\n",cSourceFile);
exit(0);
}
char flags[1024]={0};
#ifdef CL_PLATFORM_INTEL
///use this flag to allow for OpenCL kernel debugging on CPU using the Intel OpenCL run-time
//sprintf(flags,"-g -s \"%s\"","C:/develop/opencl_course/opencl/lds_bank_conflict/lds_kernels.cl");
#endif//CL_PLATFORM_INTEL
copyKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"copyKernel",&ciErrNum,0,flags);
copySharedMemKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"copySharedMemKernel",&ciErrNum,0,flags);
transposeNaiveKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeNaiveKernel",&ciErrNum,0,flags);
transposeCoalescedKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeCoalescedKernel",&ciErrNum,0,flags);
transposeNoBankConflictsKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeNoBankConflictsKernel",&ciErrNum,0,flags);
btFillCL clMemSet(ctx,device,queue);
printf("\n============================================\n");
printf("Matrix size: %d %d, Block size: %d %d, Tile size: %d %d\n",
nx, ny, TILE_DIM, BLOCK_ROWS, TILE_DIM, TILE_DIM);
float *h_idata = (float*)malloc(mem_size);
float *h_cdata = (float*)malloc(mem_size);
float *h_tdata = (float*)malloc(mem_size);
float *gold = (float*)malloc(mem_size);
btOpenCLArray<float> d_idataCL(ctx,queue);d_idataCL.resize(num_elements);
btOpenCLArray<float> d_cdataCL(ctx,queue);d_cdataCL.resize(num_elements);
btOpenCLArray<float> d_tdataCL(ctx,queue);d_tdataCL.resize(num_elements);
// check parameters and calculate execution configuration
if (nx % TILE_DIM || ny % TILE_DIM)
{
printf("nx and ny must be a multiple of TILE_DIM\n");
goto error_exit;
}
if (TILE_DIM % BLOCK_ROWS)
{
printf("TILE_DIM must be a multiple of BLOCK_ROWS\n");
goto error_exit;
}
// host
for (int j = 0; j < ny; j++)
for (int i = 0; i < nx; i++)
h_idata[j*nx + i] = j*nx + i;
// correct result for error checking
for (int j = 0; j < ny; j++)
for (int i = 0; i < nx; i++)
{
gold[j*nx + i] = h_idata[i*nx + j];
}
d_idataCL.copyFromHostPointer(h_idata,num_elements);
// events for timing
clock.reset();
float ms;
// ------------
// time kernels
// ------------
printf("%25s%25s\n", "Routine", "Bandwidth (GB/s)");
// ----
// copy
// ----
printf("%25s", "copy");
clMemSet.execute(d_cdataCL,0.f,num_elements);
{
// warm up
btLauncherCL launcher( queue, copyKernel);
launcher.setBuffer( d_cdataCL.getBufferCL());
launcher.setBuffer( d_idataCL.getBufferCL());
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
startEvent = clock.getTimeMicroseconds()/1e3;
for (int i = 0; i < NUM_REPS; i++)
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
clFinish(queue);
stopEvent = clock.getTimeMicroseconds()/1e3;
}
ms = float(stopEvent-startEvent);
d_cdataCL.copyToHostPointer(h_cdata,num_elements,0);
postprocess(h_idata, h_cdata, nx*ny, ms);
// -------------
// copySharedMem
// -------------
printf("%25s", "shared memory copy");
clMemSet.execute(d_cdataCL,0.f,num_elements);
{
btLauncherCL launcher( queue, copySharedMemKernel);
launcher.setBuffer( d_cdataCL.getBufferCL());
launcher.setBuffer( d_idataCL.getBufferCL());
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
startEvent = clock.getTimeMicroseconds()/1e3;
for (int i = 0; i < NUM_REPS; i++)
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
clFinish(queue);
stopEvent = clock.getTimeMicroseconds()/1e3;
}
ms = float(stopEvent-startEvent);
d_cdataCL.copyToHostPointer(h_cdata,num_elements,0);
postprocess(h_idata, h_cdata, nx * ny, ms);
// --------------
// transposeNaive
// --------------
printf("%25s", "naive transpose");
clMemSet.execute(d_tdataCL,0.f,num_elements);
{
// warmup
btLauncherCL launcher( queue, transposeNaiveKernel);
launcher.setBuffer( d_tdataCL.getBufferCL());
launcher.setBuffer( d_idataCL.getBufferCL());
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
startEvent = clock.getTimeMicroseconds()/1e3;
for (int i = 0; i < NUM_REPS; i++)
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
clFinish(queue);
stopEvent = clock.getTimeMicroseconds()/1e3;
}
ms = float(stopEvent-startEvent);
d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
postprocess(gold, h_tdata, nx * ny, ms);
// ------------------
// transposeCoalesced
// ------------------
printf("%25s", "coalesced transpose");
clMemSet.execute(d_tdataCL,0.f,num_elements);
{
btLauncherCL launcher( queue, transposeCoalescedKernel);
launcher.setBuffer( d_tdataCL.getBufferCL());
launcher.setBuffer( d_idataCL.getBufferCL());
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
startEvent = clock.getTimeMicroseconds()/1e3;
for (int i = 0; i < NUM_REPS; i++)
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
clFinish(queue);
stopEvent = clock.getTimeMicroseconds()/1e3;
}
ms = float(stopEvent-startEvent);
d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
postprocess(gold, h_tdata, nx * ny, ms);
// ------------------------
// transposeNoBankConflicts
// ------------------------
printf("%25s", "conflict-free transpose");
clMemSet.execute(d_tdataCL,0.f,num_elements);
{
btLauncherCL launcher( queue, transposeNoBankConflictsKernel);
launcher.setBuffer( d_tdataCL.getBufferCL());
launcher.setBuffer( d_idataCL.getBufferCL());
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
startEvent = clock.getTimeMicroseconds()/1e3;
for (int i = 0; i < NUM_REPS; i++)
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
oclCHECKERROR(ciErrNum, CL_SUCCESS);
clFinish(queue);
stopEvent = clock.getTimeMicroseconds()/1e3;
}
ms = float(stopEvent-startEvent);
d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
postprocess(gold, h_tdata, nx * ny, ms);
error_exit:
// cleanup
clReleaseKernel(copyKernel);
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
free(h_idata);
free(h_tdata);
free(h_cdata);
free(gold);
printf("Press <enter>\n");
getchar();
}

View File

@@ -0,0 +1,37 @@
function createProject (vendor)
local hasCL = findOpenCL(vendor)
if (hasCL) then
project ( "OpenCL_lds_bank_conflict_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../bin"
links {
"OpenCL_lib_parallel_primitives_host_" .. vendor
}
includedirs {
"../basic_initialize"
}
files {
"main.cpp",
"../basic_initialize/btOpenCLUtils.cpp",
"../basic_initialize/btOpenCLUtils.h"
}
end
end
createProject("AMD")
createProject("NVIDIA")
createProject("Intel")
createProject("Apple")

View File

@@ -0,0 +1,35 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("OpenCL_radixsort_benchmark_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {".."}
links {
("OpenCL_lib_parallel_primitives_host_" .. vendor)
}
files {
"test_large_problem_sorting.cpp",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../host/btFillCL.cpp",
"../host/btPrefixScanCL.cpp",
"../host/btRadixSort32CL.cpp",
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")

View File

@@ -0,0 +1,709 @@
/******************************************************************************
* Copyright 2010 Duane Merrill
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
*
*
* AUTHORS' REQUEST:
*
* If you use|reference|benchmark this code, please cite our Technical
* Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
*
* @TechReport{ Merrill:Sorting:2010,
* author = "Duane Merrill and Andrew Grimshaw",
* title = "Revisiting Sorting for GPGPU Stream Architectures",
* year = "2010",
* institution = "University of Virginia, Department of Computer Science",
* address = "Charlottesville, VA, USA",
* number = "CS2010-03"
* }
*
* For more information, see our Google Code project site:
* http://code.google.com/p/back40computing/
*
* Thanks!
******************************************************************************/
/******************************************************************************
* Simple test driver program for *large-problem* radix sorting.
*
* Useful for demonstrating how to integrate radix sorting into
* your application
******************************************************************************/
/******************************************************************************
* Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
******************************************************************************/
#ifdef _WIN32
#pragma warning (disable:4996)
#endif
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include <algorithm>
#include <string>
//#include <iostream>
#include <sstream>
/**********************
*
*/
#include "../host/btRadixSort32CL.h"
#include "../../basic_initialize/btOpenCLUtils.h"
#include "../host/btQuickprof.h"
cl_context g_cxMainContext;
cl_device_id g_device;
cl_command_queue g_cqCommandQueue;
/***********************
*
*/
bool g_verbose;
///Preferred OpenCL device/platform. When < 0 then no preference is used.
///Note that btOpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
///Preferred device/platform take priority over this platform-vendor match
int gPreferredDeviceId = -1;
int gPreferredPlatformId = -1;
/******************************************************************************
* Routines
******************************************************************************/
/**
* Keys-only sorting. Uses the GPU to sort the specified vector of elements for the given
* number of iterations, displaying runtime information.
*
* @param[in] num_elements
* Size in elements of the vector to sort
* @param[in] h_keys
* Vector of keys to sort
* @param[in] iterations
* Number of times to invoke the GPU sorting primitive
* @param[in] cfg
* Config
*/
template <typename K>
void TimedSort(
unsigned int num_elements,
K *h_keys,
unsigned int iterations)
{
printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
int max_elements = num_elements;
btAlignedObjectArray<unsigned int> hostData;
hostData.resize(num_elements);
for (int i=0;i<num_elements;i++)
{
hostData[i] = h_keys[i];
}
btRadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
btOpenCLArray<unsigned int> gpuData(g_cxMainContext,g_cqCommandQueue);
gpuData.copyFromHost(hostData);
//sorter.executeHost(gpuData);
sorter.execute(gpuData);
btAlignedObjectArray<unsigned int> hostDataSorted;
gpuData.copyToHost(hostDataSorted);
clFinish(g_cqCommandQueue);
{
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
// Create sorting enactor
// Perform the timed number of sorting iterations
double elapsed = 0;
float duration = 0;
btClock watch;
//warm-start
gpuData.copyFromHost(hostData);
clFinish(g_cqCommandQueue);
sorter.execute(gpuData);
watch.reset();
for (int i = 0; i < iterations; i++)
{
// Move a fresh copy of the problem into device storage
gpuData.copyFromHost(hostData);
clFinish(g_cqCommandQueue);
// Start GPU timing record
double startMs = watch.getTimeMicroseconds()/1e3;
// Call the sorting API routine
sorter.execute(gpuData);
clFinish(g_cqCommandQueue);
double stopMs = watch.getTimeMicroseconds()/1e3;
duration = stopMs - startMs;
// End GPU timing record
elapsed += (double) duration;
printf("duration = %f\n", duration);
}
// Display timing information
double avg_runtime = elapsed / iterations;
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
gpuData.copyToHost(hostData);
for (int i=0;i<num_elements;i++)
{
h_keys[i] = hostData[i];
}
}
}
/**
* Key-value sorting. Uses the GPU to sort the specified vector of elements for the given
* number of iterations, displaying runtime information.
*
* @param[in] num_elements
* Size in elements of the vector to sort
* @param[in] h_keys
* Vector of keys to sort
* @param[in,out] h_values
* Vector of values to sort
* @param[in] iterations
* Number of times to invoke the GPU sorting primitive
* @param[in] cfg
* Config
*/
template <typename K, typename V>
void TimedSort(
unsigned int num_elements,
K *h_keys,
V *h_values,
unsigned int iterations)
{
printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
int max_elements = num_elements;
btAlignedObjectArray<btSortData> hostData;
hostData.resize(num_elements);
for (int i=0;i<num_elements;i++)
{
hostData[i].m_key = h_keys[i];
hostData[i].m_value = h_values[i];
}
btRadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
btOpenCLArray<btSortData> gpuData(g_cxMainContext,g_cqCommandQueue);
gpuData.copyFromHost(hostData);
//sorter.executeHost(gpuData);
sorter.execute(gpuData);
btAlignedObjectArray<btSortData> hostDataSorted;
gpuData.copyToHost(hostDataSorted);
#if 0
for (int i=0;i<num_elements;i++)
{
printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
}
#endif
clFinish(g_cqCommandQueue);
{
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
// Create sorting enactor
// Perform the timed number of sorting iterations
double elapsed = 0;
float duration = 0;
btClock watch;
//warm-start
gpuData.copyFromHost(hostData);
sorter.execute(gpuData);
clFinish(g_cqCommandQueue);
watch.reset();
for (int i = 0; i < iterations; i++)
{
// Move a fresh copy of the problem into device storage
gpuData.copyFromHost(hostData);
clFinish(g_cqCommandQueue);
// Start GPU timing record
double startMs = watch.getTimeMicroseconds()/1e3;
// Call the sorting API routine
sorter.execute(gpuData);
clFinish(g_cqCommandQueue);
double stopMs = watch.getTimeMicroseconds()/1e3;
duration = stopMs - startMs;
// End GPU timing record
elapsed += (double) duration;
printf("duration = %f\n", duration);
}
// Display timing information
double avg_runtime = elapsed / iterations;
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
gpuData.copyToHost(hostData);
for (int i=0;i<num_elements;i++)
{
h_keys[i] = hostData[i].m_key;
h_values[i] = hostData[i].m_value;
}
}
}
/**
* Generates random 32-bit keys.
*
* We always take the second-order byte from rand() because the higher-order
* bits returned by rand() are commonly considered more uniformly distributed
* than the lower-order bits.
*
* We can decrease the entropy level of keys by adopting the technique
* of Thearling and Smith in which keys are computed from the bitwise AND of
* multiple random samples:
*
* entropy_reduction | Effectively-unique bits per key
* -----------------------------------------------------
* -1 | 0
* 0 | 32
* 1 | 25.95
* 2 | 17.41
* 3 | 10.78
* 4 | 6.42
* ... | ...
*
*/
template <typename K>
void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
{
const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
unsigned char key_bits[NUM_UCHARS];
do {
for (int j = 0; j < NUM_UCHARS; j++) {
unsigned char quarterword = 0xff;
for (int i = 0; i <= entropy_reduction; i++) {
quarterword &= (rand() >> 7);
}
key_bits[j] = quarterword;
}
if (lower_key_bits < sizeof(K) * 8) {
unsigned long long base = 0;
memcpy(&base, key_bits, sizeof(K));
base &= (1 << lower_key_bits) - 1;
memcpy(key_bits, &base, sizeof(K));
}
memcpy(&key, key_bits, sizeof(K));
} while (key != key); // avoids NaNs when generating random floating point numbers
}
/******************************************************************************
* Templated routines for printing keys/values to the console
******************************************************************************/
template<typename T>
void PrintValue(T val) {
printf("%d", val);
}
template<>
void PrintValue<float>(float val) {
printf("%f", val);
}
template<>
void PrintValue<double>(double val) {
printf("%f", val);
}
template<>
void PrintValue<unsigned char>(unsigned char val) {
printf("%u", val);
}
template<>
void PrintValue<unsigned short>(unsigned short val) {
printf("%u", val);
}
template<>
void PrintValue<unsigned int>(unsigned int val) {
printf("%u", val);
}
template<>
void PrintValue<long>(long val) {
printf("%ld", val);
}
template<>
void PrintValue<unsigned long>(unsigned long val) {
printf("%lu", val);
}
template<>
void PrintValue<long long>(long long val) {
printf("%lld", val);
}
template<>
void PrintValue<unsigned long long>(unsigned long long val) {
printf("%llu", val);
}
/**
* Compares the equivalence of two arrays
*/
template <typename T, typename SizeT>
int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
{
printf("\n");
for (SizeT i = 0; i < len; i++) {
if (computed[i] != reference[i]) {
printf("INCORRECT: [%lu]: ", (unsigned long) i);
PrintValue<T>(computed[i]);
printf(" != ");
PrintValue<T>(reference[i]);
if (verbose) {
printf("\nresult[...");
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
PrintValue<T>(computed[j]);
printf(", ");
}
printf("...]");
printf("\nreference[...");
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
PrintValue<T>(reference[j]);
printf(", ");
}
printf("...]");
}
return 1;
}
}
printf("CORRECT\n");
return 0;
}
/**
* Creates an example sorting problem whose keys is a vector of the specified
* number of K elements, values of V elements, and then dispatches the problem
* to the GPU for the given number of iterations, displaying runtime information.
*
* @param[in] iterations
* Number of times to invoke the GPU sorting primitive
* @param[in] num_elements
* Size in elements of the vector to sort
* @param[in] cfg
* Config
*/
template<typename K, typename V>
void TestSort(
unsigned int iterations,
int num_elements,
bool keys_only)
{
// Allocate the sorting problem on the host and fill the keys with random bytes
K *h_keys = NULL;
K *h_reference_keys = NULL;
V *h_values = NULL;
h_keys = (K*) malloc(num_elements * sizeof(K));
h_reference_keys = (K*) malloc(num_elements * sizeof(K));
if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
// Use random bits
for (unsigned int i = 0; i < num_elements; ++i) {
RandomBits<K>(h_keys[i], 0);
//h_keys[i] = num_elements-i;
//h_keys[i] = 0xffffffffu-i;
if (!keys_only)
h_values[i] = h_keys[i];//0xffffffffu-i;
h_reference_keys[i] = h_keys[i];
}
// Run the timing test
if (keys_only) {
TimedSort<K>(num_elements, h_keys, iterations);
} else {
TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
}
// cudaThreadSynchronize();
// Display sorted key data
if (g_verbose) {
printf("\n\nKeys:\n");
for (int i = 0; i < num_elements; i++) {
PrintValue<K>(h_keys[i]);
printf(", ");
}
printf("\n\n");
}
// Verify solution
std::sort(h_reference_keys, h_reference_keys + num_elements);
CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
printf("\n");
fflush(stdout);
// Free our allocated host memory
if (h_keys != NULL) free(h_keys);
if (h_values != NULL) free(h_values);
}
/**
* Displays the commandline usage for this tool
*/
void Usage()
{
printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n");
printf("\n");
printf("\t--v\tDisplays sorted results to the console.\n");
printf("\n");
printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
printf("\n");
printf("\t--n\tThe number of elements to comprise the sample problem\n");
printf("\t\t\tDefault = 512\n");
printf("\n");
printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
printf("\n");
}
/******************************************************************************
* Command-line parsing
******************************************************************************/
#include <map>
#include <algorithm>
#include <string>
class CommandLineArgs
{
protected:
std::map<std::string, std::string> pairs;
public:
// Constructor
CommandLineArgs(int argc, char **argv)
{
using namespace std;
for (int i = 1; i < argc; i++)
{
string arg = argv[i];
if ((arg[0] != '-') || (arg[1] != '-')) {
continue;
}
string::size_type pos;
string key, val;
if ((pos = arg.find( '=')) == string::npos) {
key = string(arg, 2, arg.length() - 2);
val = "";
} else {
key = string(arg, 2, pos - 2);
val = string(arg, pos + 1, arg.length() - 1);
}
pairs[key] = val;
}
}
bool CheckCmdLineFlag(const char* arg_name)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
return true;
}
return false;
}
template <typename T>
void GetCmdLineArgument(const char *arg_name, T &val);
int ParsedArgc()
{
return pairs.size();
}
};
template <typename T>
void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
istringstream strstream(itr->second);
strstream >> val;
}
}
template <>
void CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
string s = itr->second;
val = (char*) malloc(sizeof(char) * (s.length() + 1));
strcpy(val, s.c_str());
} else {
val = NULL;
}
}
/******************************************************************************
* Main
******************************************************************************/
extern bool gDebugSkipLoadingBinary;
int main( int argc, char** argv)
{
gDebugSkipLoadingBinary = true;
cl_int ciErrNum;
CommandLineArgs args(argc,argv);
args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
args.GetCmdLineArgument("platformId", gPreferredPlatformId);
printf("Initialize OpenCL using btOpenCLUtils_createContextFromType\n");
cl_platform_id platformId;
g_cxMainContext = btOpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = btOpenCLUtils_getNumDevices(g_cxMainContext);
if (!numDev)
{
printf("error: no OpenCL devices\n");
exit(0);
}
int result;
int devId = 0;
g_device = btOpenCLUtils_getDevice(g_cxMainContext,devId);
btOpenCLUtils_printDeviceInfo(g_device);
// create a command-queue
g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//srand(time(NULL));
srand(0); // presently deterministic
unsigned int num_elements = 32*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
unsigned int iterations = 10;
bool keys_only = true;
//
// Check command line arguments
//
if (args.CheckCmdLineFlag("help"))
{
Usage();
return 0;
}
args.GetCmdLineArgument("i", iterations);
args.GetCmdLineArgument("n", num_elements);
keys_only = !args.CheckCmdLineFlag("key-values");
g_verbose = args.CheckCmdLineFlag("v");
TestSort<unsigned int, unsigned int>(
iterations,
num_elements,
keys_only);
}

View File

@@ -0,0 +1,92 @@
#ifndef COMMAND_LINE_ARGS_H
#define COMMAND_LINE_ARGS_H
/******************************************************************************
* Command-line parsing
******************************************************************************/
#include <map>
#include <algorithm>
#include <string>
#include <cstring>
#include <sstream>
class CommandLineArgs
{
protected:
std::map<std::string, std::string> pairs;
public:
// Constructor
CommandLineArgs(int argc, char **argv)
{
using namespace std;
for (int i = 1; i < argc; i++)
{
string arg = argv[i];
if ((arg[0] != '-') || (arg[1] != '-')) {
continue;
}
string::size_type pos;
string key, val;
if ((pos = arg.find( '=')) == string::npos) {
key = string(arg, 2, arg.length() - 2);
val = "";
} else {
key = string(arg, 2, pos - 2);
val = string(arg, pos + 1, arg.length() - 1);
}
pairs[key] = val;
}
}
bool CheckCmdLineFlag(const char* arg_name)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
return true;
}
return false;
}
template <typename T>
void GetCmdLineArgument(const char *arg_name, T &val);
int ParsedArgc()
{
return pairs.size();
}
};
template <typename T>
void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
istringstream strstream(itr->second);
strstream >> val;
}
}
template <>
void CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
string s = itr->second;
val = (char*) malloc(sizeof(char) * (s.length() + 1));
std::strcpy(val, s.c_str());
} else {
val = NULL;
}
}
#endif //COMMAND_LINE_ARGS_H

View File

@@ -0,0 +1,181 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "btAlignedAllocator.h"
int gNumAlignedAllocs = 0;
int gNumAlignedFree = 0;
int gTotalBytesAlignedAllocs = 0;//detect memory leaks
static void *btAllocDefault(size_t size)
{
return malloc(size);
}
static void btFreeDefault(void *ptr)
{
free(ptr);
}
static btAllocFunc *sAllocFunc = btAllocDefault;
static btFreeFunc *sFreeFunc = btFreeDefault;
#if defined (BT_HAS_ALIGNED_ALLOCATOR)
#include <malloc.h>
static void *btAlignedAllocDefault(size_t size, int alignment)
{
return _aligned_malloc(size, (size_t)alignment);
}
static void btAlignedFreeDefault(void *ptr)
{
_aligned_free(ptr);
}
#elif defined(__CELLOS_LV2__)
#include <stdlib.h>
static inline void *btAlignedAllocDefault(size_t size, int alignment)
{
return memalign(alignment, size);
}
static inline void btAlignedFreeDefault(void *ptr)
{
free(ptr);
}
#else
static inline void *btAlignedAllocDefault(size_t size, int alignment)
{
void *ret;
char *real;
real = (char *)sAllocFunc(size + sizeof(void *) + (alignment-1));
if (real) {
ret = btAlignPointer(real + sizeof(void *),alignment);
*((void **)(ret)-1) = (void *)(real);
} else {
ret = (void *)(real);
}
return (ret);
}
static inline void btAlignedFreeDefault(void *ptr)
{
void* real;
if (ptr) {
real = *((void **)(ptr)-1);
sFreeFunc(real);
}
}
#endif
static btAlignedAllocFunc *sAlignedAllocFunc = btAlignedAllocDefault;
static btAlignedFreeFunc *sAlignedFreeFunc = btAlignedFreeDefault;
void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc)
{
sAlignedAllocFunc = allocFunc ? allocFunc : btAlignedAllocDefault;
sAlignedFreeFunc = freeFunc ? freeFunc : btAlignedFreeDefault;
}
void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc)
{
sAllocFunc = allocFunc ? allocFunc : btAllocDefault;
sFreeFunc = freeFunc ? freeFunc : btFreeDefault;
}
#ifdef BT_DEBUG_MEMORY_ALLOCATIONS
//this generic allocator provides the total allocated number of bytes
#include <stdio.h>
void* btAlignedAllocInternal (size_t size, int alignment,int line,char* filename)
{
void *ret;
char *real;
gTotalBytesAlignedAllocs += size;
gNumAlignedAllocs++;
real = (char *)sAllocFunc(size + 2*sizeof(void *) + (alignment-1));
if (real) {
ret = (void*) btAlignPointer(real + 2*sizeof(void *), alignment);
*((void **)(ret)-1) = (void *)(real);
*((int*)(ret)-2) = size;
} else {
ret = (void *)(real);//??
}
printf("allocation#%d at address %x, from %s,line %d, size %d\n",gNumAlignedAllocs,real, filename,line,size);
int* ptr = (int*)ret;
*ptr = 12;
return (ret);
}
void btAlignedFreeInternal (void* ptr,int line,char* filename)
{
void* real;
gNumAlignedFree++;
if (ptr) {
real = *((void **)(ptr)-1);
int size = *((int*)(ptr)-2);
gTotalBytesAlignedAllocs -= size;
printf("free #%d at address %x, from %s,line %d, size %d\n",gNumAlignedFree,real, filename,line,size);
sFreeFunc(real);
} else
{
printf("NULL ptr\n");
}
}
#else //BT_DEBUG_MEMORY_ALLOCATIONS
void* btAlignedAllocInternal (size_t size, int alignment)
{
gNumAlignedAllocs++;
void* ptr;
ptr = sAlignedAllocFunc(size, alignment);
// printf("btAlignedAllocInternal %d, %x\n",size,ptr);
return ptr;
}
void btAlignedFreeInternal (void* ptr)
{
if (!ptr)
{
return;
}
gNumAlignedFree++;
// printf("btAlignedFreeInternal %x\n",ptr);
sAlignedFreeFunc(ptr);
}
#endif //BT_DEBUG_MEMORY_ALLOCATIONS

View File

@@ -0,0 +1,107 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_ALIGNED_ALLOCATOR
#define BT_ALIGNED_ALLOCATOR
///we probably replace this with our own aligned memory allocator
///so we replace _aligned_malloc and _aligned_free with our own
///that is better portable and more predictable
#include "btScalar.h"
//#define BT_DEBUG_MEMORY_ALLOCATIONS 1
#ifdef BT_DEBUG_MEMORY_ALLOCATIONS
#define btAlignedAlloc(a,b) \
btAlignedAllocInternal(a,b,__LINE__,__FILE__)
#define btAlignedFree(ptr) \
btAlignedFreeInternal(ptr,__LINE__,__FILE__)
void* btAlignedAllocInternal (size_t size, int alignment,int line,char* filename);
void btAlignedFreeInternal (void* ptr,int line,char* filename);
#else
void* btAlignedAllocInternal (size_t size, int alignment);
void btAlignedFreeInternal (void* ptr);
#define btAlignedAlloc(size,alignment) btAlignedAllocInternal(size,alignment)
#define btAlignedFree(ptr) btAlignedFreeInternal(ptr)
#endif
typedef int size_type;
typedef void *(btAlignedAllocFunc)(size_t size, int alignment);
typedef void (btAlignedFreeFunc)(void *memblock);
typedef void *(btAllocFunc)(size_t size);
typedef void (btFreeFunc)(void *memblock);
///The developer can let all Bullet memory allocations go through a custom memory allocator, using btAlignedAllocSetCustom
void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc);
///If the developer has already an custom aligned allocator, then btAlignedAllocSetCustomAligned can be used. The default aligned allocator pre-allocates extra memory using the non-aligned allocator, and instruments it.
void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc);
///The btAlignedAllocator is a portable class for aligned memory allocations.
///Default implementations for unaligned and aligned allocations can be overridden by a custom allocator using btAlignedAllocSetCustom and btAlignedAllocSetCustomAligned.
template < typename T , unsigned Alignment >
class btAlignedAllocator {
typedef btAlignedAllocator< T , Alignment > self_type;
public:
//just going down a list:
btAlignedAllocator() {}
/*
btAlignedAllocator( const self_type & ) {}
*/
template < typename Other >
btAlignedAllocator( const btAlignedAllocator< Other , Alignment > & ) {}
typedef const T* const_pointer;
typedef const T& const_reference;
typedef T* pointer;
typedef T& reference;
typedef T value_type;
pointer address ( reference ref ) const { return &ref; }
const_pointer address ( const_reference ref ) const { return &ref; }
pointer allocate ( size_type n , const_pointer * hint = 0 ) {
(void)hint;
return reinterpret_cast< pointer >(btAlignedAlloc( sizeof(value_type) * n , Alignment ));
}
void construct ( pointer ptr , const value_type & value ) { new (ptr) value_type( value ); }
void deallocate( pointer ptr ) {
btAlignedFree( reinterpret_cast< void * >( ptr ) );
}
void destroy ( pointer ptr ) { ptr->~value_type(); }
template < typename O > struct rebind {
typedef btAlignedAllocator< O , Alignment > other;
};
template < typename O >
self_type & operator=( const btAlignedAllocator< O , Alignment > & ) { return *this; }
friend bool operator==( const self_type & , const self_type & ) { return true; }
};
#endif //BT_ALIGNED_ALLOCATOR

View File

@@ -0,0 +1,511 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_OBJECT_ARRAY__
#define BT_OBJECT_ARRAY__
#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE
#include "btAlignedAllocator.h"
///If the platform doesn't support placement new, you can disable BT_USE_PLACEMENT_NEW
///then the btAlignedObjectArray doesn't support objects with virtual methods, and non-trivial constructors/destructors
///You can enable BT_USE_MEMCPY, then swapping elements in the array will use memcpy instead of operator=
///see discussion here: http://continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1231 and
///http://www.continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1240
#define BT_USE_PLACEMENT_NEW 1
//#define BT_USE_MEMCPY 1 //disable, because it is cumbersome to find out for each platform where memcpy is defined. It can be in <memory.h> or <string.h> or otherwise...
#define BT_ALLOW_ARRAY_COPY_OPERATOR // enabling this can accidently perform deep copies of data if you are not careful
#ifdef BT_USE_MEMCPY
#include <memory.h>
#include <string.h>
#endif //BT_USE_MEMCPY
#ifdef BT_USE_PLACEMENT_NEW
#include <new> //for placement new
#endif //BT_USE_PLACEMENT_NEW
///The btAlignedObjectArray template class uses a subset of the stl::vector interface for its methods
///It is developed to replace stl::vector to avoid portability issues, including STL alignment issues to add SIMD/SSE data
template <typename T>
//template <class T>
class btAlignedObjectArray
{
btAlignedAllocator<T , 16> m_allocator;
int m_size;
int m_capacity;
T* m_data;
//PCK: added this line
bool m_ownsMemory;
#ifdef BT_ALLOW_ARRAY_COPY_OPERATOR
public:
SIMD_FORCE_INLINE btAlignedObjectArray<T>& operator=(const btAlignedObjectArray<T> &other)
{
copyFromArray(other);
return *this;
}
#else//BT_ALLOW_ARRAY_COPY_OPERATOR
private:
SIMD_FORCE_INLINE btAlignedObjectArray<T>& operator=(const btAlignedObjectArray<T> &other);
#endif//BT_ALLOW_ARRAY_COPY_OPERATOR
protected:
SIMD_FORCE_INLINE int allocSize(int size)
{
return (size ? size*2 : 1);
}
SIMD_FORCE_INLINE void copy(int start,int end, T* dest) const
{
int i;
for (i=start;i<end;++i)
#ifdef BT_USE_PLACEMENT_NEW
new (&dest[i]) T(m_data[i]);
#else
dest[i] = m_data[i];
#endif //BT_USE_PLACEMENT_NEW
}
SIMD_FORCE_INLINE void init()
{
//PCK: added this line
m_ownsMemory = true;
m_data = 0;
m_size = 0;
m_capacity = 0;
}
SIMD_FORCE_INLINE void destroy(int first,int last)
{
int i;
for (i=first; i<last;i++)
{
m_data[i].~T();
}
}
SIMD_FORCE_INLINE void* allocate(int size)
{
if (size)
return m_allocator.allocate(size);
return 0;
}
SIMD_FORCE_INLINE void deallocate()
{
if(m_data) {
//PCK: enclosed the deallocation in this block
if (m_ownsMemory)
{
m_allocator.deallocate(m_data);
}
m_data = 0;
}
}
public:
btAlignedObjectArray()
{
init();
}
~btAlignedObjectArray()
{
clear();
}
///Generally it is best to avoid using the copy constructor of an btAlignedObjectArray, and use a (const) reference to the array instead.
btAlignedObjectArray(const btAlignedObjectArray& otherArray)
{
init();
int otherSize = otherArray.size();
resize (otherSize);
otherArray.copy(0, otherSize, m_data);
}
/// return the number of elements in the array
SIMD_FORCE_INLINE int size() const
{
return m_size;
}
SIMD_FORCE_INLINE const T& at(int n) const
{
btAssert(n>=0);
btAssert(n<size());
return m_data[n];
}
SIMD_FORCE_INLINE T& at(int n)
{
btAssert(n>=0);
btAssert(n<size());
return m_data[n];
}
SIMD_FORCE_INLINE const T& operator[](int n) const
{
btAssert(n>=0);
btAssert(n<size());
return m_data[n];
}
SIMD_FORCE_INLINE T& operator[](int n)
{
btAssert(n>=0);
btAssert(n<size());
return m_data[n];
}
///clear the array, deallocated memory. Generally it is better to use array.resize(0), to reduce performance overhead of run-time memory (de)allocations.
SIMD_FORCE_INLINE void clear()
{
destroy(0,size());
deallocate();
init();
}
SIMD_FORCE_INLINE void pop_back()
{
btAssert(m_size>0);
m_size--;
m_data[m_size].~T();
}
///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument.
///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations.
SIMD_FORCE_INLINE void resizeNoInitialize(int newsize)
{
int curSize = size();
if (newsize < curSize)
{
} else
{
if (newsize > size())
{
reserve(newsize);
}
//leave this uninitialized
}
m_size = newsize;
}
SIMD_FORCE_INLINE void resize(int newsize, const T& fillData=T())
{
int curSize = size();
if (newsize < curSize)
{
for(int i = newsize; i < curSize; i++)
{
m_data[i].~T();
}
} else
{
if (newsize > size())
{
reserve(newsize);
}
#ifdef BT_USE_PLACEMENT_NEW
for (int i=curSize;i<newsize;i++)
{
new ( &m_data[i]) T(fillData);
}
#endif //BT_USE_PLACEMENT_NEW
}
m_size = newsize;
}
SIMD_FORCE_INLINE T& expandNonInitializing( )
{
int sz = size();
if( sz == capacity() )
{
reserve( allocSize(size()) );
}
m_size++;
return m_data[sz];
}
SIMD_FORCE_INLINE T& expand( const T& fillValue=T())
{
int sz = size();
if( sz == capacity() )
{
reserve( allocSize(size()) );
}
m_size++;
#ifdef BT_USE_PLACEMENT_NEW
new (&m_data[sz]) T(fillValue); //use the in-place new (not really allocating heap memory)
#endif
return m_data[sz];
}
SIMD_FORCE_INLINE void push_back(const T& _Val)
{
int sz = size();
if( sz == capacity() )
{
reserve( allocSize(size()) );
}
#ifdef BT_USE_PLACEMENT_NEW
new ( &m_data[m_size] ) T(_Val);
#else
m_data[size()] = _Val;
#endif //BT_USE_PLACEMENT_NEW
m_size++;
}
/// return the pre-allocated (reserved) elements, this is at least as large as the total number of elements,see size() and reserve()
SIMD_FORCE_INLINE int capacity() const
{
return m_capacity;
}
SIMD_FORCE_INLINE void reserve(int _Count)
{ // determine new minimum length of allocated storage
if (capacity() < _Count)
{ // not enough room, reallocate
T* s = (T*)allocate(_Count);
copy(0, size(), s);
destroy(0,size());
deallocate();
//PCK: added this line
m_ownsMemory = true;
m_data = s;
m_capacity = _Count;
}
}
class less
{
public:
bool operator() ( const T& a, const T& b )
{
return ( a < b );
}
};
template <typename L>
void quickSortInternal(const L& CompareFunc,int lo, int hi)
{
// lo is the lower index, hi is the upper index
// of the region of array a that is to be sorted
int i=lo, j=hi;
T x=m_data[(lo+hi)/2];
// partition
do
{
while (CompareFunc(m_data[i],x))
i++;
while (CompareFunc(x,m_data[j]))
j--;
if (i<=j)
{
swap(i,j);
i++; j--;
}
} while (i<=j);
// recursion
if (lo<j)
quickSortInternal( CompareFunc, lo, j);
if (i<hi)
quickSortInternal( CompareFunc, i, hi);
}
template <typename L>
void quickSort(const L& CompareFunc)
{
//don't sort 0 or 1 elements
if (size()>1)
{
quickSortInternal(CompareFunc,0,size()-1);
}
}
///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/
template <typename L>
void downHeap(T *pArr, int k, int n, const L& CompareFunc)
{
/* PRE: a[k+1..N] is a heap */
/* POST: a[k..N] is a heap */
T temp = pArr[k - 1];
/* k has child(s) */
while (k <= n/2)
{
int child = 2*k;
if ((child < n) && CompareFunc(pArr[child - 1] , pArr[child]))
{
child++;
}
/* pick larger child */
if (CompareFunc(temp , pArr[child - 1]))
{
/* move child up */
pArr[k - 1] = pArr[child - 1];
k = child;
}
else
{
break;
}
}
pArr[k - 1] = temp;
} /*downHeap*/
void swap(int index0,int index1)
{
#ifdef BT_USE_MEMCPY
char temp[sizeof(T)];
memcpy(temp,&m_data[index0],sizeof(T));
memcpy(&m_data[index0],&m_data[index1],sizeof(T));
memcpy(&m_data[index1],temp,sizeof(T));
#else
T temp = m_data[index0];
m_data[index0] = m_data[index1];
m_data[index1] = temp;
#endif //BT_USE_PLACEMENT_NEW
}
template <typename L>
void heapSort(const L& CompareFunc)
{
/* sort a[0..N-1], N.B. 0 to N-1 */
int k;
int n = m_size;
for (k = n/2; k > 0; k--)
{
downHeap(m_data, k, n, CompareFunc);
}
/* a[1..N] is now a heap */
while ( n>=1 )
{
swap(0,n-1); /* largest of a[0..n-1] */
n = n - 1;
/* restore a[1..i-1] heap */
downHeap(m_data, 1, n, CompareFunc);
}
}
///non-recursive binary search, assumes sorted array
int findBinarySearch(const T& key) const
{
int first = 0;
int last = size()-1;
//assume sorted array
while (first <= last) {
int mid = (first + last) / 2; // compute mid point.
if (key > m_data[mid])
first = mid + 1; // repeat search in top half.
else if (key < m_data[mid])
last = mid - 1; // repeat search in bottom half.
else
return mid; // found it. return position /////
}
return size(); // failed to find key
}
int findLinearSearch(const T& key) const
{
int index=size();
int i;
for (i=0;i<size();i++)
{
if (m_data[i] == key)
{
index = i;
break;
}
}
return index;
}
void remove(const T& key)
{
int findIndex = findLinearSearch(key);
if (findIndex<size())
{
swap( findIndex,size()-1);
pop_back();
}
}
//PCK: whole function
void initializeFromBuffer(void *buffer, int size, int capacity)
{
clear();
m_ownsMemory = false;
m_data = (T*)buffer;
m_size = size;
m_capacity = capacity;
}
void copyFromArray(const btAlignedObjectArray& otherArray)
{
int otherSize = otherArray.size();
resize (otherSize);
otherArray.copy(0, otherSize, m_data);
}
};
#endif //BT_OBJECT_ARRAY__

View File

@@ -0,0 +1,213 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
//Host-code rewritten by Erwin Coumans
#define BOUNDSEARCH_PATH "opencl/parallel_primitives/kernels/BoundSearchKernels.cl"
#define KERNEL0 "SearchSortDataLowerKernel"
#define KERNEL1 "SearchSortDataUpperKernel"
#define KERNEL2 "SubtractKernel"
#include "btBoundSearchCL.h"
#include "../../basic_initialize/btOpenCLUtils.h"
#include "btLauncherCL.h"
#include "../kernels/BoundSearchKernelsCL.h"
btBoundSearchCL::btBoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
:m_context(ctx),
m_device(device),
m_queue(queue)
{
const char* additionalMacros = "";
const char* srcFileNameForCaching="";
cl_int pErrNum;
const char* kernelSource = boundSearchKernelsCL;
cl_program boundSearchProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH);
btAssert(boundSearchProg);
m_lowerSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros );
btAssert(m_lowerSortDataKernel );
m_upperSortDataKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros );
btAssert(m_upperSortDataKernel);
m_subtractKernel = 0;
if( maxSize )
{
m_subtractKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros );
btAssert(m_subtractKernel);
}
//m_constBuffer = new btOpenCLArray<btInt4>( device, 1, BufferBase::BUFFER_CONST );
m_lower = (maxSize == 0)? 0: new btOpenCLArray<unsigned int>(ctx,queue,maxSize );
m_upper = (maxSize == 0)? 0: new btOpenCLArray<unsigned int>(ctx,queue, maxSize );
m_filler = new btFillCL(ctx,device,queue);
}
btBoundSearchCL::~btBoundSearchCL()
{
delete m_lower;
delete m_upper;
delete m_filler;
clReleaseKernel(m_lowerSortDataKernel);
clReleaseKernel(m_upperSortDataKernel);
clReleaseKernel(m_subtractKernel);
}
void btBoundSearchCL::execute(btOpenCLArray<btSortData>& src, int nSrc, btOpenCLArray<unsigned int>& dst, int nDst, Option option )
{
btInt4 constBuffer;
constBuffer.x = nSrc;
constBuffer.y = nDst;
if( option == BOUND_LOWER )
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL()) };
btLauncherCL launcher( m_queue, m_lowerSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
launcher.launch1D( nSrc, 64 );
}
else if( option == BOUND_UPPER )
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };
btLauncherCL launcher(m_queue, m_upperSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
launcher.launch1D( nSrc, 64 );
}
else if( option == COUNT )
{
btAssert( m_lower );
btAssert( m_upper );
btAssert( m_lower->capacity() <= (int)nDst );
btAssert( m_upper->capacity() <= (int)nDst );
int zero = 0;
m_filler->execute( *m_lower, zero, nDst );
m_filler->execute( *m_upper, zero, nDst );
execute( src, nSrc, *m_lower, nDst, BOUND_LOWER );
execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_upper->getBufferCL(), true ), btBufferInfoCL( m_lower->getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };
btLauncherCL launcher( m_queue, m_subtractKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
launcher.launch1D( nDst, 64 );
}
}
else
{
btAssert( 0 );
}
}
void btBoundSearchCL::executeHost( btAlignedObjectArray<btSortData>& src, int nSrc,
btAlignedObjectArray<unsigned int>& dst, int nDst, Option option )
{
for(int i=0; i<nSrc-1; i++)
btAssert( src[i].m_key <= src[i+1].m_key );
btSortData minData,zeroData,maxData;
minData.m_key = -1;
minData.m_value = -1;
zeroData.m_key=0;
zeroData.m_value=0;
maxData.m_key = nDst;
maxData.m_value = nDst;
if( option == BOUND_LOWER )
{
for(int i=0; i<nSrc; i++)
{
btSortData& iData = (i==0)? minData: src[i-1];
btSortData& jData = (i==nSrc)? maxData: src[i];
if( iData.m_key != jData.m_key )
{
int k = jData.m_key;
{
dst[k] = i;
}
}
}
}
else if( option == BOUND_UPPER )
{
for(int i=1; i<nSrc+1; i++)
{
btSortData& iData = src[i-1];
btSortData& jData = (i==nSrc)? maxData: src[i];
if( iData.m_key != jData.m_key )
{
int k = iData.m_key;
{
dst[k] = i;
}
}
}
}
else if( option == COUNT )
{
btAlignedObjectArray<unsigned int> lower;
lower.resize(nDst );
btAlignedObjectArray<unsigned int> upper;
upper.resize(nDst );
for(int i=0; i<nDst; i++)
{
lower[i] = upper[i] = 0;
}
executeHost( src, nSrc, lower, nDst, BOUND_LOWER );
executeHost( src, nSrc, upper, nDst, BOUND_UPPER );
for( int i=0; i<nDst; i++)
{
dst[i] = upper[i] - lower[i];
}
}
else
{
btAssert( 0 );
}
}

View File

@@ -0,0 +1,67 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#ifndef BT_BOUNDSEARCH_H
#define BT_BOUNDSEARCH_H
#pragma once
/*#include <Adl/Adl.h>
#include <AdlPrimitives/Math/Math.h>
#include <AdlPrimitives/Sort/SortData.h>
#include <AdlPrimitives/Fill/Fill.h>
*/
#include "btOpenCLArray.h"
#include "btFillCL.h"
#include "btRadixSort32CL.h" //for btSortData (perhaps move it?)
class btBoundSearchCL
{
public:
enum Option
{
BOUND_LOWER,
BOUND_UPPER,
COUNT,
};
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_lowerSortDataKernel;
cl_kernel m_upperSortDataKernel;
cl_kernel m_subtractKernel;
btOpenCLArray<btInt4>* m_constbtOpenCLArray;
btOpenCLArray<unsigned int>* m_lower;
btOpenCLArray<unsigned int>* m_upper;
btFillCL* m_filler;
btBoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
virtual ~btBoundSearchCL();
// src has to be src[i].m_key <= src[i+1].m_key
void execute( btOpenCLArray<btSortData>& src, int nSrc, btOpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
void executeHost( btAlignedObjectArray<btSortData>& src, int nSrc, btAlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
};
#endif //BT_BOUNDSEARCH_H

View File

@@ -0,0 +1,19 @@
#ifndef BT_BUFFER_INFO_CL_H
#define BT_BUFFER_INFO_CL_H
#include "btOpenCLArray.h"
struct btBufferInfoCL
{
//btBufferInfoCL(){}
// template<typename T>
btBufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
cl_mem m_clBuffer;
bool m_isReadOnly;
};
#endif //BT_BUFFER_INFO_CL_H

View File

@@ -0,0 +1,126 @@
#include "btFillCL.h"
#include "../../basic_initialize/btOpenCLUtils.h"
#include "btBufferInfoCL.h"
#include "btLauncherCL.h"
#define FILL_CL_PROGRAM_PATH "opencl/parallel_primitives/kernels/FillKernels.cl"
#include "../kernels/FillKernelsCL.h"
btFillCL::btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
:m_commandQueue(queue)
{
const char* kernelSource = fillKernelsCL;
cl_int pErrNum;
const char* additionalMacros = "";
cl_program fillProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH);
btAssert(fillProg);
m_fillIntKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros );
btAssert(m_fillIntKernel);
m_fillUnsignedIntKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros );
btAssert(m_fillIntKernel);
m_fillFloatKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros );
btAssert(m_fillFloatKernel);
m_fillKernelInt2 = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros );
btAssert(m_fillKernelInt2);
}
btFillCL::~btFillCL()
{
clReleaseKernel(m_fillKernelInt2);
clReleaseKernel(m_fillIntKernel);
clReleaseKernel(m_fillUnsignedIntKernel);
clReleaseKernel(m_fillFloatKernel);
}
void btFillCL::execute(btOpenCLArray<float>& src, const float value, int n, int offset)
{
btAssert( n>0 );
{
btLauncherCL launcher( m_commandQueue, m_fillFloatKernel );
launcher.setBuffer( src.getBufferCL());
launcher.setConst( n );
launcher.setConst( value );
launcher.setConst( offset);
launcher.launch1D( n );
}
}
void btFillCL::execute(btOpenCLArray<int>& src, const int value, int n, int offset)
{
btAssert( n>0 );
{
btLauncherCL launcher( m_commandQueue, m_fillIntKernel );
launcher.setBuffer(src.getBufferCL());
launcher.setConst( n);
launcher.setConst( value);
launcher.setConst( offset);
launcher.launch1D( n );
}
}
void btFillCL::execute(btOpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
{
btAssert( n>0 );
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( n );
launcher.setConst(value);
launcher.setConst(offset);
launcher.launch1D( n );
}
}
void btFillCL::executeHost(btAlignedObjectArray<btInt2> &src, const btInt2 &value, int n, int offset)
{
for (int i=0;i<n;i++)
{
src[i+offset]=value;
}
}
void btFillCL::executeHost(btAlignedObjectArray<int> &src, const int value, int n, int offset)
{
for (int i=0;i<n;i++)
{
src[i+offset]=value;
}
}
void btFillCL::execute(btOpenCLArray<btInt2> &src, const btInt2 &value, int n, int offset)
{
btAssert( n>0 );
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };
btLauncherCL launcher(m_commandQueue, m_fillKernelInt2);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst(n);
launcher.setConst(value);
launcher.setConst(offset);
//( constBuffer );
launcher.launch1D( n );
}
}

View File

@@ -0,0 +1,137 @@
#ifndef BT_FILL_CL_H
#define BT_FILL_CL_H
#include "btOpenCLArray.h"
#include "btScalar.h"
ATTRIBUTE_ALIGNED16(struct) btUnsignedInt4
{
BT_DECLARE_ALIGNED_ALLOCATOR();
union
{
struct
{
unsigned int x,y,z,w;
};
struct
{
unsigned int s[4];
};
};
};
ATTRIBUTE_ALIGNED16(struct) btInt4
{
BT_DECLARE_ALIGNED_ALLOCATOR();
union
{
struct
{
int x,y,z,w;
};
struct
{
int s[4];
};
};
};
struct btUnsignedInt2
{
union
{
struct
{
unsigned int x,y;
};
struct
{
unsigned int s[2];
};
};
};
struct btInt2
{
union
{
struct
{
int x,y;
};
struct
{
int s[2];
};
};
};
SIMD_FORCE_INLINE btInt4 btMakeInt4(int x, int y, int z, int w = 0)
{
btInt4 v;
v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
return v;
}
SIMD_FORCE_INLINE btUnsignedInt4 btMakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0)
{
btUnsignedInt4 v;
v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
return v;
}
class btFillCL
{
cl_command_queue m_commandQueue;
cl_kernel m_fillKernelInt2;
cl_kernel m_fillIntKernel;
cl_kernel m_fillUnsignedIntKernel;
cl_kernel m_fillFloatKernel;
public:
struct btConstData
{
union
{
btInt4 m_data;
btUnsignedInt4 m_UnsignedData;
};
int m_offset;
int m_n;
int m_padding[2];
};
protected:
public:
btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
virtual ~btFillCL();
void execute(btOpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
void execute(btOpenCLArray<int>& src, const int value, int n, int offset = 0);
void execute(btOpenCLArray<float>& src, const float value, int n, int offset = 0);
void execute(btOpenCLArray<btInt2>& src, const btInt2& value, int n, int offset = 0);
void executeHost(btAlignedObjectArray<btInt2> &src, const btInt2 &value, int n, int offset);
void executeHost(btAlignedObjectArray<int> &src, const int value, int n, int offset);
// void execute(btOpenCLArray<btInt4>& src, const btInt4& value, int n, int offset = 0);
};
#endif //BT_FILL_CL_H

View File

@@ -0,0 +1,450 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_HASH_MAP_H
#define BT_HASH_MAP_H
#include "btAlignedObjectArray.h"
///very basic hashable string implementation, compatible with btHashMap
struct btHashString
{
const char* m_string;
unsigned int m_hash;
SIMD_FORCE_INLINE unsigned int getHash()const
{
return m_hash;
}
btHashString(const char* name)
:m_string(name)
{
/* magic numbers from http://www.isthe.com/chongo/tech/comp/fnv/ */
static const unsigned int InitialFNV = 2166136261u;
static const unsigned int FNVMultiple = 16777619u;
/* Fowler / Noll / Vo (FNV) Hash */
unsigned int hash = InitialFNV;
for(int i = 0; m_string[i]; i++)
{
hash = hash ^ (m_string[i]); /* xor the low 8 bits */
hash = hash * FNVMultiple; /* multiply by the magic number */
}
m_hash = hash;
}
int portableStringCompare(const char* src, const char* dst) const
{
int ret = 0 ;
while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst)
++src, ++dst;
if ( ret < 0 )
ret = -1 ;
else if ( ret > 0 )
ret = 1 ;
return( ret );
}
bool equals(const btHashString& other) const
{
return (m_string == other.m_string) ||
(0==portableStringCompare(m_string,other.m_string));
}
};
const int BT_HASH_NULL=0xffffffff;
class btHashInt
{
int m_uid;
public:
btHashInt(int uid) :m_uid(uid)
{
}
int getUid1() const
{
return m_uid;
}
void setUid1(int uid)
{
m_uid = uid;
}
bool equals(const btHashInt& other) const
{
return getUid1() == other.getUid1();
}
//to our success
SIMD_FORCE_INLINE unsigned int getHash()const
{
int key = m_uid;
// Thomas Wang's hash
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
return key;
}
};
class btHashPtr
{
union
{
const void* m_pointer;
int m_hashValues[2];
};
public:
btHashPtr(const void* ptr)
:m_pointer(ptr)
{
}
const void* getPointer() const
{
return m_pointer;
}
bool equals(const btHashPtr& other) const
{
return getPointer() == other.getPointer();
}
//to our success
SIMD_FORCE_INLINE unsigned int getHash()const
{
const bool VOID_IS_8 = ((sizeof(void*)==8));
int key = VOID_IS_8? m_hashValues[0]+m_hashValues[1] : m_hashValues[0];
// Thomas Wang's hash
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
return key;
}
};
template <class Value>
class btHashKeyPtr
{
int m_uid;
public:
btHashKeyPtr(int uid) :m_uid(uid)
{
}
int getUid1() const
{
return m_uid;
}
bool equals(const btHashKeyPtr<Value>& other) const
{
return getUid1() == other.getUid1();
}
//to our success
SIMD_FORCE_INLINE unsigned int getHash()const
{
int key = m_uid;
// Thomas Wang's hash
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
return key;
}
};
template <class Value>
class btHashKey
{
int m_uid;
public:
btHashKey(int uid) :m_uid(uid)
{
}
int getUid1() const
{
return m_uid;
}
bool equals(const btHashKey<Value>& other) const
{
return getUid1() == other.getUid1();
}
//to our success
SIMD_FORCE_INLINE unsigned int getHash()const
{
int key = m_uid;
// Thomas Wang's hash
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
return key;
}
};
///The btHashMap template class implements a generic and lightweight hashmap.
///A basic sample of how to use btHashMap is located in Demos\BasicDemo\main.cpp
template <class Key, class Value>
class btHashMap
{
protected:
btAlignedObjectArray<int> m_hashTable;
btAlignedObjectArray<int> m_next;
btAlignedObjectArray<Value> m_valueArray;
btAlignedObjectArray<Key> m_keyArray;
void growTables(const Key& /*key*/)
{
int newCapacity = m_valueArray.capacity();
if (m_hashTable.size() < newCapacity)
{
//grow hashtable and next table
int curHashtableSize = m_hashTable.size();
m_hashTable.resize(newCapacity);
m_next.resize(newCapacity);
int i;
for (i= 0; i < newCapacity; ++i)
{
m_hashTable[i] = BT_HASH_NULL;
}
for (i = 0; i < newCapacity; ++i)
{
m_next[i] = BT_HASH_NULL;
}
for(i=0;i<curHashtableSize;i++)
{
//const Value& value = m_valueArray[i];
//const Key& key = m_keyArray[i];
int hashValue = m_keyArray[i].getHash() & (m_valueArray.capacity()-1); // New hash value with new mask
m_next[i] = m_hashTable[hashValue];
m_hashTable[hashValue] = i;
}
}
}
public:
void insert(const Key& key, const Value& value) {
int hash = key.getHash() & (m_valueArray.capacity()-1);
//replace value if the key is already there
int index = findIndex(key);
if (index != BT_HASH_NULL)
{
m_valueArray[index]=value;
return;
}
int count = m_valueArray.size();
int oldCapacity = m_valueArray.capacity();
m_valueArray.push_back(value);
m_keyArray.push_back(key);
int newCapacity = m_valueArray.capacity();
if (oldCapacity < newCapacity)
{
growTables(key);
//hash with new capacity
hash = key.getHash() & (m_valueArray.capacity()-1);
}
m_next[count] = m_hashTable[hash];
m_hashTable[hash] = count;
}
void remove(const Key& key) {
int hash = key.getHash() & (m_valueArray.capacity()-1);
int pairIndex = findIndex(key);
if (pairIndex ==BT_HASH_NULL)
{
return;
}
// Remove the pair from the hash table.
int index = m_hashTable[hash];
btAssert(index != BT_HASH_NULL);
int previous = BT_HASH_NULL;
while (index != pairIndex)
{
previous = index;
index = m_next[index];
}
if (previous != BT_HASH_NULL)
{
btAssert(m_next[previous] == pairIndex);
m_next[previous] = m_next[pairIndex];
}
else
{
m_hashTable[hash] = m_next[pairIndex];
}
// We now move the last pair into spot of the
// pair being removed. We need to fix the hash
// table indices to support the move.
int lastPairIndex = m_valueArray.size() - 1;
// If the removed pair is the last pair, we are done.
if (lastPairIndex == pairIndex)
{
m_valueArray.pop_back();
m_keyArray.pop_back();
return;
}
// Remove the last pair from the hash table.
int lastHash = m_keyArray[lastPairIndex].getHash() & (m_valueArray.capacity()-1);
index = m_hashTable[lastHash];
btAssert(index != BT_HASH_NULL);
previous = BT_HASH_NULL;
while (index != lastPairIndex)
{
previous = index;
index = m_next[index];
}
if (previous != BT_HASH_NULL)
{
btAssert(m_next[previous] == lastPairIndex);
m_next[previous] = m_next[lastPairIndex];
}
else
{
m_hashTable[lastHash] = m_next[lastPairIndex];
}
// Copy the last pair into the remove pair's spot.
m_valueArray[pairIndex] = m_valueArray[lastPairIndex];
m_keyArray[pairIndex] = m_keyArray[lastPairIndex];
// Insert the last pair into the hash table
m_next[pairIndex] = m_hashTable[lastHash];
m_hashTable[lastHash] = pairIndex;
m_valueArray.pop_back();
m_keyArray.pop_back();
}
int size() const
{
return m_valueArray.size();
}
const Value* getAtIndex(int index) const
{
btAssert(index < m_valueArray.size());
return &m_valueArray[index];
}
Value* getAtIndex(int index)
{
btAssert(index < m_valueArray.size());
return &m_valueArray[index];
}
Value* operator[](const Key& key) {
return find(key);
}
const Value* find(const Key& key) const
{
int index = findIndex(key);
if (index == BT_HASH_NULL)
{
return NULL;
}
return &m_valueArray[index];
}
Value* find(const Key& key)
{
int index = findIndex(key);
if (index == BT_HASH_NULL)
{
return NULL;
}
return &m_valueArray[index];
}
int findIndex(const Key& key) const
{
unsigned int hash = key.getHash() & (m_valueArray.capacity()-1);
if (hash >= (unsigned int)m_hashTable.size())
{
return BT_HASH_NULL;
}
int index = m_hashTable[hash];
while ((index != BT_HASH_NULL) && key.equals(m_keyArray[index]) == false)
{
index = m_next[index];
}
return index;
}
void clear()
{
m_hashTable.clear();
m_next.clear();
m_valueArray.clear();
m_keyArray.clear();
}
};
#endif //BT_HASH_MAP_H

View File

@@ -0,0 +1,363 @@
#ifndef BT_LAUNCHER_CL_H
#define BT_LAUNCHER_CL_H
#include "btBufferInfoCL.h"
#include "btMinMax.h"
#include "btOpenCLArray.h"
#include <stdio.h>
#ifdef _WIN32
#pragma warning(disable :4996)
#endif
#define BT_CL_MAX_ARG_SIZE 16
struct btKernelArgData
{
int m_isBuffer;
int m_argIndex;
int m_argSizeInBytes;
union
{
cl_mem m_clBuffer;
unsigned char m_argData[BT_CL_MAX_ARG_SIZE];
};
};
class btLauncherCL
{
cl_command_queue m_commandQueue;
cl_kernel m_kernel;
int m_idx;
btAlignedObjectArray<btKernelArgData> m_kernelArguments;
int m_serializationSizeInBytes;
public:
btAlignedObjectArray<btOpenCLArray<unsigned char>* > m_arrays;
btLauncherCL(cl_command_queue queue, cl_kernel kernel)
:m_commandQueue(queue),
m_kernel(kernel),
m_idx(0)
{
m_serializationSizeInBytes = sizeof(int);
}
virtual ~btLauncherCL()
{
for (int i=0;i<m_arrays.size();i++)
{
clReleaseMemObject(m_arrays[i]->getBufferCL());
}
}
inline void setBuffer( cl_mem clBuffer)
{
btKernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = clBuffer;
cl_mem_info param_name = CL_MEM_SIZE;
size_t param_value;
size_t sizeInBytes = sizeof(size_t);
size_t actualSizeInBytes;
cl_int err;
err = clGetMemObjectInfo ( kernelArg.m_clBuffer,
param_name,
sizeInBytes,
&param_value,
&actualSizeInBytes);
btAssert( err == CL_SUCCESS );
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+= sizeof(btKernelArgData);
m_serializationSizeInBytes+=param_value;
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
btAssert( status == CL_SUCCESS );
}
inline void setBuffers( btBufferInfoCL* buffInfo, int n )
{
for(int i=0; i<n; i++)
{
btKernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
cl_mem_info param_name = CL_MEM_SIZE;
size_t param_value;
size_t sizeInBytes = sizeof(size_t);
size_t actualSizeInBytes;
cl_int err;
err = clGetMemObjectInfo ( kernelArg.m_clBuffer,
param_name,
sizeInBytes,
&param_value,
&actualSizeInBytes);
btAssert( err == CL_SUCCESS );
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+= sizeof(btKernelArgData);
m_serializationSizeInBytes+=param_value;
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
btAssert( status == CL_SUCCESS );
}
}
int getSerializationBufferSize() const
{
return m_serializationSizeInBytes;
}
inline int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
{
int index=0;
int numArguments = *(int*) &buf[index];
index+=sizeof(int);
for (int i=0;i<numArguments;i++)
{
btKernelArgData* arg = (btKernelArgData*)&buf[index];
index+=sizeof(btKernelArgData);
if (arg->m_isBuffer)
{
btOpenCLArray<unsigned char>* clData = new btOpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
clData->resize(arg->m_argSizeInBytes);
clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
arg->m_clBuffer = clData->getBufferCL();
m_arrays.push_back(clData);
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
btAssert( status == CL_SUCCESS );
index+=arg->m_argSizeInBytes;
} else
{
cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
btAssert( status == CL_SUCCESS );
}
m_kernelArguments.push_back(*arg);
}
m_serializationSizeInBytes = index;
return index;
}
inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
{
int index=0;
int numArguments = *(int*) &goldBuffer[index];
index+=sizeof(int);
if (numArguments != m_kernelArguments.size())
{
printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size());
return -1;
}
for (int ii=0;ii<numArguments;ii++)
{
btKernelArgData* argGold = (btKernelArgData*)&goldBuffer[index];
if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
{
printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
return -2;
}
{
int expected = argGold->m_isBuffer;
int found = m_kernelArguments[ii].m_isBuffer;
if (expected != found)
{
printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found);
return -3;
}
}
index+=sizeof(btKernelArgData);
if (argGold->m_isBuffer)
{
unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes);
unsigned char* goldBuf = &goldBuffer[index];
for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++)
{
memBuf[j] = 0xaa;
}
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
memBuf, 0,0,0 );
btAssert( status==CL_SUCCESS );
clFinish(m_commandQueue);
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
{
int expected = goldBuf[b];
int found = memBuf[b];
if (expected != found)
{
printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
ii, b, expected, found);
return -4;
}
}
index+=argGold->m_argSizeInBytes;
} else
{
//compare content
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
{
int expected = argGold->m_argData[b];
int found =m_kernelArguments[ii].m_argData[b];
if (expected != found)
{
printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
ii, b, expected, found);
return -5;
}
}
}
}
return index;
}
inline int serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
{
//initialize to known values
for (int i=0;i<destBufferCapacity;i++)
destBuffer[i] = 0xec;
assert(destBufferCapacity>=m_serializationSizeInBytes);
//todo: use the btSerializer for this to allow for 32/64bit, endianness etc
int numArguments = m_kernelArguments.size();
int curBufferSize = 0;
int* dest = (int*)&destBuffer[curBufferSize];
*dest = numArguments;
curBufferSize += sizeof(int);
for (int i=0;i<this->m_kernelArguments.size();i++)
{
btKernelArgData* arg = (btKernelArgData*) &destBuffer[curBufferSize];
*arg = m_kernelArguments[i];
curBufferSize+=sizeof(btKernelArgData);
if (arg->m_isBuffer==1)
{
//copy the OpenCL buffer content
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
&destBuffer[curBufferSize], 0,0,0 );
btAssert( status==CL_SUCCESS );
clFinish(m_commandQueue);
curBufferSize+=arg->m_argSizeInBytes;
}
}
return curBufferSize;
}
void serializeToFile(const char* fileName, int numWorkItems)
{
int num = numWorkItems;
int buffSize = getSerializationBufferSize();
unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
for (int i=0;i<buffSize+1;i++)
{
unsigned char* ptr = (unsigned char*)&buf[i];
*ptr = 0xff;
}
int actualWrite = serializeArguments(buf,buffSize);
unsigned char* cptr = (unsigned char*)&buf[buffSize];
// printf("buf[buffSize] = %d\n",*cptr);
assert(buf[buffSize]==0xff);//check for buffer overrun
int* ptr = (int*)&buf[buffSize];
*ptr = num;
FILE* f = fopen(fileName,"wb");
fwrite(buf,buffSize+sizeof(int),1,f);
fclose(f);
delete[] buf;
}
template<typename T>
inline void setConst( const T& consts )
{
int sz=sizeof(T);
btAssert(sz<=BT_CL_MAX_ARG_SIZE);
btKernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 0;
T* destArg = (T*)kernelArg.m_argData;
*destArg = consts;
kernelArg.m_argSizeInBytes = sizeof(T);
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+=sizeof(btKernelArgData);
cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts );
btAssert( status == CL_SUCCESS );
}
inline void launch1D( int numThreads, int localSize = 64)
{
launch2D( numThreads, 1, localSize, 1 );
}
inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
{
size_t gRange[3] = {1,1,1};
size_t lRange[3] = {1,1,1};
lRange[0] = localSizeX;
lRange[1] = localSizeY;
gRange[0] = btMax((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
gRange[0] *= lRange[0];
gRange[1] = btMax((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
gRange[1] *= lRange[1];
cl_int status = clEnqueueNDRangeKernel( m_commandQueue,
m_kernel, 2, NULL, gRange, lRange, 0,0,0 );
if (status != CL_SUCCESS)
{
printf("Error: OpenCL status = %d\n",status);
}
btAssert( status == CL_SUCCESS );
}
};
#endif //BT_LAUNCHER_CL_H

View File

@@ -0,0 +1,71 @@
/*
Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_GEN_MINMAX_H
#define BT_GEN_MINMAX_H
#include "btScalar.h"
template <class T>
SIMD_FORCE_INLINE const T& btMin(const T& a, const T& b)
{
return a < b ? a : b ;
}
template <class T>
SIMD_FORCE_INLINE const T& btMax(const T& a, const T& b)
{
return a > b ? a : b;
}
template <class T>
SIMD_FORCE_INLINE const T& btClamped(const T& a, const T& lb, const T& ub)
{
return a < lb ? lb : (ub < a ? ub : a);
}
template <class T>
SIMD_FORCE_INLINE void btSetMin(T& a, const T& b)
{
if (b < a)
{
a = b;
}
}
template <class T>
SIMD_FORCE_INLINE void btSetMax(T& a, const T& b)
{
if (a < b)
{
a = b;
}
}
template <class T>
SIMD_FORCE_INLINE void btClamp(T& a, const T& lb, const T& ub)
{
if (a < lb)
{
a = lb;
}
else if (ub < a)
{
a = ub;
}
}
#endif //BT_GEN_MINMAX_H

View File

@@ -0,0 +1,274 @@
#ifndef BT_OPENCL_ARRAY_H
#define BT_OPENCL_ARRAY_H
#include "btAlignedObjectArray.h"
#include "../../basic_initialize/btOpenCLInclude.h"
template <typename T>
class btOpenCLArray
{
int m_size;
int m_capacity;
cl_mem m_clBuffer;
cl_context m_clContext;
cl_command_queue m_commandQueue;
bool m_ownsMemory;
bool m_allowGrowingCapacity;
void deallocate()
{
if (m_clBuffer && m_ownsMemory)
{
clReleaseMemObject(m_clBuffer);
}
m_clBuffer = 0;
m_capacity=0;
}
btOpenCLArray<T>& operator=(const btOpenCLArray<T>& src);
SIMD_FORCE_INLINE int allocSize(int size)
{
return (size ? size*2 : 1);
}
public:
btOpenCLArray(cl_context ctx, cl_command_queue queue, int initialCapacity=0, bool allowGrowingCapacity=true)
:m_size(0), m_capacity(0),m_clBuffer(0),
m_clContext(ctx),m_commandQueue(queue),
m_ownsMemory(true),m_allowGrowingCapacity(true)
{
if (initialCapacity)
{
reserve(initialCapacity);
}
m_allowGrowingCapacity = allowGrowingCapacity;
}
///this is an error-prone method with no error checking, be careful!
void setFromOpenCLBuffer(cl_mem buffer, int sizeInElements)
{
deallocate();
m_ownsMemory = false;
m_allowGrowingCapacity = false;
m_clBuffer = buffer;
m_size = sizeInElements;
m_capacity = sizeInElements;
}
// we could enable this assignment, but need to make sure to avoid accidental deep copies
// btOpenCLArray<T>& operator=(const btAlignedObjectArray<T>& src)
// {
// copyFromArray(src);
// return *this;
// }
cl_mem getBufferCL() const
{
return m_clBuffer;
}
virtual ~btOpenCLArray()
{
deallocate();
m_size=0;
m_capacity=0;
}
SIMD_FORCE_INLINE void push_back(const T& _Val,bool waitForCompletion=true)
{
int sz = size();
if( sz == capacity() )
{
reserve( allocSize(size()) );
}
copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
m_size++;
}
SIMD_FORCE_INLINE T forcedAt(int n) const
{
btAssert(n>=0);
btAssert(n<capacity());
T elem;
copyToHostPointer(&elem,1,n,true);
return elem;
}
SIMD_FORCE_INLINE T at(int n) const
{
btAssert(n>=0);
btAssert(n<size());
T elem;
copyToHostPointer(&elem,1,n,true);
return elem;
}
SIMD_FORCE_INLINE void resize(int newsize, bool copyOldContents=true)
{
int curSize = size();
if (newsize < curSize)
{
//leave the OpenCL memory for now
} else
{
if (newsize > size())
{
reserve(newsize,copyOldContents);
}
//leave new data uninitialized (init in debug mode?)
//for (int i=curSize;i<newsize;i++) ...
}
m_size = newsize;
}
SIMD_FORCE_INLINE int size() const
{
return m_size;
}
SIMD_FORCE_INLINE int capacity() const
{
return m_capacity;
}
SIMD_FORCE_INLINE void reserve(int _Count, bool copyOldContents=true)
{ // determine new minimum length of allocated storage
if (capacity() < _Count)
{ // not enough room, reallocate
if (m_allowGrowingCapacity)
{
cl_int ciErrNum;
//create a new OpenCL buffer
int memSizeInBytes = sizeof(T)*_Count;
cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
btAssert(ciErrNum==CL_SUCCESS);
//#define BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
#ifdef BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
for (int i=0;i<memSizeInBytes;i++)
src[i] = 0xbb;
ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 );
btAssert(ciErrNum==CL_SUCCESS);
clFinish(m_commandQueue);
free(src);
#endif //BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
if (copyOldContents)
copyToCL(buf, size());
//deallocate the old buffer
deallocate();
m_clBuffer = buf;
m_capacity = _Count;
} else
{
//fail: assert and
btAssert(0);
deallocate();
}
}
}
void copyToCL(cl_mem destination, int numElements, int firstElem=0, int dstOffsetInElems=0) const
{
if (numElements<=0)
return;
btAssert(m_clBuffer);
btAssert(destination);
//likely some error, destination is same as source
btAssert(m_clBuffer != destination);
btAssert((firstElem+numElements)<=m_size);
cl_int status = 0;
btAssert(numElements>0);
btAssert(numElements<=m_size);
int srcOffsetBytes = sizeof(T)*firstElem;
int dstOffsetInBytes = sizeof(T)*dstOffsetInElems;
status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination,
srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 );
btAssert( status == CL_SUCCESS );
}
void copyFromHost(const btAlignedObjectArray<T>& srcArray, bool waitForCompletion=true)
{
int newSize = srcArray.size();
bool copyOldContents = false;
resize (newSize,copyOldContents);
if (newSize)
copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion);
}
void copyFromHostPointer(const T* src, int numElems, int destFirstElem= 0, bool waitForCompletion=true)
{
btAssert(numElems+destFirstElem <= capacity());
cl_int status = 0;
int sizeInBytes=sizeof(T)*numElems;
status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes,
src, 0,0,0 );
btAssert(status == CL_SUCCESS );
if (waitForCompletion)
clFinish(m_commandQueue);
}
void copyToHost(btAlignedObjectArray<T>& destArray, bool waitForCompletion=true) const
{
destArray.resize(this->size());
if (size())
copyToHostPointer(&destArray[0], size(),0,waitForCompletion);
}
void copyToHostPointer(T* destPtr, int numElem, int srcFirstElem=0, bool waitForCompletion=true) const
{
btAssert(numElem+srcFirstElem <= capacity());
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem,
destPtr, 0,0,0 );
btAssert( status==CL_SUCCESS );
if (waitForCompletion)
clFinish(m_commandQueue);
}
void copyFromOpenCLArray(const btOpenCLArray& src)
{
int newSize = src.size();
resize(newSize);
if (size())
{
src.copyToCL(m_clBuffer,size());
}
}
};
#endif //BT_OPENCL_ARRAY_H

View File

@@ -0,0 +1,126 @@
#include "btPrefixScanCL.h"
#include "btFillCL.h"
#define BT_PREFIXSCAN_PROG_PATH "opencl/parallel_primitives/kernels/PrefixScanKernels.cl"
#include "btLauncherCL.h"
#include "../../basic_initialize/btOpenCLUtils.h"
#include "../kernels/PrefixScanKernelsCL.h"
btPrefixScanCL::btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
:m_commandQueue(queue)
{
const char* scanKernelSource = prefixScanKernelsCL;
cl_int pErrNum;
char* additionalMacros=0;
m_workBuffer = new btOpenCLArray<unsigned int>(ctx,queue,size);
cl_program scanProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, BT_PREFIXSCAN_PROG_PATH);
btAssert(scanProg);
m_localScanKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
btAssert(m_localScanKernel );
m_blockSumKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
btAssert(m_blockSumKernel );
m_propagationKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
btAssert(m_propagationKernel );
}
btPrefixScanCL::~btPrefixScanCL()
{
delete m_workBuffer;
clReleaseKernel(m_localScanKernel);
clReleaseKernel(m_blockSumKernel);
clReleaseKernel(m_propagationKernel);
}
template<class T>
T btNextPowerOf2(T n)
{
n -= 1;
for(int i=0; i<sizeof(T)*8; i++)
n = n | (n>>i);
return n+1;
}
void btPrefixScanCL::execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
{
// btAssert( data->m_option == EXCLUSIVE );
const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
dst.resize(src.size());
m_workBuffer->resize(src.size());
btInt4 constBuffer;
constBuffer.x = n;
constBuffer.y = numBlocks;
constBuffer.z = (int)btNextPowerOf2( numBlocks );
btOpenCLArray<unsigned int>* srcNative = &src;
btOpenCLArray<unsigned int>* dstNative = &dst;
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( srcNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_localScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
}
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_workBuffer->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_blockSumKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
}
if( numBlocks > 1 )
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_propagationKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
}
if( sum )
{
clFinish(m_commandQueue);
dstNative->copyToHostPointer(sum,1,n-1,true);
}
}
void btPrefixScanCL::executeHost(btAlignedObjectArray<unsigned int>& src, btAlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
{
unsigned int s = 0;
//if( data->m_option == EXCLUSIVE )
{
for(int i=0; i<n; i++)
{
dst[i] = s;
s += src[i];
}
}
/*else
{
for(int i=0; i<n; i++)
{
s += hSrc[i];
hDst[i] = s;
}
}
*/
if( sum )
{
*sum = dst[n-1];
}
}

View File

@@ -0,0 +1,37 @@
#ifndef BT_PREFIX_SCAN_CL_H
#define BT_PREFIX_SCAN_CL_H
#include "btOpenCLArray.h"
#include "btBufferInfoCL.h"
#include "btAlignedObjectArray.h"
class btPrefixScanCL
{
enum
{
BLOCK_SIZE = 128
};
// Option m_option;
cl_command_queue m_commandQueue;
cl_kernel m_localScanKernel;
cl_kernel m_blockSumKernel;
cl_kernel m_propagationKernel;
btOpenCLArray<unsigned int>* m_workBuffer;
public:
btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
virtual ~btPrefixScanCL();
void execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
void executeHost(btAlignedObjectArray<unsigned int>& src, btAlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum);
};
#endif //BT_PREFIX_SCAN_CL_H

View File

@@ -0,0 +1,566 @@
/*
***************************************************************************************************
**
** profile.cpp
**
** Real-Time Hierarchical Profiling for Game Programming Gems 3
**
** by Greg Hjelstrom & Byon Garrabrant
**
***************************************************************************************************/
// Credits: The Clock class was inspired by the Timer classes in
// Ogre (www.ogre3d.org).
#include "btQuickprof.h"
#ifndef BT_NO_PROFILE
static btClock gProfileClock;
#ifdef __CELLOS_LV2__
#include <sys/sys_time.h>
#include <sys/time_util.h>
#include <stdio.h>
#endif
#if defined (SUNOS) || defined (__SUNOS__)
#include <stdio.h>
#endif
#if defined(WIN32) || defined(_WIN32)
#define BT_USE_WINDOWS_TIMERS
#define WIN32_LEAN_AND_MEAN
#define NOWINRES
#define NOMCX
#define NOIME
#ifdef _XBOX
#include <Xtl.h>
#else //_XBOX
#include <windows.h>
#endif //_XBOX
#include <time.h>
#else //_WIN32
#include <sys/time.h>
#endif //_WIN32
#define mymin(a,b) (a > b ? a : b)
struct btClockData
{
#ifdef BT_USE_WINDOWS_TIMERS
LARGE_INTEGER mClockFrequency;
DWORD mStartTick;
LONGLONG mPrevElapsedTime;
LARGE_INTEGER mStartTime;
#else
#ifdef __CELLOS_LV2__
uint64_t mStartTime;
#else
struct timeval mStartTime;
#endif
#endif //__CELLOS_LV2__
};
///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling.
btClock::btClock()
{
m_data = new btClockData;
#ifdef BT_USE_WINDOWS_TIMERS
QueryPerformanceFrequency(&m_data->mClockFrequency);
#endif
reset();
}
btClock::~btClock()
{
delete m_data;
}
btClock::btClock(const btClock& other)
{
m_data = new btClockData;
*m_data = *other.m_data;
}
btClock& btClock::operator=(const btClock& other)
{
*m_data = *other.m_data;
return *this;
}
/// Resets the initial reference time.
void btClock::reset()
{
#ifdef BT_USE_WINDOWS_TIMERS
QueryPerformanceCounter(&m_data->mStartTime);
m_data->mStartTick = GetTickCount();
m_data->mPrevElapsedTime = 0;
#else
#ifdef __CELLOS_LV2__
typedef uint64_t ClockSize;
ClockSize newTime;
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
SYS_TIMEBASE_GET( newTime );
m_data->mStartTime = newTime;
#else
gettimeofday(&m_data->mStartTime, 0);
#endif
#endif
}
/// Returns the time in ms since the last call to reset or since
/// the btClock was created.
unsigned long int btClock::getTimeMilliseconds()
{
#ifdef BT_USE_WINDOWS_TIMERS
LARGE_INTEGER currentTime;
QueryPerformanceCounter(&currentTime);
LONGLONG elapsedTime = currentTime.QuadPart -
m_data->mStartTime.QuadPart;
// Compute the number of millisecond ticks elapsed.
unsigned long msecTicks = (unsigned long)(1000 * elapsedTime /
m_data->mClockFrequency.QuadPart);
// Check for unexpected leaps in the Win32 performance counter.
// (This is caused by unexpected data across the PCI to ISA
// bridge, aka south bridge. See Microsoft KB274323.)
unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
signed long msecOff = (signed long)(msecTicks - elapsedTicks);
if (msecOff < -100 || msecOff > 100)
{
// Adjust the starting time forwards.
LONGLONG msecAdjustment = mymin(msecOff *
m_data->mClockFrequency.QuadPart / 1000, elapsedTime -
m_data->mPrevElapsedTime);
m_data->mStartTime.QuadPart += msecAdjustment;
elapsedTime -= msecAdjustment;
// Recompute the number of millisecond ticks elapsed.
msecTicks = (unsigned long)(1000 * elapsedTime /
m_data->mClockFrequency.QuadPart);
}
// Store the current elapsed time for adjustments next time.
m_data->mPrevElapsedTime = elapsedTime;
return msecTicks;
#else
#ifdef __CELLOS_LV2__
uint64_t freq=sys_time_get_timebase_frequency();
double dFreq=((double) freq) / 1000.0;
typedef uint64_t ClockSize;
ClockSize newTime;
SYS_TIMEBASE_GET( newTime );
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq);
#else
struct timeval currentTime;
gettimeofday(&currentTime, 0);
return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000 +
(currentTime.tv_usec - m_data->mStartTime.tv_usec) / 1000;
#endif //__CELLOS_LV2__
#endif
}
/// Returns the time in us since the last call to reset or since
/// the Clock was created.
unsigned long int btClock::getTimeMicroseconds()
{
#ifdef BT_USE_WINDOWS_TIMERS
LARGE_INTEGER currentTime;
QueryPerformanceCounter(&currentTime);
LONGLONG elapsedTime = currentTime.QuadPart -
m_data->mStartTime.QuadPart;
// Compute the number of millisecond ticks elapsed.
unsigned long msecTicks = (unsigned long)(1000 * elapsedTime /
m_data->mClockFrequency.QuadPart);
// Check for unexpected leaps in the Win32 performance counter.
// (This is caused by unexpected data across the PCI to ISA
// bridge, aka south bridge. See Microsoft KB274323.)
unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
signed long msecOff = (signed long)(msecTicks - elapsedTicks);
if (msecOff < -100 || msecOff > 100)
{
// Adjust the starting time forwards.
LONGLONG msecAdjustment = mymin(msecOff *
m_data->mClockFrequency.QuadPart / 1000, elapsedTime -
m_data->mPrevElapsedTime);
m_data->mStartTime.QuadPart += msecAdjustment;
elapsedTime -= msecAdjustment;
}
// Store the current elapsed time for adjustments next time.
m_data->mPrevElapsedTime = elapsedTime;
// Convert to microseconds.
unsigned long usecTicks = (unsigned long)(1000000 * elapsedTime /
m_data->mClockFrequency.QuadPart);
return usecTicks;
#else
#ifdef __CELLOS_LV2__
uint64_t freq=sys_time_get_timebase_frequency();
double dFreq=((double) freq)/ 1000000.0;
typedef uint64_t ClockSize;
ClockSize newTime;
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
SYS_TIMEBASE_GET( newTime );
return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq);
#else
struct timeval currentTime;
gettimeofday(&currentTime, 0);
return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 +
(currentTime.tv_usec - m_data->mStartTime.tv_usec);
#endif//__CELLOS_LV2__
#endif
}
inline void Profile_Get_Ticks(unsigned long int * ticks)
{
*ticks = gProfileClock.getTimeMicroseconds();
}
inline float Profile_Get_Tick_Rate(void)
{
// return 1000000.f;
return 1000.f;
}
/***************************************************************************************************
**
** CProfileNode
**
***************************************************************************************************/
/***********************************************************************************************
* INPUT: *
* name - pointer to a static string which is the name of this profile node *
* parent - parent pointer *
* *
* WARNINGS: *
* The name is assumed to be a static pointer, only the pointer is stored and compared for *
* efficiency reasons. *
*=============================================================================================*/
CProfileNode::CProfileNode( const char * name, CProfileNode * parent ) :
Name( name ),
TotalCalls( 0 ),
TotalTime( 0 ),
StartTime( 0 ),
RecursionCounter( 0 ),
Parent( parent ),
Child( NULL ),
Sibling( NULL ),
m_userPtr(0)
{
Reset();
}
void CProfileNode::CleanupMemory()
{
delete ( Child);
Child = NULL;
delete ( Sibling);
Sibling = NULL;
}
CProfileNode::~CProfileNode( void )
{
delete ( Child);
delete ( Sibling);
}
/***********************************************************************************************
* INPUT: *
* name - static string pointer to the name of the node we are searching for *
* *
* WARNINGS: *
* All profile names are assumed to be static strings so this function uses pointer compares *
* to find the named node. *
*=============================================================================================*/
CProfileNode * CProfileNode::Get_Sub_Node( const char * name )
{
// Try to find this sub node
CProfileNode * child = Child;
while ( child ) {
if ( child->Name == name ) {
return child;
}
child = child->Sibling;
}
// We didn't find it, so add it
CProfileNode * node = new CProfileNode( name, this );
node->Sibling = Child;
Child = node;
return node;
}
void CProfileNode::Reset( void )
{
TotalCalls = 0;
TotalTime = 0.0f;
if ( Child ) {
Child->Reset();
}
if ( Sibling ) {
Sibling->Reset();
}
}
void CProfileNode::Call( void )
{
TotalCalls++;
if (RecursionCounter++ == 0) {
Profile_Get_Ticks(&StartTime);
}
}
bool CProfileNode::Return( void )
{
if ( --RecursionCounter == 0 && TotalCalls != 0 ) {
unsigned long int time;
Profile_Get_Ticks(&time);
time-=StartTime;
TotalTime += (float)time / Profile_Get_Tick_Rate();
}
return ( RecursionCounter == 0 );
}
/***************************************************************************************************
**
** CProfileIterator
**
***************************************************************************************************/
CProfileIterator::CProfileIterator( CProfileNode * start )
{
CurrentParent = start;
CurrentChild = CurrentParent->Get_Child();
}
void CProfileIterator::First(void)
{
CurrentChild = CurrentParent->Get_Child();
}
void CProfileIterator::Next(void)
{
CurrentChild = CurrentChild->Get_Sibling();
}
bool CProfileIterator::Is_Done(void)
{
return CurrentChild == NULL;
}
void CProfileIterator::Enter_Child( int index )
{
CurrentChild = CurrentParent->Get_Child();
while ( (CurrentChild != NULL) && (index != 0) ) {
index--;
CurrentChild = CurrentChild->Get_Sibling();
}
if ( CurrentChild != NULL ) {
CurrentParent = CurrentChild;
CurrentChild = CurrentParent->Get_Child();
}
}
void CProfileIterator::Enter_Parent( void )
{
if ( CurrentParent->Get_Parent() != NULL ) {
CurrentParent = CurrentParent->Get_Parent();
}
CurrentChild = CurrentParent->Get_Child();
}
/***************************************************************************************************
**
** CProfileManager
**
***************************************************************************************************/
CProfileNode CProfileManager::Root( "Root", NULL );
CProfileNode * CProfileManager::CurrentNode = &CProfileManager::Root;
int CProfileManager::FrameCounter = 0;
unsigned long int CProfileManager::ResetTime = 0;
/***********************************************************************************************
* CProfileManager::Start_Profile -- Begin a named profile *
* *
* Steps one level deeper into the tree, if a child already exists with the specified name *
* then it accumulates the profiling; otherwise a new child node is added to the profile tree. *
* *
* INPUT: *
* name - name of this profiling record *
* *
* WARNINGS: *
* The string used is assumed to be a static string; pointer compares are used throughout *
* the profiling code for efficiency. *
*=============================================================================================*/
void CProfileManager::Start_Profile( const char * name )
{
if (name != CurrentNode->Get_Name()) {
CurrentNode = CurrentNode->Get_Sub_Node( name );
}
CurrentNode->Call();
}
/***********************************************************************************************
* CProfileManager::Stop_Profile -- Stop timing and record the results. *
*=============================================================================================*/
void CProfileManager::Stop_Profile( void )
{
// Return will indicate whether we should back up to our parent (we may
// be profiling a recursive function)
if (CurrentNode->Return()) {
CurrentNode = CurrentNode->Get_Parent();
}
}
/***********************************************************************************************
* CProfileManager::Reset -- Reset the contents of the profiling system *
* *
* This resets everything except for the tree structure. All of the timing data is reset. *
*=============================================================================================*/
void CProfileManager::Reset( void )
{
gProfileClock.reset();
Root.Reset();
Root.Call();
FrameCounter = 0;
Profile_Get_Ticks(&ResetTime);
}
/***********************************************************************************************
* CProfileManager::Increment_Frame_Counter -- Increment the frame counter *
*=============================================================================================*/
void CProfileManager::Increment_Frame_Counter( void )
{
FrameCounter++;
}
/***********************************************************************************************
* CProfileManager::Get_Time_Since_Reset -- returns the elapsed time since last reset *
*=============================================================================================*/
float CProfileManager::Get_Time_Since_Reset( void )
{
unsigned long int time;
Profile_Get_Ticks(&time);
time -= ResetTime;
return (float)time / Profile_Get_Tick_Rate();
}
#include <stdio.h>
void CProfileManager::dumpRecursive(CProfileIterator* profileIterator, int spacing)
{
profileIterator->First();
if (profileIterator->Is_Done())
return;
float accumulated_time=0,parent_time = profileIterator->Is_Root() ? CProfileManager::Get_Time_Since_Reset() : profileIterator->Get_Current_Parent_Total_Time();
int i;
int frames_since_reset = CProfileManager::Get_Frame_Count_Since_Reset();
for (i=0;i<spacing;i++) printf(".");
printf("----------------------------------\n");
for (i=0;i<spacing;i++) printf(".");
printf("Profiling: %s (total running time: %.3f ms) ---\n", profileIterator->Get_Current_Parent_Name(), parent_time );
float totalTime = 0.f;
int numChildren = 0;
for (i = 0; !profileIterator->Is_Done(); i++,profileIterator->Next())
{
numChildren++;
float current_total_time = profileIterator->Get_Current_Total_Time();
accumulated_time += current_total_time;
float fraction = parent_time > SIMD_EPSILON ? (current_total_time / parent_time) * 100 : 0.f;
{
int i; for (i=0;i<spacing;i++) printf(".");
}
printf("%d -- %s (%.2f %%) :: %.3f ms / frame (%d calls)\n",i, profileIterator->Get_Current_Name(), fraction,(current_total_time / (double)frames_since_reset),profileIterator->Get_Current_Total_Calls());
totalTime += current_total_time;
//recurse into children
}
if (parent_time < accumulated_time)
{
printf("what's wrong\n");
}
for (i=0;i<spacing;i++) printf(".");
printf("%s (%.3f %%) :: %.3f ms\n", "Unaccounted:",parent_time > SIMD_EPSILON ? ((parent_time - accumulated_time) / parent_time) * 100 : 0.f, parent_time - accumulated_time);
for (i=0;i<numChildren;i++)
{
profileIterator->Enter_Child(i);
dumpRecursive(profileIterator,spacing+3);
profileIterator->Enter_Parent();
}
}
void CProfileManager::dumpAll()
{
CProfileIterator* profileIterator = 0;
profileIterator = CProfileManager::Get_Iterator();
dumpRecursive(profileIterator,0);
CProfileManager::Release_Iterator(profileIterator);
}
#endif //BT_NO_PROFILE

View File

@@ -0,0 +1,203 @@
/***************************************************************************************************
**
** Real-Time Hierarchical Profiling for Game Programming Gems 3
**
** by Greg Hjelstrom & Byon Garrabrant
**
***************************************************************************************************/
// Credits: The Clock class was inspired by the Timer classes in
// Ogre (www.ogre3d.org).
#ifndef BT_QUICK_PROF_H
#define BT_QUICK_PROF_H
//To disable built-in profiling, please comment out next line
//#define BT_NO_PROFILE 1
#ifndef BT_NO_PROFILE
#include <stdio.h>//@todo remove this, backwards compatibility
#include "btScalar.h"
#include "btAlignedAllocator.h"
#include <new>
#define USE_BT_CLOCK 1
#ifdef USE_BT_CLOCK
///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling.
class btClock
{
public:
btClock();
btClock(const btClock& other);
btClock& operator=(const btClock& other);
~btClock();
/// Resets the initial reference time.
void reset();
/// Returns the time in ms since the last call to reset or since
/// the btClock was created.
unsigned long int getTimeMilliseconds();
/// Returns the time in us since the last call to reset or since
/// the Clock was created.
unsigned long int getTimeMicroseconds();
private:
struct btClockData* m_data;
};
#endif //USE_BT_CLOCK
///A node in the Profile Hierarchy Tree
class CProfileNode {
public:
CProfileNode( const char * name, CProfileNode * parent );
~CProfileNode( void );
CProfileNode * Get_Sub_Node( const char * name );
CProfileNode * Get_Parent( void ) { return Parent; }
CProfileNode * Get_Sibling( void ) { return Sibling; }
CProfileNode * Get_Child( void ) { return Child; }
void CleanupMemory();
void Reset( void );
void Call( void );
bool Return( void );
const char * Get_Name( void ) { return Name; }
int Get_Total_Calls( void ) { return TotalCalls; }
float Get_Total_Time( void ) { return TotalTime; }
void* GetUserPointer() const {return m_userPtr;}
void SetUserPointer(void* ptr) { m_userPtr = ptr;}
protected:
const char * Name;
int TotalCalls;
float TotalTime;
unsigned long int StartTime;
int RecursionCounter;
CProfileNode * Parent;
CProfileNode * Child;
CProfileNode * Sibling;
void* m_userPtr;
};
///An iterator to navigate through the tree
class CProfileIterator
{
public:
// Access all the children of the current parent
void First(void);
void Next(void);
bool Is_Done(void);
bool Is_Root(void) { return (CurrentParent->Get_Parent() == 0); }
void Enter_Child( int index ); // Make the given child the new parent
void Enter_Largest_Child( void ); // Make the largest child the new parent
void Enter_Parent( void ); // Make the current parent's parent the new parent
// Access the current child
const char * Get_Current_Name( void ) { return CurrentChild->Get_Name(); }
int Get_Current_Total_Calls( void ) { return CurrentChild->Get_Total_Calls(); }
float Get_Current_Total_Time( void ) { return CurrentChild->Get_Total_Time(); }
void* Get_Current_UserPointer( void ) { return CurrentChild->GetUserPointer(); }
void Set_Current_UserPointer(void* ptr) {CurrentChild->SetUserPointer(ptr);}
// Access the current parent
const char * Get_Current_Parent_Name( void ) { return CurrentParent->Get_Name(); }
int Get_Current_Parent_Total_Calls( void ) { return CurrentParent->Get_Total_Calls(); }
float Get_Current_Parent_Total_Time( void ) { return CurrentParent->Get_Total_Time(); }
protected:
CProfileNode * CurrentParent;
CProfileNode * CurrentChild;
CProfileIterator( CProfileNode * start );
friend class CProfileManager;
};
///The Manager for the Profile system
class CProfileManager {
public:
static void Start_Profile( const char * name );
static void Stop_Profile( void );
static void CleanupMemory(void)
{
Root.CleanupMemory();
}
static void Reset( void );
static void Increment_Frame_Counter( void );
static int Get_Frame_Count_Since_Reset( void ) { return FrameCounter; }
static float Get_Time_Since_Reset( void );
static CProfileIterator * Get_Iterator( void )
{
return new CProfileIterator( &Root );
}
static void Release_Iterator( CProfileIterator * iterator ) { delete ( iterator); }
static void dumpRecursive(CProfileIterator* profileIterator, int spacing);
static void dumpAll();
private:
static CProfileNode Root;
static CProfileNode * CurrentNode;
static int FrameCounter;
static unsigned long int ResetTime;
};
///ProfileSampleClass is a simple way to profile a function's scope
///Use the BT_PROFILE macro at the start of scope to time
class CProfileSample {
public:
CProfileSample( const char * name )
{
CProfileManager::Start_Profile( name );
}
~CProfileSample( void )
{
CProfileManager::Stop_Profile();
}
};
#define BT_PROFILE( name ) CProfileSample __profile( name )
#else
#define BT_PROFILE( name )
#endif //#ifndef BT_NO_PROFILE
#endif //BT_QUICK_PROF_H

View File

@@ -0,0 +1,712 @@
#include "btRadixSort32CL.h"
#include "btLauncherCL.h"
#include "../../basic_initialize/btOpenCLUtils.h"
#include "btPrefixScanCL.h"
#include "btFillCL.h"
#define RADIXSORT32_PATH "opencl/parallel_primitives/kernels/RadixSort32Kernels.cl"
#include "../kernels/RadixSort32KernelsCL.h"
btRadixSort32CL::btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
:m_commandQueue(queue)
{
btOpenCLDeviceInfo info;
btOpenCLUtils::getDeviceInfo(device,&info);
m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
m_workBuffer1 = new btOpenCLArray<unsigned int>(ctx,queue);
m_workBuffer2 = new btOpenCLArray<unsigned int>(ctx,queue);
m_workBuffer3 = new btOpenCLArray<btSortData>(ctx,queue);
m_workBuffer3a = new btOpenCLArray<unsigned int>(ctx,queue);
m_workBuffer4 = new btOpenCLArray<btSortData>(ctx,queue);
m_workBuffer4a = new btOpenCLArray<unsigned int>(ctx,queue);
if (initialCapacity>0)
{
m_workBuffer1->resize(initialCapacity);
m_workBuffer3->resize(initialCapacity);
m_workBuffer3a->resize(initialCapacity);
m_workBuffer4->resize(initialCapacity);
m_workBuffer4a->resize(initialCapacity);
}
m_scan = new btPrefixScanCL(ctx,device,queue);
m_fill = new btFillCL(ctx,device,queue);
const char* additionalMacros = "";
const char* srcFileNameForCaching="";
cl_int pErrNum;
const char* kernelSource = radixSort32KernelsCL;
cl_program sortProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
btAssert(sortProg);
m_streamCountSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_streamCountSortDataKernel );
m_streamCountKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_streamCountKernel);
if (m_deviceCPU)
{
m_sortAndScatterSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
btAssert(m_sortAndScatterSortDataKernel);
m_sortAndScatterKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
btAssert(m_sortAndScatterKernel);
} else
{
m_sortAndScatterSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_sortAndScatterSortDataKernel);
m_sortAndScatterKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_sortAndScatterKernel);
}
m_prefixScanKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_prefixScanKernel);
}
btRadixSort32CL::~btRadixSort32CL()
{
delete m_scan;
delete m_fill;
delete m_workBuffer1;
delete m_workBuffer2;
delete m_workBuffer3;
delete m_workBuffer3a;
delete m_workBuffer4;
delete m_workBuffer4a;
clReleaseKernel(m_streamCountSortDataKernel);
clReleaseKernel(m_streamCountKernel);
clReleaseKernel(m_sortAndScatterSortDataKernel);
clReleaseKernel(m_sortAndScatterKernel);
clReleaseKernel(m_prefixScanKernel);
}
void btRadixSort32CL::executeHost(btAlignedObjectArray<btSortData>& inout, int sortBits /* = 32 */)
{
int n = inout.size();
const int BITS_PER_PASS = 8;
const int NUM_TABLES = (1<<BITS_PER_PASS);
int tables[NUM_TABLES];
int counter[NUM_TABLES];
btSortData* src = &inout[0];
btAlignedObjectArray<btSortData> workbuffer;
workbuffer.resize(inout.size());
btSortData* dst = &workbuffer[0];
int count=0;
for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
{
for(int i=0; i<NUM_TABLES; i++)
{
tables[i] = 0;
}
for(int i=0; i<n; i++)
{
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
tables[tableIdx]++;
}
//#define TEST
#ifdef TEST
printf("histogram size=%d\n",NUM_TABLES);
for (int i=0;i<NUM_TABLES;i++)
{
if (tables[i]!=0)
{
printf("tables[%d]=%d]\n",i,tables[i]);
}
}
#endif //TEST
// prefix scan
int sum = 0;
for(int i=0; i<NUM_TABLES; i++)
{
int iData = tables[i];
tables[i] = sum;
sum += iData;
counter[i] = 0;
}
// distribute
for(int i=0; i<n; i++)
{
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
dst[tables[tableIdx] + counter[tableIdx]] = src[i];
counter[tableIdx] ++;
}
btSwap( src, dst );
count++;
}
if (count&1)
{
btAssert(0);//need to copy
}
}
void btRadixSort32CL::executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
{
btAlignedObjectArray<btSortData> inout;
keyValuesInOut.copyToHost(inout);
executeHost(inout,sortBits);
keyValuesInOut.copyFromHost(inout);
}
void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray<unsigned int>& keysOut, btOpenCLArray<unsigned int>& valuesIn,
btOpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
{
}
//#define DEBUG_RADIXSORT
//#define DEBUG_RADIXSORT2
void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
{
int originalSize = keyValuesInOut.size();
int workingSize = originalSize;
int dataAlignment = DATA_ALIGNMENT;
#ifdef DEBUG_RADIXSORT2
btAlignedObjectArray<btSortData> test2;
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
{
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
}
#endif //DEBUG_RADIXSORT2
btOpenCLArray<btSortData>* src = 0;
if (workingSize%dataAlignment)
{
workingSize += dataAlignment-(workingSize%dataAlignment);
m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
m_workBuffer4->resize(workingSize);
btSortData fillValue;
fillValue.m_key = 0xffffffff;
fillValue.m_value = 0xffffffff;
#define USE_BTFILL
#ifdef USE_BTFILL
m_fill->execute((btOpenCLArray<btInt2>&)*m_workBuffer4,(btInt2&)fillValue,workingSize-originalSize,originalSize);
#else
//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
for (int i=originalSize; i<workingSize;i++)
{
m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
}
#endif//USE_BTFILL
src = m_workBuffer4;
} else
{
src = &keyValuesInOut;
m_workBuffer4->resize(0);
}
btAssert( workingSize%DATA_ALIGNMENT == 0 );
int minCap = NUM_BUCKET*NUM_WGS;
int n = workingSize;
m_workBuffer1->resize(minCap);
m_workBuffer3->resize(workingSize);
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
btAssert( BITS_PER_PASS == 4 );
btAssert( WG_SIZE == 64 );
btAssert( (sortBits&0x3) == 0 );
btOpenCLArray<btSortData>* dst = m_workBuffer3;
btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;
int nWGs = NUM_WGS;
btConstData cdata;
{
int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
int nBlocks = (n+blockSize-1)/(blockSize);
cdata.m_n = n;
cdata.m_nWGs = NUM_WGS;
cdata.m_startBit = 0;
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
if( nBlocks < NUM_WGS )
{
cdata.m_nBlocksPerWG = 1;
nWGs = nBlocks;
}
}
int count=0;
for(int ib=0; ib<sortBits; ib+=4)
{
#ifdef DEBUG_RADIXSORT2
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
{
if (test2[i].m_key != test2[i].m_value)
{
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
}
}
#endif //DEBUG_RADIXSORT2
cdata.m_startBit = ib;
if (src->size())
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
btLauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( cdata );
int num = NUM_WGS*WG_SIZE;
launcher.launch1D( num, WG_SIZE );
}
#ifdef DEBUG_RADIXSORT
btAlignedObjectArray<unsigned int> testHist;
srcHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
for (int i=0;i<testHist.size();i++)
{
if (testHist[i]!=0)
printf("testHist[%d]=%d\n",i,testHist[i]);
}
#endif //DEBUG_RADIXSORT
//fast prefix scan is not working properly on Mac OSX yet
#ifdef _WIN32
bool fastScan=!m_deviceCPU;//only use fast scan on GPU
#else
bool fastScan=false;
#endif
if (fastScan)
{// prefix scan group histogram
btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( 128, 128 );
destHisto = srcHisto;
}else
{
//unsigned int sum; //for debugging
m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
}
#ifdef DEBUG_RADIXSORT
destHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
for (int i=0;i<testHist.size();i++)
{
if (testHist[i]!=0)
printf("testHist[%d]=%d\n",i,testHist[i]);
}
for (int i=0;i<testHist.size();i+=NUM_WGS)
{
printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
}
#endif //DEBUG_RADIXSORT
#define USE_GPU
#ifdef USE_GPU
if (src->size())
{// local sort and distribute
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
btLauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
}
#else
{
#define NUM_TABLES 16
//#define SEQUENTIAL
#ifdef SEQUENTIAL
int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int tables[NUM_TABLES];
int startBit = ib;
destHisto->copyToHost(testHist);
btAlignedObjectArray<btSortData> srcHost;
btAlignedObjectArray<btSortData> dstHost;
dstHost.resize(src->size());
src->copyToHost(srcHost);
for (int i=0;i<NUM_TABLES;i++)
{
tables[i] = testHist[i*NUM_WGS];
}
// distribute
for(int i=0; i<n; i++)
{
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
counter2[tableIdx] ++;
}
#else
int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int tables[NUM_TABLES];
btAlignedObjectArray<btSortData> dstHostOK;
dstHostOK.resize(src->size());
destHisto->copyToHost(testHist);
btAlignedObjectArray<btSortData> srcHost;
src->copyToHost(srcHost);
int blockSize = 256;
int nBlocksPerWG = cdata.m_nBlocksPerWG;
int startBit = ib;
{
for (int i=0;i<NUM_TABLES;i++)
{
tables[i] = testHist[i*NUM_WGS];
}
// distribute
for(int i=0; i<n; i++)
{
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
counter2[tableIdx] ++;
}
}
btAlignedObjectArray<btSortData> dstHost;
dstHost.resize(src->size());
int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
{
int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
for(int iblock=0; iblock<btMin(cdata.m_nBlocksPerWG, nBlocks); iblock++)
{
for (int lIdx = 0;lIdx < 64;lIdx++)
{
int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
// MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
// Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
// AMD: AtomInc performs better while NV prefers ++
for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
{
if( addr+j < n )
{
// printf ("addr+j=%d\n", addr+j);
int i = addr+j;
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
btSortData ok = dstHostOK[destIndex];
if (ok.m_key != srcHost[i].m_key)
{
printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
}
if (ok.m_value != srcHost[i].m_value)
{
printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );
}
dstHost[destIndex] = srcHost[i];
counter[tableIdx] ++;
}
}
}
}
}
#endif //SEQUENTIAL
dst->copyFromHost(dstHost);
}
#endif//USE_GPU
#ifdef DEBUG_RADIXSORT
destHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
for (int i=0;i<testHist.size();i++)
{
if (testHist[i]!=0)
printf("testHist[%d]=%d\n",i,testHist[i]);
}
#endif //DEBUG_RADIXSORT
btSwap(src, dst );
btSwap(srcHisto,destHisto);
#ifdef DEBUG_RADIXSORT2
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
{
if (test2[i].m_key != test2[i].m_value)
{
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
}
}
#endif //DEBUG_RADIXSORT2
count++;
}
if (count&1)
{
btAssert(0);//need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4->size())
{
m_workBuffer4->resize(originalSize);
keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
}
#ifdef DEBUG_RADIXSORT
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
{
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
}
#endif
}
void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
{
int originalSize = keysInOut.size();
int workingSize = originalSize;
int dataAlignment = DATA_ALIGNMENT;
btOpenCLArray<unsigned int>* src = 0;
if (workingSize%dataAlignment)
{
workingSize += dataAlignment-(workingSize%dataAlignment);
m_workBuffer4a->copyFromOpenCLArray(keysInOut);
m_workBuffer4a->resize(workingSize);
unsigned int fillValue = 0xffffffff;
m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize);
src = m_workBuffer4a;
} else
{
src = &keysInOut;
m_workBuffer4a->resize(0);
}
btAssert( workingSize%DATA_ALIGNMENT == 0 );
int minCap = NUM_BUCKET*NUM_WGS;
int n = workingSize;
m_workBuffer1->resize(minCap);
m_workBuffer3->resize(workingSize);
m_workBuffer3a->resize(workingSize);
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
btAssert( BITS_PER_PASS == 4 );
btAssert( WG_SIZE == 64 );
btAssert( (sortBits&0x3) == 0 );
btOpenCLArray<unsigned int>* dst = m_workBuffer3a;
btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;
int nWGs = NUM_WGS;
btConstData cdata;
{
int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
int nBlocks = (n+blockSize-1)/(blockSize);
cdata.m_n = n;
cdata.m_nWGs = NUM_WGS;
cdata.m_startBit = 0;
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
if( nBlocks < NUM_WGS )
{
cdata.m_nBlocksPerWG = 1;
nWGs = nBlocks;
}
}
int count=0;
for(int ib=0; ib<sortBits; ib+=4)
{
cdata.m_startBit = ib;
if (src->size())
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
btLauncherCL launcher(m_commandQueue, m_streamCountKernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( cdata );
int num = NUM_WGS*WG_SIZE;
launcher.launch1D( num, WG_SIZE );
}
//fast prefix scan is not working properly on Mac OSX yet
#ifdef _WIN32
bool fastScan=!m_deviceCPU;
#else
bool fastScan=false;
#endif
if (fastScan)
{// prefix scan group histogram
btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( 128, 128 );
destHisto = srcHisto;
}else
{
//unsigned int sum; //for debugging
m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
}
if (src->size())
{// local sort and distribute
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
btLauncherCL launcher( m_commandQueue, m_sortAndScatterKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
}
btSwap(src, dst );
btSwap(srcHisto,destHisto);
count++;
}
if (count&1)
{
btAssert(0);//need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4a->size())
{
m_workBuffer4a->resize(originalSize);
keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
}
}

View File

@@ -0,0 +1,85 @@
#ifndef BT_RADIXSORT32_H
#define BT_RADIXSORT32_H
#include "btOpenCLArray.h"
struct btSortData
{
int m_key;
int m_value;
};
#include "btBufferInfoCL.h"
class btRadixSort32CL
{
btOpenCLArray<unsigned int>* m_workBuffer1;
btOpenCLArray<unsigned int>* m_workBuffer2;
btOpenCLArray<btSortData>* m_workBuffer3;
btOpenCLArray<btSortData>* m_workBuffer4;
btOpenCLArray<unsigned int>* m_workBuffer3a;
btOpenCLArray<unsigned int>* m_workBuffer4a;
cl_command_queue m_commandQueue;
cl_kernel m_streamCountSortDataKernel;
cl_kernel m_streamCountKernel;
cl_kernel m_prefixScanKernel;
cl_kernel m_sortAndScatterSortDataKernel;
cl_kernel m_sortAndScatterKernel;
bool m_deviceCPU;
class btPrefixScanCL* m_scan;
class btFillCL* m_fill;
public:
struct btConstData
{
int m_n;
int m_nWGs;
int m_startBit;
int m_nBlocksPerWG;
};
enum
{
DATA_ALIGNMENT = 256,
WG_SIZE = 64,
BLOCK_SIZE = 256,
ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
BITS_PER_PASS = 4,
NUM_BUCKET=(1<<BITS_PER_PASS),
// if you change this, change nPerWI in kernel as well
NUM_WGS = 20*6, // cypress
// NUM_WGS = 24*6, // cayman
// NUM_WGS = 32*4, // nv
};
private:
public:
btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
virtual ~btRadixSort32CL();
void execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray<unsigned int>& keysOut, btOpenCLArray<unsigned int>& valuesIn,
btOpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
///keys only
void execute(btOpenCLArray<unsigned int>& keysInOut, int sortBits = 32 );
void execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits = 32 );
void executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits = 32);
void executeHost(btAlignedObjectArray<btSortData>& keyValuesInOut, int sortBits = 32);
};
#endif //BT_RADIXSORT32_H

View File

@@ -0,0 +1,660 @@
/*
Copyright (c) 2003-2009 Erwin Coumans http://bullet.googlecode.com
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_SCALAR_H
#define BT_SCALAR_H
#ifdef BT_MANAGED_CODE
//Aligned data types not supported in managed code
#pragma unmanaged
#endif
#include <math.h>
#include <stdlib.h>//size_t for MSVC 6.0
#include <float.h>
/* SVN $Revision$ on $Date$ from http://bullet.googlecode.com*/
#define BT_BULLET_VERSION 281
inline int btGetVersion()
{
return BT_BULLET_VERSION;
}
#if defined(DEBUG) || defined (_DEBUG)
#define BT_DEBUG
#endif
#ifdef _WIN32
#if defined(__MINGW32__) || defined(__CYGWIN__) || (defined (_MSC_VER) && _MSC_VER < 1300)
#define SIMD_FORCE_INLINE inline
#define ATTRIBUTE_ALIGNED16(a) a
#define ATTRIBUTE_ALIGNED64(a) a
#define ATTRIBUTE_ALIGNED128(a) a
#else
//#define BT_HAS_ALIGNED_ALLOCATOR
#pragma warning(disable : 4324) // disable padding warning
// #pragma warning(disable:4530) // Disable the exception disable but used in MSCV Stl warning.
// #pragma warning(disable:4996) //Turn off warnings about deprecated C routines
// #pragma warning(disable:4786) // Disable the "debug name too long" warning
#define SIMD_FORCE_INLINE __forceinline
#define ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a
#define ATTRIBUTE_ALIGNED64(a) __declspec(align(64)) a
#define ATTRIBUTE_ALIGNED128(a) __declspec (align(128)) a
#ifdef _XBOX
#define BT_USE_VMX128
#include <ppcintrinsics.h>
#define BT_HAVE_NATIVE_FSEL
#define btFsel(a,b,c) __fsel((a),(b),(c))
#else
#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (BT_USE_DOUBLE_PRECISION))
#define BT_USE_SSE
#ifdef BT_USE_SSE
//BT_USE_SSE_IN_API is disabled under Windows by default, because
//it makes it harder to integrate Bullet into your application under Windows
//(structured embedding Bullet structs/classes need to be 16-byte aligned)
//with relatively little performance gain
//If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries
//you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage)
//#define BT_USE_SSE_IN_API
#endif //BT_USE_SSE
#include <emmintrin.h>
#endif
#endif//_XBOX
#endif //__MINGW32__
#ifdef BT_DEBUG
#ifdef _MSC_VER
#include <stdio.h>
#define btAssert(x) { if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);__debugbreak(); }}
#else//_MSC_VER
#include <assert.h>
#define btAssert assert
#endif//_MSC_VER
#else
#define btAssert(x)
#endif
//btFullAssert is optional, slows down a lot
#define btFullAssert(x)
#define btLikely(_c) _c
#define btUnlikely(_c) _c
#else
#if defined (__CELLOS_LV2__)
#define SIMD_FORCE_INLINE inline __attribute__((always_inline))
#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
#ifndef assert
#include <assert.h>
#endif
#ifdef BT_DEBUG
#ifdef __SPU__
#include <spu_printf.h>
#define printf spu_printf
#define btAssert(x) {if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}}
#else
#define btAssert assert
#endif
#else
#define btAssert(x)
#endif
//btFullAssert is optional, slows down a lot
#define btFullAssert(x)
#define btLikely(_c) _c
#define btUnlikely(_c) _c
#else
#ifdef USE_LIBSPE2
#define SIMD_FORCE_INLINE __inline
#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
#ifndef assert
#include <assert.h>
#endif
#ifdef BT_DEBUG
#define btAssert assert
#else
#define btAssert(x)
#endif
//btFullAssert is optional, slows down a lot
#define btFullAssert(x)
#define btLikely(_c) __builtin_expect((_c), 1)
#define btUnlikely(_c) __builtin_expect((_c), 0)
#else
//non-windows systems
#if (defined (__APPLE__) && (!defined (BT_USE_DOUBLE_PRECISION)))
#if defined (__i386__) || defined (__x86_64__)
#define BT_USE_SSE
//BT_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries
//if apps run into issues, we will disable the next line
#define BT_USE_SSE_IN_API
#ifdef BT_USE_SSE
// include appropriate SSE level
#if defined (__SSE4_1__)
#include <smmintrin.h>
#elif defined (__SSSE3__)
#include <tmmintrin.h>
#elif defined (__SSE3__)
#include <pmmintrin.h>
#else
#include <emmintrin.h>
#endif
#endif //BT_USE_SSE
#elif defined( __armv7__ )
#ifdef __clang__
#define BT_USE_NEON 1
#if defined BT_USE_NEON && defined (__clang__)
#include <arm_neon.h>
#endif//BT_USE_NEON
#endif //__clang__
#endif//__arm__
#define SIMD_FORCE_INLINE inline __attribute__ ((always_inline))
///@todo: check out alignment methods for other platforms/compilers
#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
#ifndef assert
#include <assert.h>
#endif
#if defined(DEBUG) || defined (_DEBUG)
#if defined (__i386__) || defined (__x86_64__)
#include <stdio.h>
#define btAssert(x)\
{\
if(!(x))\
{\
printf("Assert %s in line %d, file %s\n",#x, __LINE__, __FILE__);\
asm volatile ("int3");\
}\
}
#else//defined (__i386__) || defined (__x86_64__)
#define btAssert assert
#endif//defined (__i386__) || defined (__x86_64__)
#else//defined(DEBUG) || defined (_DEBUG)
#define btAssert(x)
#endif//defined(DEBUG) || defined (_DEBUG)
//btFullAssert is optional, slows down a lot
#define btFullAssert(x)
#define btLikely(_c) _c
#define btUnlikely(_c) _c
#else
#define SIMD_FORCE_INLINE inline
///@todo: check out alignment methods for other platforms/compilers
///#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
///#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
///#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
#define ATTRIBUTE_ALIGNED16(a) a
#define ATTRIBUTE_ALIGNED64(a) a
#define ATTRIBUTE_ALIGNED128(a) a
#ifndef assert
#include <assert.h>
#endif
#if defined(DEBUG) || defined (_DEBUG)
#define btAssert assert
#else
#define btAssert(x)
#endif
//btFullAssert is optional, slows down a lot
#define btFullAssert(x)
#define btLikely(_c) _c
#define btUnlikely(_c) _c
#endif //__APPLE__
#endif // LIBSPE2
#endif //__CELLOS_LV2__
#endif
///The btScalar type abstracts floating point numbers, to easily switch between double and single floating point precision.
#if defined(BT_USE_DOUBLE_PRECISION)
typedef double btScalar;
//this number could be bigger in double precision
#define BT_LARGE_FLOAT 1e30
#else
typedef float btScalar;
//keep BT_LARGE_FLOAT*BT_LARGE_FLOAT < FLT_MAX
#define BT_LARGE_FLOAT 1e18f
#endif
#ifdef BT_USE_SSE
typedef __m128 btSimdFloat4;
#endif//BT_USE_SSE
#if defined BT_USE_SSE_IN_API && defined (BT_USE_SSE)
#ifdef _WIN32
#ifndef BT_NAN
static int btNanMask = 0x7F800001;
#define BT_NAN (*(float*)&btNanMask)
#endif
#ifndef BT_INFINITY
static int btInfinityMask = 0x7F800000;
#define BT_INFINITY (*(float*)&btInfinityMask)
#endif
inline __m128 operator + (const __m128 A, const __m128 B)
{
return _mm_add_ps(A, B);
}
inline __m128 operator - (const __m128 A, const __m128 B)
{
return _mm_sub_ps(A, B);
}
inline __m128 operator * (const __m128 A, const __m128 B)
{
return _mm_mul_ps(A, B);
}
#define btCastfTo128i(a) (_mm_castps_si128(a))
#define btCastfTo128d(a) (_mm_castps_pd(a))
#define btCastiTo128f(a) (_mm_castsi128_ps(a))
#define btCastdTo128f(a) (_mm_castpd_ps(a))
#define btCastdTo128i(a) (_mm_castpd_si128(a))
#define btAssign128(r0,r1,r2,r3) _mm_setr_ps(r0,r1,r2,r3)
#else//_WIN32
#define btCastfTo128i(a) ((__m128i)(a))
#define btCastfTo128d(a) ((__m128d)(a))
#define btCastiTo128f(a) ((__m128) (a))
#define btCastdTo128f(a) ((__m128) (a))
#define btCastdTo128i(a) ((__m128i)(a))
#define btAssign128(r0,r1,r2,r3) (__m128){r0,r1,r2,r3}
#define BT_INFINITY INFINITY
#define BT_NAN NAN
#endif//_WIN32
#endif //BT_USE_SSE_IN_API
#ifdef BT_USE_NEON
#include <arm_neon.h>
typedef float32x4_t btSimdFloat4;
#define BT_INFINITY INFINITY
#define BT_NAN NAN
#define btAssign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3}
#endif
#define BT_DECLARE_ALIGNED_ALLOCATOR() \
SIMD_FORCE_INLINE void* operator new(size_t sizeInBytes) { return btAlignedAlloc(sizeInBytes,16); } \
SIMD_FORCE_INLINE void operator delete(void* ptr) { btAlignedFree(ptr); } \
SIMD_FORCE_INLINE void* operator new(size_t, void* ptr) { return ptr; } \
SIMD_FORCE_INLINE void operator delete(void*, void*) { } \
SIMD_FORCE_INLINE void* operator new[](size_t sizeInBytes) { return btAlignedAlloc(sizeInBytes,16); } \
SIMD_FORCE_INLINE void operator delete[](void* ptr) { btAlignedFree(ptr); } \
SIMD_FORCE_INLINE void* operator new[](size_t, void* ptr) { return ptr; } \
SIMD_FORCE_INLINE void operator delete[](void*, void*) { } \
#if defined(BT_USE_DOUBLE_PRECISION) || defined(BT_FORCE_DOUBLE_FUNCTIONS)
SIMD_FORCE_INLINE btScalar btSqrt(btScalar x) { return sqrt(x); }
SIMD_FORCE_INLINE btScalar btFabs(btScalar x) { return fabs(x); }
SIMD_FORCE_INLINE btScalar btCos(btScalar x) { return cos(x); }
SIMD_FORCE_INLINE btScalar btSin(btScalar x) { return sin(x); }
SIMD_FORCE_INLINE btScalar btTan(btScalar x) { return tan(x); }
SIMD_FORCE_INLINE btScalar btAcos(btScalar x) { if (x<btScalar(-1)) x=btScalar(-1); if (x>btScalar(1)) x=btScalar(1); return acos(x); }
SIMD_FORCE_INLINE btScalar btAsin(btScalar x) { if (x<btScalar(-1)) x=btScalar(-1); if (x>btScalar(1)) x=btScalar(1); return asin(x); }
SIMD_FORCE_INLINE btScalar btAtan(btScalar x) { return atan(x); }
SIMD_FORCE_INLINE btScalar btAtan2(btScalar x, btScalar y) { return atan2(x, y); }
SIMD_FORCE_INLINE btScalar btExp(btScalar x) { return exp(x); }
SIMD_FORCE_INLINE btScalar btLog(btScalar x) { return log(x); }
SIMD_FORCE_INLINE btScalar btPow(btScalar x,btScalar y) { return pow(x,y); }
SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmod(x,y); }
#else
SIMD_FORCE_INLINE btScalar btSqrt(btScalar y)
{
#ifdef USE_APPROXIMATION
double x, z, tempf;
unsigned long *tfptr = ((unsigned long *)&tempf) + 1;
tempf = y;
*tfptr = (0xbfcdd90a - *tfptr)>>1; /* estimate of 1/sqrt(y) */
x = tempf;
z = y*btScalar(0.5);
x = (btScalar(1.5)*x)-(x*x)*(x*z); /* iteration formula */
x = (btScalar(1.5)*x)-(x*x)*(x*z);
x = (btScalar(1.5)*x)-(x*x)*(x*z);
x = (btScalar(1.5)*x)-(x*x)*(x*z);
x = (btScalar(1.5)*x)-(x*x)*(x*z);
return x*y;
#else
return sqrtf(y);
#endif
}
SIMD_FORCE_INLINE btScalar btFabs(btScalar x) { return fabsf(x); }
SIMD_FORCE_INLINE btScalar btCos(btScalar x) { return cosf(x); }
SIMD_FORCE_INLINE btScalar btSin(btScalar x) { return sinf(x); }
SIMD_FORCE_INLINE btScalar btTan(btScalar x) { return tanf(x); }
SIMD_FORCE_INLINE btScalar btAcos(btScalar x) {
if (x<btScalar(-1))
x=btScalar(-1);
if (x>btScalar(1))
x=btScalar(1);
return acosf(x);
}
SIMD_FORCE_INLINE btScalar btAsin(btScalar x) {
if (x<btScalar(-1))
x=btScalar(-1);
if (x>btScalar(1))
x=btScalar(1);
return asinf(x);
}
SIMD_FORCE_INLINE btScalar btAtan(btScalar x) { return atanf(x); }
SIMD_FORCE_INLINE btScalar btAtan2(btScalar x, btScalar y) { return atan2f(x, y); }
SIMD_FORCE_INLINE btScalar btExp(btScalar x) { return expf(x); }
SIMD_FORCE_INLINE btScalar btLog(btScalar x) { return logf(x); }
SIMD_FORCE_INLINE btScalar btPow(btScalar x,btScalar y) { return powf(x,y); }
SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmodf(x,y); }
#endif
#define SIMD_2_PI btScalar(6.283185307179586232)
#define SIMD_PI (SIMD_2_PI * btScalar(0.5))
#define SIMD_HALF_PI (SIMD_2_PI * btScalar(0.25))
#define SIMD_RADS_PER_DEG (SIMD_2_PI / btScalar(360.0))
#define SIMD_DEGS_PER_RAD (btScalar(360.0) / SIMD_2_PI)
#define SIMDSQRT12 btScalar(0.7071067811865475244008443621048490)
#define btRecipSqrt(x) ((btScalar)(btScalar(1.0)/btSqrt(btScalar(x)))) /* reciprocal square root */
#ifdef BT_USE_DOUBLE_PRECISION
#define SIMD_EPSILON DBL_EPSILON
#define SIMD_INFINITY DBL_MAX
#else
#define SIMD_EPSILON FLT_EPSILON
#define SIMD_INFINITY FLT_MAX
#endif
SIMD_FORCE_INLINE btScalar btAtan2Fast(btScalar y, btScalar x)
{
btScalar coeff_1 = SIMD_PI / 4.0f;
btScalar coeff_2 = 3.0f * coeff_1;
btScalar abs_y = btFabs(y);
btScalar angle;
if (x >= 0.0f) {
btScalar r = (x - abs_y) / (x + abs_y);
angle = coeff_1 - coeff_1 * r;
} else {
btScalar r = (x + abs_y) / (abs_y - x);
angle = coeff_2 - coeff_1 * r;
}
return (y < 0.0f) ? -angle : angle;
}
SIMD_FORCE_INLINE bool btFuzzyZero(btScalar x) { return btFabs(x) < SIMD_EPSILON; }
SIMD_FORCE_INLINE bool btEqual(btScalar a, btScalar eps) {
return (((a) <= eps) && !((a) < -eps));
}
SIMD_FORCE_INLINE bool btGreaterEqual (btScalar a, btScalar eps) {
return (!((a) <= eps));
}
SIMD_FORCE_INLINE int btIsNegative(btScalar x) {
return x < btScalar(0.0) ? 1 : 0;
}
SIMD_FORCE_INLINE btScalar btRadians(btScalar x) { return x * SIMD_RADS_PER_DEG; }
SIMD_FORCE_INLINE btScalar btDegrees(btScalar x) { return x * SIMD_DEGS_PER_RAD; }
#define BT_DECLARE_HANDLE(name) typedef struct name##__ { int unused; } *name
#ifndef btFsel
SIMD_FORCE_INLINE btScalar btFsel(btScalar a, btScalar b, btScalar c)
{
return a >= 0 ? b : c;
}
#endif
#define btFsels(a,b,c) (btScalar)btFsel(a,b,c)
SIMD_FORCE_INLINE bool btMachineIsLittleEndian()
{
long int i = 1;
const char *p = (const char *) &i;
if (p[0] == 1) // Lowest address contains the least significant byte
return true;
else
return false;
}
///btSelect avoids branches, which makes performance much better for consoles like Playstation 3 and XBox 360
///Thanks Phil Knight. See also http://www.cellperformance.com/articles/2006/04/more_techniques_for_eliminatin_1.html
SIMD_FORCE_INLINE unsigned btSelect(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero)
{
// Set testNz to 0xFFFFFFFF if condition is nonzero, 0x00000000 if condition is zero
// Rely on positive value or'ed with its negative having sign bit on
// and zero value or'ed with its negative (which is still zero) having sign bit off
// Use arithmetic shift right, shifting the sign bit through all 32 bits
unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
unsigned testEqz = ~testNz;
return ((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
}
SIMD_FORCE_INLINE int btSelect(unsigned condition, int valueIfConditionNonZero, int valueIfConditionZero)
{
unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
unsigned testEqz = ~testNz;
return static_cast<int>((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
}
SIMD_FORCE_INLINE float btSelect(unsigned condition, float valueIfConditionNonZero, float valueIfConditionZero)
{
#ifdef BT_HAVE_NATIVE_FSEL
return (float)btFsel((btScalar)condition - btScalar(1.0f), valueIfConditionNonZero, valueIfConditionZero);
#else
return (condition != 0) ? valueIfConditionNonZero : valueIfConditionZero;
#endif
}
template<typename T> SIMD_FORCE_INLINE void btSwap(T& a, T& b)
{
T tmp = a;
a = b;
b = tmp;
}
//PCK: endian swapping functions
SIMD_FORCE_INLINE unsigned btSwapEndian(unsigned val)
{
return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8) | ((val & 0x000000ff) << 24));
}
SIMD_FORCE_INLINE unsigned short btSwapEndian(unsigned short val)
{
return static_cast<unsigned short>(((val & 0xff00) >> 8) | ((val & 0x00ff) << 8));
}
SIMD_FORCE_INLINE unsigned btSwapEndian(int val)
{
return btSwapEndian((unsigned)val);
}
SIMD_FORCE_INLINE unsigned short btSwapEndian(short val)
{
return btSwapEndian((unsigned short) val);
}
///btSwapFloat uses using char pointers to swap the endianness
////btSwapFloat/btSwapDouble will NOT return a float, because the machine might 'correct' invalid floating point values
///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754.
///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception.
///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you.
///so instead of returning a float/double, we return integer/long long integer
SIMD_FORCE_INLINE unsigned int btSwapEndianFloat(float d)
{
unsigned int a = 0;
unsigned char *dst = (unsigned char *)&a;
unsigned char *src = (unsigned char *)&d;
dst[0] = src[3];
dst[1] = src[2];
dst[2] = src[1];
dst[3] = src[0];
return a;
}
// unswap using char pointers
SIMD_FORCE_INLINE float btUnswapEndianFloat(unsigned int a)
{
float d = 0.0f;
unsigned char *src = (unsigned char *)&a;
unsigned char *dst = (unsigned char *)&d;
dst[0] = src[3];
dst[1] = src[2];
dst[2] = src[1];
dst[3] = src[0];
return d;
}
// swap using char pointers
SIMD_FORCE_INLINE void btSwapEndianDouble(double d, unsigned char* dst)
{
unsigned char *src = (unsigned char *)&d;
dst[0] = src[7];
dst[1] = src[6];
dst[2] = src[5];
dst[3] = src[4];
dst[4] = src[3];
dst[5] = src[2];
dst[6] = src[1];
dst[7] = src[0];
}
// unswap using char pointers
SIMD_FORCE_INLINE double btUnswapEndianDouble(const unsigned char *src)
{
double d = 0.0;
unsigned char *dst = (unsigned char *)&d;
dst[0] = src[7];
dst[1] = src[6];
dst[2] = src[5];
dst[3] = src[4];
dst[4] = src[3];
dst[5] = src[2];
dst[6] = src[1];
dst[7] = src[0];
return d;
}
// returns normalized value in range [-SIMD_PI, SIMD_PI]
SIMD_FORCE_INLINE btScalar btNormalizeAngle(btScalar angleInRadians)
{
angleInRadians = btFmod(angleInRadians, SIMD_2_PI);
if(angleInRadians < -SIMD_PI)
{
return angleInRadians + SIMD_2_PI;
}
else if(angleInRadians > SIMD_PI)
{
return angleInRadians - SIMD_2_PI;
}
else
{
return angleInRadians;
}
}
///rudimentary class to provide type info
struct btTypedObject
{
btTypedObject(int objectType)
:m_objectType(objectType)
{
}
int m_objectType;
inline int getObjectType() const
{
return m_objectType;
}
};
///align a pointer to the provided alignment, upwards
template <typename T>T* btAlignPointer(T* unalignedPtr, size_t alignment)
{
struct btConvertPointerSizeT
{
union
{
T* ptr;
size_t integer;
};
};
btConvertPointerSizeT converter;
const size_t bit_mask = ~(alignment - 1);
converter.ptr = unalignedPtr;
converter.integer += alignment-1;
converter.integer &= bit_mask;
return converter.ptr;
}
#endif //BT_SCALAR_H

View File

@@ -0,0 +1,26 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("OpenCL_lib_parallel_primitives_host_" .. vendor)
initOpenCL(vendor)
kind "StaticLib"
targetdir "../../../lib"
includedirs {
".",
}
files {
"**.cpp",
"**.h"
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")

View File

@@ -0,0 +1,106 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
typedef struct
{
u32 m_key;
u32 m_value;
}SortData;
typedef struct
{
u32 m_nSrc;
u32 m_nDst;
u32 m_padding[2];
} ConstBuffer;
__attribute__((reqd_work_group_size(64,1,1)))
__kernel
void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst,
unsigned int nSrc, unsigned int nDst)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < nSrc )
{
SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);
SortData end; end.m_key = nDst; end.m_value = nDst;
SortData iData = (gIdx==0)? first: src[gIdx-1];
SortData jData = (gIdx==nSrc)? end: src[gIdx];
if( iData.m_key != jData.m_key )
{
// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
u32 k = jData.m_key;
{
dst[k] = gIdx;
}
}
}
}
__attribute__((reqd_work_group_size(64,1,1)))
__kernel
void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst,
unsigned int nSrc, unsigned int nDst)
{
int gIdx = GET_GLOBAL_IDX+1;
if( gIdx < nSrc+1 )
{
SortData first; first.m_key = 0; first.m_value = 0;
SortData end; end.m_key = nDst; end.m_value = nDst;
SortData iData = src[gIdx-1];
SortData jData = (gIdx==nSrc)? end: src[gIdx];
if( iData.m_key != jData.m_key )
{
u32 k = iData.m_key;
{
dst[k] = gIdx;
}
}
}
}
__attribute__((reqd_work_group_size(64,1,1)))
__kernel
void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C,
unsigned int nSrc, unsigned int nDst)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < nDst )
{
C[gIdx] = A[gIdx] - B[gIdx];
}
}

View File

@@ -0,0 +1,110 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* boundSearchKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"\n"
"typedef struct\n"
"{\n"
" u32 m_key; \n"
" u32 m_value;\n"
"}SortData;\n"
"\n"
"\n"
"\n"
"typedef struct\n"
"{\n"
" u32 m_nSrc;\n"
" u32 m_nDst;\n"
" u32 m_padding[2];\n"
"} ConstBuffer;\n"
"\n"
"\n"
"\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < nSrc )\n"
" {\n"
" SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
"\n"
" SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
"\n"
" if( iData.m_key != jData.m_key )\n"
" {\n"
"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
" u32 k = jData.m_key;\n"
" {\n"
" dst[k] = gIdx;\n"
" }\n"
" }\n"
" }\n"
"}\n"
"\n"
"\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX+1;\n"
"\n"
" if( gIdx < nSrc+1 )\n"
" {\n"
" SortData first; first.m_key = 0; first.m_value = 0;\n"
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
"\n"
" SortData iData = src[gIdx-1];\n"
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
"\n"
" if( iData.m_key != jData.m_key )\n"
" {\n"
" u32 k = iData.m_key;\n"
" {\n"
" dst[k] = gIdx;\n"
" }\n"
" }\n"
" }\n"
"}\n"
"\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" \n"
"\n"
" if( gIdx < nDst )\n"
" {\n"
" C[gIdx] = A[gIdx] - B[gIdx];\n"
" }\n"
"}\n"
"\n"
"\n"
;

View File

@@ -0,0 +1,128 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define make_uint4 (uint4)
#define make_uint2 (uint2)
#define make_int2 (int2)
typedef struct
{
int m_n;
int m_padding[3];
} ConstBuffer;
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void Copy1F4Kernel(__global float4* dst, __global float4* src,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < cb.m_n )
{
float4 a0 = src[gIdx];
dst[ gIdx ] = a0;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void Copy2F4Kernel(__global float4* dst, __global float4* src,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( 2*gIdx <= cb.m_n )
{
float4 a0 = src[gIdx*2+0];
float4 a1 = src[gIdx*2+1];
dst[ gIdx*2+0 ] = a0;
dst[ gIdx*2+1 ] = a1;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void Copy4F4Kernel(__global float4* dst, __global float4* src,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( 4*gIdx <= cb.m_n )
{
int idx0 = gIdx*4+0;
int idx1 = gIdx*4+1;
int idx2 = gIdx*4+2;
int idx3 = gIdx*4+3;
float4 a0 = src[idx0];
float4 a1 = src[idx1];
float4 a2 = src[idx2];
float4 a3 = src[idx3];
dst[ idx0 ] = a0;
dst[ idx1 ] = a1;
dst[ idx2 ] = a2;
dst[ idx3 ] = a3;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void CopyF1Kernel(__global float* dstF1, __global float* srcF1,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < cb.m_n )
{
float a0 = srcF1[gIdx];
dstF1[ gIdx ] = a0;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2,
ConstBuffer cb)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < cb.m_n )
{
float2 a0 = srcF2[gIdx];
dstF2[ gIdx ] = a0;
}
}

View File

@@ -0,0 +1,132 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* copyKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"\n"
"#define make_uint4 (uint4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"typedef struct\n"
"{\n"
" int m_n;\n"
" int m_padding[3];\n"
"} ConstBuffer;\n"
"\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float4 a0 = src[gIdx];\n"
"\n"
" dst[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( 2*gIdx <= cb.m_n )\n"
" {\n"
" float4 a0 = src[gIdx*2+0];\n"
" float4 a1 = src[gIdx*2+1];\n"
"\n"
" dst[ gIdx*2+0 ] = a0;\n"
" dst[ gIdx*2+1 ] = a1;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( 4*gIdx <= cb.m_n )\n"
" {\n"
" int idx0 = gIdx*4+0;\n"
" int idx1 = gIdx*4+1;\n"
" int idx2 = gIdx*4+2;\n"
" int idx3 = gIdx*4+3;\n"
"\n"
" float4 a0 = src[idx0];\n"
" float4 a1 = src[idx1];\n"
" float4 a2 = src[idx2];\n"
" float4 a3 = src[idx3];\n"
"\n"
" dst[ idx0 ] = a0;\n"
" dst[ idx1 ] = a1;\n"
" dst[ idx2 ] = a2;\n"
" dst[ idx3 ] = a3;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float a0 = srcF1[gIdx];\n"
"\n"
" dstF1[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float2 a0 = srcF2[gIdx];\n"
"\n"
" dstF2[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"\n"
;

View File

@@ -0,0 +1,107 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#pragma OPENCL EXTENSION cl_amd_printf : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
#define AtomInc(x) atom_inc(&(x))
#define AtomInc1(x, out) out = atom_inc(&(x))
#define make_uint4 (uint4)
#define make_uint2 (uint2)
#define make_int2 (int2)
typedef struct
{
union
{
int4 m_data;
uint4 m_unsignedData;
float m_floatData;
};
int m_offset;
int m_n;
int m_padding[2];
} ConstBuffer;
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num_elements )
{
dstInt[ offset+gIdx ] = value;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num_elements )
{
dstFloat[ offset+gIdx ] = value;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num )
{
dstInt[ offset+gIdx ] = value;
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num )
{
dstInt2[ gIdx + offset] = make_int2( value.x, value.y );
}
}
__kernel
__attribute__((reqd_work_group_size(64,1,1)))
void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)
{
int gIdx = GET_GLOBAL_IDX;
if( gIdx < num )
{
dstInt4[ offset+gIdx ] = value;
}
}

View File

@@ -0,0 +1,111 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* fillKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"\n"
"#define make_uint4 (uint4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"typedef struct\n"
"{\n"
" union\n"
" {\n"
" int4 m_data;\n"
" uint4 m_unsignedData;\n"
" float m_floatData;\n"
" };\n"
" int m_offset;\n"
" int m_n;\n"
" int m_padding[2];\n"
"} ConstBuffer;\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < num_elements )\n"
" {\n"
" dstInt[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < num_elements )\n"
" {\n"
" dstFloat[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < num )\n"
" {\n"
" dstInt[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < num )\n"
" {\n"
" dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < num )\n"
" {\n"
" dstInt4[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"\n"
"\n"
;

View File

@@ -0,0 +1,154 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
typedef unsigned int u32;
#define GET_GROUP_IDX get_group_id(0)
#define GET_LOCAL_IDX get_local_id(0)
#define GET_GLOBAL_IDX get_global_id(0)
#define GET_GROUP_SIZE get_local_size(0)
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
// takahiro end
#define WG_SIZE 128
#define m_numElems x
#define m_numBlocks y
#define m_numScanBlocks z
/*typedef struct
{
uint m_numElems;
uint m_numBlocks;
uint m_numScanBlocks;
uint m_padding[1];
} ConstBuffer;
*/
u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)
{
u32 blocksum;
int offset = 1;
for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
{
GROUP_LDS_BARRIER;
for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
{
int ai = offset*(2*iIdx+1)-1;
int bi = offset*(2*iIdx+2)-1;
data[bi] += data[ai];
}
}
GROUP_LDS_BARRIER;
if( lIdx == 0 )
{
blocksum = data[ n-1 ];
data[ n-1 ] = 0;
}
GROUP_LDS_BARRIER;
offset >>= 1;
for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
{
GROUP_LDS_BARRIER;
for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
{
int ai = offset*(2*iIdx+1)-1;
int bi = offset*(2*iIdx+2)-1;
u32 temp = data[ai];
data[ai] = data[bi];
data[bi] += temp;
}
}
GROUP_LDS_BARRIER;
return blocksum;
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,
uint4 cb)
{
__local u32 ldsData[WG_SIZE*2];
int gIdx = GET_GLOBAL_IDX;
int lIdx = GET_LOCAL_IDX;
ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
if( (2*gIdx) < cb.m_numElems )
{
dst[2*gIdx] = ldsData[2*lIdx];
}
if( (2*gIdx + 1) < cb.m_numElems )
{
dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
}
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)
{
const u32 blockSize = WG_SIZE*2;
int myIdx = GET_GROUP_IDX+1;
int lIdx = GET_LOCAL_IDX;
u32 iBlockSum = blockSum[myIdx];
int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
{
dst[i] += iBlockSum;
}
}
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
__kernel
void TopLevelScanKernel(__global u32* dst, uint4 cb)
{
__local u32 ldsData[2048];
int gIdx = GET_GLOBAL_IDX;
int lIdx = GET_LOCAL_IDX;
int lSize = GET_GROUP_SIZE;
for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
{
ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
}
GROUP_LDS_BARRIER;
u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
{
dst[i] = ldsData[i];
}
if( gIdx == 0 )
{
dst[cb.m_numBlocks] = sum;
}
}

View File

@@ -0,0 +1,158 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* prefixScanKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"\n"
"// takahiro end\n"
"#define WG_SIZE 128 \n"
"#define m_numElems x\n"
"#define m_numBlocks y\n"
"#define m_numScanBlocks z\n"
"\n"
"/*typedef struct\n"
"{\n"
" uint m_numElems;\n"
" uint m_numBlocks;\n"
" uint m_numScanBlocks;\n"
" uint m_padding[1];\n"
"} ConstBuffer;\n"
"*/\n"
"\n"
"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
"{\n"
" u32 blocksum;\n"
" int offset = 1;\n"
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" data[bi] += data[ai];\n"
" }\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" if( lIdx == 0 )\n"
" {\n"
" blocksum = data[ n-1 ];\n"
" data[ n-1 ] = 0;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" offset >>= 1;\n"
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" u32 temp = data[ai];\n"
" data[ai] = data[bi];\n"
" data[bi] += temp;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" return blocksum;\n"
"}\n"
"\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
" uint4 cb)\n"
"{\n"
" __local u32 ldsData[WG_SIZE*2];\n"
"\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
"\n"
" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
"\n"
" u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
"\n"
" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
"\n"
" if( (2*gIdx) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx] = ldsData[2*lIdx];\n"
" }\n"
" if( (2*gIdx + 1) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
" }\n"
"}\n"
"\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
"{\n"
" const u32 blockSize = WG_SIZE*2;\n"
"\n"
" int myIdx = GET_GROUP_IDX+1;\n"
" int lIdx = GET_LOCAL_IDX;\n"
"\n"
" u32 iBlockSum = blockSum[myIdx];\n"
"\n"
" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
" {\n"
" dst[i] += iBlockSum;\n"
" }\n"
"}\n"
"\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
"{\n"
" __local u32 ldsData[2048];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int lSize = GET_GROUP_SIZE;\n"
"\n"
" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
" {\n"
" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
" }\n"
"\n"
" GROUP_LDS_BARRIER;\n"
"\n"
" u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
"\n"
" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
" {\n"
" dst[i] = ldsData[i];\n"
" }\n"
"\n"
" if( gIdx == 0 )\n"
" {\n"
" dst[cb.m_numBlocks] = sum;\n"
" }\n"
"}\n"
"\n"
;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,379 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include <stdio.h>
#include "../basic_initialize/btOpenCLUtils.h"
#include "../host/btFillCL.h"
#include "../host/btBoundSearchCL.h"
#include "../host/btRadixSort32CL.h"
#include "../host/btPrefixScanCL.h"
#include "../host/CommandLineArgs.h"
#include "../host/btMinMax.h"
int g_nPassed = 0;
int g_nFailed = 0;
bool g_testFailed = 0;
#define TEST_INIT g_testFailed = 0;
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
cl_context g_context=0;
cl_device_id g_device=0;
cl_command_queue g_queue =0;
const char* g_deviceName = 0;
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
{
void* glCtx=0;
void* glDC = 0;
int ciErrNum = 0;
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
g_context = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = btOpenCLUtils::getNumDevices(g_context);
if (numDev>0)
{
btOpenCLDeviceInfo info;
g_device= btOpenCLUtils::getDevice(g_context,0);
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
btOpenCLUtils::printDeviceInfo(g_device);
btOpenCLUtils::getDeviceInfo(g_device,&info);
g_deviceName = info.m_deviceName;
}
}
void exitCL()
{
clReleaseCommandQueue(g_queue);
clReleaseContext(g_context);
}
inline void fillIntTest()
{
TEST_INIT;
btFillCL* fillCL = new btFillCL(g_context,g_device,g_queue);
int maxSize=1024*256;
btOpenCLArray<int> intBuffer(g_context,g_queue,maxSize);
intBuffer.resize(maxSize);
#define NUM_TESTS 7
int dx = maxSize/NUM_TESTS;
for (int iter=0;iter<NUM_TESTS;iter++)
{
int size = btMin( 11+dx*iter, maxSize );
int value = 2;
int offset=0;
fillCL->execute(intBuffer,value,size,offset);
btAlignedObjectArray<int> hostBuf2;
hostBuf2.resize(size);
fillCL->executeHost(hostBuf2,value,size,offset);
btAlignedObjectArray<int> hostBuf;
intBuffer.copyToHost(hostBuf);
for(int i=0; i<size; i++)
{
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
}
}
delete fillCL;
TEST_REPORT( "fillIntTest" );
}
__inline
void seedRandom(int seed)
{
srand( seed );
}
template<typename T>
__inline
T getRandom(const T& minV, const T& maxV)
{
float r = (rand()%10000)/10000.f;
T range = maxV - minV;
return (T)(minV + r*range);
}
struct btSortDataCompare
{
inline bool operator()(const btSortData& first, const btSortData& second) const
{
return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value);
}
};
void boundSearchTest( )
{
TEST_INIT;
int maxSize = 1024*256;
int bucketSize = 256;
btOpenCLArray<btSortData> srcCL(g_context,g_queue,maxSize);
btOpenCLArray<unsigned int> upperCL(g_context,g_queue,maxSize);
btOpenCLArray<unsigned int> lowerCL(g_context,g_queue,maxSize);
btAlignedObjectArray<btSortData> srcHost;
btAlignedObjectArray<unsigned int> upperHost;
btAlignedObjectArray<unsigned int> lowerHost;
btAlignedObjectArray<unsigned int> upperHostCompare;
btAlignedObjectArray<unsigned int> lowerHostCompare;
btBoundSearchCL* search = new btBoundSearchCL(g_context,g_device,g_queue, maxSize);
int dx = maxSize/NUM_TESTS;
for(int iter=0; iter<NUM_TESTS; iter++)
{
int size = btMin( 128+dx*iter, maxSize );
upperHost.resize(bucketSize);
lowerHost.resize(bucketSize);
upperHostCompare.resize(bucketSize);
lowerHostCompare.resize(bucketSize);
srcHost.resize(size);
for(int i=0; i<size; i++)
{
btSortData v;
// v.m_key = i<2? 0 : 5;
v.m_key = getRandom(0,bucketSize);
v.m_value = i;
srcHost.at(i) = v;
}
srcHost.quickSort(btSortDataCompare());
srcCL.copyFromHost(srcHost);
{
for(int i=0; i<bucketSize; i++)
{
lowerHost[i] = -1;
lowerHostCompare[i] = -1;
upperHost[i] = -1;
upperHostCompare[i] = -1;
}
upperCL.copyFromHost(upperHost);
lowerCL.copyFromHost(lowerHost);
}
search->execute(srcCL,size,upperCL,bucketSize,btBoundSearchCL::BOUND_UPPER);
search->execute(srcCL,size,lowerCL,bucketSize,btBoundSearchCL::BOUND_LOWER);
search->executeHost(srcHost,size,upperHostCompare,bucketSize,btBoundSearchCL::BOUND_UPPER);
search->executeHost(srcHost,size,lowerHostCompare,bucketSize,btBoundSearchCL::BOUND_LOWER);
lowerCL.copyToHost(lowerHost);
upperCL.copyToHost(upperHost);
for(int i=0; i<bucketSize; i++)
{
TEST_ASSERT(upperHostCompare[i] == upperHost[i]);
TEST_ASSERT(lowerHostCompare[i] == lowerHost[i]);
}
/*
for(int i=1; i<bucketSize; i++)
{
int lhi_1 = lowerHost[i-1];
int lhi = lowerHost[i];
for(int j=lhi_1; j<lhi; j++)
//for(int j=lowerHost[i-1]; j<lowerHost[i]; j++)
{
TEST_ASSERT( srcHost[j].m_key < i );
}
}
for(int i=0; i<bucketSize; i++)
{
int jMin = (i==0)?0:upperHost[i-1];
for(int j=jMin; j<upperHost[i]; j++)
{
TEST_ASSERT( srcHost[j].m_key <= i );
}
}
*/
for(int i=0; i<bucketSize; i++)
{
int lhi = lowerHost[i];
int uhi = upperHost[i];
for(int j=lhi; j<uhi; j++)
{
if ( srcHost[j].m_key != i )
{
printf("error %d != %d\n",srcHost[j].m_key,i);
}
TEST_ASSERT( srcHost[j].m_key == i );
}
}
}
delete search;
TEST_REPORT( "boundSearchTest" );
}
void prefixScanTest()
{
TEST_INIT;
int maxSize = 1024*256;
btAlignedObjectArray<unsigned int> buf0Host;
btAlignedObjectArray<unsigned int> buf1Host;
btOpenCLArray<unsigned int> buf2CL(g_context,g_queue,maxSize);
btOpenCLArray<unsigned int> buf3CL(g_context,g_queue,maxSize);
btPrefixScanCL* scan = new btPrefixScanCL(g_context,g_device,g_queue,maxSize);
int dx = maxSize/NUM_TESTS;
for(int iter=0; iter<NUM_TESTS; iter++)
{
int size = btMin( 128+dx*iter, maxSize );
buf0Host.resize(size);
buf1Host.resize(size);
for(int i=0; i<size; i++)
buf0Host[i] = 1;
buf2CL.copyFromHost( buf0Host);
unsigned int sumHost, sumGPU;
scan->executeHost(buf0Host, buf1Host, size, &sumHost );
scan->execute( buf2CL, buf3CL, size, &sumGPU );
buf3CL.copyToHost(buf0Host);
TEST_ASSERT( sumHost == sumGPU );
for(int i=0; i<size; i++)
TEST_ASSERT( buf1Host[i] == buf0Host[i] );
}
delete scan;
TEST_REPORT( "scanTest" );
}
bool radixSortTest()
{
TEST_INIT;
int maxSize = 1024*256;
btAlignedObjectArray<btSortData> buf0Host;
buf0Host.resize(maxSize);
btAlignedObjectArray<btSortData> buf1Host;
buf1Host.resize(maxSize );
btOpenCLArray<btSortData> buf2CL(g_context,g_queue,maxSize);
btRadixSort32CL* sort = new btRadixSort32CL(g_context,g_device,g_queue,maxSize);
int dx = maxSize/NUM_TESTS;
for(int iter=0; iter<NUM_TESTS; iter++)
{
int size = btMin( 128+dx*iter, maxSize-512 );
size = NEXTMULTIPLEOF( size, 512 );//not necessary
buf0Host.resize(size);
for(int i=0; i<size; i++)
{
btSortData v;
v.m_key = getRandom(0,0xff);
v.m_value = i;
buf0Host[i] = v;
}
buf2CL.copyFromHost( buf0Host);
sort->executeHost( buf0Host);
sort->execute(buf2CL);
buf2CL.copyToHost(buf1Host);
for(int i=0; i<size; i++)
{
TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key );
}
}
delete sort;
TEST_REPORT( "radixSort" );
return g_testFailed;
}
int main(int argc, char** argv)
{
int preferredDeviceIndex = -1;
int preferredPlatformIndex = -1;
CommandLineArgs args(argc, argv);
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
initCL(preferredDeviceIndex,preferredPlatformIndex);
fillIntTest();
boundSearchTest();
prefixScanTest();
radixSortTest();
exitCL();
printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
printf("End, press <enter>\n");
getchar();
}

View File

@@ -0,0 +1,41 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("OpenCL_primitives_test_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {".",".."}
files {
"main.cpp",
"../../basic_initialize/btOpenCLInclude.h",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../host/btFillCL.cpp",
"../host/btFillCL.h",
"../host/btBoundSearchCL.cpp",
"../host/btBoundSearchCL.h",
"../host/btPrefixScanCL.cpp",
"../host/btPrefixScanCL.h",
"../host/btRadixSort32CL.cpp",
"../host/btRadixSort32CL.h",
"../host/btAlignedAllocator.cpp",
"../host/btAlignedAllocator.h",
"../host/btAlignedObjectArray.h",
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")

116
opencl/reduce/main.cpp Normal file
View File

@@ -0,0 +1,116 @@
///original author: Erwin Coumans
#include "btOpenCLUtils.h"
#include "../parallel_primitives/host/btOpenCLArray.h"
#include "../parallel_primitives/host/btLauncherCL.h"
#include <stdio.h>
#define MSTRINGIFY(A) #A
const char* kernelString= MSTRINGIFY(
__kernel void ReduceGlobal(__global int* d_in, __global int* d_out, int numElements)
{
int myId = get_global_id(0);
int tid = get_local_id(0);
int ls = get_local_size(0);
for (unsigned int s=ls/2;s>0;s>>=1)
{
if (myId<numElements)
{
if (tid<s)
{
d_in[myId] += d_in[myId+s];
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
if (tid==0)
{
if (myId<numElements)
{
d_out[get_group_id(0)]=d_in[myId];
}
}
}
);
int main(int argc, char* argv[])
{
int ciErrNum = 0;
int preferred_device = -1;
int preferred_platform = -1;
cl_platform_id platformId;
cl_context ctx;
cl_command_queue queue;
cl_device_id device;
cl_kernel addKernel;
ctx = btOpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
btOpenCLUtils::printPlatformInfo(platformId);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
if (!ctx) {
printf("No OpenCL capable GPU found!");
return 0;
}
device = btOpenCLUtils::getDevice(ctx,0);
queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
addKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"ReduceGlobal",&ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numElements = 1024*1024;
btOpenCLArray<int> a(ctx,queue);
btOpenCLArray<int> b(ctx,queue);
btAlignedObjectArray<int> hostA;
btAlignedObjectArray<int> hostB;
for (int i=0;i<numElements;i++)
{
hostA.push_back(1);
hostB.push_back(0.f);
}
a.copyFromHost(hostA);
b.copyFromHost(hostB);
int hostSum= 0;
for (int i=0;i<numElements;i++)
{
hostSum += hostA.at(i);
}
b.resize(numElements);
{
btLauncherCL launcher( queue, addKernel);
launcher.setBuffer( a.getBufferCL());
launcher.setBuffer( b.getBufferCL());
launcher.setConst( numElements );
launcher.launch1D( numElements,1024);
}
clFinish(queue);
{
btLauncherCL launcher( queue, addKernel);
launcher.setBuffer( b.getBufferCL());
launcher.setBuffer( a.getBufferCL());
launcher.setConst( 1024 );
launcher.launch1D( 1024,1024);
}
clFinish(queue);
printf("hostSum = %d\n", hostSum);
int clSum = a.at(0);
printf("clSum = %d\n", clSum );
if (hostSum != clSum)
{
printf("Incorrect result\n");
} else
{
printf("Correct result\n");
}
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
printf("press key\n");
getchar();
return 0;
}

View File

@@ -0,0 +1,37 @@
function createProject (vendor)
local hasCL = findOpenCL(vendor)
if (hasCL) then
project ( "OpenCL_reduce_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../bin"
links {
"OpenCL_lib_parallel_primitives_host_" .. vendor
}
includedirs {
"../basic_initialize"
}
files {
"main.cpp",
"../basic_initialize/btOpenCLUtils.cpp",
"../basic_initialize/btOpenCLUtils.h"
}
end
end
createProject("AMD")
createProject("NVIDIA")
createProject("Intel")
createProject("Apple")

View File

@@ -0,0 +1,16 @@
__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)
{
// get oct-float index into global data array
int iGID = get_global_id(0);
if (iGID>=numElements)
return;
float8 aGID = a[iGID];
float8 bGID = b[iGID];
float8 result = aGID + bGID;
// write back out to GMEM
c[iGID] = result;
}

View File

@@ -0,0 +1,20 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* vectorAddCL= \
"\n"
"\n"
"__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)\n"
"{\n"
" // get oct-float index into global data array\n"
" int iGID = get_global_id(0);\n"
" if (iGID>=numElements)\n"
" return;\n"
"\n"
" float8 aGID = a[iGID];\n"
" float8 bGID = b[iGID];\n"
"\n"
" float8 result = aGID + bGID;\n"
" // write back out to GMEM\n"
" c[iGID] = result;\n"
"}\n"
"\n"
;

408
opencl/vector_add/main.cpp Normal file
View File

@@ -0,0 +1,408 @@
///VectorAdd sample, from the NVidia JumpStart Guide
///http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf
///Instead of #include <CL/cl.h> we include <MiniCL/cl.h>
///Apart from this include file, all other code should compile and work on OpenCL compliant implementation
#define LOAD_FROM_FILE
#ifdef __APPLE__
#include <OpenCL/OpenCL.h>
#else
#include <CL/cl.h>
#endif //__APPLE__
#ifdef _WIN32
#pragma warning (disable:4996)
#endif
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); }
size_t wgSize;
#include "VectorAddKernels.h"
#ifdef CL_PLATFORM_INTEL
const char* preferredPlatform = "Intel(R) Corporation";
#elif defined CL_PLATFORM_AMD
const char* preferredPlatform = "Advanced Micro Devices, Inc.";
#elif defined CL_PLATFORM_NVIDIA
const char* preferredPlatform = "NVIDIA Corporation";
#else
const char* preferredPlatform = "Unknown";
#endif
char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
size_t szPreambleLength = strlen(cPreamble);
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
memcpy(cSourceString, cPreamble, szPreambleLength);
fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream);
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength + szPreambleLength;
}
cSourceString[szSourceLength + szPreambleLength] = '\0';
return cSourceString;
}
size_t workitem_size[3];
void printDevInfo(cl_device_id device)
{
char device_string[1024];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
printf( " Device %s:\n", device_string);
// CL_DEVICE_INFO
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
if( type & CL_DEVICE_TYPE_CPU )
printf(" CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if( type & CL_DEVICE_TYPE_GPU )
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if( type & CL_DEVICE_TYPE_ACCELERATOR )
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if( type & CL_DEVICE_TYPE_DEFAULT )
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
printf( " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
printf( " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
}
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
void *srcA, *srcB, *dst; // Host buffers for OpenCL test
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQue; // OpenCL command que
cl_device_id* cdDevices; // OpenCL device list
cl_program cpProgram; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel
cl_mem cmMemObjs[3]; // OpenCL memory buffer objects: 3 for device
size_t szGlobalWorkSize[1]; // 1D var for Total # of work items
size_t szLocalWorkSize[1]; // 1D var for # of work items in the work group
size_t szParmDataBytes; // Byte size of context information
cl_int ciErr1, ciErr2; // Error code var
int iTestN = 100000 * 8; // Size of Vectors to process
int actualGlobalSize = iTestN / 8;
// set Global and Local work size dimensions
szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item
szLocalWorkSize[0]= iTestN>>3;
// Allocate and initialize host arrays
srcA = (void *)malloc (sizeof(cl_float) * iTestN);
srcB = (void *)malloc (sizeof(cl_float) * iTestN);
dst = (void *)malloc (sizeof(cl_float) * iTestN);
int i;
// Initialize arrays with some values
for (i=0;i<iTestN;i++)
{
((cl_float*)srcA)[i] = cl_float(i);
((cl_float*)srcB)[i] = 2;
((cl_float*)dst)[i]=-1;
}
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
if (0 < numPlatforms)
{
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
for (unsigned i = 0; i < numPlatforms; ++i)
{
char pbuf[100];
status = clGetPlatformInfo(platforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuf),
pbuf,
NULL);
platform = platforms[i];
if (!strcmp(pbuf, preferredPlatform))
{
printf("Found platform %s\n", preferredPlatform);
break;
}
}
delete[] platforms;
}
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)platform,
0
};
// Create OpenCL context & context
cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
// Query all devices available to the context
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*)malloc(szParmDataBytes);
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
if (cdDevices)
{
printDevInfo(cdDevices[0]);
}
// Create a command queue for first device the context reported
cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
ciErr1 |= ciErr2;
// Allocate the OpenCL source and result buffer memory objects on the device GMEM
cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
ciErr1 |= ciErr2;
cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
ciErr1 |= ciErr2;
cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
ciErr1 |= ciErr2;
///create kernels from binary
int numDevices = 1;
::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));
for (i = 0; i < numDevices; ++i) {
images[i] = 0;
lengths[i] = 0;
}
// Read the OpenCL kernel in from source file
const char* cSourceFile = "opencl/vector_add/VectorAddKernels.cl";
const char* cPathAndName = cSourceFile;
#ifdef LOAD_FROM_FILE
size_t szKernelLength;
const char* cSourceCL =0;
char relativeFileName[1024];
{
const char* prefix[]={"../","../../","../../../","../../../../"};
int numPrefixes = sizeof(prefix)/sizeof(char*);
for (int i=0;!cSourceCL && i<numPrefixes;i++)
{
sprintf(relativeFileName,"%s%s",prefix[i],cSourceFile);
cSourceCL = loadProgSource(relativeFileName, "", &szKernelLength);
if (cSourceCL)
{
printf("Loaded program source: %s\n", relativeFileName);
}
}
}
if (!cSourceCL)
{
printf("Couldn't find file %s, exiting\n",cSourceFile);
exit(0);
}
#else
const char* cSourceCL = vectorAddCL;
size_t szKernelLength = strlen(cSourceCL);
#endif //LOAD_FROM_FILE
// Create the program
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
printf("clCreateProgramWithSource...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
exit(0);
}
// Build the program with 'mad' Optimization option
#ifdef MAC
char* flags = "-cl-mad-enable -DMAC ";
#else
char flags[1024]={0};
#ifdef CL_PLATFORM_INTEL
sprintf(flags,"-g -s \"%s\"","C:/develop/experiments/opencl/vector_add/VectorAddKernels.cl");
#endif//CL_PLATFORM_INTEL
#endif//MAC
ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
printf("clBuildProgram...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
exit(0);
}
// Create the kernel
ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
printf("clCreateKernel (VectorAdd)...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
exit(0);
}
cl_int ciErrNum;
ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
if (ciErrNum != CL_SUCCESS)
{
printf("cannot get workgroup size\n");
exit(0);
}
// Set the Argument values
ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);
ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(int), (void*)&actualGlobalSize);
printf("Press ENTER to quit\n");
getchar();
int workgroupSize = wgSize;
if(workgroupSize <= 0)
{ // let OpenCL library calculate workgroup size
size_t globalWorkSize[2];
globalWorkSize[0] = actualGlobalSize;
globalWorkSize[1] = 1;
// Copy input data from host to GPU and launch kernel
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 );
}
else
{
size_t localWorkSize[2], globalWorkSize[2];
//workgroupSize = btMin(workgroupSize, actualGlobalSize);
int num_t = actualGlobalSize / workgroupSize;
int num_g = num_t * workgroupSize;
if(num_g < actualGlobalSize)
{
num_t++;
//this can cause problems -> processing outside of the buffer
//make sure to check kernel
}
size_t globalThreads[] = {num_t * workgroupSize};
size_t localThreads[] = {workgroupSize};
localWorkSize[0] = workgroupSize;
globalWorkSize[0] = num_t * workgroupSize;
localWorkSize[1] = 1;
globalWorkSize[1] = 1;
// Copy input data from host to GPU and launch kernel
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);
}
if (ciErrNum != CL_SUCCESS)
{
printf("cannot clEnqueueNDRangeKernel\n");
exit(0);
}
clFinish(cqCommandQue);
// Read back results and check accumulated errors
ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);
// Release kernel, program, and memory objects
// NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
free(cdDevices);
clReleaseKernel(ckKernel);
clReleaseProgram(cpProgram);
clReleaseCommandQueue(cqCommandQue);
clReleaseContext(cxGPUContext);
// print the results
int iErrorCount = 0;
for (i = 0; i < iTestN; i++)
{
if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
iErrorCount++;
}
if (iErrorCount)
{
printf("Validation FAILED\n");
} else
{
printf("Validation SUCCESSFULL\n");
}
// Free host memory, close log and return success
for (i = 0; i < 3; i++)
{
clReleaseMemObject(cmMemObjs[i]);
}
free(srcA);
free(srcB);
free (dst);
printf("Press ENTER to quit\n");
getchar();
}

View File

@@ -0,0 +1,28 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("OpenCL_VectorAdd_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../bin"
files {
"main.cpp",
"../basic_initialize/btOpenCLUtils.cpp",
"../basic_initialize/btOpenCLUtils.h"
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")

View File

@@ -0,0 +1,69 @@
///original author: Erwin Coumans
#include "btOpenCLUtils.h"
#include "../parallel_primitives/host/btOpenCLArray.h"
#include "../parallel_primitives/host/btLauncherCL.h"
#include <stdio.h>
#define MSTRINGIFY(A) #A
const char* kernelString= MSTRINGIFY(
__kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int numElements)
{
int iGID = get_global_id(0);
if (iGID>=numElements)
return;
float aGID = a[iGID];
float bGID = b[iGID];
float result = aGID + bGID;
c[iGID] = result;
}
);
int main(int argc, char* argv[])
{
int ciErrNum = 0;
int preferred_device = -1;
int preferred_platform = -1;
cl_platform_id platformId;
cl_context ctx;
cl_command_queue queue;
cl_device_id device;
cl_kernel addKernel;
ctx = btOpenCLUtils::createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
btOpenCLUtils::printPlatformInfo(platformId);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
if (!ctx) {
printf("No OpenCL capable GPU found!");
return 0;
}
device = btOpenCLUtils::getDevice(ctx,0);
queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
addKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"VectorAdd",&ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numElements = 32;
btOpenCLArray<float> a(ctx,queue);
btOpenCLArray<float> b(ctx,queue);
btOpenCLArray<float> c(ctx,queue);
for (int i=0;i<numElements;i++)
{
a.push_back(float(i));
b.push_back(float(i));
}
c.resize(numElements);
btLauncherCL launcher( queue, addKernel);
launcher.setBuffer( a.getBufferCL());
launcher.setBuffer( b.getBufferCL());
launcher.setBuffer( c.getBufferCL());
launcher.setConst( numElements );
launcher.launch1D( numElements);
for (int i=0;i<numElements;i++)
{
float v = c.at(i);
printf("c[%d]=%f\n",i,v);
}
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
return 0;
}

View File

@@ -0,0 +1,37 @@
function createProject (vendor)
local hasCL = findOpenCL(vendor)
if (hasCL) then
project ( "OpenCL_vector_add_simplified_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../bin"
links {
"OpenCL_lib_parallel_primitives_host_" .. vendor
}
includedirs {
"../basic_initialize"
}
files {
"main.cpp",
"../basic_initialize/btOpenCLUtils.cpp",
"../basic_initialize/btOpenCLUtils.h"
}
end
end
createProject("AMD")
createProject("NVIDIA")
createProject("Intel")
createProject("Apple")