import opencl_course source for a start
This commit is contained in:
44
opencl/basic_initialize/btOpenCLInclude.h
Normal file
44
opencl/basic_initialize/btOpenCLInclude.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#ifndef BT_OPENCL_INCLUDE_H
|
||||
#define BT_OPENCL_INCLUDE_H
|
||||
|
||||
|
||||
#ifdef __APPLE__
|
||||
#ifdef USE_MINICL
|
||||
#include <MiniCL/cl.h>
|
||||
#else
|
||||
#include <OpenCL/cl.h>
|
||||
#include <OpenCL/cl_ext.h> //clLogMessagesToStderrAPPLE
|
||||
#endif
|
||||
#else
|
||||
#ifdef USE_MINICL
|
||||
#include <MiniCL/cl.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#ifdef _WIN32
|
||||
#include "CL/cl_gl.h"
|
||||
#endif //_WIN32
|
||||
#endif
|
||||
#endif //__APPLE__
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); }
|
||||
|
||||
|
||||
#endif //BT_OPENCL_INCLUDE_H
|
||||
|
||||
903
opencl/basic_initialize/btOpenCLUtils.cpp
Normal file
903
opencl/basic_initialize/btOpenCLUtils.cpp
Normal file
@@ -0,0 +1,903 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
|
||||
Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
//original author: Roman Ponomarev
|
||||
//cleanup by Erwin Coumans
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#pragma warning (disable:4996)
|
||||
#endif
|
||||
#include "btOpenCLUtils.h"
|
||||
//#include "btOpenCLInclude.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define BT_MAX_CL_DEVICES 16 //who needs 16 devices?
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <Windows.h>
|
||||
#endif
|
||||
|
||||
#include <assert.h>
|
||||
#define btAssert assert
|
||||
|
||||
|
||||
//Set the preferred platform vendor using the OpenCL SDK
|
||||
static const char* spPlatformVendor =
|
||||
#if defined(CL_PLATFORM_MINI_CL)
|
||||
"MiniCL, SCEA";
|
||||
#elif defined(CL_PLATFORM_AMD)
|
||||
"Advanced Micro Devices, Inc.";
|
||||
#elif defined(CL_PLATFORM_NVIDIA)
|
||||
"NVIDIA Corporation";
|
||||
#elif defined(CL_PLATFORM_INTEL)
|
||||
"Intel(R) Corporation";
|
||||
#else
|
||||
"Unknown Vendor";
|
||||
#endif
|
||||
|
||||
#ifndef CL_PLATFORM_MINI_CL
|
||||
#ifdef _WIN32
|
||||
#include "CL/cl_gl.h"
|
||||
#endif //_WIN32
|
||||
#endif
|
||||
|
||||
bool gDebugForceLoadingFromSource = false;
|
||||
bool gDebugSkipLoadingBinary = false;
|
||||
|
||||
void MyFatalBreakAPPLE( const char * errstr ,
|
||||
const void * private_info ,
|
||||
size_t cb ,
|
||||
void * user_data )
|
||||
{
|
||||
printf("Error: %s\n", errstr);
|
||||
|
||||
const char* patloc = strstr(errstr, "Warning");
|
||||
//find out if it is a warning or error, exit if error
|
||||
|
||||
if (patloc)
|
||||
{
|
||||
printf("warning\n");
|
||||
} else
|
||||
{
|
||||
printf("error\n");
|
||||
btAssert(0);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
int btOpenCLUtils_getNumPlatforms(cl_int* pErrNum)
|
||||
{
|
||||
|
||||
cl_platform_id pPlatforms[10] = { 0 };
|
||||
|
||||
cl_uint numPlatforms = 0;
|
||||
cl_int ciErrNum = clGetPlatformIDs(10, pPlatforms, &numPlatforms);
|
||||
//cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
|
||||
|
||||
if(ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
if(pErrNum != NULL)
|
||||
*pErrNum = ciErrNum;
|
||||
}
|
||||
return numPlatforms;
|
||||
|
||||
}
|
||||
|
||||
const char* btOpenCLUtils_getSdkVendorName()
|
||||
{
|
||||
return spPlatformVendor;
|
||||
}
|
||||
|
||||
cl_platform_id btOpenCLUtils_getPlatform(int platformIndex0, cl_int* pErrNum)
|
||||
{
|
||||
cl_platform_id platform = 0;
|
||||
unsigned int platformIndex = (unsigned int )platformIndex0;
|
||||
cl_uint numPlatforms;
|
||||
cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
|
||||
|
||||
if (platformIndex>=0 && platformIndex<numPlatforms)
|
||||
{
|
||||
cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms);
|
||||
ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
|
||||
if(ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
if(pErrNum != NULL)
|
||||
*pErrNum = ciErrNum;
|
||||
return platform;
|
||||
}
|
||||
|
||||
platform = platforms[platformIndex];
|
||||
|
||||
free (platforms);
|
||||
}
|
||||
|
||||
return platform;
|
||||
}
|
||||
|
||||
void btOpenCLUtils::getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo* platformInfo)
|
||||
{
|
||||
cl_int ciErrNum;
|
||||
ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VENDOR,BT_MAX_STRING_LENGTH,platformInfo->m_platformVendor,NULL);
|
||||
oclCHECKERROR(ciErrNum,CL_SUCCESS);
|
||||
ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_NAME,BT_MAX_STRING_LENGTH,platformInfo->m_platformName,NULL);
|
||||
oclCHECKERROR(ciErrNum,CL_SUCCESS);
|
||||
ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VERSION,BT_MAX_STRING_LENGTH,platformInfo->m_platformVersion,NULL);
|
||||
oclCHECKERROR(ciErrNum,CL_SUCCESS);
|
||||
}
|
||||
|
||||
void btOpenCLUtils_printPlatformInfo(cl_platform_id platform)
|
||||
{
|
||||
btOpenCLPlatformInfo platformInfo;
|
||||
btOpenCLUtils::getPlatformInfo (platform, &platformInfo);
|
||||
printf("Platform info:\n");
|
||||
printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
|
||||
printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
|
||||
printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
|
||||
}
|
||||
|
||||
|
||||
|
||||
cl_context btOpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex)
|
||||
{
|
||||
cl_context retContext = 0;
|
||||
cl_int ciErrNum=0;
|
||||
cl_uint num_entries;
|
||||
cl_device_id devices[BT_MAX_CL_DEVICES];
|
||||
cl_uint num_devices;
|
||||
cl_context_properties* cprops;
|
||||
|
||||
/*
|
||||
* If we could find our platform, use it. Otherwise pass a NULL and get whatever the
|
||||
* implementation thinks we should be using.
|
||||
*/
|
||||
cl_context_properties cps[7] = {0,0,0,0,0,0,0};
|
||||
cps[0] = CL_CONTEXT_PLATFORM;
|
||||
cps[1] = (cl_context_properties)platform;
|
||||
#ifdef _WIN32
|
||||
if (pGLContext && pGLDC)
|
||||
{
|
||||
cps[2] = CL_GL_CONTEXT_KHR;
|
||||
cps[3] = (cl_context_properties)pGLContext;
|
||||
cps[4] = CL_WGL_HDC_KHR;
|
||||
cps[5] = (cl_context_properties)pGLDC;
|
||||
}
|
||||
#endif //_WIN32
|
||||
num_entries = BT_MAX_CL_DEVICES;
|
||||
|
||||
|
||||
num_devices=-1;
|
||||
|
||||
ciErrNum = clGetDeviceIDs(
|
||||
platform,
|
||||
deviceType,
|
||||
num_entries,
|
||||
devices,
|
||||
&num_devices);
|
||||
|
||||
if (ciErrNum<0)
|
||||
{
|
||||
printf("clGetDeviceIDs returned %d\n",ciErrNum);
|
||||
return 0;
|
||||
}
|
||||
cprops = (NULL == platform) ? NULL : cps;
|
||||
|
||||
if (!num_devices)
|
||||
return 0;
|
||||
|
||||
if (pGLContext)
|
||||
{
|
||||
//search for the GPU that relates to the OpenCL context
|
||||
unsigned int i;
|
||||
for (i=0;i<num_devices;i++)
|
||||
{
|
||||
retContext = clCreateContext(cprops,1,&devices[i],NULL,NULL,&ciErrNum);
|
||||
if (ciErrNum==CL_SUCCESS)
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (preferredDeviceIndex>=0 && (unsigned int)preferredDeviceIndex<num_devices)
|
||||
{
|
||||
//create a context of the preferred device index
|
||||
retContext = clCreateContext(cprops,1,&devices[preferredDeviceIndex],NULL,NULL,&ciErrNum);
|
||||
} else
|
||||
{
|
||||
//create a context of all devices
|
||||
#if defined (__APPLE__)
|
||||
retContext = clCreateContext(cprops,num_devices,devices,MyFatalBreakAPPLE,NULL,&ciErrNum);
|
||||
#else
|
||||
printf("numDevices=%d\n",num_devices);
|
||||
|
||||
retContext = clCreateContext(cprops,num_devices,devices,NULL,NULL,&ciErrNum);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
if(pErrNum != NULL)
|
||||
{
|
||||
*pErrNum = ciErrNum;
|
||||
};
|
||||
|
||||
return retContext;
|
||||
}
|
||||
|
||||
cl_context btOpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC , int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* retPlatformId)
|
||||
{
|
||||
cl_uint numPlatforms;
|
||||
cl_context retContext = 0;
|
||||
unsigned int i;
|
||||
|
||||
cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
|
||||
if(ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
if(pErrNum != NULL) *pErrNum = ciErrNum;
|
||||
return NULL;
|
||||
}
|
||||
if(numPlatforms > 0)
|
||||
{
|
||||
cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms);
|
||||
ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
|
||||
if(ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
if(pErrNum != NULL)
|
||||
*pErrNum = ciErrNum;
|
||||
free(platforms);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
|
||||
for ( i = 0; i < numPlatforms; ++i)
|
||||
{
|
||||
char pbuf[128];
|
||||
ciErrNum = clGetPlatformInfo( platforms[i],
|
||||
CL_PLATFORM_VENDOR,
|
||||
sizeof(pbuf),
|
||||
pbuf,
|
||||
NULL);
|
||||
if(ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
if(pErrNum != NULL) *pErrNum = ciErrNum;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (preferredPlatformIndex>=0 && i==preferredPlatformIndex)
|
||||
{
|
||||
cl_platform_id tmpPlatform = platforms[0];
|
||||
platforms[0] = platforms[i];
|
||||
platforms[i] = tmpPlatform;
|
||||
break;
|
||||
} else
|
||||
{
|
||||
if(!strcmp(pbuf, spPlatformVendor))
|
||||
{
|
||||
cl_platform_id tmpPlatform = platforms[0];
|
||||
platforms[0] = platforms[i];
|
||||
platforms[i] = tmpPlatform;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < numPlatforms; ++i)
|
||||
{
|
||||
cl_platform_id platform = platforms[i];
|
||||
assert(platform);
|
||||
|
||||
retContext = btOpenCLUtils_createContextFromPlatform(platform,deviceType,pErrNum,pGLContext,pGLDC,preferredDeviceIndex,preferredPlatformIndex);
|
||||
|
||||
if (retContext)
|
||||
{
|
||||
// printf("OpenCL platform details:\n");
|
||||
btOpenCLPlatformInfo platformInfo;
|
||||
|
||||
btOpenCLUtils::getPlatformInfo(platform, &platformInfo);
|
||||
|
||||
if (retPlatformId)
|
||||
*retPlatformId = platform;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
free (platforms);
|
||||
}
|
||||
return retContext;
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxMainContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_device_id btOpenCLUtils_getDevice(cl_context cxMainContext, int deviceIndex)
|
||||
{
|
||||
assert(cxMainContext);
|
||||
|
||||
size_t szParmDataBytes;
|
||||
cl_device_id* cdDevices;
|
||||
cl_device_id device ;
|
||||
|
||||
// get the list of devices associated with context
|
||||
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||
|
||||
if( szParmDataBytes / sizeof(cl_device_id) < (unsigned int)deviceIndex ) {
|
||||
return (cl_device_id)-1;
|
||||
}
|
||||
|
||||
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
|
||||
|
||||
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
|
||||
|
||||
device = cdDevices[deviceIndex];
|
||||
free(cdDevices);
|
||||
|
||||
return device;
|
||||
}
|
||||
|
||||
int btOpenCLUtils_getNumDevices(cl_context cxMainContext)
|
||||
{
|
||||
size_t szParamDataBytes;
|
||||
int device_count;
|
||||
clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes);
|
||||
device_count = (int) szParamDataBytes/ sizeof(cl_device_id);
|
||||
return device_count;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void btOpenCLUtils::getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo* info)
|
||||
{
|
||||
// CL_DEVICE_NAME
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, BT_MAX_STRING_LENGTH, &info->m_deviceName, NULL);
|
||||
|
||||
// CL_DEVICE_VENDOR
|
||||
clGetDeviceInfo(device, CL_DEVICE_VENDOR, BT_MAX_STRING_LENGTH, &info->m_deviceVendor, NULL);
|
||||
|
||||
// CL_DRIVER_VERSION
|
||||
clGetDeviceInfo(device, CL_DRIVER_VERSION, BT_MAX_STRING_LENGTH, &info->m_driverVersion, NULL);
|
||||
|
||||
// CL_DEVICE_INFO
|
||||
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &info->m_deviceType, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(info->m_computeUnits), &info->m_computeUnits, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(info->m_workitemDims), &info->m_workitemDims, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_ITEM_SIZES
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(info->m_workItemSize), &info->m_workItemSize, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_GROUP_SIZE
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(info->m_workgroupSize), &info->m_workgroupSize, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_CLOCK_FREQUENCY
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(info->m_clockFrequency), &info->m_clockFrequency, NULL);
|
||||
|
||||
// CL_DEVICE_ADDRESS_BITS
|
||||
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(info->m_addressBits), &info->m_addressBits, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(info->m_maxMemAllocSize), &info->m_maxMemAllocSize, NULL);
|
||||
|
||||
// CL_DEVICE_GLOBAL_MEM_SIZE
|
||||
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(info->m_globalMemSize), &info->m_globalMemSize, NULL);
|
||||
|
||||
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
|
||||
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(info->m_errorCorrectionSupport), &info->m_errorCorrectionSupport, NULL);
|
||||
|
||||
// CL_DEVICE_LOCAL_MEM_TYPE
|
||||
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(info->m_localMemType), &info->m_localMemType, NULL);
|
||||
|
||||
// CL_DEVICE_LOCAL_MEM_SIZE
|
||||
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(info->m_localMemSize), &info->m_localMemSize, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(info->m_constantBufferSize), &info->m_constantBufferSize, NULL);
|
||||
|
||||
// CL_DEVICE_QUEUE_PROPERTIES
|
||||
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(info->m_queueProperties), &info->m_queueProperties, NULL);
|
||||
|
||||
// CL_DEVICE_IMAGE_SUPPORT
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(info->m_imageSupport), &info->m_imageSupport, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_READ_IMAGE_ARGS
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(info->m_maxReadImageArgs), &info->m_maxReadImageArgs, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(info->m_maxWriteImageArgs), &info->m_maxWriteImageArgs, NULL);
|
||||
|
||||
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &info->m_image2dMaxWidth, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &info->m_image2dMaxHeight, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &info->m_image3dMaxWidth, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &info->m_image3dMaxHeight, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &info->m_image3dMaxDepth, NULL);
|
||||
|
||||
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, BT_MAX_STRING_LENGTH, &info->m_deviceExtensions, NULL);
|
||||
|
||||
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &info->m_vecWidthChar, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &info->m_vecWidthShort, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &info->m_vecWidthInt, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &info->m_vecWidthLong, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &info->m_vecWidthFloat, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info->m_vecWidthDouble, NULL);
|
||||
}
|
||||
|
||||
|
||||
void btOpenCLUtils_printDeviceInfo(cl_device_id device)
|
||||
{
|
||||
btOpenCLDeviceInfo info;
|
||||
btOpenCLUtils::getDeviceInfo(device,&info);
|
||||
printf("Device Info:\n");
|
||||
printf(" CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName);
|
||||
printf(" CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor);
|
||||
printf(" CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion);
|
||||
|
||||
if( info.m_deviceType & CL_DEVICE_TYPE_CPU )
|
||||
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
|
||||
if( info.m_deviceType & CL_DEVICE_TYPE_GPU )
|
||||
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
|
||||
if( info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR )
|
||||
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
|
||||
if( info.m_deviceType & CL_DEVICE_TYPE_DEFAULT )
|
||||
printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
|
||||
|
||||
printf(" CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits);
|
||||
printf(" CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", info.m_workitemDims);
|
||||
printf(" CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", info.m_workItemSize[0], info.m_workItemSize[1], info.m_workItemSize[2]);
|
||||
printf(" CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize);
|
||||
printf(" CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency);
|
||||
printf(" CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits);
|
||||
printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize/ (1024 * 1024)));
|
||||
printf(" CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize/ (1024 * 1024)));
|
||||
printf(" CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport== CL_TRUE ? "yes" : "no");
|
||||
printf(" CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global");
|
||||
printf(" CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024));
|
||||
printf(" CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024));
|
||||
if( info.m_queueProperties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
|
||||
printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
|
||||
if( info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE )
|
||||
printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
|
||||
|
||||
printf(" CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport);
|
||||
|
||||
printf(" CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", info.m_maxReadImageArgs);
|
||||
printf(" CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", info.m_maxWriteImageArgs);
|
||||
printf("\n CL_DEVICE_IMAGE <dim>");
|
||||
printf("\t\t\t2D_MAX_WIDTH\t %u\n", info.m_image2dMaxWidth);
|
||||
printf("\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", info.m_image2dMaxHeight);
|
||||
printf("\t\t\t\t\t3D_MAX_WIDTH\t %u\n", info.m_image3dMaxWidth);
|
||||
printf("\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", info.m_image3dMaxHeight);
|
||||
printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth);
|
||||
if (info.m_deviceExtensions != 0)
|
||||
printf("\n CL_DEVICE_EXTENSIONS:%s\n",info.m_deviceExtensions);
|
||||
else
|
||||
printf(" CL_DEVICE_EXTENSIONS: None\n");
|
||||
printf(" CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
|
||||
printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
|
||||
info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong,info.m_vecWidthFloat, info.m_vecWidthDouble);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
static const char* strip2(const char* name, const char* pattern)
|
||||
{
|
||||
size_t const patlen = strlen(pattern);
|
||||
size_t patcnt = 0;
|
||||
const char * oriptr;
|
||||
const char * patloc;
|
||||
// find how many times the pattern occurs in the original string
|
||||
for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
|
||||
{
|
||||
patcnt++;
|
||||
}
|
||||
return oriptr;
|
||||
}
|
||||
|
||||
cl_program btOpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSourceOrg, cl_int* pErrNum, const char* additionalMacrosArg , const char* clFileNameForCaching)
|
||||
{
|
||||
const char* additionalMacros = additionalMacrosArg?additionalMacrosArg:"";
|
||||
|
||||
cl_program m_cpProgram=0;
|
||||
cl_int status;
|
||||
|
||||
#ifdef _WIN32
|
||||
char binaryFileName[BT_MAX_STRING_LENGTH];
|
||||
char* bla=0;
|
||||
|
||||
if (clFileNameForCaching && !(gDebugSkipLoadingBinary||gDebugForceLoadingFromSource) )
|
||||
{
|
||||
|
||||
char deviceName[256];
|
||||
char driverVersion[256];
|
||||
const char* strippedName;
|
||||
int fileUpToDate = 0;
|
||||
int binaryFileValid=0;
|
||||
FILETIME modtimeBinary;
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL);
|
||||
clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
|
||||
|
||||
|
||||
strippedName = strip2(clFileNameForCaching,"\\");
|
||||
strippedName = strip2(strippedName,"/");
|
||||
|
||||
#ifdef _WIN32
|
||||
sprintf_s(binaryFileName,BT_MAX_STRING_LENGTH,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion );
|
||||
#else
|
||||
sprintf(binaryFileName,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion );
|
||||
#endif
|
||||
|
||||
|
||||
//printf("searching for %s\n", binaryFileName);
|
||||
|
||||
|
||||
|
||||
|
||||
CreateDirectory("cache",0);
|
||||
{
|
||||
|
||||
HANDLE binaryFileHandle = CreateFile(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
|
||||
if (binaryFileHandle ==INVALID_HANDLE_VALUE)
|
||||
{
|
||||
DWORD errorCode;
|
||||
errorCode = GetLastError();
|
||||
switch (errorCode)
|
||||
{
|
||||
case ERROR_FILE_NOT_FOUND:
|
||||
{
|
||||
printf("\nCached file not found %s\n", binaryFileName);
|
||||
break;
|
||||
}
|
||||
case ERROR_PATH_NOT_FOUND:
|
||||
{
|
||||
printf("\nCached file path not found %s\n", binaryFileName);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
printf("\nFailed reading cached file with errorCode = %d\n", errorCode);
|
||||
}
|
||||
}
|
||||
} else
|
||||
{
|
||||
if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0)
|
||||
{
|
||||
DWORD errorCode;
|
||||
errorCode = GetLastError();
|
||||
printf("\nGetFileTime errorCode = %d\n", errorCode);
|
||||
} else
|
||||
{
|
||||
binaryFileValid = 1;
|
||||
}
|
||||
CloseHandle(binaryFileHandle);
|
||||
}
|
||||
|
||||
if (binaryFileValid)
|
||||
{
|
||||
HANDLE srcFileHandle = CreateFile(clFileNameForCaching,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
|
||||
|
||||
if (srcFileHandle==INVALID_HANDLE_VALUE)
|
||||
{
|
||||
const char* prefix[]={"../","../../","../../../","../../../../"};
|
||||
for (int i=0;(srcFileHandle==INVALID_HANDLE_VALUE) && i<3;i++)
|
||||
{
|
||||
char relativeFileName[1024];
|
||||
sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching);
|
||||
srcFileHandle = CreateFile(relativeFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (srcFileHandle!=INVALID_HANDLE_VALUE)
|
||||
{
|
||||
FILETIME modtimeSrc;
|
||||
if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0)
|
||||
{
|
||||
DWORD errorCode;
|
||||
errorCode = GetLastError();
|
||||
printf("\nGetFileTime errorCode = %d\n", errorCode);
|
||||
}
|
||||
if ( ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime)
|
||||
||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
|
||||
{
|
||||
fileUpToDate=1;
|
||||
} else
|
||||
{
|
||||
printf("\nCached binary file out-of-date (%s)\n",binaryFileName);
|
||||
}
|
||||
CloseHandle(srcFileHandle);
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef _DEBUG
|
||||
DWORD errorCode;
|
||||
errorCode = GetLastError();
|
||||
switch (errorCode)
|
||||
{
|
||||
case ERROR_FILE_NOT_FOUND:
|
||||
{
|
||||
printf("\nSrc file not found %s\n", clFileNameForCaching);
|
||||
break;
|
||||
}
|
||||
case ERROR_PATH_NOT_FOUND:
|
||||
{
|
||||
printf("\nSrc path not found %s\n", clFileNameForCaching);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
printf("\nnSrc file reading errorCode = %d\n", errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
//we should make sure the src file exists so we can verify the timestamp with binary
|
||||
assert(0);
|
||||
fileUpToDate = false;
|
||||
#else
|
||||
//if we cannot find the source, assume it is OK in release builds
|
||||
fileUpToDate = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
if( fileUpToDate)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
FILE* file;
|
||||
if (fopen_s(&file,binaryFileName, "rb")!=0)
|
||||
file=0;
|
||||
#else
|
||||
FILE* file = fopen(binaryFileName, "rb");
|
||||
#endif
|
||||
|
||||
if (file)
|
||||
{
|
||||
size_t binarySize=0;
|
||||
char* binary =0;
|
||||
|
||||
fseek( file, 0L, SEEK_END );
|
||||
binarySize = ftell( file );
|
||||
rewind( file );
|
||||
binary = (char*)malloc(sizeof(char)*binarySize);
|
||||
fread( binary, sizeof(char), binarySize, file );
|
||||
fclose( file );
|
||||
|
||||
m_cpProgram = clCreateProgramWithBinary( clContext, 1,&device, &binarySize, (const unsigned char**)&binary, 0, &status );
|
||||
btAssert( status == CL_SUCCESS );
|
||||
status = clBuildProgram( m_cpProgram, 1, &device, additionalMacros, 0, 0 );
|
||||
btAssert( status == CL_SUCCESS );
|
||||
|
||||
if( status != CL_SUCCESS )
|
||||
{
|
||||
char *build_log;
|
||||
size_t ret_val_size;
|
||||
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
|
||||
build_log = (char*)malloc(sizeof(char)*(ret_val_size+1));
|
||||
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
|
||||
build_log[ret_val_size] = '\0';
|
||||
printf("%s\n", build_log);
|
||||
free (build_log);
|
||||
btAssert(0);
|
||||
m_cpProgram = 0;
|
||||
}
|
||||
free (binary);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#endif //_WIN32
|
||||
|
||||
if (!m_cpProgram)
|
||||
{
|
||||
|
||||
cl_int localErrNum;
|
||||
char* compileFlags;
|
||||
int flagsize;
|
||||
|
||||
|
||||
|
||||
const char* kernelSource = kernelSourceOrg;
|
||||
|
||||
if (!kernelSourceOrg || gDebugForceLoadingFromSource)
|
||||
{
|
||||
if (clFileNameForCaching)
|
||||
{
|
||||
|
||||
FILE* file = fopen(clFileNameForCaching, "rb");
|
||||
//in many cases the relative path is a few levels up the directory hierarchy, so try it
|
||||
if (!file)
|
||||
{
|
||||
const char* prefix[]={"../","../../","../../../","../../../../"};
|
||||
for (int i=0;!file && i<3;i++)
|
||||
{
|
||||
char relativeFileName[1024];
|
||||
sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching);
|
||||
file = fopen(relativeFileName, "rb");
|
||||
}
|
||||
}
|
||||
|
||||
if (file)
|
||||
{
|
||||
char* kernelSrc=0;
|
||||
fseek( file, 0L, SEEK_END );
|
||||
int kernelSize = ftell( file );
|
||||
rewind( file );
|
||||
kernelSrc = (char*)malloc(kernelSize+1);
|
||||
int readBytes = fread((void*)kernelSrc,1,kernelSize, file);
|
||||
kernelSrc[kernelSize] = 0;
|
||||
fclose(file);
|
||||
kernelSource = kernelSrc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t program_length = kernelSource ? strlen(kernelSource) : 0;
|
||||
#ifdef MAC //or __APPLE__?
|
||||
char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
|
||||
#else
|
||||
//const char* flags = "-DGUID_ARG= -fno-alias";
|
||||
const char* flags = "-DGUID_ARG= ";
|
||||
#endif
|
||||
|
||||
|
||||
m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum);
|
||||
if (localErrNum!= CL_SUCCESS)
|
||||
{
|
||||
if (pErrNum)
|
||||
*pErrNum = localErrNum;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Build the program with 'mad' Optimization option
|
||||
|
||||
|
||||
|
||||
flagsize = sizeof(char)*(strlen(additionalMacros) + strlen(flags) + 5);
|
||||
compileFlags = (char*) malloc(flagsize);
|
||||
#ifdef _WIN32
|
||||
sprintf_s(compileFlags,flagsize, "%s %s", flags, additionalMacros);
|
||||
#else
|
||||
sprintf(compileFlags, "%s %s", flags, additionalMacros);
|
||||
#endif
|
||||
localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL);
|
||||
if (localErrNum!= CL_SUCCESS)
|
||||
{
|
||||
char *build_log;
|
||||
size_t ret_val_size;
|
||||
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
|
||||
build_log = (char*) malloc(sizeof(char)*(ret_val_size+1));
|
||||
clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
|
||||
|
||||
// to be carefully, terminate with \0
|
||||
// there's no information in the reference whether the string is 0 terminated or not
|
||||
build_log[ret_val_size] = '\0';
|
||||
|
||||
|
||||
printf("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log);
|
||||
free (build_log);
|
||||
if (pErrNum)
|
||||
*pErrNum = localErrNum;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
if( clFileNameForCaching )
|
||||
{ // write to binary
|
||||
|
||||
cl_uint numAssociatedDevices;
|
||||
status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 );
|
||||
btAssert( status == CL_SUCCESS );
|
||||
if (numAssociatedDevices==1)
|
||||
{
|
||||
|
||||
size_t binarySize;
|
||||
char* binary ;
|
||||
|
||||
status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
|
||||
btAssert( status == CL_SUCCESS );
|
||||
|
||||
binary = (char*)malloc(sizeof(char)*binarySize);
|
||||
|
||||
status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
|
||||
btAssert( status == CL_SUCCESS );
|
||||
|
||||
{
|
||||
FILE* file=0;
|
||||
#ifdef _WIN32
|
||||
if (fopen_s(&file,binaryFileName, "wb")!=0)
|
||||
file=0;
|
||||
#else
|
||||
file = fopen(binaryFileName, "wb");
|
||||
#endif
|
||||
if (file)
|
||||
{
|
||||
fwrite( binary, sizeof(char), binarySize, file );
|
||||
fclose( file );
|
||||
} else
|
||||
{
|
||||
printf("cannot write file %s\n", binaryFileName);
|
||||
}
|
||||
}
|
||||
|
||||
free (binary);
|
||||
}
|
||||
}
|
||||
#endif //_WIN32
|
||||
|
||||
free(compileFlags);
|
||||
|
||||
}
|
||||
return m_cpProgram;
|
||||
}
|
||||
|
||||
|
||||
cl_kernel btOpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros )
|
||||
{
|
||||
|
||||
cl_kernel kernel;
|
||||
cl_int localErrNum;
|
||||
|
||||
cl_program m_cpProgram = prog;
|
||||
|
||||
printf("compiling kernel %s ",kernelName);
|
||||
|
||||
if (!m_cpProgram)
|
||||
{
|
||||
m_cpProgram = btOpenCLUtils_compileCLProgramFromString(clContext,device,kernelSource,pErrNum, additionalMacros,0);
|
||||
}
|
||||
|
||||
|
||||
// Create the kernel
|
||||
kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum);
|
||||
if (localErrNum != CL_SUCCESS)
|
||||
{
|
||||
printf("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName);
|
||||
assert(0);
|
||||
if (pErrNum)
|
||||
*pErrNum = localErrNum;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!prog && m_cpProgram)
|
||||
{
|
||||
clReleaseProgram(m_cpProgram);
|
||||
}
|
||||
printf("ready. \n");
|
||||
|
||||
|
||||
if (pErrNum)
|
||||
*pErrNum = CL_SUCCESS;
|
||||
return kernel;
|
||||
|
||||
}
|
||||
179
opencl/basic_initialize/btOpenCLUtils.h
Normal file
179
opencl/basic_initialize/btOpenCLUtils.h
Normal file
@@ -0,0 +1,179 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
|
||||
Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
//original author: Roman Ponomarev
|
||||
//cleanup by Erwin Coumans
|
||||
|
||||
#ifndef BT_OPENCL_UTILS_H
|
||||
#define BT_OPENCL_UTILS_H
|
||||
|
||||
#include "btOpenCLInclude.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
///C API for OpenCL utilities: convenience functions, see below for C++ API
|
||||
|
||||
/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
|
||||
/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
|
||||
cl_context btOpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC , int preferredDeviceIndex , int preferredPlatformIndex, cl_platform_id* platformId);
|
||||
|
||||
int btOpenCLUtils_getNumDevices(cl_context cxMainContext);
|
||||
|
||||
cl_device_id btOpenCLUtils_getDevice(cl_context cxMainContext, int nr);
|
||||
|
||||
void btOpenCLUtils_printDeviceInfo(cl_device_id device);
|
||||
|
||||
cl_kernel btOpenCLUtils_compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog,const char* additionalMacros);
|
||||
|
||||
//optional
|
||||
cl_program btOpenCLUtils_compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum,const char* additionalMacros , const char* srcFileNameForCaching);
|
||||
|
||||
//the following optional APIs provide access using specific platform information
|
||||
int btOpenCLUtils_getNumPlatforms(cl_int* pErrNum);
|
||||
|
||||
///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
|
||||
cl_platform_id btOpenCLUtils_getPlatform(int nr, cl_int* pErrNum);
|
||||
|
||||
void btOpenCLUtils_printPlatformInfo(cl_platform_id platform);
|
||||
|
||||
const char* btOpenCLUtils_getSdkVendorName();
|
||||
|
||||
cl_context btOpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC ,int preferredDeviceIndex , int preferredPlatformIndex);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
#define BT_MAX_STRING_LENGTH 1024
|
||||
|
||||
typedef struct
|
||||
{
|
||||
char m_deviceName[BT_MAX_STRING_LENGTH];
|
||||
char m_deviceVendor[BT_MAX_STRING_LENGTH];
|
||||
char m_driverVersion[BT_MAX_STRING_LENGTH];
|
||||
char m_deviceExtensions[BT_MAX_STRING_LENGTH];
|
||||
|
||||
cl_device_type m_deviceType;
|
||||
cl_uint m_computeUnits;
|
||||
size_t m_workitemDims;
|
||||
size_t m_workItemSize[3];
|
||||
size_t m_image2dMaxWidth;
|
||||
size_t m_image2dMaxHeight;
|
||||
size_t m_image3dMaxWidth;
|
||||
size_t m_image3dMaxHeight;
|
||||
size_t m_image3dMaxDepth;
|
||||
size_t m_workgroupSize;
|
||||
cl_uint m_clockFrequency;
|
||||
cl_ulong m_constantBufferSize;
|
||||
cl_ulong m_localMemSize;
|
||||
cl_ulong m_globalMemSize;
|
||||
cl_bool m_errorCorrectionSupport;
|
||||
cl_device_local_mem_type m_localMemType;
|
||||
cl_uint m_maxReadImageArgs;
|
||||
cl_uint m_maxWriteImageArgs;
|
||||
|
||||
|
||||
|
||||
cl_uint m_addressBits;
|
||||
cl_ulong m_maxMemAllocSize;
|
||||
cl_command_queue_properties m_queueProperties;
|
||||
cl_bool m_imageSupport;
|
||||
cl_uint m_vecWidthChar;
|
||||
cl_uint m_vecWidthShort;
|
||||
cl_uint m_vecWidthInt;
|
||||
cl_uint m_vecWidthLong;
|
||||
cl_uint m_vecWidthFloat;
|
||||
cl_uint m_vecWidthDouble;
|
||||
|
||||
} btOpenCLDeviceInfo;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
char m_platformVendor[BT_MAX_STRING_LENGTH];
|
||||
char m_platformName[BT_MAX_STRING_LENGTH];
|
||||
char m_platformVersion[BT_MAX_STRING_LENGTH];
|
||||
} btOpenCLPlatformInfo;
|
||||
|
||||
|
||||
///C++ API for OpenCL utilities: convenience functions
|
||||
struct btOpenCLUtils
|
||||
{
|
||||
/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
|
||||
/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
|
||||
static inline cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1, cl_platform_id* platformId=0)
|
||||
{
|
||||
return btOpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx , pGLDC , preferredDeviceIndex, preferredPlatformIndex, platformId);
|
||||
}
|
||||
|
||||
static inline int getNumDevices(cl_context cxMainContext)
|
||||
{
|
||||
return btOpenCLUtils_getNumDevices(cxMainContext);
|
||||
}
|
||||
static inline cl_device_id getDevice(cl_context cxMainContext, int nr)
|
||||
{
|
||||
return btOpenCLUtils_getDevice(cxMainContext,nr);
|
||||
}
|
||||
|
||||
static void getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo* info);
|
||||
|
||||
static inline void printDeviceInfo(cl_device_id device)
|
||||
{
|
||||
btOpenCLUtils_printDeviceInfo(device);
|
||||
}
|
||||
|
||||
static inline cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" )
|
||||
{
|
||||
return btOpenCLUtils_compileCLKernelFromString(clContext,device, kernelSource, kernelName, pErrNum, prog,additionalMacros);
|
||||
}
|
||||
|
||||
//optional
|
||||
static inline cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0)
|
||||
{
|
||||
return btOpenCLUtils_compileCLProgramFromString(clContext,device, kernelSource, pErrNum,additionalMacros, srcFileNameForCaching);
|
||||
}
|
||||
|
||||
//the following optional APIs provide access using specific platform information
|
||||
static inline int getNumPlatforms(cl_int* pErrNum=0)
|
||||
{
|
||||
return btOpenCLUtils_getNumPlatforms(pErrNum);
|
||||
}
|
||||
///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
|
||||
static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum=0)
|
||||
{
|
||||
return btOpenCLUtils_getPlatform(nr,pErrNum);
|
||||
}
|
||||
|
||||
static void getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo* platformInfo);
|
||||
|
||||
static inline void printPlatformInfo(cl_platform_id platform)
|
||||
{
|
||||
btOpenCLUtils_printPlatformInfo(platform);
|
||||
}
|
||||
|
||||
static inline const char* getSdkVendorName()
|
||||
{
|
||||
return btOpenCLUtils_getSdkVendorName();
|
||||
}
|
||||
static inline cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1)
|
||||
{
|
||||
return btOpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx,pGLDC,preferredDeviceIndex, preferredPlatformIndex);
|
||||
}
|
||||
};
|
||||
|
||||
#endif //__cplusplus
|
||||
|
||||
#endif // BT_OPENCL_UTILS_H
|
||||
98
opencl/basic_initialize/main.cpp
Normal file
98
opencl/basic_initialize/main.cpp
Normal file
@@ -0,0 +1,98 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
///original author: Erwin Coumans
|
||||
|
||||
#include "btOpenCLUtils.h"
|
||||
#include <stdio.h>
|
||||
|
||||
cl_context g_cxMainContext;
|
||||
cl_command_queue g_cqCommandQue;
|
||||
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int ciErrNum = 0;
|
||||
|
||||
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
|
||||
const char* vendorSDK = btOpenCLUtils::getSdkVendorName();
|
||||
|
||||
printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
|
||||
int numPlatforms = btOpenCLUtils::getNumPlatforms();
|
||||
printf("Num Platforms = %d\n", numPlatforms);
|
||||
|
||||
for (int i=0;i<numPlatforms;i++)
|
||||
{
|
||||
cl_platform_id platform = btOpenCLUtils::getPlatform(i);
|
||||
btOpenCLPlatformInfo platformInfo;
|
||||
btOpenCLUtils::getPlatformInfo(platform,&platformInfo);
|
||||
printf("--------------------------------\n");
|
||||
printf("Platform info for platform nr %d:\n",i);
|
||||
printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
|
||||
printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
|
||||
printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
|
||||
|
||||
cl_context context = btOpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
|
||||
|
||||
int numDevices = btOpenCLUtils::getNumDevices(context);
|
||||
printf("Num Devices = %d\n", numDevices);
|
||||
for (int j=0;j<numDevices;j++)
|
||||
{
|
||||
cl_device_id dev = btOpenCLUtils::getDevice(context,j);
|
||||
btOpenCLDeviceInfo devInfo;
|
||||
btOpenCLUtils::getDeviceInfo(dev,&devInfo);
|
||||
btOpenCLUtils::printDeviceInfo(dev);
|
||||
}
|
||||
|
||||
clReleaseContext(context);
|
||||
}
|
||||
|
||||
///Easier method to initialize OpenCL using createContextFromType for a GPU
|
||||
deviceType = CL_DEVICE_TYPE_GPU;
|
||||
|
||||
void* glCtx=0;
|
||||
void* glDC = 0;
|
||||
printf("Initialize OpenCL using btOpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
|
||||
g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
if (g_cxMainContext)
|
||||
{
|
||||
int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
|
||||
|
||||
for (int i=0;i<numDev;i++)
|
||||
{
|
||||
cl_device_id device;
|
||||
device = btOpenCLUtils::getDevice(g_cxMainContext,i);
|
||||
btOpenCLDeviceInfo clInfo;
|
||||
btOpenCLUtils::getDeviceInfo(device,&clInfo);
|
||||
btOpenCLUtils::printDeviceInfo(device);
|
||||
// create a command-queue
|
||||
g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
//normally you would create and execute kernels using this command queue
|
||||
|
||||
clReleaseCommandQueue(g_cqCommandQue);
|
||||
}
|
||||
|
||||
clReleaseContext(g_cxMainContext);
|
||||
|
||||
}
|
||||
else {
|
||||
printf("No OpenCL capable GPU found!");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
28
opencl/basic_initialize/premake4.lua
Normal file
28
opencl/basic_initialize/premake4.lua
Normal file
@@ -0,0 +1,28 @@
|
||||
function createProject(vendor)
|
||||
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("OpenCL_intialize_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../bin"
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"btOpenCLUtils.cpp",
|
||||
"btOpenCLUtils.h"
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("Apple")
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
171
opencl/lds_bank_conflict/lds_kernels.cl
Normal file
171
opencl/lds_bank_conflict/lds_kernels.cl
Normal file
@@ -0,0 +1,171 @@
|
||||
|
||||
#define TILE_DIM 32
|
||||
#define BLOCK_ROWS 8
|
||||
|
||||
|
||||
/*// simple copy kernel (CUDA)
|
||||
// Used as reference case representing best effective bandwidth.
|
||||
__global__ void copy(float *odata, const float *idata)
|
||||
{
|
||||
int x = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
int y = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
int width = gridDim.x * TILE_DIM;
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
|
||||
odata[(y+j)*width + x] = idata[(y+j)*width + x];
|
||||
}
|
||||
*/
|
||||
// simple copy kernel (OpenCL)
|
||||
__kernel void copyKernel(__global float* odata, __global const float* idata)
|
||||
{
|
||||
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
|
||||
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
|
||||
int width = get_num_groups(0) * get_local_size(0);
|
||||
for (int j = 0; j < get_num_groups(1); j+= get_local_size(1))
|
||||
{
|
||||
odata[(y+j)*width + x] = idata[(y+j)*width + x];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
// copy kernel using shared memory (CUDA)
|
||||
// Also used as reference case, demonstrating effect of using shared memory.
|
||||
__global__ void copySharedMem(float *odata, const float *idata)
|
||||
{
|
||||
__shared__ float tile[TILE_DIM * TILE_DIM];
|
||||
|
||||
int x = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
int y = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
int width = gridDim.x * TILE_DIM;
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x];
|
||||
}
|
||||
*/
|
||||
|
||||
// copy kernel using shared memory (OpenCL)
|
||||
// Also used as reference case, demonstrating effect of using shared memory.
|
||||
__kernel void copySharedMemKernel(__global float *odata, __global const float *idata)
|
||||
{
|
||||
__local float tile[TILE_DIM * TILE_DIM];
|
||||
|
||||
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
|
||||
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
|
||||
int width = get_num_groups(0) * get_local_size(0);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)] = idata[(y+j)*width + x];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
odata[(y+j)*width + x] = tile[(get_local_id(1)+j)*TILE_DIM + get_local_id(0)];
|
||||
}
|
||||
|
||||
/*
|
||||
// naive transpose (CUDA)
|
||||
// Simplest transpose; doesn't use shared memory.
|
||||
// Global memory reads are coalesced but writes are not.
|
||||
__global__ void transposeNaive(float *odata, const float *idata)
|
||||
{
|
||||
int x = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
int y = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
int width = gridDim.x * TILE_DIM;
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
|
||||
odata[x*width + (y+j)] = idata[(y+j)*width + x];
|
||||
}
|
||||
*/
|
||||
|
||||
// naive transpose (OpenCL)
|
||||
// Simplest transpose; doesn't use shared memory.
|
||||
// Global memory reads are coalesced but writes are not.
|
||||
__kernel void transposeNaiveKernel(__global float *odata, __global const float *idata)
|
||||
{
|
||||
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
|
||||
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
|
||||
int width = get_num_groups(0) * get_local_size(0);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
|
||||
odata[x*width + (y+j)] = idata[(y+j)*width + x];
|
||||
}
|
||||
|
||||
/*
|
||||
// coalesced transpose (CUDA)
|
||||
// Uses shared memory to achieve coalesing in both reads and writes
|
||||
// Tile width == #banks causes shared memory bank conflicts.
|
||||
__global__ void transposeCoalesced(float *odata, const float *idata)
|
||||
{
|
||||
__shared__ float tile[TILE_DIM][TILE_DIM];
|
||||
|
||||
int x = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
int y = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
int width = gridDim.x * TILE_DIM;
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
|
||||
y = blockIdx.x * TILE_DIM + threadIdx.y;
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
|
||||
}
|
||||
*/
|
||||
|
||||
// coalesced transpose (OpenCL)
|
||||
// Uses shared memory to achieve coalesing in both reads and writes
|
||||
// Tile width == #banks causes shared memory bank conflicts.
|
||||
__kernel void transposeCoalescedKernel(__global float *odata, __global const float *idata)
|
||||
{
|
||||
__local float tile[TILE_DIM][TILE_DIM];
|
||||
|
||||
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
|
||||
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
|
||||
int width = get_num_groups(0) * get_local_size(0);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
x = get_group_id(1) * TILE_DIM + get_local_id(0);
|
||||
y = get_group_id(0) * TILE_DIM + get_local_id(1);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
|
||||
}
|
||||
|
||||
|
||||
// No bank-conflict transpose (OpenCL)
|
||||
// Same as transposeCoalesced except the first tile dimension is padded
|
||||
// to avoid shared memory bank conflicts.
|
||||
__kernel void transposeNoBankConflictsKernel(__global float *odata, __global const float *idata)
|
||||
{
|
||||
__local float tile[TILE_DIM][TILE_DIM+1];
|
||||
|
||||
int x = get_group_id(0) * get_num_groups(0) + get_local_id(0);
|
||||
int y = get_group_id(1) * get_num_groups(1) + get_local_id(1);
|
||||
int width = get_num_groups(0) * get_local_size(0);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
tile[get_local_id(1)+j][get_local_id(0)] = idata[(y+j)*width + x];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
x = get_group_id(1) * TILE_DIM + get_local_id(0);
|
||||
y = get_group_id(0) * TILE_DIM + get_local_id(1);
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
|
||||
odata[(y+j)*width + x] = tile[get_local_id(0)][get_local_id(1) + j];
|
||||
}
|
||||
|
||||
|
||||
|
||||
361
opencl/lds_bank_conflict/main.cpp
Normal file
361
opencl/lds_bank_conflict/main.cpp
Normal file
@@ -0,0 +1,361 @@
|
||||
//Adapted from CUDA to OpenCL by Erwin Coumans
|
||||
//See http://bitbucket.org/erwincoumans/opencl_course
|
||||
|
||||
// Copyright 2012 NVIDIA Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "btOpenCLUtils.h"
|
||||
#include "../parallel_primitives/host/btOpenCLArray.h"
|
||||
#include "../parallel_primitives/host/btLauncherCL.h"
|
||||
#include "../parallel_primitives/host/btQuickprof.h"
|
||||
#include "../parallel_primitives/host/btFillCL.h"
|
||||
#include "../parallel_primitives/host/CommandLineArgs.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
//make sure to update the same #define in the opencl/lds_bank_conflict/lds_kernels.cl
|
||||
const int TILE_DIM = 32;
|
||||
const int BLOCK_ROWS = 8;
|
||||
const int NUM_REPS = 100;
|
||||
|
||||
// Check errors and print GB/s
|
||||
void postprocess(const float *ref, const float *res, int n, float ms)
|
||||
{
|
||||
bool passed = true;
|
||||
for (int i = 0; i < n; i++)
|
||||
if (res[i] != ref[i]) {
|
||||
printf("\nError: at res[%d] got %f but expected %f\n", i, res[i], ref[i]);
|
||||
printf("%25s\n", "*** FAILED ***");
|
||||
passed = false;
|
||||
break;
|
||||
}
|
||||
if (passed)
|
||||
printf("%20.2f\n", 2 * n * sizeof(float) * 1e-6 * NUM_REPS / ms );
|
||||
}
|
||||
|
||||
char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
|
||||
{
|
||||
// locals
|
||||
FILE* pFileStream = NULL;
|
||||
size_t szSourceLength;
|
||||
|
||||
// open the OpenCL source code file
|
||||
pFileStream = fopen(cFilename, "rb");
|
||||
if(pFileStream == 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t szPreambleLength = strlen(cPreamble);
|
||||
|
||||
// get the length of the source code
|
||||
fseek(pFileStream, 0, SEEK_END);
|
||||
szSourceLength = ftell(pFileStream);
|
||||
fseek(pFileStream, 0, SEEK_SET);
|
||||
|
||||
// allocate a buffer for the source code string and read it in
|
||||
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
|
||||
memcpy(cSourceString, cPreamble, szPreambleLength);
|
||||
fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream);
|
||||
|
||||
// close the file and return the total length of the combined (preamble + source) string
|
||||
fclose(pFileStream);
|
||||
if(szFinalLength != 0)
|
||||
{
|
||||
*szFinalLength = szSourceLength + szPreambleLength;
|
||||
}
|
||||
cSourceString[szSourceLength + szPreambleLength] = '\0';
|
||||
|
||||
return cSourceString;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
printf("Use --deviceId=<id> or --platformId=<id> to override OpenCL device\n");
|
||||
CommandLineArgs args(argc,argv);
|
||||
|
||||
const int nx = 1024;
|
||||
const int ny = 1024;
|
||||
|
||||
const int mem_size = nx*ny*sizeof(float);
|
||||
const int num_elements = nx*ny;
|
||||
btClock clock;
|
||||
double startEvent=0.f;
|
||||
double stopEvent=0.f;
|
||||
|
||||
int localSizeX = TILE_DIM;
|
||||
int localSizeY = BLOCK_ROWS;
|
||||
|
||||
int numThreadsX = (nx/TILE_DIM)*TILE_DIM;
|
||||
int numThreadsY = (ny/TILE_DIM)*BLOCK_ROWS;
|
||||
|
||||
int gridX = numThreadsX / localSizeX;
|
||||
int gridY = numThreadsY / localSizeY;
|
||||
|
||||
int ciErrNum = 0;
|
||||
int preferred_device = -1;
|
||||
int preferred_platform = -1;
|
||||
args.GetCmdLineArgument("deviceId",preferred_device);
|
||||
args.GetCmdLineArgument("platformId",preferred_platform);
|
||||
|
||||
|
||||
cl_platform_id platformId=0;
|
||||
cl_context ctx=0;
|
||||
cl_command_queue queue=0;
|
||||
cl_device_id device=0;
|
||||
cl_kernel copyKernel=0;
|
||||
cl_kernel copySharedMemKernel=0;
|
||||
cl_kernel transposeNaiveKernel = 0;
|
||||
cl_kernel transposeCoalescedKernel = 0;
|
||||
cl_kernel transposeNoBankConflictsKernel= 0;
|
||||
|
||||
|
||||
ctx = btOpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
|
||||
btOpenCLUtils::printPlatformInfo(platformId);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
device = btOpenCLUtils::getDevice(ctx,0);
|
||||
btOpenCLUtils::printDeviceInfo(device);
|
||||
queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
|
||||
|
||||
const char* cSourceFile = "opencl/lds_bank_conflict/lds_kernels.cl";
|
||||
|
||||
size_t szKernelLength;
|
||||
|
||||
const char* cSourceCL =0;
|
||||
char relativeFileName[1024];
|
||||
|
||||
{
|
||||
const char* prefix[]={"./","../","../../","../../../","../../../../"};
|
||||
int numPrefixes = sizeof(prefix)/sizeof(char*);
|
||||
|
||||
for (int i=0;!cSourceCL && i<numPrefixes;i++)
|
||||
{
|
||||
|
||||
sprintf(relativeFileName,"%s%s",prefix[i],cSourceFile);
|
||||
cSourceCL = loadProgSource(relativeFileName, "", &szKernelLength);
|
||||
if (cSourceCL)
|
||||
{
|
||||
printf("Loaded program source: %s\n", relativeFileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!cSourceCL)
|
||||
{
|
||||
printf("Couldn't find file %s, exiting\n",cSourceFile);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
char flags[1024]={0};
|
||||
#ifdef CL_PLATFORM_INTEL
|
||||
///use this flag to allow for OpenCL kernel debugging on CPU using the Intel OpenCL run-time
|
||||
//sprintf(flags,"-g -s \"%s\"","C:/develop/opencl_course/opencl/lds_bank_conflict/lds_kernels.cl");
|
||||
#endif//CL_PLATFORM_INTEL
|
||||
|
||||
|
||||
copyKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"copyKernel",&ciErrNum,0,flags);
|
||||
copySharedMemKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"copySharedMemKernel",&ciErrNum,0,flags);
|
||||
transposeNaiveKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeNaiveKernel",&ciErrNum,0,flags);
|
||||
transposeCoalescedKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeCoalescedKernel",&ciErrNum,0,flags);
|
||||
transposeNoBankConflictsKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,cSourceCL,"transposeNoBankConflictsKernel",&ciErrNum,0,flags);
|
||||
|
||||
btFillCL clMemSet(ctx,device,queue);
|
||||
|
||||
printf("\n============================================\n");
|
||||
|
||||
printf("Matrix size: %d %d, Block size: %d %d, Tile size: %d %d\n",
|
||||
nx, ny, TILE_DIM, BLOCK_ROWS, TILE_DIM, TILE_DIM);
|
||||
|
||||
float *h_idata = (float*)malloc(mem_size);
|
||||
float *h_cdata = (float*)malloc(mem_size);
|
||||
float *h_tdata = (float*)malloc(mem_size);
|
||||
float *gold = (float*)malloc(mem_size);
|
||||
|
||||
btOpenCLArray<float> d_idataCL(ctx,queue);d_idataCL.resize(num_elements);
|
||||
btOpenCLArray<float> d_cdataCL(ctx,queue);d_cdataCL.resize(num_elements);
|
||||
btOpenCLArray<float> d_tdataCL(ctx,queue);d_tdataCL.resize(num_elements);
|
||||
|
||||
|
||||
// check parameters and calculate execution configuration
|
||||
if (nx % TILE_DIM || ny % TILE_DIM)
|
||||
{
|
||||
printf("nx and ny must be a multiple of TILE_DIM\n");
|
||||
goto error_exit;
|
||||
}
|
||||
|
||||
if (TILE_DIM % BLOCK_ROWS)
|
||||
{
|
||||
printf("TILE_DIM must be a multiple of BLOCK_ROWS\n");
|
||||
goto error_exit;
|
||||
}
|
||||
|
||||
// host
|
||||
for (int j = 0; j < ny; j++)
|
||||
for (int i = 0; i < nx; i++)
|
||||
h_idata[j*nx + i] = j*nx + i;
|
||||
|
||||
// correct result for error checking
|
||||
for (int j = 0; j < ny; j++)
|
||||
for (int i = 0; i < nx; i++)
|
||||
{
|
||||
gold[j*nx + i] = h_idata[i*nx + j];
|
||||
}
|
||||
|
||||
d_idataCL.copyFromHostPointer(h_idata,num_elements);
|
||||
|
||||
// events for timing
|
||||
clock.reset();
|
||||
|
||||
float ms;
|
||||
|
||||
// ------------
|
||||
// time kernels
|
||||
// ------------
|
||||
printf("%25s%25s\n", "Routine", "Bandwidth (GB/s)");
|
||||
|
||||
// ----
|
||||
// copy
|
||||
// ----
|
||||
printf("%25s", "copy");
|
||||
|
||||
clMemSet.execute(d_cdataCL,0.f,num_elements);
|
||||
|
||||
{
|
||||
// warm up
|
||||
btLauncherCL launcher( queue, copyKernel);
|
||||
launcher.setBuffer( d_cdataCL.getBufferCL());
|
||||
launcher.setBuffer( d_idataCL.getBufferCL());
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
|
||||
startEvent = clock.getTimeMicroseconds()/1e3;
|
||||
for (int i = 0; i < NUM_REPS; i++)
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
clFinish(queue);
|
||||
stopEvent = clock.getTimeMicroseconds()/1e3;
|
||||
}
|
||||
|
||||
ms = float(stopEvent-startEvent);
|
||||
|
||||
d_cdataCL.copyToHostPointer(h_cdata,num_elements,0);
|
||||
postprocess(h_idata, h_cdata, nx*ny, ms);
|
||||
|
||||
// -------------
|
||||
// copySharedMem
|
||||
// -------------
|
||||
printf("%25s", "shared memory copy");
|
||||
clMemSet.execute(d_cdataCL,0.f,num_elements);
|
||||
|
||||
{
|
||||
btLauncherCL launcher( queue, copySharedMemKernel);
|
||||
launcher.setBuffer( d_cdataCL.getBufferCL());
|
||||
launcher.setBuffer( d_idataCL.getBufferCL());
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
|
||||
startEvent = clock.getTimeMicroseconds()/1e3;
|
||||
for (int i = 0; i < NUM_REPS; i++)
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
clFinish(queue);
|
||||
stopEvent = clock.getTimeMicroseconds()/1e3;
|
||||
}
|
||||
|
||||
ms = float(stopEvent-startEvent);
|
||||
d_cdataCL.copyToHostPointer(h_cdata,num_elements,0);
|
||||
postprocess(h_idata, h_cdata, nx * ny, ms);
|
||||
|
||||
// --------------
|
||||
// transposeNaive
|
||||
// --------------
|
||||
printf("%25s", "naive transpose");
|
||||
clMemSet.execute(d_tdataCL,0.f,num_elements);
|
||||
{
|
||||
// warmup
|
||||
btLauncherCL launcher( queue, transposeNaiveKernel);
|
||||
launcher.setBuffer( d_tdataCL.getBufferCL());
|
||||
launcher.setBuffer( d_idataCL.getBufferCL());
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
|
||||
startEvent = clock.getTimeMicroseconds()/1e3;
|
||||
for (int i = 0; i < NUM_REPS; i++)
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
clFinish(queue);
|
||||
stopEvent = clock.getTimeMicroseconds()/1e3;
|
||||
}
|
||||
ms = float(stopEvent-startEvent);
|
||||
d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
|
||||
postprocess(gold, h_tdata, nx * ny, ms);
|
||||
|
||||
// ------------------
|
||||
// transposeCoalesced
|
||||
// ------------------
|
||||
printf("%25s", "coalesced transpose");
|
||||
clMemSet.execute(d_tdataCL,0.f,num_elements);
|
||||
{
|
||||
btLauncherCL launcher( queue, transposeCoalescedKernel);
|
||||
launcher.setBuffer( d_tdataCL.getBufferCL());
|
||||
launcher.setBuffer( d_idataCL.getBufferCL());
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
|
||||
startEvent = clock.getTimeMicroseconds()/1e3;
|
||||
for (int i = 0; i < NUM_REPS; i++)
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
clFinish(queue);
|
||||
stopEvent = clock.getTimeMicroseconds()/1e3;
|
||||
}
|
||||
|
||||
ms = float(stopEvent-startEvent);
|
||||
d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
|
||||
postprocess(gold, h_tdata, nx * ny, ms);
|
||||
|
||||
// ------------------------
|
||||
// transposeNoBankConflicts
|
||||
// ------------------------
|
||||
printf("%25s", "conflict-free transpose");
|
||||
clMemSet.execute(d_tdataCL,0.f,num_elements);
|
||||
{
|
||||
btLauncherCL launcher( queue, transposeNoBankConflictsKernel);
|
||||
launcher.setBuffer( d_tdataCL.getBufferCL());
|
||||
launcher.setBuffer( d_idataCL.getBufferCL());
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
|
||||
startEvent = clock.getTimeMicroseconds()/1e3;
|
||||
for (int i = 0; i < NUM_REPS; i++)
|
||||
launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY );
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
clFinish(queue);
|
||||
stopEvent = clock.getTimeMicroseconds()/1e3;
|
||||
}
|
||||
|
||||
ms = float(stopEvent-startEvent);
|
||||
d_tdataCL.copyToHostPointer(h_tdata,num_elements,0);
|
||||
postprocess(gold, h_tdata, nx * ny, ms);
|
||||
|
||||
error_exit:
|
||||
// cleanup
|
||||
clReleaseKernel(copyKernel);
|
||||
clReleaseCommandQueue(queue);
|
||||
clReleaseContext(ctx);
|
||||
|
||||
free(h_idata);
|
||||
free(h_tdata);
|
||||
free(h_cdata);
|
||||
free(gold);
|
||||
printf("Press <enter>\n");
|
||||
getchar();
|
||||
}
|
||||
37
opencl/lds_bank_conflict/premake4.lua
Normal file
37
opencl/lds_bank_conflict/premake4.lua
Normal file
@@ -0,0 +1,37 @@
|
||||
|
||||
function createProject (vendor)
|
||||
|
||||
local hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ( "OpenCL_lds_bank_conflict_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../bin"
|
||||
|
||||
links {
|
||||
"OpenCL_lib_parallel_primitives_host_" .. vendor
|
||||
}
|
||||
|
||||
includedirs {
|
||||
"../basic_initialize"
|
||||
}
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../basic_initialize/btOpenCLUtils.cpp",
|
||||
"../basic_initialize/btOpenCLUtils.h"
|
||||
}
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("NVIDIA")
|
||||
createProject("Intel")
|
||||
createProject("Apple")
|
||||
35
opencl/parallel_primitives/benchmark/premake4.lua
Normal file
35
opencl/parallel_primitives/benchmark/premake4.lua
Normal file
@@ -0,0 +1,35 @@
|
||||
function createProject(vendor)
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("OpenCL_radixsort_benchmark_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../bin"
|
||||
includedirs {".."}
|
||||
|
||||
links {
|
||||
("OpenCL_lib_parallel_primitives_host_" .. vendor)
|
||||
}
|
||||
|
||||
files {
|
||||
"test_large_problem_sorting.cpp",
|
||||
"../../basic_initialize/btOpenCLUtils.cpp",
|
||||
"../../basic_initialize/btOpenCLUtils.h",
|
||||
"../host/btFillCL.cpp",
|
||||
"../host/btPrefixScanCL.cpp",
|
||||
"../host/btRadixSort32CL.cpp",
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
createProject("Apple")
|
||||
@@ -0,0 +1,709 @@
|
||||
/******************************************************************************
|
||||
* Copyright 2010 Duane Merrill
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* AUTHORS' REQUEST:
|
||||
*
|
||||
* If you use|reference|benchmark this code, please cite our Technical
|
||||
* Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
|
||||
*
|
||||
* @TechReport{ Merrill:Sorting:2010,
|
||||
* author = "Duane Merrill and Andrew Grimshaw",
|
||||
* title = "Revisiting Sorting for GPGPU Stream Architectures",
|
||||
* year = "2010",
|
||||
* institution = "University of Virginia, Department of Computer Science",
|
||||
* address = "Charlottesville, VA, USA",
|
||||
* number = "CS2010-03"
|
||||
* }
|
||||
*
|
||||
* For more information, see our Google Code project site:
|
||||
* http://code.google.com/p/back40computing/
|
||||
*
|
||||
* Thanks!
|
||||
******************************************************************************/
|
||||
|
||||
/******************************************************************************
|
||||
* Simple test driver program for *large-problem* radix sorting.
|
||||
*
|
||||
* Useful for demonstrating how to integrate radix sorting into
|
||||
* your application
|
||||
******************************************************************************/
|
||||
|
||||
/******************************************************************************
|
||||
* Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
|
||||
******************************************************************************/
|
||||
#ifdef _WIN32
|
||||
#pragma warning (disable:4996)
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
|
||||
//#include <iostream>
|
||||
#include <sstream>
|
||||
/**********************
|
||||
*
|
||||
*/
|
||||
|
||||
#include "../host/btRadixSort32CL.h"
|
||||
#include "../../basic_initialize/btOpenCLUtils.h"
|
||||
#include "../host/btQuickprof.h"
|
||||
|
||||
cl_context g_cxMainContext;
|
||||
cl_device_id g_device;
|
||||
cl_command_queue g_cqCommandQueue;
|
||||
|
||||
/***********************
|
||||
*
|
||||
*/
|
||||
|
||||
bool g_verbose;
|
||||
///Preferred OpenCL device/platform. When < 0 then no preference is used.
|
||||
///Note that btOpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application.
|
||||
///Preferred device/platform take priority over this platform-vendor match
|
||||
int gPreferredDeviceId = -1;
|
||||
int gPreferredPlatformId = -1;
|
||||
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Routines
|
||||
******************************************************************************/
|
||||
|
||||
|
||||
/**
|
||||
* Keys-only sorting. Uses the GPU to sort the specified vector of elements for the given
|
||||
* number of iterations, displaying runtime information.
|
||||
*
|
||||
* @param[in] num_elements
|
||||
* Size in elements of the vector to sort
|
||||
* @param[in] h_keys
|
||||
* Vector of keys to sort
|
||||
* @param[in] iterations
|
||||
* Number of times to invoke the GPU sorting primitive
|
||||
* @param[in] cfg
|
||||
* Config
|
||||
*/
|
||||
template <typename K>
|
||||
void TimedSort(
|
||||
unsigned int num_elements,
|
||||
K *h_keys,
|
||||
unsigned int iterations)
|
||||
{
|
||||
printf("Keys only, %d iterations, %d elements\n", iterations, num_elements);
|
||||
|
||||
int max_elements = num_elements;
|
||||
btAlignedObjectArray<unsigned int> hostData;
|
||||
hostData.resize(num_elements);
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
hostData[i] = h_keys[i];
|
||||
}
|
||||
|
||||
btRadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
|
||||
|
||||
btOpenCLArray<unsigned int> gpuData(g_cxMainContext,g_cqCommandQueue);
|
||||
gpuData.copyFromHost(hostData);
|
||||
//sorter.executeHost(gpuData);
|
||||
sorter.execute(gpuData);
|
||||
|
||||
btAlignedObjectArray<unsigned int> hostDataSorted;
|
||||
gpuData.copyToHost(hostDataSorted);
|
||||
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
{
|
||||
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
|
||||
|
||||
// Create sorting enactor
|
||||
|
||||
// Perform the timed number of sorting iterations
|
||||
double elapsed = 0;
|
||||
float duration = 0;
|
||||
btClock watch;
|
||||
|
||||
//warm-start
|
||||
gpuData.copyFromHost(hostData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
sorter.execute(gpuData);
|
||||
|
||||
watch.reset();
|
||||
|
||||
|
||||
for (int i = 0; i < iterations; i++)
|
||||
{
|
||||
|
||||
|
||||
|
||||
// Move a fresh copy of the problem into device storage
|
||||
gpuData.copyFromHost(hostData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
// Start GPU timing record
|
||||
double startMs = watch.getTimeMicroseconds()/1e3;
|
||||
|
||||
// Call the sorting API routine
|
||||
sorter.execute(gpuData);
|
||||
|
||||
|
||||
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
double stopMs = watch.getTimeMicroseconds()/1e3;
|
||||
|
||||
duration = stopMs - startMs;
|
||||
|
||||
// End GPU timing record
|
||||
elapsed += (double) duration;
|
||||
printf("duration = %f\n", duration);
|
||||
}
|
||||
|
||||
// Display timing information
|
||||
double avg_runtime = elapsed / iterations;
|
||||
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
|
||||
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
|
||||
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
|
||||
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
|
||||
|
||||
gpuData.copyToHost(hostData);
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
h_keys[i] = hostData[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Key-value sorting. Uses the GPU to sort the specified vector of elements for the given
|
||||
* number of iterations, displaying runtime information.
|
||||
*
|
||||
* @param[in] num_elements
|
||||
* Size in elements of the vector to sort
|
||||
* @param[in] h_keys
|
||||
* Vector of keys to sort
|
||||
* @param[in,out] h_values
|
||||
* Vector of values to sort
|
||||
* @param[in] iterations
|
||||
* Number of times to invoke the GPU sorting primitive
|
||||
* @param[in] cfg
|
||||
* Config
|
||||
*/
|
||||
template <typename K, typename V>
|
||||
void TimedSort(
|
||||
unsigned int num_elements,
|
||||
K *h_keys,
|
||||
V *h_values,
|
||||
unsigned int iterations)
|
||||
{
|
||||
|
||||
printf("Key-values, %d iterations, %d elements\n", iterations, num_elements);
|
||||
|
||||
int max_elements = num_elements;
|
||||
btAlignedObjectArray<btSortData> hostData;
|
||||
hostData.resize(num_elements);
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
hostData[i].m_key = h_keys[i];
|
||||
hostData[i].m_value = h_values[i];
|
||||
}
|
||||
|
||||
btRadixSort32CL sorter(g_cxMainContext,g_device,g_cqCommandQueue);
|
||||
|
||||
btOpenCLArray<btSortData> gpuData(g_cxMainContext,g_cqCommandQueue);
|
||||
gpuData.copyFromHost(hostData);
|
||||
//sorter.executeHost(gpuData);
|
||||
sorter.execute(gpuData);
|
||||
|
||||
btAlignedObjectArray<btSortData> hostDataSorted;
|
||||
gpuData.copyToHost(hostDataSorted);
|
||||
#if 0
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
printf("hostData[%d].m_key = %d\n",i, hostDataSorted[i].m_key);
|
||||
printf("hostData[%d].m_value = %d\n",i,hostDataSorted[i].m_value);
|
||||
}
|
||||
#endif
|
||||
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
{
|
||||
//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
|
||||
|
||||
// Create sorting enactor
|
||||
|
||||
// Perform the timed number of sorting iterations
|
||||
double elapsed = 0;
|
||||
float duration = 0;
|
||||
btClock watch;
|
||||
|
||||
//warm-start
|
||||
gpuData.copyFromHost(hostData);
|
||||
sorter.execute(gpuData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
watch.reset();
|
||||
|
||||
|
||||
for (int i = 0; i < iterations; i++)
|
||||
{
|
||||
|
||||
|
||||
|
||||
// Move a fresh copy of the problem into device storage
|
||||
gpuData.copyFromHost(hostData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
// Start GPU timing record
|
||||
double startMs = watch.getTimeMicroseconds()/1e3;
|
||||
|
||||
// Call the sorting API routine
|
||||
sorter.execute(gpuData);
|
||||
clFinish(g_cqCommandQueue);
|
||||
|
||||
double stopMs = watch.getTimeMicroseconds()/1e3;
|
||||
|
||||
duration = stopMs - startMs;
|
||||
|
||||
// End GPU timing record
|
||||
elapsed += (double) duration;
|
||||
printf("duration = %f\n", duration);
|
||||
}
|
||||
|
||||
// Display timing information
|
||||
double avg_runtime = elapsed / iterations;
|
||||
// double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0;
|
||||
// printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput);
|
||||
double throughput = ((double) num_elements) / avg_runtime / 1000.0 ;
|
||||
printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput);
|
||||
|
||||
gpuData.copyToHost(hostData);
|
||||
for (int i=0;i<num_elements;i++)
|
||||
{
|
||||
h_keys[i] = hostData[i].m_key;
|
||||
h_values[i] = hostData[i].m_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Generates random 32-bit keys.
|
||||
*
|
||||
* We always take the second-order byte from rand() because the higher-order
|
||||
* bits returned by rand() are commonly considered more uniformly distributed
|
||||
* than the lower-order bits.
|
||||
*
|
||||
* We can decrease the entropy level of keys by adopting the technique
|
||||
* of Thearling and Smith in which keys are computed from the bitwise AND of
|
||||
* multiple random samples:
|
||||
*
|
||||
* entropy_reduction | Effectively-unique bits per key
|
||||
* -----------------------------------------------------
|
||||
* -1 | 0
|
||||
* 0 | 32
|
||||
* 1 | 25.95
|
||||
* 2 | 17.41
|
||||
* 3 | 10.78
|
||||
* 4 | 6.42
|
||||
* ... | ...
|
||||
*
|
||||
*/
|
||||
template <typename K>
|
||||
void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
|
||||
{
|
||||
const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
|
||||
unsigned char key_bits[NUM_UCHARS];
|
||||
|
||||
do {
|
||||
|
||||
for (int j = 0; j < NUM_UCHARS; j++) {
|
||||
unsigned char quarterword = 0xff;
|
||||
for (int i = 0; i <= entropy_reduction; i++) {
|
||||
quarterword &= (rand() >> 7);
|
||||
}
|
||||
key_bits[j] = quarterword;
|
||||
}
|
||||
|
||||
if (lower_key_bits < sizeof(K) * 8) {
|
||||
unsigned long long base = 0;
|
||||
memcpy(&base, key_bits, sizeof(K));
|
||||
base &= (1 << lower_key_bits) - 1;
|
||||
memcpy(key_bits, &base, sizeof(K));
|
||||
}
|
||||
|
||||
memcpy(&key, key_bits, sizeof(K));
|
||||
|
||||
} while (key != key); // avoids NaNs when generating random floating point numbers
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Templated routines for printing keys/values to the console
|
||||
******************************************************************************/
|
||||
|
||||
template<typename T>
|
||||
void PrintValue(T val) {
|
||||
printf("%d", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<float>(float val) {
|
||||
printf("%f", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<double>(double val) {
|
||||
printf("%f", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned char>(unsigned char val) {
|
||||
printf("%u", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned short>(unsigned short val) {
|
||||
printf("%u", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned int>(unsigned int val) {
|
||||
printf("%u", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<long>(long val) {
|
||||
printf("%ld", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned long>(unsigned long val) {
|
||||
printf("%lu", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<long long>(long long val) {
|
||||
printf("%lld", val);
|
||||
}
|
||||
|
||||
template<>
|
||||
void PrintValue<unsigned long long>(unsigned long long val) {
|
||||
printf("%llu", val);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Compares the equivalence of two arrays
|
||||
*/
|
||||
template <typename T, typename SizeT>
|
||||
int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
|
||||
{
|
||||
printf("\n");
|
||||
for (SizeT i = 0; i < len; i++) {
|
||||
|
||||
if (computed[i] != reference[i]) {
|
||||
printf("INCORRECT: [%lu]: ", (unsigned long) i);
|
||||
PrintValue<T>(computed[i]);
|
||||
printf(" != ");
|
||||
PrintValue<T>(reference[i]);
|
||||
|
||||
if (verbose) {
|
||||
printf("\nresult[...");
|
||||
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
|
||||
PrintValue<T>(computed[j]);
|
||||
printf(", ");
|
||||
}
|
||||
printf("...]");
|
||||
printf("\nreference[...");
|
||||
for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
|
||||
PrintValue<T>(reference[j]);
|
||||
printf(", ");
|
||||
}
|
||||
printf("...]");
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("CORRECT\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an example sorting problem whose keys is a vector of the specified
|
||||
* number of K elements, values of V elements, and then dispatches the problem
|
||||
* to the GPU for the given number of iterations, displaying runtime information.
|
||||
*
|
||||
* @param[in] iterations
|
||||
* Number of times to invoke the GPU sorting primitive
|
||||
* @param[in] num_elements
|
||||
* Size in elements of the vector to sort
|
||||
* @param[in] cfg
|
||||
* Config
|
||||
*/
|
||||
template<typename K, typename V>
|
||||
void TestSort(
|
||||
unsigned int iterations,
|
||||
int num_elements,
|
||||
bool keys_only)
|
||||
{
|
||||
// Allocate the sorting problem on the host and fill the keys with random bytes
|
||||
|
||||
K *h_keys = NULL;
|
||||
K *h_reference_keys = NULL;
|
||||
V *h_values = NULL;
|
||||
h_keys = (K*) malloc(num_elements * sizeof(K));
|
||||
h_reference_keys = (K*) malloc(num_elements * sizeof(K));
|
||||
if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
|
||||
|
||||
|
||||
// Use random bits
|
||||
for (unsigned int i = 0; i < num_elements; ++i) {
|
||||
RandomBits<K>(h_keys[i], 0);
|
||||
//h_keys[i] = num_elements-i;
|
||||
//h_keys[i] = 0xffffffffu-i;
|
||||
if (!keys_only)
|
||||
h_values[i] = h_keys[i];//0xffffffffu-i;
|
||||
|
||||
h_reference_keys[i] = h_keys[i];
|
||||
}
|
||||
|
||||
// Run the timing test
|
||||
if (keys_only) {
|
||||
TimedSort<K>(num_elements, h_keys, iterations);
|
||||
} else {
|
||||
TimedSort<K, V>(num_elements, h_keys, h_values, iterations);
|
||||
}
|
||||
|
||||
// cudaThreadSynchronize();
|
||||
|
||||
// Display sorted key data
|
||||
if (g_verbose) {
|
||||
printf("\n\nKeys:\n");
|
||||
for (int i = 0; i < num_elements; i++) {
|
||||
PrintValue<K>(h_keys[i]);
|
||||
printf(", ");
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
// Verify solution
|
||||
std::sort(h_reference_keys, h_reference_keys + num_elements);
|
||||
CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
|
||||
printf("\n");
|
||||
fflush(stdout);
|
||||
|
||||
// Free our allocated host memory
|
||||
if (h_keys != NULL) free(h_keys);
|
||||
if (h_values != NULL) free(h_values);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Displays the commandline usage for this tool
|
||||
*/
|
||||
void Usage()
|
||||
{
|
||||
printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--key-values] [--deviceId=<int>] [--platformId=<int>]\n");
|
||||
printf("\n");
|
||||
printf("\t--v\tDisplays sorted results to the console.\n");
|
||||
printf("\n");
|
||||
printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
|
||||
printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
|
||||
printf("\n");
|
||||
printf("\t--n\tThe number of elements to comprise the sample problem\n");
|
||||
printf("\t\t\tDefault = 512\n");
|
||||
printf("\n");
|
||||
printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Command-line parsing
|
||||
******************************************************************************/
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
class CommandLineArgs
|
||||
{
|
||||
protected:
|
||||
|
||||
std::map<std::string, std::string> pairs;
|
||||
|
||||
public:
|
||||
|
||||
// Constructor
|
||||
CommandLineArgs(int argc, char **argv)
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
string arg = argv[i];
|
||||
|
||||
if ((arg[0] != '-') || (arg[1] != '-')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
string::size_type pos;
|
||||
string key, val;
|
||||
if ((pos = arg.find( '=')) == string::npos) {
|
||||
key = string(arg, 2, arg.length() - 2);
|
||||
val = "";
|
||||
} else {
|
||||
key = string(arg, 2, pos - 2);
|
||||
val = string(arg, pos + 1, arg.length() - 1);
|
||||
}
|
||||
pairs[key] = val;
|
||||
}
|
||||
}
|
||||
|
||||
bool CheckCmdLineFlag(const char* arg_name)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GetCmdLineArgument(const char *arg_name, T &val);
|
||||
|
||||
int ParsedArgc()
|
||||
{
|
||||
return pairs.size();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
istringstream strstream(itr->second);
|
||||
strstream >> val;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
|
||||
string s = itr->second;
|
||||
val = (char*) malloc(sizeof(char) * (s.length() + 1));
|
||||
strcpy(val, s.c_str());
|
||||
|
||||
} else {
|
||||
val = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/******************************************************************************
|
||||
* Main
|
||||
******************************************************************************/
|
||||
|
||||
extern bool gDebugSkipLoadingBinary;
|
||||
|
||||
int main( int argc, char** argv)
|
||||
{
|
||||
gDebugSkipLoadingBinary = true;
|
||||
|
||||
cl_int ciErrNum;
|
||||
CommandLineArgs args(argc,argv);
|
||||
|
||||
args.GetCmdLineArgument("deviceId", gPreferredDeviceId);
|
||||
args.GetCmdLineArgument("platformId", gPreferredPlatformId);
|
||||
|
||||
printf("Initialize OpenCL using btOpenCLUtils_createContextFromType\n");
|
||||
cl_platform_id platformId;
|
||||
g_cxMainContext = btOpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
int numDev = btOpenCLUtils_getNumDevices(g_cxMainContext);
|
||||
|
||||
if (!numDev)
|
||||
{
|
||||
printf("error: no OpenCL devices\n");
|
||||
exit(0);
|
||||
}
|
||||
int result;
|
||||
int devId = 0;
|
||||
g_device = btOpenCLUtils_getDevice(g_cxMainContext,devId);
|
||||
btOpenCLUtils_printDeviceInfo(g_device);
|
||||
// create a command-queue
|
||||
g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
|
||||
|
||||
//srand(time(NULL));
|
||||
srand(0); // presently deterministic
|
||||
|
||||
unsigned int num_elements = 32*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
|
||||
unsigned int iterations = 10;
|
||||
bool keys_only = true;
|
||||
|
||||
//
|
||||
// Check command line arguments
|
||||
//
|
||||
|
||||
|
||||
|
||||
if (args.CheckCmdLineFlag("help"))
|
||||
{
|
||||
Usage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
args.GetCmdLineArgument("i", iterations);
|
||||
args.GetCmdLineArgument("n", num_elements);
|
||||
|
||||
|
||||
|
||||
keys_only = !args.CheckCmdLineFlag("key-values");
|
||||
g_verbose = args.CheckCmdLineFlag("v");
|
||||
|
||||
|
||||
|
||||
TestSort<unsigned int, unsigned int>(
|
||||
iterations,
|
||||
num_elements,
|
||||
keys_only);
|
||||
|
||||
|
||||
}
|
||||
92
opencl/parallel_primitives/host/CommandLineArgs.h
Normal file
92
opencl/parallel_primitives/host/CommandLineArgs.h
Normal file
@@ -0,0 +1,92 @@
|
||||
#ifndef COMMAND_LINE_ARGS_H
|
||||
#define COMMAND_LINE_ARGS_H
|
||||
|
||||
/******************************************************************************
|
||||
* Command-line parsing
|
||||
******************************************************************************/
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <sstream>
|
||||
class CommandLineArgs
|
||||
{
|
||||
protected:
|
||||
|
||||
std::map<std::string, std::string> pairs;
|
||||
|
||||
public:
|
||||
|
||||
// Constructor
|
||||
CommandLineArgs(int argc, char **argv)
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
string arg = argv[i];
|
||||
|
||||
if ((arg[0] != '-') || (arg[1] != '-')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
string::size_type pos;
|
||||
string key, val;
|
||||
if ((pos = arg.find( '=')) == string::npos) {
|
||||
key = string(arg, 2, arg.length() - 2);
|
||||
val = "";
|
||||
} else {
|
||||
key = string(arg, 2, pos - 2);
|
||||
val = string(arg, pos + 1, arg.length() - 1);
|
||||
}
|
||||
pairs[key] = val;
|
||||
}
|
||||
}
|
||||
|
||||
bool CheckCmdLineFlag(const char* arg_name)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GetCmdLineArgument(const char *arg_name, T &val);
|
||||
|
||||
int ParsedArgc()
|
||||
{
|
||||
return pairs.size();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
istringstream strstream(itr->second);
|
||||
strstream >> val;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
|
||||
string s = itr->second;
|
||||
val = (char*) malloc(sizeof(char) * (s.length() + 1));
|
||||
std::strcpy(val, s.c_str());
|
||||
|
||||
} else {
|
||||
val = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
#endif //COMMAND_LINE_ARGS_H
|
||||
181
opencl/parallel_primitives/host/btAlignedAllocator.cpp
Normal file
181
opencl/parallel_primitives/host/btAlignedAllocator.cpp
Normal file
@@ -0,0 +1,181 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#include "btAlignedAllocator.h"
|
||||
|
||||
int gNumAlignedAllocs = 0;
|
||||
int gNumAlignedFree = 0;
|
||||
int gTotalBytesAlignedAllocs = 0;//detect memory leaks
|
||||
|
||||
static void *btAllocDefault(size_t size)
|
||||
{
|
||||
return malloc(size);
|
||||
}
|
||||
|
||||
static void btFreeDefault(void *ptr)
|
||||
{
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
static btAllocFunc *sAllocFunc = btAllocDefault;
|
||||
static btFreeFunc *sFreeFunc = btFreeDefault;
|
||||
|
||||
|
||||
|
||||
#if defined (BT_HAS_ALIGNED_ALLOCATOR)
|
||||
#include <malloc.h>
|
||||
static void *btAlignedAllocDefault(size_t size, int alignment)
|
||||
{
|
||||
return _aligned_malloc(size, (size_t)alignment);
|
||||
}
|
||||
|
||||
static void btAlignedFreeDefault(void *ptr)
|
||||
{
|
||||
_aligned_free(ptr);
|
||||
}
|
||||
#elif defined(__CELLOS_LV2__)
|
||||
#include <stdlib.h>
|
||||
|
||||
static inline void *btAlignedAllocDefault(size_t size, int alignment)
|
||||
{
|
||||
return memalign(alignment, size);
|
||||
}
|
||||
|
||||
static inline void btAlignedFreeDefault(void *ptr)
|
||||
{
|
||||
free(ptr);
|
||||
}
|
||||
#else
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static inline void *btAlignedAllocDefault(size_t size, int alignment)
|
||||
{
|
||||
void *ret;
|
||||
char *real;
|
||||
real = (char *)sAllocFunc(size + sizeof(void *) + (alignment-1));
|
||||
if (real) {
|
||||
ret = btAlignPointer(real + sizeof(void *),alignment);
|
||||
*((void **)(ret)-1) = (void *)(real);
|
||||
} else {
|
||||
ret = (void *)(real);
|
||||
}
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static inline void btAlignedFreeDefault(void *ptr)
|
||||
{
|
||||
void* real;
|
||||
|
||||
if (ptr) {
|
||||
real = *((void **)(ptr)-1);
|
||||
sFreeFunc(real);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static btAlignedAllocFunc *sAlignedAllocFunc = btAlignedAllocDefault;
|
||||
static btAlignedFreeFunc *sAlignedFreeFunc = btAlignedFreeDefault;
|
||||
|
||||
void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc)
|
||||
{
|
||||
sAlignedAllocFunc = allocFunc ? allocFunc : btAlignedAllocDefault;
|
||||
sAlignedFreeFunc = freeFunc ? freeFunc : btAlignedFreeDefault;
|
||||
}
|
||||
|
||||
void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc)
|
||||
{
|
||||
sAllocFunc = allocFunc ? allocFunc : btAllocDefault;
|
||||
sFreeFunc = freeFunc ? freeFunc : btFreeDefault;
|
||||
}
|
||||
|
||||
#ifdef BT_DEBUG_MEMORY_ALLOCATIONS
|
||||
//this generic allocator provides the total allocated number of bytes
|
||||
#include <stdio.h>
|
||||
|
||||
void* btAlignedAllocInternal (size_t size, int alignment,int line,char* filename)
|
||||
{
|
||||
void *ret;
|
||||
char *real;
|
||||
|
||||
gTotalBytesAlignedAllocs += size;
|
||||
gNumAlignedAllocs++;
|
||||
|
||||
|
||||
real = (char *)sAllocFunc(size + 2*sizeof(void *) + (alignment-1));
|
||||
if (real) {
|
||||
ret = (void*) btAlignPointer(real + 2*sizeof(void *), alignment);
|
||||
*((void **)(ret)-1) = (void *)(real);
|
||||
*((int*)(ret)-2) = size;
|
||||
|
||||
} else {
|
||||
ret = (void *)(real);//??
|
||||
}
|
||||
|
||||
printf("allocation#%d at address %x, from %s,line %d, size %d\n",gNumAlignedAllocs,real, filename,line,size);
|
||||
|
||||
int* ptr = (int*)ret;
|
||||
*ptr = 12;
|
||||
return (ret);
|
||||
}
|
||||
|
||||
void btAlignedFreeInternal (void* ptr,int line,char* filename)
|
||||
{
|
||||
|
||||
void* real;
|
||||
gNumAlignedFree++;
|
||||
|
||||
if (ptr) {
|
||||
real = *((void **)(ptr)-1);
|
||||
int size = *((int*)(ptr)-2);
|
||||
gTotalBytesAlignedAllocs -= size;
|
||||
|
||||
printf("free #%d at address %x, from %s,line %d, size %d\n",gNumAlignedFree,real, filename,line,size);
|
||||
|
||||
sFreeFunc(real);
|
||||
} else
|
||||
{
|
||||
printf("NULL ptr\n");
|
||||
}
|
||||
}
|
||||
|
||||
#else //BT_DEBUG_MEMORY_ALLOCATIONS
|
||||
|
||||
void* btAlignedAllocInternal (size_t size, int alignment)
|
||||
{
|
||||
gNumAlignedAllocs++;
|
||||
void* ptr;
|
||||
ptr = sAlignedAllocFunc(size, alignment);
|
||||
// printf("btAlignedAllocInternal %d, %x\n",size,ptr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void btAlignedFreeInternal (void* ptr)
|
||||
{
|
||||
if (!ptr)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
gNumAlignedFree++;
|
||||
// printf("btAlignedFreeInternal %x\n",ptr);
|
||||
sAlignedFreeFunc(ptr);
|
||||
}
|
||||
|
||||
#endif //BT_DEBUG_MEMORY_ALLOCATIONS
|
||||
|
||||
107
opencl/parallel_primitives/host/btAlignedAllocator.h
Normal file
107
opencl/parallel_primitives/host/btAlignedAllocator.h
Normal file
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#ifndef BT_ALIGNED_ALLOCATOR
|
||||
#define BT_ALIGNED_ALLOCATOR
|
||||
|
||||
///we probably replace this with our own aligned memory allocator
|
||||
///so we replace _aligned_malloc and _aligned_free with our own
|
||||
///that is better portable and more predictable
|
||||
|
||||
#include "btScalar.h"
|
||||
//#define BT_DEBUG_MEMORY_ALLOCATIONS 1
|
||||
#ifdef BT_DEBUG_MEMORY_ALLOCATIONS
|
||||
|
||||
#define btAlignedAlloc(a,b) \
|
||||
btAlignedAllocInternal(a,b,__LINE__,__FILE__)
|
||||
|
||||
#define btAlignedFree(ptr) \
|
||||
btAlignedFreeInternal(ptr,__LINE__,__FILE__)
|
||||
|
||||
void* btAlignedAllocInternal (size_t size, int alignment,int line,char* filename);
|
||||
|
||||
void btAlignedFreeInternal (void* ptr,int line,char* filename);
|
||||
|
||||
#else
|
||||
void* btAlignedAllocInternal (size_t size, int alignment);
|
||||
void btAlignedFreeInternal (void* ptr);
|
||||
|
||||
#define btAlignedAlloc(size,alignment) btAlignedAllocInternal(size,alignment)
|
||||
#define btAlignedFree(ptr) btAlignedFreeInternal(ptr)
|
||||
|
||||
#endif
|
||||
typedef int size_type;
|
||||
|
||||
typedef void *(btAlignedAllocFunc)(size_t size, int alignment);
|
||||
typedef void (btAlignedFreeFunc)(void *memblock);
|
||||
typedef void *(btAllocFunc)(size_t size);
|
||||
typedef void (btFreeFunc)(void *memblock);
|
||||
|
||||
///The developer can let all Bullet memory allocations go through a custom memory allocator, using btAlignedAllocSetCustom
|
||||
void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc);
|
||||
///If the developer has already an custom aligned allocator, then btAlignedAllocSetCustomAligned can be used. The default aligned allocator pre-allocates extra memory using the non-aligned allocator, and instruments it.
|
||||
void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc);
|
||||
|
||||
|
||||
///The btAlignedAllocator is a portable class for aligned memory allocations.
|
||||
///Default implementations for unaligned and aligned allocations can be overridden by a custom allocator using btAlignedAllocSetCustom and btAlignedAllocSetCustomAligned.
|
||||
template < typename T , unsigned Alignment >
|
||||
class btAlignedAllocator {
|
||||
|
||||
typedef btAlignedAllocator< T , Alignment > self_type;
|
||||
|
||||
public:
|
||||
|
||||
//just going down a list:
|
||||
btAlignedAllocator() {}
|
||||
/*
|
||||
btAlignedAllocator( const self_type & ) {}
|
||||
*/
|
||||
|
||||
template < typename Other >
|
||||
btAlignedAllocator( const btAlignedAllocator< Other , Alignment > & ) {}
|
||||
|
||||
typedef const T* const_pointer;
|
||||
typedef const T& const_reference;
|
||||
typedef T* pointer;
|
||||
typedef T& reference;
|
||||
typedef T value_type;
|
||||
|
||||
pointer address ( reference ref ) const { return &ref; }
|
||||
const_pointer address ( const_reference ref ) const { return &ref; }
|
||||
pointer allocate ( size_type n , const_pointer * hint = 0 ) {
|
||||
(void)hint;
|
||||
return reinterpret_cast< pointer >(btAlignedAlloc( sizeof(value_type) * n , Alignment ));
|
||||
}
|
||||
void construct ( pointer ptr , const value_type & value ) { new (ptr) value_type( value ); }
|
||||
void deallocate( pointer ptr ) {
|
||||
btAlignedFree( reinterpret_cast< void * >( ptr ) );
|
||||
}
|
||||
void destroy ( pointer ptr ) { ptr->~value_type(); }
|
||||
|
||||
|
||||
template < typename O > struct rebind {
|
||||
typedef btAlignedAllocator< O , Alignment > other;
|
||||
};
|
||||
template < typename O >
|
||||
self_type & operator=( const btAlignedAllocator< O , Alignment > & ) { return *this; }
|
||||
|
||||
friend bool operator==( const self_type & , const self_type & ) { return true; }
|
||||
};
|
||||
|
||||
|
||||
|
||||
#endif //BT_ALIGNED_ALLOCATOR
|
||||
|
||||
511
opencl/parallel_primitives/host/btAlignedObjectArray.h
Normal file
511
opencl/parallel_primitives/host/btAlignedObjectArray.h
Normal file
@@ -0,0 +1,511 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef BT_OBJECT_ARRAY__
|
||||
#define BT_OBJECT_ARRAY__
|
||||
|
||||
#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE
|
||||
#include "btAlignedAllocator.h"
|
||||
|
||||
///If the platform doesn't support placement new, you can disable BT_USE_PLACEMENT_NEW
|
||||
///then the btAlignedObjectArray doesn't support objects with virtual methods, and non-trivial constructors/destructors
|
||||
///You can enable BT_USE_MEMCPY, then swapping elements in the array will use memcpy instead of operator=
|
||||
///see discussion here: http://continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1231 and
|
||||
///http://www.continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1240
|
||||
|
||||
#define BT_USE_PLACEMENT_NEW 1
|
||||
//#define BT_USE_MEMCPY 1 //disable, because it is cumbersome to find out for each platform where memcpy is defined. It can be in <memory.h> or <string.h> or otherwise...
|
||||
#define BT_ALLOW_ARRAY_COPY_OPERATOR // enabling this can accidently perform deep copies of data if you are not careful
|
||||
|
||||
#ifdef BT_USE_MEMCPY
|
||||
#include <memory.h>
|
||||
#include <string.h>
|
||||
#endif //BT_USE_MEMCPY
|
||||
|
||||
#ifdef BT_USE_PLACEMENT_NEW
|
||||
#include <new> //for placement new
|
||||
#endif //BT_USE_PLACEMENT_NEW
|
||||
|
||||
|
||||
///The btAlignedObjectArray template class uses a subset of the stl::vector interface for its methods
|
||||
///It is developed to replace stl::vector to avoid portability issues, including STL alignment issues to add SIMD/SSE data
|
||||
template <typename T>
|
||||
//template <class T>
|
||||
class btAlignedObjectArray
|
||||
{
|
||||
btAlignedAllocator<T , 16> m_allocator;
|
||||
|
||||
int m_size;
|
||||
int m_capacity;
|
||||
T* m_data;
|
||||
//PCK: added this line
|
||||
bool m_ownsMemory;
|
||||
|
||||
#ifdef BT_ALLOW_ARRAY_COPY_OPERATOR
|
||||
public:
|
||||
SIMD_FORCE_INLINE btAlignedObjectArray<T>& operator=(const btAlignedObjectArray<T> &other)
|
||||
{
|
||||
copyFromArray(other);
|
||||
return *this;
|
||||
}
|
||||
#else//BT_ALLOW_ARRAY_COPY_OPERATOR
|
||||
private:
|
||||
SIMD_FORCE_INLINE btAlignedObjectArray<T>& operator=(const btAlignedObjectArray<T> &other);
|
||||
#endif//BT_ALLOW_ARRAY_COPY_OPERATOR
|
||||
|
||||
protected:
|
||||
SIMD_FORCE_INLINE int allocSize(int size)
|
||||
{
|
||||
return (size ? size*2 : 1);
|
||||
}
|
||||
SIMD_FORCE_INLINE void copy(int start,int end, T* dest) const
|
||||
{
|
||||
int i;
|
||||
for (i=start;i<end;++i)
|
||||
#ifdef BT_USE_PLACEMENT_NEW
|
||||
new (&dest[i]) T(m_data[i]);
|
||||
#else
|
||||
dest[i] = m_data[i];
|
||||
#endif //BT_USE_PLACEMENT_NEW
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void init()
|
||||
{
|
||||
//PCK: added this line
|
||||
m_ownsMemory = true;
|
||||
m_data = 0;
|
||||
m_size = 0;
|
||||
m_capacity = 0;
|
||||
}
|
||||
SIMD_FORCE_INLINE void destroy(int first,int last)
|
||||
{
|
||||
int i;
|
||||
for (i=first; i<last;i++)
|
||||
{
|
||||
m_data[i].~T();
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void* allocate(int size)
|
||||
{
|
||||
if (size)
|
||||
return m_allocator.allocate(size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void deallocate()
|
||||
{
|
||||
if(m_data) {
|
||||
//PCK: enclosed the deallocation in this block
|
||||
if (m_ownsMemory)
|
||||
{
|
||||
m_allocator.deallocate(m_data);
|
||||
}
|
||||
m_data = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public:
|
||||
|
||||
btAlignedObjectArray()
|
||||
{
|
||||
init();
|
||||
}
|
||||
|
||||
~btAlignedObjectArray()
|
||||
{
|
||||
clear();
|
||||
}
|
||||
|
||||
///Generally it is best to avoid using the copy constructor of an btAlignedObjectArray, and use a (const) reference to the array instead.
|
||||
btAlignedObjectArray(const btAlignedObjectArray& otherArray)
|
||||
{
|
||||
init();
|
||||
|
||||
int otherSize = otherArray.size();
|
||||
resize (otherSize);
|
||||
otherArray.copy(0, otherSize, m_data);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// return the number of elements in the array
|
||||
SIMD_FORCE_INLINE int size() const
|
||||
{
|
||||
return m_size;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE const T& at(int n) const
|
||||
{
|
||||
btAssert(n>=0);
|
||||
btAssert(n<size());
|
||||
return m_data[n];
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE T& at(int n)
|
||||
{
|
||||
btAssert(n>=0);
|
||||
btAssert(n<size());
|
||||
return m_data[n];
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE const T& operator[](int n) const
|
||||
{
|
||||
btAssert(n>=0);
|
||||
btAssert(n<size());
|
||||
return m_data[n];
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE T& operator[](int n)
|
||||
{
|
||||
btAssert(n>=0);
|
||||
btAssert(n<size());
|
||||
return m_data[n];
|
||||
}
|
||||
|
||||
|
||||
///clear the array, deallocated memory. Generally it is better to use array.resize(0), to reduce performance overhead of run-time memory (de)allocations.
|
||||
SIMD_FORCE_INLINE void clear()
|
||||
{
|
||||
destroy(0,size());
|
||||
|
||||
deallocate();
|
||||
|
||||
init();
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void pop_back()
|
||||
{
|
||||
btAssert(m_size>0);
|
||||
m_size--;
|
||||
m_data[m_size].~T();
|
||||
}
|
||||
|
||||
|
||||
///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument.
|
||||
///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations.
|
||||
SIMD_FORCE_INLINE void resizeNoInitialize(int newsize)
|
||||
{
|
||||
int curSize = size();
|
||||
|
||||
if (newsize < curSize)
|
||||
{
|
||||
} else
|
||||
{
|
||||
if (newsize > size())
|
||||
{
|
||||
reserve(newsize);
|
||||
}
|
||||
//leave this uninitialized
|
||||
}
|
||||
m_size = newsize;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void resize(int newsize, const T& fillData=T())
|
||||
{
|
||||
int curSize = size();
|
||||
|
||||
if (newsize < curSize)
|
||||
{
|
||||
for(int i = newsize; i < curSize; i++)
|
||||
{
|
||||
m_data[i].~T();
|
||||
}
|
||||
} else
|
||||
{
|
||||
if (newsize > size())
|
||||
{
|
||||
reserve(newsize);
|
||||
}
|
||||
#ifdef BT_USE_PLACEMENT_NEW
|
||||
for (int i=curSize;i<newsize;i++)
|
||||
{
|
||||
new ( &m_data[i]) T(fillData);
|
||||
}
|
||||
#endif //BT_USE_PLACEMENT_NEW
|
||||
|
||||
}
|
||||
|
||||
m_size = newsize;
|
||||
}
|
||||
SIMD_FORCE_INLINE T& expandNonInitializing( )
|
||||
{
|
||||
int sz = size();
|
||||
if( sz == capacity() )
|
||||
{
|
||||
reserve( allocSize(size()) );
|
||||
}
|
||||
m_size++;
|
||||
|
||||
return m_data[sz];
|
||||
}
|
||||
|
||||
|
||||
SIMD_FORCE_INLINE T& expand( const T& fillValue=T())
|
||||
{
|
||||
int sz = size();
|
||||
if( sz == capacity() )
|
||||
{
|
||||
reserve( allocSize(size()) );
|
||||
}
|
||||
m_size++;
|
||||
#ifdef BT_USE_PLACEMENT_NEW
|
||||
new (&m_data[sz]) T(fillValue); //use the in-place new (not really allocating heap memory)
|
||||
#endif
|
||||
|
||||
return m_data[sz];
|
||||
}
|
||||
|
||||
|
||||
SIMD_FORCE_INLINE void push_back(const T& _Val)
|
||||
{
|
||||
int sz = size();
|
||||
if( sz == capacity() )
|
||||
{
|
||||
reserve( allocSize(size()) );
|
||||
}
|
||||
|
||||
#ifdef BT_USE_PLACEMENT_NEW
|
||||
new ( &m_data[m_size] ) T(_Val);
|
||||
#else
|
||||
m_data[size()] = _Val;
|
||||
#endif //BT_USE_PLACEMENT_NEW
|
||||
|
||||
m_size++;
|
||||
}
|
||||
|
||||
|
||||
/// return the pre-allocated (reserved) elements, this is at least as large as the total number of elements,see size() and reserve()
|
||||
SIMD_FORCE_INLINE int capacity() const
|
||||
{
|
||||
return m_capacity;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void reserve(int _Count)
|
||||
{ // determine new minimum length of allocated storage
|
||||
if (capacity() < _Count)
|
||||
{ // not enough room, reallocate
|
||||
T* s = (T*)allocate(_Count);
|
||||
|
||||
copy(0, size(), s);
|
||||
|
||||
destroy(0,size());
|
||||
|
||||
deallocate();
|
||||
|
||||
//PCK: added this line
|
||||
m_ownsMemory = true;
|
||||
|
||||
m_data = s;
|
||||
|
||||
m_capacity = _Count;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class less
|
||||
{
|
||||
public:
|
||||
|
||||
bool operator() ( const T& a, const T& b )
|
||||
{
|
||||
return ( a < b );
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename L>
|
||||
void quickSortInternal(const L& CompareFunc,int lo, int hi)
|
||||
{
|
||||
// lo is the lower index, hi is the upper index
|
||||
// of the region of array a that is to be sorted
|
||||
int i=lo, j=hi;
|
||||
T x=m_data[(lo+hi)/2];
|
||||
|
||||
// partition
|
||||
do
|
||||
{
|
||||
while (CompareFunc(m_data[i],x))
|
||||
i++;
|
||||
while (CompareFunc(x,m_data[j]))
|
||||
j--;
|
||||
if (i<=j)
|
||||
{
|
||||
swap(i,j);
|
||||
i++; j--;
|
||||
}
|
||||
} while (i<=j);
|
||||
|
||||
// recursion
|
||||
if (lo<j)
|
||||
quickSortInternal( CompareFunc, lo, j);
|
||||
if (i<hi)
|
||||
quickSortInternal( CompareFunc, i, hi);
|
||||
}
|
||||
|
||||
|
||||
template <typename L>
|
||||
void quickSort(const L& CompareFunc)
|
||||
{
|
||||
//don't sort 0 or 1 elements
|
||||
if (size()>1)
|
||||
{
|
||||
quickSortInternal(CompareFunc,0,size()-1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/
|
||||
template <typename L>
|
||||
void downHeap(T *pArr, int k, int n, const L& CompareFunc)
|
||||
{
|
||||
/* PRE: a[k+1..N] is a heap */
|
||||
/* POST: a[k..N] is a heap */
|
||||
|
||||
T temp = pArr[k - 1];
|
||||
/* k has child(s) */
|
||||
while (k <= n/2)
|
||||
{
|
||||
int child = 2*k;
|
||||
|
||||
if ((child < n) && CompareFunc(pArr[child - 1] , pArr[child]))
|
||||
{
|
||||
child++;
|
||||
}
|
||||
/* pick larger child */
|
||||
if (CompareFunc(temp , pArr[child - 1]))
|
||||
{
|
||||
/* move child up */
|
||||
pArr[k - 1] = pArr[child - 1];
|
||||
k = child;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
pArr[k - 1] = temp;
|
||||
} /*downHeap*/
|
||||
|
||||
void swap(int index0,int index1)
|
||||
{
|
||||
#ifdef BT_USE_MEMCPY
|
||||
char temp[sizeof(T)];
|
||||
memcpy(temp,&m_data[index0],sizeof(T));
|
||||
memcpy(&m_data[index0],&m_data[index1],sizeof(T));
|
||||
memcpy(&m_data[index1],temp,sizeof(T));
|
||||
#else
|
||||
T temp = m_data[index0];
|
||||
m_data[index0] = m_data[index1];
|
||||
m_data[index1] = temp;
|
||||
#endif //BT_USE_PLACEMENT_NEW
|
||||
|
||||
}
|
||||
|
||||
template <typename L>
|
||||
void heapSort(const L& CompareFunc)
|
||||
{
|
||||
/* sort a[0..N-1], N.B. 0 to N-1 */
|
||||
int k;
|
||||
int n = m_size;
|
||||
for (k = n/2; k > 0; k--)
|
||||
{
|
||||
downHeap(m_data, k, n, CompareFunc);
|
||||
}
|
||||
|
||||
/* a[1..N] is now a heap */
|
||||
while ( n>=1 )
|
||||
{
|
||||
swap(0,n-1); /* largest of a[0..n-1] */
|
||||
|
||||
|
||||
n = n - 1;
|
||||
/* restore a[1..i-1] heap */
|
||||
downHeap(m_data, 1, n, CompareFunc);
|
||||
}
|
||||
}
|
||||
|
||||
///non-recursive binary search, assumes sorted array
|
||||
int findBinarySearch(const T& key) const
|
||||
{
|
||||
int first = 0;
|
||||
int last = size()-1;
|
||||
|
||||
//assume sorted array
|
||||
while (first <= last) {
|
||||
int mid = (first + last) / 2; // compute mid point.
|
||||
if (key > m_data[mid])
|
||||
first = mid + 1; // repeat search in top half.
|
||||
else if (key < m_data[mid])
|
||||
last = mid - 1; // repeat search in bottom half.
|
||||
else
|
||||
return mid; // found it. return position /////
|
||||
}
|
||||
return size(); // failed to find key
|
||||
}
|
||||
|
||||
|
||||
int findLinearSearch(const T& key) const
|
||||
{
|
||||
int index=size();
|
||||
int i;
|
||||
|
||||
for (i=0;i<size();i++)
|
||||
{
|
||||
if (m_data[i] == key)
|
||||
{
|
||||
index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
void remove(const T& key)
|
||||
{
|
||||
|
||||
int findIndex = findLinearSearch(key);
|
||||
if (findIndex<size())
|
||||
{
|
||||
swap( findIndex,size()-1);
|
||||
pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
//PCK: whole function
|
||||
void initializeFromBuffer(void *buffer, int size, int capacity)
|
||||
{
|
||||
clear();
|
||||
m_ownsMemory = false;
|
||||
m_data = (T*)buffer;
|
||||
m_size = size;
|
||||
m_capacity = capacity;
|
||||
}
|
||||
|
||||
void copyFromArray(const btAlignedObjectArray& otherArray)
|
||||
{
|
||||
int otherSize = otherArray.size();
|
||||
resize (otherSize);
|
||||
otherArray.copy(0, otherSize, m_data);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif //BT_OBJECT_ARRAY__
|
||||
213
opencl/parallel_primitives/host/btBoundSearchCL.cpp
Normal file
213
opencl/parallel_primitives/host/btBoundSearchCL.cpp
Normal file
@@ -0,0 +1,213 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
//Host-code rewritten by Erwin Coumans
|
||||
|
||||
#define BOUNDSEARCH_PATH "opencl/parallel_primitives/kernels/BoundSearchKernels.cl"
|
||||
#define KERNEL0 "SearchSortDataLowerKernel"
|
||||
#define KERNEL1 "SearchSortDataUpperKernel"
|
||||
#define KERNEL2 "SubtractKernel"
|
||||
|
||||
|
||||
#include "btBoundSearchCL.h"
|
||||
#include "../../basic_initialize/btOpenCLUtils.h"
|
||||
#include "btLauncherCL.h"
|
||||
#include "../kernels/BoundSearchKernelsCL.h"
|
||||
|
||||
btBoundSearchCL::btBoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
|
||||
:m_context(ctx),
|
||||
m_device(device),
|
||||
m_queue(queue)
|
||||
{
|
||||
|
||||
const char* additionalMacros = "";
|
||||
const char* srcFileNameForCaching="";
|
||||
|
||||
cl_int pErrNum;
|
||||
const char* kernelSource = boundSearchKernelsCL;
|
||||
|
||||
cl_program boundSearchProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH);
|
||||
btAssert(boundSearchProg);
|
||||
|
||||
m_lowerSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros );
|
||||
btAssert(m_lowerSortDataKernel );
|
||||
|
||||
m_upperSortDataKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros );
|
||||
btAssert(m_upperSortDataKernel);
|
||||
|
||||
m_subtractKernel = 0;
|
||||
|
||||
if( maxSize )
|
||||
{
|
||||
m_subtractKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros );
|
||||
btAssert(m_subtractKernel);
|
||||
}
|
||||
|
||||
//m_constBuffer = new btOpenCLArray<btInt4>( device, 1, BufferBase::BUFFER_CONST );
|
||||
|
||||
m_lower = (maxSize == 0)? 0: new btOpenCLArray<unsigned int>(ctx,queue,maxSize );
|
||||
m_upper = (maxSize == 0)? 0: new btOpenCLArray<unsigned int>(ctx,queue, maxSize );
|
||||
|
||||
m_filler = new btFillCL(ctx,device,queue);
|
||||
}
|
||||
|
||||
btBoundSearchCL::~btBoundSearchCL()
|
||||
{
|
||||
|
||||
delete m_lower;
|
||||
delete m_upper;
|
||||
delete m_filler;
|
||||
|
||||
clReleaseKernel(m_lowerSortDataKernel);
|
||||
clReleaseKernel(m_upperSortDataKernel);
|
||||
clReleaseKernel(m_subtractKernel);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void btBoundSearchCL::execute(btOpenCLArray<btSortData>& src, int nSrc, btOpenCLArray<unsigned int>& dst, int nDst, Option option )
|
||||
{
|
||||
btInt4 constBuffer;
|
||||
constBuffer.x = nSrc;
|
||||
constBuffer.y = nDst;
|
||||
|
||||
if( option == BOUND_LOWER )
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL()) };
|
||||
|
||||
btLauncherCL launcher( m_queue, m_lowerSortDataKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( nSrc );
|
||||
launcher.setConst( nDst );
|
||||
|
||||
launcher.launch1D( nSrc, 64 );
|
||||
}
|
||||
else if( option == BOUND_UPPER )
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };
|
||||
|
||||
btLauncherCL launcher(m_queue, m_upperSortDataKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( nSrc );
|
||||
launcher.setConst( nDst );
|
||||
|
||||
launcher.launch1D( nSrc, 64 );
|
||||
}
|
||||
else if( option == COUNT )
|
||||
{
|
||||
btAssert( m_lower );
|
||||
btAssert( m_upper );
|
||||
btAssert( m_lower->capacity() <= (int)nDst );
|
||||
btAssert( m_upper->capacity() <= (int)nDst );
|
||||
|
||||
int zero = 0;
|
||||
m_filler->execute( *m_lower, zero, nDst );
|
||||
m_filler->execute( *m_upper, zero, nDst );
|
||||
|
||||
execute( src, nSrc, *m_lower, nDst, BOUND_LOWER );
|
||||
execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );
|
||||
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_upper->getBufferCL(), true ), btBufferInfoCL( m_lower->getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };
|
||||
|
||||
btLauncherCL launcher( m_queue, m_subtractKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( nSrc );
|
||||
launcher.setConst( nDst );
|
||||
|
||||
launcher.launch1D( nDst, 64 );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
btAssert( 0 );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void btBoundSearchCL::executeHost( btAlignedObjectArray<btSortData>& src, int nSrc,
|
||||
btAlignedObjectArray<unsigned int>& dst, int nDst, Option option )
|
||||
{
|
||||
|
||||
|
||||
for(int i=0; i<nSrc-1; i++)
|
||||
btAssert( src[i].m_key <= src[i+1].m_key );
|
||||
|
||||
btSortData minData,zeroData,maxData;
|
||||
minData.m_key = -1;
|
||||
minData.m_value = -1;
|
||||
zeroData.m_key=0;
|
||||
zeroData.m_value=0;
|
||||
maxData.m_key = nDst;
|
||||
maxData.m_value = nDst;
|
||||
|
||||
if( option == BOUND_LOWER )
|
||||
{
|
||||
for(int i=0; i<nSrc; i++)
|
||||
{
|
||||
btSortData& iData = (i==0)? minData: src[i-1];
|
||||
btSortData& jData = (i==nSrc)? maxData: src[i];
|
||||
|
||||
if( iData.m_key != jData.m_key )
|
||||
{
|
||||
int k = jData.m_key;
|
||||
{
|
||||
dst[k] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( option == BOUND_UPPER )
|
||||
{
|
||||
for(int i=1; i<nSrc+1; i++)
|
||||
{
|
||||
btSortData& iData = src[i-1];
|
||||
btSortData& jData = (i==nSrc)? maxData: src[i];
|
||||
|
||||
if( iData.m_key != jData.m_key )
|
||||
{
|
||||
int k = iData.m_key;
|
||||
{
|
||||
dst[k] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( option == COUNT )
|
||||
{
|
||||
btAlignedObjectArray<unsigned int> lower;
|
||||
lower.resize(nDst );
|
||||
btAlignedObjectArray<unsigned int> upper;
|
||||
upper.resize(nDst );
|
||||
|
||||
for(int i=0; i<nDst; i++)
|
||||
{
|
||||
lower[i] = upper[i] = 0;
|
||||
}
|
||||
|
||||
executeHost( src, nSrc, lower, nDst, BOUND_LOWER );
|
||||
executeHost( src, nSrc, upper, nDst, BOUND_UPPER );
|
||||
|
||||
for( int i=0; i<nDst; i++)
|
||||
{
|
||||
dst[i] = upper[i] - lower[i];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
btAssert( 0 );
|
||||
}
|
||||
}
|
||||
67
opencl/parallel_primitives/host/btBoundSearchCL.h
Normal file
67
opencl/parallel_primitives/host/btBoundSearchCL.h
Normal file
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
#ifndef BT_BOUNDSEARCH_H
|
||||
#define BT_BOUNDSEARCH_H
|
||||
|
||||
#pragma once
|
||||
|
||||
/*#include <Adl/Adl.h>
|
||||
#include <AdlPrimitives/Math/Math.h>
|
||||
#include <AdlPrimitives/Sort/SortData.h>
|
||||
#include <AdlPrimitives/Fill/Fill.h>
|
||||
*/
|
||||
|
||||
#include "btOpenCLArray.h"
|
||||
#include "btFillCL.h"
|
||||
#include "btRadixSort32CL.h" //for btSortData (perhaps move it?)
|
||||
class btBoundSearchCL
|
||||
{
|
||||
public:
|
||||
|
||||
enum Option
|
||||
{
|
||||
BOUND_LOWER,
|
||||
BOUND_UPPER,
|
||||
COUNT,
|
||||
};
|
||||
|
||||
cl_context m_context;
|
||||
cl_device_id m_device;
|
||||
cl_command_queue m_queue;
|
||||
|
||||
|
||||
cl_kernel m_lowerSortDataKernel;
|
||||
cl_kernel m_upperSortDataKernel;
|
||||
cl_kernel m_subtractKernel;
|
||||
|
||||
btOpenCLArray<btInt4>* m_constbtOpenCLArray;
|
||||
btOpenCLArray<unsigned int>* m_lower;
|
||||
btOpenCLArray<unsigned int>* m_upper;
|
||||
|
||||
btFillCL* m_filler;
|
||||
|
||||
btBoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
|
||||
|
||||
virtual ~btBoundSearchCL();
|
||||
|
||||
// src has to be src[i].m_key <= src[i+1].m_key
|
||||
void execute( btOpenCLArray<btSortData>& src, int nSrc, btOpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
|
||||
|
||||
void executeHost( btAlignedObjectArray<btSortData>& src, int nSrc, btAlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
|
||||
};
|
||||
|
||||
|
||||
#endif //BT_BOUNDSEARCH_H
|
||||
19
opencl/parallel_primitives/host/btBufferInfoCL.h
Normal file
19
opencl/parallel_primitives/host/btBufferInfoCL.h
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
#ifndef BT_BUFFER_INFO_CL_H
|
||||
#define BT_BUFFER_INFO_CL_H
|
||||
|
||||
#include "btOpenCLArray.h"
|
||||
|
||||
|
||||
struct btBufferInfoCL
|
||||
{
|
||||
//btBufferInfoCL(){}
|
||||
|
||||
// template<typename T>
|
||||
btBufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
|
||||
|
||||
cl_mem m_clBuffer;
|
||||
bool m_isReadOnly;
|
||||
};
|
||||
|
||||
#endif //BT_BUFFER_INFO_CL_H
|
||||
126
opencl/parallel_primitives/host/btFillCL.cpp
Normal file
126
opencl/parallel_primitives/host/btFillCL.cpp
Normal file
@@ -0,0 +1,126 @@
|
||||
#include "btFillCL.h"
|
||||
#include "../../basic_initialize/btOpenCLUtils.h"
|
||||
#include "btBufferInfoCL.h"
|
||||
#include "btLauncherCL.h"
|
||||
|
||||
#define FILL_CL_PROGRAM_PATH "opencl/parallel_primitives/kernels/FillKernels.cl"
|
||||
|
||||
#include "../kernels/FillKernelsCL.h"
|
||||
|
||||
btFillCL::btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
|
||||
:m_commandQueue(queue)
|
||||
{
|
||||
const char* kernelSource = fillKernelsCL;
|
||||
cl_int pErrNum;
|
||||
const char* additionalMacros = "";
|
||||
|
||||
cl_program fillProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH);
|
||||
btAssert(fillProg);
|
||||
|
||||
m_fillIntKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros );
|
||||
btAssert(m_fillIntKernel);
|
||||
|
||||
m_fillUnsignedIntKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros );
|
||||
btAssert(m_fillIntKernel);
|
||||
|
||||
m_fillFloatKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros );
|
||||
btAssert(m_fillFloatKernel);
|
||||
|
||||
|
||||
|
||||
m_fillKernelInt2 = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros );
|
||||
btAssert(m_fillKernelInt2);
|
||||
|
||||
}
|
||||
|
||||
btFillCL::~btFillCL()
|
||||
{
|
||||
clReleaseKernel(m_fillKernelInt2);
|
||||
clReleaseKernel(m_fillIntKernel);
|
||||
clReleaseKernel(m_fillUnsignedIntKernel);
|
||||
clReleaseKernel(m_fillFloatKernel);
|
||||
|
||||
}
|
||||
|
||||
void btFillCL::execute(btOpenCLArray<float>& src, const float value, int n, int offset)
|
||||
{
|
||||
btAssert( n>0 );
|
||||
|
||||
{
|
||||
btLauncherCL launcher( m_commandQueue, m_fillFloatKernel );
|
||||
launcher.setBuffer( src.getBufferCL());
|
||||
launcher.setConst( n );
|
||||
launcher.setConst( value );
|
||||
launcher.setConst( offset);
|
||||
|
||||
launcher.launch1D( n );
|
||||
}
|
||||
}
|
||||
|
||||
void btFillCL::execute(btOpenCLArray<int>& src, const int value, int n, int offset)
|
||||
{
|
||||
btAssert( n>0 );
|
||||
|
||||
|
||||
{
|
||||
btLauncherCL launcher( m_commandQueue, m_fillIntKernel );
|
||||
launcher.setBuffer(src.getBufferCL());
|
||||
launcher.setConst( n);
|
||||
launcher.setConst( value);
|
||||
launcher.setConst( offset);
|
||||
launcher.launch1D( n );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void btFillCL::execute(btOpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
|
||||
{
|
||||
btAssert( n>0 );
|
||||
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };
|
||||
|
||||
btLauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( n );
|
||||
launcher.setConst(value);
|
||||
launcher.setConst(offset);
|
||||
|
||||
launcher.launch1D( n );
|
||||
}
|
||||
}
|
||||
|
||||
void btFillCL::executeHost(btAlignedObjectArray<btInt2> &src, const btInt2 &value, int n, int offset)
|
||||
{
|
||||
for (int i=0;i<n;i++)
|
||||
{
|
||||
src[i+offset]=value;
|
||||
}
|
||||
}
|
||||
|
||||
void btFillCL::executeHost(btAlignedObjectArray<int> &src, const int value, int n, int offset)
|
||||
{
|
||||
for (int i=0;i<n;i++)
|
||||
{
|
||||
src[i+offset]=value;
|
||||
}
|
||||
}
|
||||
|
||||
void btFillCL::execute(btOpenCLArray<btInt2> &src, const btInt2 &value, int n, int offset)
|
||||
{
|
||||
btAssert( n>0 );
|
||||
|
||||
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };
|
||||
|
||||
btLauncherCL launcher(m_commandQueue, m_fillKernelInt2);
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst(n);
|
||||
launcher.setConst(value);
|
||||
launcher.setConst(offset);
|
||||
|
||||
//( constBuffer );
|
||||
launcher.launch1D( n );
|
||||
}
|
||||
}
|
||||
137
opencl/parallel_primitives/host/btFillCL.h
Normal file
137
opencl/parallel_primitives/host/btFillCL.h
Normal file
@@ -0,0 +1,137 @@
|
||||
#ifndef BT_FILL_CL_H
|
||||
#define BT_FILL_CL_H
|
||||
|
||||
#include "btOpenCLArray.h"
|
||||
#include "btScalar.h"
|
||||
|
||||
ATTRIBUTE_ALIGNED16(struct) btUnsignedInt4
|
||||
{
|
||||
BT_DECLARE_ALIGNED_ALLOCATOR();
|
||||
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
unsigned int x,y,z,w;
|
||||
};
|
||||
struct
|
||||
{
|
||||
unsigned int s[4];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
ATTRIBUTE_ALIGNED16(struct) btInt4
|
||||
{
|
||||
BT_DECLARE_ALIGNED_ALLOCATOR();
|
||||
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
int x,y,z,w;
|
||||
};
|
||||
struct
|
||||
{
|
||||
int s[4];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
struct btUnsignedInt2
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
unsigned int x,y;
|
||||
};
|
||||
struct
|
||||
{
|
||||
unsigned int s[2];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
struct btInt2
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
int x,y;
|
||||
};
|
||||
struct
|
||||
{
|
||||
int s[2];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
SIMD_FORCE_INLINE btInt4 btMakeInt4(int x, int y, int z, int w = 0)
|
||||
{
|
||||
btInt4 v;
|
||||
v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
|
||||
return v;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE btUnsignedInt4 btMakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0)
|
||||
{
|
||||
btUnsignedInt4 v;
|
||||
v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
|
||||
return v;
|
||||
}
|
||||
|
||||
class btFillCL
|
||||
{
|
||||
|
||||
cl_command_queue m_commandQueue;
|
||||
|
||||
cl_kernel m_fillKernelInt2;
|
||||
cl_kernel m_fillIntKernel;
|
||||
cl_kernel m_fillUnsignedIntKernel;
|
||||
cl_kernel m_fillFloatKernel;
|
||||
|
||||
public:
|
||||
|
||||
struct btConstData
|
||||
{
|
||||
union
|
||||
{
|
||||
btInt4 m_data;
|
||||
btUnsignedInt4 m_UnsignedData;
|
||||
};
|
||||
int m_offset;
|
||||
int m_n;
|
||||
int m_padding[2];
|
||||
};
|
||||
|
||||
protected:
|
||||
|
||||
public:
|
||||
|
||||
btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
|
||||
|
||||
virtual ~btFillCL();
|
||||
|
||||
void execute(btOpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
|
||||
|
||||
void execute(btOpenCLArray<int>& src, const int value, int n, int offset = 0);
|
||||
|
||||
void execute(btOpenCLArray<float>& src, const float value, int n, int offset = 0);
|
||||
|
||||
void execute(btOpenCLArray<btInt2>& src, const btInt2& value, int n, int offset = 0);
|
||||
|
||||
void executeHost(btAlignedObjectArray<btInt2> &src, const btInt2 &value, int n, int offset);
|
||||
|
||||
void executeHost(btAlignedObjectArray<int> &src, const int value, int n, int offset);
|
||||
|
||||
// void execute(btOpenCLArray<btInt4>& src, const btInt4& value, int n, int offset = 0);
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif //BT_FILL_CL_H
|
||||
450
opencl/parallel_primitives/host/btHashMap.h
Normal file
450
opencl/parallel_primitives/host/btHashMap.h
Normal file
@@ -0,0 +1,450 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef BT_HASH_MAP_H
|
||||
#define BT_HASH_MAP_H
|
||||
|
||||
#include "btAlignedObjectArray.h"
|
||||
|
||||
///very basic hashable string implementation, compatible with btHashMap
|
||||
struct btHashString
|
||||
{
|
||||
const char* m_string;
|
||||
unsigned int m_hash;
|
||||
|
||||
SIMD_FORCE_INLINE unsigned int getHash()const
|
||||
{
|
||||
return m_hash;
|
||||
}
|
||||
|
||||
btHashString(const char* name)
|
||||
:m_string(name)
|
||||
{
|
||||
/* magic numbers from http://www.isthe.com/chongo/tech/comp/fnv/ */
|
||||
static const unsigned int InitialFNV = 2166136261u;
|
||||
static const unsigned int FNVMultiple = 16777619u;
|
||||
|
||||
/* Fowler / Noll / Vo (FNV) Hash */
|
||||
unsigned int hash = InitialFNV;
|
||||
|
||||
for(int i = 0; m_string[i]; i++)
|
||||
{
|
||||
hash = hash ^ (m_string[i]); /* xor the low 8 bits */
|
||||
hash = hash * FNVMultiple; /* multiply by the magic number */
|
||||
}
|
||||
m_hash = hash;
|
||||
}
|
||||
|
||||
int portableStringCompare(const char* src, const char* dst) const
|
||||
{
|
||||
int ret = 0 ;
|
||||
|
||||
while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst)
|
||||
++src, ++dst;
|
||||
|
||||
if ( ret < 0 )
|
||||
ret = -1 ;
|
||||
else if ( ret > 0 )
|
||||
ret = 1 ;
|
||||
|
||||
return( ret );
|
||||
}
|
||||
|
||||
bool equals(const btHashString& other) const
|
||||
{
|
||||
return (m_string == other.m_string) ||
|
||||
(0==portableStringCompare(m_string,other.m_string));
|
||||
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
const int BT_HASH_NULL=0xffffffff;
|
||||
|
||||
|
||||
class btHashInt
|
||||
{
|
||||
int m_uid;
|
||||
public:
|
||||
btHashInt(int uid) :m_uid(uid)
|
||||
{
|
||||
}
|
||||
|
||||
int getUid1() const
|
||||
{
|
||||
return m_uid;
|
||||
}
|
||||
|
||||
void setUid1(int uid)
|
||||
{
|
||||
m_uid = uid;
|
||||
}
|
||||
|
||||
bool equals(const btHashInt& other) const
|
||||
{
|
||||
return getUid1() == other.getUid1();
|
||||
}
|
||||
//to our success
|
||||
SIMD_FORCE_INLINE unsigned int getHash()const
|
||||
{
|
||||
int key = m_uid;
|
||||
// Thomas Wang's hash
|
||||
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
|
||||
return key;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
class btHashPtr
|
||||
{
|
||||
|
||||
union
|
||||
{
|
||||
const void* m_pointer;
|
||||
int m_hashValues[2];
|
||||
};
|
||||
|
||||
public:
|
||||
|
||||
btHashPtr(const void* ptr)
|
||||
:m_pointer(ptr)
|
||||
{
|
||||
}
|
||||
|
||||
const void* getPointer() const
|
||||
{
|
||||
return m_pointer;
|
||||
}
|
||||
|
||||
bool equals(const btHashPtr& other) const
|
||||
{
|
||||
return getPointer() == other.getPointer();
|
||||
}
|
||||
|
||||
//to our success
|
||||
SIMD_FORCE_INLINE unsigned int getHash()const
|
||||
{
|
||||
const bool VOID_IS_8 = ((sizeof(void*)==8));
|
||||
|
||||
int key = VOID_IS_8? m_hashValues[0]+m_hashValues[1] : m_hashValues[0];
|
||||
|
||||
// Thomas Wang's hash
|
||||
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
|
||||
return key;
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
template <class Value>
|
||||
class btHashKeyPtr
|
||||
{
|
||||
int m_uid;
|
||||
public:
|
||||
|
||||
btHashKeyPtr(int uid) :m_uid(uid)
|
||||
{
|
||||
}
|
||||
|
||||
int getUid1() const
|
||||
{
|
||||
return m_uid;
|
||||
}
|
||||
|
||||
bool equals(const btHashKeyPtr<Value>& other) const
|
||||
{
|
||||
return getUid1() == other.getUid1();
|
||||
}
|
||||
|
||||
//to our success
|
||||
SIMD_FORCE_INLINE unsigned int getHash()const
|
||||
{
|
||||
int key = m_uid;
|
||||
// Thomas Wang's hash
|
||||
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
|
||||
return key;
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
template <class Value>
|
||||
class btHashKey
|
||||
{
|
||||
int m_uid;
|
||||
public:
|
||||
|
||||
btHashKey(int uid) :m_uid(uid)
|
||||
{
|
||||
}
|
||||
|
||||
int getUid1() const
|
||||
{
|
||||
return m_uid;
|
||||
}
|
||||
|
||||
bool equals(const btHashKey<Value>& other) const
|
||||
{
|
||||
return getUid1() == other.getUid1();
|
||||
}
|
||||
//to our success
|
||||
SIMD_FORCE_INLINE unsigned int getHash()const
|
||||
{
|
||||
int key = m_uid;
|
||||
// Thomas Wang's hash
|
||||
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
|
||||
return key;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
///The btHashMap template class implements a generic and lightweight hashmap.
|
||||
///A basic sample of how to use btHashMap is located in Demos\BasicDemo\main.cpp
|
||||
template <class Key, class Value>
|
||||
class btHashMap
|
||||
{
|
||||
|
||||
protected:
|
||||
btAlignedObjectArray<int> m_hashTable;
|
||||
btAlignedObjectArray<int> m_next;
|
||||
|
||||
btAlignedObjectArray<Value> m_valueArray;
|
||||
btAlignedObjectArray<Key> m_keyArray;
|
||||
|
||||
void growTables(const Key& /*key*/)
|
||||
{
|
||||
int newCapacity = m_valueArray.capacity();
|
||||
|
||||
if (m_hashTable.size() < newCapacity)
|
||||
{
|
||||
//grow hashtable and next table
|
||||
int curHashtableSize = m_hashTable.size();
|
||||
|
||||
m_hashTable.resize(newCapacity);
|
||||
m_next.resize(newCapacity);
|
||||
|
||||
int i;
|
||||
|
||||
for (i= 0; i < newCapacity; ++i)
|
||||
{
|
||||
m_hashTable[i] = BT_HASH_NULL;
|
||||
}
|
||||
for (i = 0; i < newCapacity; ++i)
|
||||
{
|
||||
m_next[i] = BT_HASH_NULL;
|
||||
}
|
||||
|
||||
for(i=0;i<curHashtableSize;i++)
|
||||
{
|
||||
//const Value& value = m_valueArray[i];
|
||||
//const Key& key = m_keyArray[i];
|
||||
|
||||
int hashValue = m_keyArray[i].getHash() & (m_valueArray.capacity()-1); // New hash value with new mask
|
||||
m_next[i] = m_hashTable[hashValue];
|
||||
m_hashTable[hashValue] = i;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
void insert(const Key& key, const Value& value) {
|
||||
int hash = key.getHash() & (m_valueArray.capacity()-1);
|
||||
|
||||
//replace value if the key is already there
|
||||
int index = findIndex(key);
|
||||
if (index != BT_HASH_NULL)
|
||||
{
|
||||
m_valueArray[index]=value;
|
||||
return;
|
||||
}
|
||||
|
||||
int count = m_valueArray.size();
|
||||
int oldCapacity = m_valueArray.capacity();
|
||||
m_valueArray.push_back(value);
|
||||
m_keyArray.push_back(key);
|
||||
|
||||
int newCapacity = m_valueArray.capacity();
|
||||
if (oldCapacity < newCapacity)
|
||||
{
|
||||
growTables(key);
|
||||
//hash with new capacity
|
||||
hash = key.getHash() & (m_valueArray.capacity()-1);
|
||||
}
|
||||
m_next[count] = m_hashTable[hash];
|
||||
m_hashTable[hash] = count;
|
||||
}
|
||||
|
||||
void remove(const Key& key) {
|
||||
|
||||
int hash = key.getHash() & (m_valueArray.capacity()-1);
|
||||
|
||||
int pairIndex = findIndex(key);
|
||||
|
||||
if (pairIndex ==BT_HASH_NULL)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Remove the pair from the hash table.
|
||||
int index = m_hashTable[hash];
|
||||
btAssert(index != BT_HASH_NULL);
|
||||
|
||||
int previous = BT_HASH_NULL;
|
||||
while (index != pairIndex)
|
||||
{
|
||||
previous = index;
|
||||
index = m_next[index];
|
||||
}
|
||||
|
||||
if (previous != BT_HASH_NULL)
|
||||
{
|
||||
btAssert(m_next[previous] == pairIndex);
|
||||
m_next[previous] = m_next[pairIndex];
|
||||
}
|
||||
else
|
||||
{
|
||||
m_hashTable[hash] = m_next[pairIndex];
|
||||
}
|
||||
|
||||
// We now move the last pair into spot of the
|
||||
// pair being removed. We need to fix the hash
|
||||
// table indices to support the move.
|
||||
|
||||
int lastPairIndex = m_valueArray.size() - 1;
|
||||
|
||||
// If the removed pair is the last pair, we are done.
|
||||
if (lastPairIndex == pairIndex)
|
||||
{
|
||||
m_valueArray.pop_back();
|
||||
m_keyArray.pop_back();
|
||||
return;
|
||||
}
|
||||
|
||||
// Remove the last pair from the hash table.
|
||||
int lastHash = m_keyArray[lastPairIndex].getHash() & (m_valueArray.capacity()-1);
|
||||
|
||||
index = m_hashTable[lastHash];
|
||||
btAssert(index != BT_HASH_NULL);
|
||||
|
||||
previous = BT_HASH_NULL;
|
||||
while (index != lastPairIndex)
|
||||
{
|
||||
previous = index;
|
||||
index = m_next[index];
|
||||
}
|
||||
|
||||
if (previous != BT_HASH_NULL)
|
||||
{
|
||||
btAssert(m_next[previous] == lastPairIndex);
|
||||
m_next[previous] = m_next[lastPairIndex];
|
||||
}
|
||||
else
|
||||
{
|
||||
m_hashTable[lastHash] = m_next[lastPairIndex];
|
||||
}
|
||||
|
||||
// Copy the last pair into the remove pair's spot.
|
||||
m_valueArray[pairIndex] = m_valueArray[lastPairIndex];
|
||||
m_keyArray[pairIndex] = m_keyArray[lastPairIndex];
|
||||
|
||||
// Insert the last pair into the hash table
|
||||
m_next[pairIndex] = m_hashTable[lastHash];
|
||||
m_hashTable[lastHash] = pairIndex;
|
||||
|
||||
m_valueArray.pop_back();
|
||||
m_keyArray.pop_back();
|
||||
|
||||
}
|
||||
|
||||
|
||||
int size() const
|
||||
{
|
||||
return m_valueArray.size();
|
||||
}
|
||||
|
||||
const Value* getAtIndex(int index) const
|
||||
{
|
||||
btAssert(index < m_valueArray.size());
|
||||
|
||||
return &m_valueArray[index];
|
||||
}
|
||||
|
||||
Value* getAtIndex(int index)
|
||||
{
|
||||
btAssert(index < m_valueArray.size());
|
||||
|
||||
return &m_valueArray[index];
|
||||
}
|
||||
|
||||
Value* operator[](const Key& key) {
|
||||
return find(key);
|
||||
}
|
||||
|
||||
const Value* find(const Key& key) const
|
||||
{
|
||||
int index = findIndex(key);
|
||||
if (index == BT_HASH_NULL)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
return &m_valueArray[index];
|
||||
}
|
||||
|
||||
Value* find(const Key& key)
|
||||
{
|
||||
int index = findIndex(key);
|
||||
if (index == BT_HASH_NULL)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
return &m_valueArray[index];
|
||||
}
|
||||
|
||||
|
||||
int findIndex(const Key& key) const
|
||||
{
|
||||
unsigned int hash = key.getHash() & (m_valueArray.capacity()-1);
|
||||
|
||||
if (hash >= (unsigned int)m_hashTable.size())
|
||||
{
|
||||
return BT_HASH_NULL;
|
||||
}
|
||||
|
||||
int index = m_hashTable[hash];
|
||||
while ((index != BT_HASH_NULL) && key.equals(m_keyArray[index]) == false)
|
||||
{
|
||||
index = m_next[index];
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
void clear()
|
||||
{
|
||||
m_hashTable.clear();
|
||||
m_next.clear();
|
||||
m_valueArray.clear();
|
||||
m_keyArray.clear();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif //BT_HASH_MAP_H
|
||||
363
opencl/parallel_primitives/host/btLauncherCL.h
Normal file
363
opencl/parallel_primitives/host/btLauncherCL.h
Normal file
@@ -0,0 +1,363 @@
|
||||
|
||||
#ifndef BT_LAUNCHER_CL_H
|
||||
#define BT_LAUNCHER_CL_H
|
||||
|
||||
#include "btBufferInfoCL.h"
|
||||
#include "btMinMax.h"
|
||||
#include "btOpenCLArray.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#pragma warning(disable :4996)
|
||||
#endif
|
||||
#define BT_CL_MAX_ARG_SIZE 16
|
||||
struct btKernelArgData
|
||||
{
|
||||
int m_isBuffer;
|
||||
int m_argIndex;
|
||||
int m_argSizeInBytes;
|
||||
union
|
||||
{
|
||||
cl_mem m_clBuffer;
|
||||
unsigned char m_argData[BT_CL_MAX_ARG_SIZE];
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
class btLauncherCL
|
||||
{
|
||||
|
||||
cl_command_queue m_commandQueue;
|
||||
cl_kernel m_kernel;
|
||||
int m_idx;
|
||||
|
||||
btAlignedObjectArray<btKernelArgData> m_kernelArguments;
|
||||
|
||||
|
||||
int m_serializationSizeInBytes;
|
||||
|
||||
public:
|
||||
|
||||
btAlignedObjectArray<btOpenCLArray<unsigned char>* > m_arrays;
|
||||
|
||||
btLauncherCL(cl_command_queue queue, cl_kernel kernel)
|
||||
:m_commandQueue(queue),
|
||||
m_kernel(kernel),
|
||||
m_idx(0)
|
||||
{
|
||||
m_serializationSizeInBytes = sizeof(int);
|
||||
}
|
||||
|
||||
virtual ~btLauncherCL()
|
||||
{
|
||||
for (int i=0;i<m_arrays.size();i++)
|
||||
{
|
||||
clReleaseMemObject(m_arrays[i]->getBufferCL());
|
||||
}
|
||||
}
|
||||
|
||||
inline void setBuffer( cl_mem clBuffer)
|
||||
{
|
||||
|
||||
btKernelArgData kernelArg;
|
||||
kernelArg.m_argIndex = m_idx;
|
||||
kernelArg.m_isBuffer = 1;
|
||||
kernelArg.m_clBuffer = clBuffer;
|
||||
|
||||
cl_mem_info param_name = CL_MEM_SIZE;
|
||||
size_t param_value;
|
||||
size_t sizeInBytes = sizeof(size_t);
|
||||
size_t actualSizeInBytes;
|
||||
cl_int err;
|
||||
err = clGetMemObjectInfo ( kernelArg.m_clBuffer,
|
||||
param_name,
|
||||
sizeInBytes,
|
||||
¶m_value,
|
||||
&actualSizeInBytes);
|
||||
|
||||
btAssert( err == CL_SUCCESS );
|
||||
kernelArg.m_argSizeInBytes = param_value;
|
||||
|
||||
m_kernelArguments.push_back(kernelArg);
|
||||
m_serializationSizeInBytes+= sizeof(btKernelArgData);
|
||||
m_serializationSizeInBytes+=param_value;
|
||||
|
||||
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
|
||||
btAssert( status == CL_SUCCESS );
|
||||
}
|
||||
|
||||
|
||||
inline void setBuffers( btBufferInfoCL* buffInfo, int n )
|
||||
{
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
btKernelArgData kernelArg;
|
||||
kernelArg.m_argIndex = m_idx;
|
||||
kernelArg.m_isBuffer = 1;
|
||||
kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
|
||||
|
||||
cl_mem_info param_name = CL_MEM_SIZE;
|
||||
size_t param_value;
|
||||
size_t sizeInBytes = sizeof(size_t);
|
||||
size_t actualSizeInBytes;
|
||||
cl_int err;
|
||||
err = clGetMemObjectInfo ( kernelArg.m_clBuffer,
|
||||
param_name,
|
||||
sizeInBytes,
|
||||
¶m_value,
|
||||
&actualSizeInBytes);
|
||||
|
||||
btAssert( err == CL_SUCCESS );
|
||||
kernelArg.m_argSizeInBytes = param_value;
|
||||
|
||||
m_kernelArguments.push_back(kernelArg);
|
||||
m_serializationSizeInBytes+= sizeof(btKernelArgData);
|
||||
m_serializationSizeInBytes+=param_value;
|
||||
|
||||
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
|
||||
btAssert( status == CL_SUCCESS );
|
||||
}
|
||||
}
|
||||
|
||||
int getSerializationBufferSize() const
|
||||
{
|
||||
return m_serializationSizeInBytes;
|
||||
}
|
||||
|
||||
inline int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
|
||||
{
|
||||
int index=0;
|
||||
|
||||
int numArguments = *(int*) &buf[index];
|
||||
index+=sizeof(int);
|
||||
|
||||
for (int i=0;i<numArguments;i++)
|
||||
{
|
||||
btKernelArgData* arg = (btKernelArgData*)&buf[index];
|
||||
|
||||
index+=sizeof(btKernelArgData);
|
||||
if (arg->m_isBuffer)
|
||||
{
|
||||
btOpenCLArray<unsigned char>* clData = new btOpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
|
||||
clData->resize(arg->m_argSizeInBytes);
|
||||
|
||||
clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
|
||||
|
||||
arg->m_clBuffer = clData->getBufferCL();
|
||||
|
||||
m_arrays.push_back(clData);
|
||||
|
||||
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
|
||||
btAssert( status == CL_SUCCESS );
|
||||
index+=arg->m_argSizeInBytes;
|
||||
} else
|
||||
{
|
||||
cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
|
||||
btAssert( status == CL_SUCCESS );
|
||||
}
|
||||
m_kernelArguments.push_back(*arg);
|
||||
}
|
||||
m_serializationSizeInBytes = index;
|
||||
return index;
|
||||
}
|
||||
|
||||
inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
|
||||
{
|
||||
int index=0;
|
||||
|
||||
int numArguments = *(int*) &goldBuffer[index];
|
||||
index+=sizeof(int);
|
||||
|
||||
if (numArguments != m_kernelArguments.size())
|
||||
{
|
||||
printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size());
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (int ii=0;ii<numArguments;ii++)
|
||||
{
|
||||
btKernelArgData* argGold = (btKernelArgData*)&goldBuffer[index];
|
||||
|
||||
if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
|
||||
{
|
||||
printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
|
||||
return -2;
|
||||
}
|
||||
|
||||
{
|
||||
int expected = argGold->m_isBuffer;
|
||||
int found = m_kernelArguments[ii].m_isBuffer;
|
||||
|
||||
if (expected != found)
|
||||
{
|
||||
printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found);
|
||||
return -3;
|
||||
}
|
||||
}
|
||||
index+=sizeof(btKernelArgData);
|
||||
|
||||
if (argGold->m_isBuffer)
|
||||
{
|
||||
|
||||
unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes);
|
||||
unsigned char* goldBuf = &goldBuffer[index];
|
||||
for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++)
|
||||
{
|
||||
memBuf[j] = 0xaa;
|
||||
}
|
||||
|
||||
cl_int status = 0;
|
||||
status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
|
||||
memBuf, 0,0,0 );
|
||||
btAssert( status==CL_SUCCESS );
|
||||
clFinish(m_commandQueue);
|
||||
|
||||
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
|
||||
{
|
||||
int expected = goldBuf[b];
|
||||
int found = memBuf[b];
|
||||
if (expected != found)
|
||||
{
|
||||
printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
|
||||
ii, b, expected, found);
|
||||
return -4;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
index+=argGold->m_argSizeInBytes;
|
||||
} else
|
||||
{
|
||||
|
||||
//compare content
|
||||
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
|
||||
{
|
||||
int expected = argGold->m_argData[b];
|
||||
int found =m_kernelArguments[ii].m_argData[b];
|
||||
if (expected != found)
|
||||
{
|
||||
printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
|
||||
ii, b, expected, found);
|
||||
return -5;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return index;
|
||||
|
||||
}
|
||||
|
||||
inline int serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
|
||||
{
|
||||
//initialize to known values
|
||||
for (int i=0;i<destBufferCapacity;i++)
|
||||
destBuffer[i] = 0xec;
|
||||
|
||||
assert(destBufferCapacity>=m_serializationSizeInBytes);
|
||||
|
||||
//todo: use the btSerializer for this to allow for 32/64bit, endianness etc
|
||||
int numArguments = m_kernelArguments.size();
|
||||
int curBufferSize = 0;
|
||||
int* dest = (int*)&destBuffer[curBufferSize];
|
||||
*dest = numArguments;
|
||||
curBufferSize += sizeof(int);
|
||||
|
||||
|
||||
|
||||
for (int i=0;i<this->m_kernelArguments.size();i++)
|
||||
{
|
||||
btKernelArgData* arg = (btKernelArgData*) &destBuffer[curBufferSize];
|
||||
*arg = m_kernelArguments[i];
|
||||
curBufferSize+=sizeof(btKernelArgData);
|
||||
if (arg->m_isBuffer==1)
|
||||
{
|
||||
//copy the OpenCL buffer content
|
||||
cl_int status = 0;
|
||||
status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
|
||||
&destBuffer[curBufferSize], 0,0,0 );
|
||||
btAssert( status==CL_SUCCESS );
|
||||
clFinish(m_commandQueue);
|
||||
curBufferSize+=arg->m_argSizeInBytes;
|
||||
}
|
||||
|
||||
}
|
||||
return curBufferSize;
|
||||
}
|
||||
|
||||
void serializeToFile(const char* fileName, int numWorkItems)
|
||||
{
|
||||
int num = numWorkItems;
|
||||
int buffSize = getSerializationBufferSize();
|
||||
unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
|
||||
for (int i=0;i<buffSize+1;i++)
|
||||
{
|
||||
unsigned char* ptr = (unsigned char*)&buf[i];
|
||||
*ptr = 0xff;
|
||||
}
|
||||
int actualWrite = serializeArguments(buf,buffSize);
|
||||
|
||||
unsigned char* cptr = (unsigned char*)&buf[buffSize];
|
||||
// printf("buf[buffSize] = %d\n",*cptr);
|
||||
|
||||
assert(buf[buffSize]==0xff);//check for buffer overrun
|
||||
int* ptr = (int*)&buf[buffSize];
|
||||
|
||||
*ptr = num;
|
||||
|
||||
FILE* f = fopen(fileName,"wb");
|
||||
fwrite(buf,buffSize+sizeof(int),1,f);
|
||||
fclose(f);
|
||||
|
||||
delete[] buf;
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
inline void setConst( const T& consts )
|
||||
{
|
||||
int sz=sizeof(T);
|
||||
btAssert(sz<=BT_CL_MAX_ARG_SIZE);
|
||||
btKernelArgData kernelArg;
|
||||
kernelArg.m_argIndex = m_idx;
|
||||
kernelArg.m_isBuffer = 0;
|
||||
T* destArg = (T*)kernelArg.m_argData;
|
||||
*destArg = consts;
|
||||
kernelArg.m_argSizeInBytes = sizeof(T);
|
||||
m_kernelArguments.push_back(kernelArg);
|
||||
m_serializationSizeInBytes+=sizeof(btKernelArgData);
|
||||
|
||||
cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts );
|
||||
btAssert( status == CL_SUCCESS );
|
||||
}
|
||||
|
||||
inline void launch1D( int numThreads, int localSize = 64)
|
||||
{
|
||||
launch2D( numThreads, 1, localSize, 1 );
|
||||
}
|
||||
|
||||
inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
|
||||
{
|
||||
size_t gRange[3] = {1,1,1};
|
||||
size_t lRange[3] = {1,1,1};
|
||||
lRange[0] = localSizeX;
|
||||
lRange[1] = localSizeY;
|
||||
gRange[0] = btMax((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
|
||||
gRange[0] *= lRange[0];
|
||||
gRange[1] = btMax((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
|
||||
gRange[1] *= lRange[1];
|
||||
|
||||
cl_int status = clEnqueueNDRangeKernel( m_commandQueue,
|
||||
m_kernel, 2, NULL, gRange, lRange, 0,0,0 );
|
||||
if (status != CL_SUCCESS)
|
||||
{
|
||||
printf("Error: OpenCL status = %d\n",status);
|
||||
}
|
||||
btAssert( status == CL_SUCCESS );
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
#endif //BT_LAUNCHER_CL_H
|
||||
71
opencl/parallel_primitives/host/btMinMax.h
Normal file
71
opencl/parallel_primitives/host/btMinMax.h
Normal file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans http://continuousphysics.com/Bullet/
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#ifndef BT_GEN_MINMAX_H
|
||||
#define BT_GEN_MINMAX_H
|
||||
|
||||
#include "btScalar.h"
|
||||
|
||||
template <class T>
|
||||
SIMD_FORCE_INLINE const T& btMin(const T& a, const T& b)
|
||||
{
|
||||
return a < b ? a : b ;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
SIMD_FORCE_INLINE const T& btMax(const T& a, const T& b)
|
||||
{
|
||||
return a > b ? a : b;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
SIMD_FORCE_INLINE const T& btClamped(const T& a, const T& lb, const T& ub)
|
||||
{
|
||||
return a < lb ? lb : (ub < a ? ub : a);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
SIMD_FORCE_INLINE void btSetMin(T& a, const T& b)
|
||||
{
|
||||
if (b < a)
|
||||
{
|
||||
a = b;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
SIMD_FORCE_INLINE void btSetMax(T& a, const T& b)
|
||||
{
|
||||
if (a < b)
|
||||
{
|
||||
a = b;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
SIMD_FORCE_INLINE void btClamp(T& a, const T& lb, const T& ub)
|
||||
{
|
||||
if (a < lb)
|
||||
{
|
||||
a = lb;
|
||||
}
|
||||
else if (ub < a)
|
||||
{
|
||||
a = ub;
|
||||
}
|
||||
}
|
||||
|
||||
#endif //BT_GEN_MINMAX_H
|
||||
274
opencl/parallel_primitives/host/btOpenCLArray.h
Normal file
274
opencl/parallel_primitives/host/btOpenCLArray.h
Normal file
@@ -0,0 +1,274 @@
|
||||
#ifndef BT_OPENCL_ARRAY_H
|
||||
#define BT_OPENCL_ARRAY_H
|
||||
|
||||
#include "btAlignedObjectArray.h"
|
||||
#include "../../basic_initialize/btOpenCLInclude.h"
|
||||
|
||||
template <typename T>
|
||||
class btOpenCLArray
|
||||
{
|
||||
int m_size;
|
||||
int m_capacity;
|
||||
cl_mem m_clBuffer;
|
||||
|
||||
cl_context m_clContext;
|
||||
cl_command_queue m_commandQueue;
|
||||
|
||||
bool m_ownsMemory;
|
||||
|
||||
bool m_allowGrowingCapacity;
|
||||
|
||||
void deallocate()
|
||||
{
|
||||
if (m_clBuffer && m_ownsMemory)
|
||||
{
|
||||
clReleaseMemObject(m_clBuffer);
|
||||
}
|
||||
m_clBuffer = 0;
|
||||
m_capacity=0;
|
||||
}
|
||||
|
||||
btOpenCLArray<T>& operator=(const btOpenCLArray<T>& src);
|
||||
|
||||
SIMD_FORCE_INLINE int allocSize(int size)
|
||||
{
|
||||
return (size ? size*2 : 1);
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
btOpenCLArray(cl_context ctx, cl_command_queue queue, int initialCapacity=0, bool allowGrowingCapacity=true)
|
||||
:m_size(0), m_capacity(0),m_clBuffer(0),
|
||||
m_clContext(ctx),m_commandQueue(queue),
|
||||
m_ownsMemory(true),m_allowGrowingCapacity(true)
|
||||
{
|
||||
if (initialCapacity)
|
||||
{
|
||||
reserve(initialCapacity);
|
||||
}
|
||||
m_allowGrowingCapacity = allowGrowingCapacity;
|
||||
}
|
||||
|
||||
///this is an error-prone method with no error checking, be careful!
|
||||
void setFromOpenCLBuffer(cl_mem buffer, int sizeInElements)
|
||||
{
|
||||
deallocate();
|
||||
m_ownsMemory = false;
|
||||
m_allowGrowingCapacity = false;
|
||||
m_clBuffer = buffer;
|
||||
m_size = sizeInElements;
|
||||
m_capacity = sizeInElements;
|
||||
}
|
||||
|
||||
// we could enable this assignment, but need to make sure to avoid accidental deep copies
|
||||
// btOpenCLArray<T>& operator=(const btAlignedObjectArray<T>& src)
|
||||
// {
|
||||
// copyFromArray(src);
|
||||
// return *this;
|
||||
// }
|
||||
|
||||
|
||||
cl_mem getBufferCL() const
|
||||
{
|
||||
return m_clBuffer;
|
||||
}
|
||||
|
||||
|
||||
virtual ~btOpenCLArray()
|
||||
{
|
||||
deallocate();
|
||||
m_size=0;
|
||||
m_capacity=0;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void push_back(const T& _Val,bool waitForCompletion=true)
|
||||
{
|
||||
int sz = size();
|
||||
if( sz == capacity() )
|
||||
{
|
||||
reserve( allocSize(size()) );
|
||||
}
|
||||
copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
|
||||
m_size++;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE T forcedAt(int n) const
|
||||
{
|
||||
btAssert(n>=0);
|
||||
btAssert(n<capacity());
|
||||
T elem;
|
||||
copyToHostPointer(&elem,1,n,true);
|
||||
return elem;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE T at(int n) const
|
||||
{
|
||||
btAssert(n>=0);
|
||||
btAssert(n<size());
|
||||
T elem;
|
||||
copyToHostPointer(&elem,1,n,true);
|
||||
return elem;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void resize(int newsize, bool copyOldContents=true)
|
||||
{
|
||||
int curSize = size();
|
||||
|
||||
if (newsize < curSize)
|
||||
{
|
||||
//leave the OpenCL memory for now
|
||||
} else
|
||||
{
|
||||
if (newsize > size())
|
||||
{
|
||||
reserve(newsize,copyOldContents);
|
||||
}
|
||||
|
||||
//leave new data uninitialized (init in debug mode?)
|
||||
//for (int i=curSize;i<newsize;i++) ...
|
||||
}
|
||||
|
||||
m_size = newsize;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE int size() const
|
||||
{
|
||||
return m_size;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE int capacity() const
|
||||
{
|
||||
return m_capacity;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void reserve(int _Count, bool copyOldContents=true)
|
||||
{ // determine new minimum length of allocated storage
|
||||
if (capacity() < _Count)
|
||||
{ // not enough room, reallocate
|
||||
|
||||
if (m_allowGrowingCapacity)
|
||||
{
|
||||
cl_int ciErrNum;
|
||||
//create a new OpenCL buffer
|
||||
int memSizeInBytes = sizeof(T)*_Count;
|
||||
cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
|
||||
btAssert(ciErrNum==CL_SUCCESS);
|
||||
|
||||
//#define BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
|
||||
#ifdef BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
|
||||
unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
|
||||
for (int i=0;i<memSizeInBytes;i++)
|
||||
src[i] = 0xbb;
|
||||
ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 );
|
||||
btAssert(ciErrNum==CL_SUCCESS);
|
||||
clFinish(m_commandQueue);
|
||||
free(src);
|
||||
#endif //BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
|
||||
|
||||
if (copyOldContents)
|
||||
copyToCL(buf, size());
|
||||
|
||||
//deallocate the old buffer
|
||||
deallocate();
|
||||
|
||||
m_clBuffer = buf;
|
||||
|
||||
m_capacity = _Count;
|
||||
} else
|
||||
{
|
||||
//fail: assert and
|
||||
btAssert(0);
|
||||
deallocate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void copyToCL(cl_mem destination, int numElements, int firstElem=0, int dstOffsetInElems=0) const
|
||||
{
|
||||
if (numElements<=0)
|
||||
return;
|
||||
|
||||
btAssert(m_clBuffer);
|
||||
btAssert(destination);
|
||||
|
||||
//likely some error, destination is same as source
|
||||
btAssert(m_clBuffer != destination);
|
||||
|
||||
btAssert((firstElem+numElements)<=m_size);
|
||||
|
||||
cl_int status = 0;
|
||||
|
||||
|
||||
btAssert(numElements>0);
|
||||
btAssert(numElements<=m_size);
|
||||
|
||||
int srcOffsetBytes = sizeof(T)*firstElem;
|
||||
int dstOffsetInBytes = sizeof(T)*dstOffsetInElems;
|
||||
|
||||
status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination,
|
||||
srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 );
|
||||
|
||||
btAssert( status == CL_SUCCESS );
|
||||
}
|
||||
|
||||
void copyFromHost(const btAlignedObjectArray<T>& srcArray, bool waitForCompletion=true)
|
||||
{
|
||||
int newSize = srcArray.size();
|
||||
|
||||
bool copyOldContents = false;
|
||||
resize (newSize,copyOldContents);
|
||||
if (newSize)
|
||||
copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion);
|
||||
|
||||
}
|
||||
|
||||
void copyFromHostPointer(const T* src, int numElems, int destFirstElem= 0, bool waitForCompletion=true)
|
||||
{
|
||||
btAssert(numElems+destFirstElem <= capacity());
|
||||
|
||||
cl_int status = 0;
|
||||
int sizeInBytes=sizeof(T)*numElems;
|
||||
status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes,
|
||||
src, 0,0,0 );
|
||||
btAssert(status == CL_SUCCESS );
|
||||
if (waitForCompletion)
|
||||
clFinish(m_commandQueue);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void copyToHost(btAlignedObjectArray<T>& destArray, bool waitForCompletion=true) const
|
||||
{
|
||||
destArray.resize(this->size());
|
||||
if (size())
|
||||
copyToHostPointer(&destArray[0], size(),0,waitForCompletion);
|
||||
}
|
||||
|
||||
void copyToHostPointer(T* destPtr, int numElem, int srcFirstElem=0, bool waitForCompletion=true) const
|
||||
{
|
||||
btAssert(numElem+srcFirstElem <= capacity());
|
||||
|
||||
cl_int status = 0;
|
||||
status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem,
|
||||
destPtr, 0,0,0 );
|
||||
btAssert( status==CL_SUCCESS );
|
||||
|
||||
if (waitForCompletion)
|
||||
clFinish(m_commandQueue);
|
||||
}
|
||||
|
||||
void copyFromOpenCLArray(const btOpenCLArray& src)
|
||||
{
|
||||
int newSize = src.size();
|
||||
resize(newSize);
|
||||
if (size())
|
||||
{
|
||||
src.copyToCL(m_clBuffer,size());
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
#endif //BT_OPENCL_ARRAY_H
|
||||
126
opencl/parallel_primitives/host/btPrefixScanCL.cpp
Normal file
126
opencl/parallel_primitives/host/btPrefixScanCL.cpp
Normal file
@@ -0,0 +1,126 @@
|
||||
#include "btPrefixScanCL.h"
|
||||
#include "btFillCL.h"
|
||||
#define BT_PREFIXSCAN_PROG_PATH "opencl/parallel_primitives/kernels/PrefixScanKernels.cl"
|
||||
|
||||
#include "btLauncherCL.h"
|
||||
#include "../../basic_initialize/btOpenCLUtils.h"
|
||||
#include "../kernels/PrefixScanKernelsCL.h"
|
||||
|
||||
btPrefixScanCL::btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
|
||||
:m_commandQueue(queue)
|
||||
{
|
||||
const char* scanKernelSource = prefixScanKernelsCL;
|
||||
cl_int pErrNum;
|
||||
char* additionalMacros=0;
|
||||
|
||||
m_workBuffer = new btOpenCLArray<unsigned int>(ctx,queue,size);
|
||||
cl_program scanProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, BT_PREFIXSCAN_PROG_PATH);
|
||||
btAssert(scanProg);
|
||||
|
||||
m_localScanKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
|
||||
btAssert(m_localScanKernel );
|
||||
m_blockSumKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
|
||||
btAssert(m_blockSumKernel );
|
||||
m_propagationKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
|
||||
btAssert(m_propagationKernel );
|
||||
}
|
||||
|
||||
|
||||
btPrefixScanCL::~btPrefixScanCL()
|
||||
{
|
||||
delete m_workBuffer;
|
||||
clReleaseKernel(m_localScanKernel);
|
||||
clReleaseKernel(m_blockSumKernel);
|
||||
clReleaseKernel(m_propagationKernel);
|
||||
}
|
||||
|
||||
template<class T>
|
||||
T btNextPowerOf2(T n)
|
||||
{
|
||||
n -= 1;
|
||||
for(int i=0; i<sizeof(T)*8; i++)
|
||||
n = n | (n>>i);
|
||||
return n+1;
|
||||
}
|
||||
|
||||
void btPrefixScanCL::execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
|
||||
{
|
||||
|
||||
// btAssert( data->m_option == EXCLUSIVE );
|
||||
const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
|
||||
|
||||
dst.resize(src.size());
|
||||
m_workBuffer->resize(src.size());
|
||||
|
||||
btInt4 constBuffer;
|
||||
constBuffer.x = n;
|
||||
constBuffer.y = numBlocks;
|
||||
constBuffer.z = (int)btNextPowerOf2( numBlocks );
|
||||
|
||||
btOpenCLArray<unsigned int>* srcNative = &src;
|
||||
btOpenCLArray<unsigned int>* dstNative = &dst;
|
||||
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( srcNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) };
|
||||
|
||||
btLauncherCL launcher( m_commandQueue, m_localScanKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( constBuffer );
|
||||
launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
|
||||
}
|
||||
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_workBuffer->getBufferCL() ) };
|
||||
|
||||
btLauncherCL launcher( m_commandQueue, m_blockSumKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( constBuffer );
|
||||
launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
|
||||
}
|
||||
|
||||
|
||||
if( numBlocks > 1 )
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) };
|
||||
btLauncherCL launcher( m_commandQueue, m_propagationKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( constBuffer );
|
||||
launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
|
||||
}
|
||||
|
||||
|
||||
if( sum )
|
||||
{
|
||||
clFinish(m_commandQueue);
|
||||
dstNative->copyToHostPointer(sum,1,n-1,true);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void btPrefixScanCL::executeHost(btAlignedObjectArray<unsigned int>& src, btAlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
|
||||
{
|
||||
unsigned int s = 0;
|
||||
//if( data->m_option == EXCLUSIVE )
|
||||
{
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
dst[i] = s;
|
||||
s += src[i];
|
||||
}
|
||||
}
|
||||
/*else
|
||||
{
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
s += hSrc[i];
|
||||
hDst[i] = s;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
if( sum )
|
||||
{
|
||||
*sum = dst[n-1];
|
||||
}
|
||||
}
|
||||
37
opencl/parallel_primitives/host/btPrefixScanCL.h
Normal file
37
opencl/parallel_primitives/host/btPrefixScanCL.h
Normal file
@@ -0,0 +1,37 @@
|
||||
|
||||
#ifndef BT_PREFIX_SCAN_CL_H
|
||||
#define BT_PREFIX_SCAN_CL_H
|
||||
|
||||
#include "btOpenCLArray.h"
|
||||
#include "btBufferInfoCL.h"
|
||||
#include "btAlignedObjectArray.h"
|
||||
|
||||
class btPrefixScanCL
|
||||
{
|
||||
enum
|
||||
{
|
||||
BLOCK_SIZE = 128
|
||||
};
|
||||
|
||||
// Option m_option;
|
||||
|
||||
cl_command_queue m_commandQueue;
|
||||
|
||||
cl_kernel m_localScanKernel;
|
||||
cl_kernel m_blockSumKernel;
|
||||
cl_kernel m_propagationKernel;
|
||||
|
||||
btOpenCLArray<unsigned int>* m_workBuffer;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
|
||||
|
||||
virtual ~btPrefixScanCL();
|
||||
|
||||
void execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
|
||||
void executeHost(btAlignedObjectArray<unsigned int>& src, btAlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum);
|
||||
};
|
||||
|
||||
#endif //BT_PREFIX_SCAN_CL_H
|
||||
566
opencl/parallel_primitives/host/btQuickprof.cpp
Normal file
566
opencl/parallel_primitives/host/btQuickprof.cpp
Normal file
@@ -0,0 +1,566 @@
|
||||
/*
|
||||
|
||||
***************************************************************************************************
|
||||
**
|
||||
** profile.cpp
|
||||
**
|
||||
** Real-Time Hierarchical Profiling for Game Programming Gems 3
|
||||
**
|
||||
** by Greg Hjelstrom & Byon Garrabrant
|
||||
**
|
||||
***************************************************************************************************/
|
||||
|
||||
// Credits: The Clock class was inspired by the Timer classes in
|
||||
// Ogre (www.ogre3d.org).
|
||||
|
||||
#include "btQuickprof.h"
|
||||
|
||||
#ifndef BT_NO_PROFILE
|
||||
|
||||
|
||||
static btClock gProfileClock;
|
||||
|
||||
|
||||
#ifdef __CELLOS_LV2__
|
||||
#include <sys/sys_time.h>
|
||||
#include <sys/time_util.h>
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#if defined (SUNOS) || defined (__SUNOS__)
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#if defined(WIN32) || defined(_WIN32)
|
||||
|
||||
#define BT_USE_WINDOWS_TIMERS
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOWINRES
|
||||
#define NOMCX
|
||||
#define NOIME
|
||||
|
||||
#ifdef _XBOX
|
||||
#include <Xtl.h>
|
||||
#else //_XBOX
|
||||
#include <windows.h>
|
||||
#endif //_XBOX
|
||||
|
||||
#include <time.h>
|
||||
|
||||
|
||||
#else //_WIN32
|
||||
#include <sys/time.h>
|
||||
#endif //_WIN32
|
||||
|
||||
#define mymin(a,b) (a > b ? a : b)
|
||||
|
||||
struct btClockData
|
||||
{
|
||||
|
||||
#ifdef BT_USE_WINDOWS_TIMERS
|
||||
LARGE_INTEGER mClockFrequency;
|
||||
DWORD mStartTick;
|
||||
LONGLONG mPrevElapsedTime;
|
||||
LARGE_INTEGER mStartTime;
|
||||
#else
|
||||
#ifdef __CELLOS_LV2__
|
||||
uint64_t mStartTime;
|
||||
#else
|
||||
struct timeval mStartTime;
|
||||
#endif
|
||||
#endif //__CELLOS_LV2__
|
||||
|
||||
};
|
||||
|
||||
///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling.
|
||||
btClock::btClock()
|
||||
{
|
||||
m_data = new btClockData;
|
||||
#ifdef BT_USE_WINDOWS_TIMERS
|
||||
QueryPerformanceFrequency(&m_data->mClockFrequency);
|
||||
#endif
|
||||
reset();
|
||||
}
|
||||
|
||||
btClock::~btClock()
|
||||
{
|
||||
delete m_data;
|
||||
}
|
||||
|
||||
btClock::btClock(const btClock& other)
|
||||
{
|
||||
m_data = new btClockData;
|
||||
*m_data = *other.m_data;
|
||||
}
|
||||
|
||||
btClock& btClock::operator=(const btClock& other)
|
||||
{
|
||||
*m_data = *other.m_data;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
/// Resets the initial reference time.
|
||||
void btClock::reset()
|
||||
{
|
||||
#ifdef BT_USE_WINDOWS_TIMERS
|
||||
QueryPerformanceCounter(&m_data->mStartTime);
|
||||
m_data->mStartTick = GetTickCount();
|
||||
m_data->mPrevElapsedTime = 0;
|
||||
#else
|
||||
#ifdef __CELLOS_LV2__
|
||||
|
||||
typedef uint64_t ClockSize;
|
||||
ClockSize newTime;
|
||||
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
|
||||
SYS_TIMEBASE_GET( newTime );
|
||||
m_data->mStartTime = newTime;
|
||||
#else
|
||||
gettimeofday(&m_data->mStartTime, 0);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Returns the time in ms since the last call to reset or since
|
||||
/// the btClock was created.
|
||||
unsigned long int btClock::getTimeMilliseconds()
|
||||
{
|
||||
#ifdef BT_USE_WINDOWS_TIMERS
|
||||
LARGE_INTEGER currentTime;
|
||||
QueryPerformanceCounter(¤tTime);
|
||||
LONGLONG elapsedTime = currentTime.QuadPart -
|
||||
m_data->mStartTime.QuadPart;
|
||||
// Compute the number of millisecond ticks elapsed.
|
||||
unsigned long msecTicks = (unsigned long)(1000 * elapsedTime /
|
||||
m_data->mClockFrequency.QuadPart);
|
||||
// Check for unexpected leaps in the Win32 performance counter.
|
||||
// (This is caused by unexpected data across the PCI to ISA
|
||||
// bridge, aka south bridge. See Microsoft KB274323.)
|
||||
unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
|
||||
signed long msecOff = (signed long)(msecTicks - elapsedTicks);
|
||||
if (msecOff < -100 || msecOff > 100)
|
||||
{
|
||||
// Adjust the starting time forwards.
|
||||
LONGLONG msecAdjustment = mymin(msecOff *
|
||||
m_data->mClockFrequency.QuadPart / 1000, elapsedTime -
|
||||
m_data->mPrevElapsedTime);
|
||||
m_data->mStartTime.QuadPart += msecAdjustment;
|
||||
elapsedTime -= msecAdjustment;
|
||||
|
||||
// Recompute the number of millisecond ticks elapsed.
|
||||
msecTicks = (unsigned long)(1000 * elapsedTime /
|
||||
m_data->mClockFrequency.QuadPart);
|
||||
}
|
||||
|
||||
// Store the current elapsed time for adjustments next time.
|
||||
m_data->mPrevElapsedTime = elapsedTime;
|
||||
|
||||
return msecTicks;
|
||||
#else
|
||||
|
||||
#ifdef __CELLOS_LV2__
|
||||
uint64_t freq=sys_time_get_timebase_frequency();
|
||||
double dFreq=((double) freq) / 1000.0;
|
||||
typedef uint64_t ClockSize;
|
||||
ClockSize newTime;
|
||||
SYS_TIMEBASE_GET( newTime );
|
||||
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
|
||||
|
||||
return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq);
|
||||
#else
|
||||
|
||||
struct timeval currentTime;
|
||||
gettimeofday(¤tTime, 0);
|
||||
return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000 +
|
||||
(currentTime.tv_usec - m_data->mStartTime.tv_usec) / 1000;
|
||||
#endif //__CELLOS_LV2__
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Returns the time in us since the last call to reset or since
|
||||
/// the Clock was created.
|
||||
unsigned long int btClock::getTimeMicroseconds()
|
||||
{
|
||||
#ifdef BT_USE_WINDOWS_TIMERS
|
||||
LARGE_INTEGER currentTime;
|
||||
QueryPerformanceCounter(¤tTime);
|
||||
LONGLONG elapsedTime = currentTime.QuadPart -
|
||||
m_data->mStartTime.QuadPart;
|
||||
|
||||
// Compute the number of millisecond ticks elapsed.
|
||||
unsigned long msecTicks = (unsigned long)(1000 * elapsedTime /
|
||||
m_data->mClockFrequency.QuadPart);
|
||||
|
||||
// Check for unexpected leaps in the Win32 performance counter.
|
||||
// (This is caused by unexpected data across the PCI to ISA
|
||||
// bridge, aka south bridge. See Microsoft KB274323.)
|
||||
unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
|
||||
signed long msecOff = (signed long)(msecTicks - elapsedTicks);
|
||||
if (msecOff < -100 || msecOff > 100)
|
||||
{
|
||||
// Adjust the starting time forwards.
|
||||
LONGLONG msecAdjustment = mymin(msecOff *
|
||||
m_data->mClockFrequency.QuadPart / 1000, elapsedTime -
|
||||
m_data->mPrevElapsedTime);
|
||||
m_data->mStartTime.QuadPart += msecAdjustment;
|
||||
elapsedTime -= msecAdjustment;
|
||||
}
|
||||
|
||||
// Store the current elapsed time for adjustments next time.
|
||||
m_data->mPrevElapsedTime = elapsedTime;
|
||||
|
||||
// Convert to microseconds.
|
||||
unsigned long usecTicks = (unsigned long)(1000000 * elapsedTime /
|
||||
m_data->mClockFrequency.QuadPart);
|
||||
|
||||
return usecTicks;
|
||||
#else
|
||||
|
||||
#ifdef __CELLOS_LV2__
|
||||
uint64_t freq=sys_time_get_timebase_frequency();
|
||||
double dFreq=((double) freq)/ 1000000.0;
|
||||
typedef uint64_t ClockSize;
|
||||
ClockSize newTime;
|
||||
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
|
||||
SYS_TIMEBASE_GET( newTime );
|
||||
|
||||
return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq);
|
||||
#else
|
||||
|
||||
struct timeval currentTime;
|
||||
gettimeofday(¤tTime, 0);
|
||||
return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 +
|
||||
(currentTime.tv_usec - m_data->mStartTime.tv_usec);
|
||||
#endif//__CELLOS_LV2__
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
inline void Profile_Get_Ticks(unsigned long int * ticks)
|
||||
{
|
||||
*ticks = gProfileClock.getTimeMicroseconds();
|
||||
}
|
||||
|
||||
inline float Profile_Get_Tick_Rate(void)
|
||||
{
|
||||
// return 1000000.f;
|
||||
return 1000.f;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/***************************************************************************************************
|
||||
**
|
||||
** CProfileNode
|
||||
**
|
||||
***************************************************************************************************/
|
||||
|
||||
/***********************************************************************************************
|
||||
* INPUT: *
|
||||
* name - pointer to a static string which is the name of this profile node *
|
||||
* parent - parent pointer *
|
||||
* *
|
||||
* WARNINGS: *
|
||||
* The name is assumed to be a static pointer, only the pointer is stored and compared for *
|
||||
* efficiency reasons. *
|
||||
*=============================================================================================*/
|
||||
CProfileNode::CProfileNode( const char * name, CProfileNode * parent ) :
|
||||
Name( name ),
|
||||
TotalCalls( 0 ),
|
||||
TotalTime( 0 ),
|
||||
StartTime( 0 ),
|
||||
RecursionCounter( 0 ),
|
||||
Parent( parent ),
|
||||
Child( NULL ),
|
||||
Sibling( NULL ),
|
||||
m_userPtr(0)
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
|
||||
|
||||
void CProfileNode::CleanupMemory()
|
||||
{
|
||||
delete ( Child);
|
||||
Child = NULL;
|
||||
delete ( Sibling);
|
||||
Sibling = NULL;
|
||||
}
|
||||
|
||||
CProfileNode::~CProfileNode( void )
|
||||
{
|
||||
delete ( Child);
|
||||
delete ( Sibling);
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************************************
|
||||
* INPUT: *
|
||||
* name - static string pointer to the name of the node we are searching for *
|
||||
* *
|
||||
* WARNINGS: *
|
||||
* All profile names are assumed to be static strings so this function uses pointer compares *
|
||||
* to find the named node. *
|
||||
*=============================================================================================*/
|
||||
CProfileNode * CProfileNode::Get_Sub_Node( const char * name )
|
||||
{
|
||||
// Try to find this sub node
|
||||
CProfileNode * child = Child;
|
||||
while ( child ) {
|
||||
if ( child->Name == name ) {
|
||||
return child;
|
||||
}
|
||||
child = child->Sibling;
|
||||
}
|
||||
|
||||
// We didn't find it, so add it
|
||||
|
||||
CProfileNode * node = new CProfileNode( name, this );
|
||||
node->Sibling = Child;
|
||||
Child = node;
|
||||
return node;
|
||||
}
|
||||
|
||||
|
||||
void CProfileNode::Reset( void )
|
||||
{
|
||||
TotalCalls = 0;
|
||||
TotalTime = 0.0f;
|
||||
|
||||
|
||||
if ( Child ) {
|
||||
Child->Reset();
|
||||
}
|
||||
if ( Sibling ) {
|
||||
Sibling->Reset();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void CProfileNode::Call( void )
|
||||
{
|
||||
TotalCalls++;
|
||||
if (RecursionCounter++ == 0) {
|
||||
Profile_Get_Ticks(&StartTime);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool CProfileNode::Return( void )
|
||||
{
|
||||
if ( --RecursionCounter == 0 && TotalCalls != 0 ) {
|
||||
unsigned long int time;
|
||||
Profile_Get_Ticks(&time);
|
||||
time-=StartTime;
|
||||
TotalTime += (float)time / Profile_Get_Tick_Rate();
|
||||
}
|
||||
return ( RecursionCounter == 0 );
|
||||
}
|
||||
|
||||
|
||||
/***************************************************************************************************
|
||||
**
|
||||
** CProfileIterator
|
||||
**
|
||||
***************************************************************************************************/
|
||||
CProfileIterator::CProfileIterator( CProfileNode * start )
|
||||
{
|
||||
CurrentParent = start;
|
||||
CurrentChild = CurrentParent->Get_Child();
|
||||
}
|
||||
|
||||
|
||||
void CProfileIterator::First(void)
|
||||
{
|
||||
CurrentChild = CurrentParent->Get_Child();
|
||||
}
|
||||
|
||||
|
||||
void CProfileIterator::Next(void)
|
||||
{
|
||||
CurrentChild = CurrentChild->Get_Sibling();
|
||||
}
|
||||
|
||||
|
||||
bool CProfileIterator::Is_Done(void)
|
||||
{
|
||||
return CurrentChild == NULL;
|
||||
}
|
||||
|
||||
|
||||
void CProfileIterator::Enter_Child( int index )
|
||||
{
|
||||
CurrentChild = CurrentParent->Get_Child();
|
||||
while ( (CurrentChild != NULL) && (index != 0) ) {
|
||||
index--;
|
||||
CurrentChild = CurrentChild->Get_Sibling();
|
||||
}
|
||||
|
||||
if ( CurrentChild != NULL ) {
|
||||
CurrentParent = CurrentChild;
|
||||
CurrentChild = CurrentParent->Get_Child();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void CProfileIterator::Enter_Parent( void )
|
||||
{
|
||||
if ( CurrentParent->Get_Parent() != NULL ) {
|
||||
CurrentParent = CurrentParent->Get_Parent();
|
||||
}
|
||||
CurrentChild = CurrentParent->Get_Child();
|
||||
}
|
||||
|
||||
|
||||
/***************************************************************************************************
|
||||
**
|
||||
** CProfileManager
|
||||
**
|
||||
***************************************************************************************************/
|
||||
|
||||
CProfileNode CProfileManager::Root( "Root", NULL );
|
||||
CProfileNode * CProfileManager::CurrentNode = &CProfileManager::Root;
|
||||
int CProfileManager::FrameCounter = 0;
|
||||
unsigned long int CProfileManager::ResetTime = 0;
|
||||
|
||||
|
||||
/***********************************************************************************************
|
||||
* CProfileManager::Start_Profile -- Begin a named profile *
|
||||
* *
|
||||
* Steps one level deeper into the tree, if a child already exists with the specified name *
|
||||
* then it accumulates the profiling; otherwise a new child node is added to the profile tree. *
|
||||
* *
|
||||
* INPUT: *
|
||||
* name - name of this profiling record *
|
||||
* *
|
||||
* WARNINGS: *
|
||||
* The string used is assumed to be a static string; pointer compares are used throughout *
|
||||
* the profiling code for efficiency. *
|
||||
*=============================================================================================*/
|
||||
void CProfileManager::Start_Profile( const char * name )
|
||||
{
|
||||
if (name != CurrentNode->Get_Name()) {
|
||||
CurrentNode = CurrentNode->Get_Sub_Node( name );
|
||||
}
|
||||
|
||||
CurrentNode->Call();
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************************************
|
||||
* CProfileManager::Stop_Profile -- Stop timing and record the results. *
|
||||
*=============================================================================================*/
|
||||
void CProfileManager::Stop_Profile( void )
|
||||
{
|
||||
// Return will indicate whether we should back up to our parent (we may
|
||||
// be profiling a recursive function)
|
||||
if (CurrentNode->Return()) {
|
||||
CurrentNode = CurrentNode->Get_Parent();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************************************
|
||||
* CProfileManager::Reset -- Reset the contents of the profiling system *
|
||||
* *
|
||||
* This resets everything except for the tree structure. All of the timing data is reset. *
|
||||
*=============================================================================================*/
|
||||
void CProfileManager::Reset( void )
|
||||
{
|
||||
gProfileClock.reset();
|
||||
Root.Reset();
|
||||
Root.Call();
|
||||
FrameCounter = 0;
|
||||
Profile_Get_Ticks(&ResetTime);
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************************************
|
||||
* CProfileManager::Increment_Frame_Counter -- Increment the frame counter *
|
||||
*=============================================================================================*/
|
||||
void CProfileManager::Increment_Frame_Counter( void )
|
||||
{
|
||||
FrameCounter++;
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************************************
|
||||
* CProfileManager::Get_Time_Since_Reset -- returns the elapsed time since last reset *
|
||||
*=============================================================================================*/
|
||||
float CProfileManager::Get_Time_Since_Reset( void )
|
||||
{
|
||||
unsigned long int time;
|
||||
Profile_Get_Ticks(&time);
|
||||
time -= ResetTime;
|
||||
return (float)time / Profile_Get_Tick_Rate();
|
||||
}
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
void CProfileManager::dumpRecursive(CProfileIterator* profileIterator, int spacing)
|
||||
{
|
||||
profileIterator->First();
|
||||
if (profileIterator->Is_Done())
|
||||
return;
|
||||
|
||||
float accumulated_time=0,parent_time = profileIterator->Is_Root() ? CProfileManager::Get_Time_Since_Reset() : profileIterator->Get_Current_Parent_Total_Time();
|
||||
int i;
|
||||
int frames_since_reset = CProfileManager::Get_Frame_Count_Since_Reset();
|
||||
for (i=0;i<spacing;i++) printf(".");
|
||||
printf("----------------------------------\n");
|
||||
for (i=0;i<spacing;i++) printf(".");
|
||||
printf("Profiling: %s (total running time: %.3f ms) ---\n", profileIterator->Get_Current_Parent_Name(), parent_time );
|
||||
float totalTime = 0.f;
|
||||
|
||||
|
||||
int numChildren = 0;
|
||||
|
||||
for (i = 0; !profileIterator->Is_Done(); i++,profileIterator->Next())
|
||||
{
|
||||
numChildren++;
|
||||
float current_total_time = profileIterator->Get_Current_Total_Time();
|
||||
accumulated_time += current_total_time;
|
||||
float fraction = parent_time > SIMD_EPSILON ? (current_total_time / parent_time) * 100 : 0.f;
|
||||
{
|
||||
int i; for (i=0;i<spacing;i++) printf(".");
|
||||
}
|
||||
printf("%d -- %s (%.2f %%) :: %.3f ms / frame (%d calls)\n",i, profileIterator->Get_Current_Name(), fraction,(current_total_time / (double)frames_since_reset),profileIterator->Get_Current_Total_Calls());
|
||||
totalTime += current_total_time;
|
||||
//recurse into children
|
||||
}
|
||||
|
||||
if (parent_time < accumulated_time)
|
||||
{
|
||||
printf("what's wrong\n");
|
||||
}
|
||||
for (i=0;i<spacing;i++) printf(".");
|
||||
printf("%s (%.3f %%) :: %.3f ms\n", "Unaccounted:",parent_time > SIMD_EPSILON ? ((parent_time - accumulated_time) / parent_time) * 100 : 0.f, parent_time - accumulated_time);
|
||||
|
||||
for (i=0;i<numChildren;i++)
|
||||
{
|
||||
profileIterator->Enter_Child(i);
|
||||
dumpRecursive(profileIterator,spacing+3);
|
||||
profileIterator->Enter_Parent();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void CProfileManager::dumpAll()
|
||||
{
|
||||
CProfileIterator* profileIterator = 0;
|
||||
profileIterator = CProfileManager::Get_Iterator();
|
||||
|
||||
dumpRecursive(profileIterator,0);
|
||||
|
||||
CProfileManager::Release_Iterator(profileIterator);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#endif //BT_NO_PROFILE
|
||||
203
opencl/parallel_primitives/host/btQuickprof.h
Normal file
203
opencl/parallel_primitives/host/btQuickprof.h
Normal file
@@ -0,0 +1,203 @@
|
||||
|
||||
/***************************************************************************************************
|
||||
**
|
||||
** Real-Time Hierarchical Profiling for Game Programming Gems 3
|
||||
**
|
||||
** by Greg Hjelstrom & Byon Garrabrant
|
||||
**
|
||||
***************************************************************************************************/
|
||||
|
||||
// Credits: The Clock class was inspired by the Timer classes in
|
||||
// Ogre (www.ogre3d.org).
|
||||
|
||||
|
||||
|
||||
#ifndef BT_QUICK_PROF_H
|
||||
#define BT_QUICK_PROF_H
|
||||
|
||||
//To disable built-in profiling, please comment out next line
|
||||
//#define BT_NO_PROFILE 1
|
||||
#ifndef BT_NO_PROFILE
|
||||
#include <stdio.h>//@todo remove this, backwards compatibility
|
||||
#include "btScalar.h"
|
||||
#include "btAlignedAllocator.h"
|
||||
#include <new>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#define USE_BT_CLOCK 1
|
||||
|
||||
#ifdef USE_BT_CLOCK
|
||||
|
||||
///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling.
|
||||
class btClock
|
||||
{
|
||||
public:
|
||||
btClock();
|
||||
|
||||
btClock(const btClock& other);
|
||||
btClock& operator=(const btClock& other);
|
||||
|
||||
~btClock();
|
||||
|
||||
/// Resets the initial reference time.
|
||||
void reset();
|
||||
|
||||
/// Returns the time in ms since the last call to reset or since
|
||||
/// the btClock was created.
|
||||
unsigned long int getTimeMilliseconds();
|
||||
|
||||
/// Returns the time in us since the last call to reset or since
|
||||
/// the Clock was created.
|
||||
unsigned long int getTimeMicroseconds();
|
||||
private:
|
||||
struct btClockData* m_data;
|
||||
};
|
||||
|
||||
#endif //USE_BT_CLOCK
|
||||
|
||||
|
||||
|
||||
|
||||
///A node in the Profile Hierarchy Tree
|
||||
class CProfileNode {
|
||||
|
||||
public:
|
||||
CProfileNode( const char * name, CProfileNode * parent );
|
||||
~CProfileNode( void );
|
||||
|
||||
CProfileNode * Get_Sub_Node( const char * name );
|
||||
|
||||
CProfileNode * Get_Parent( void ) { return Parent; }
|
||||
CProfileNode * Get_Sibling( void ) { return Sibling; }
|
||||
CProfileNode * Get_Child( void ) { return Child; }
|
||||
|
||||
void CleanupMemory();
|
||||
void Reset( void );
|
||||
void Call( void );
|
||||
bool Return( void );
|
||||
|
||||
const char * Get_Name( void ) { return Name; }
|
||||
int Get_Total_Calls( void ) { return TotalCalls; }
|
||||
float Get_Total_Time( void ) { return TotalTime; }
|
||||
void* GetUserPointer() const {return m_userPtr;}
|
||||
void SetUserPointer(void* ptr) { m_userPtr = ptr;}
|
||||
protected:
|
||||
|
||||
const char * Name;
|
||||
int TotalCalls;
|
||||
float TotalTime;
|
||||
unsigned long int StartTime;
|
||||
int RecursionCounter;
|
||||
|
||||
CProfileNode * Parent;
|
||||
CProfileNode * Child;
|
||||
CProfileNode * Sibling;
|
||||
void* m_userPtr;
|
||||
};
|
||||
|
||||
///An iterator to navigate through the tree
|
||||
class CProfileIterator
|
||||
{
|
||||
public:
|
||||
// Access all the children of the current parent
|
||||
void First(void);
|
||||
void Next(void);
|
||||
bool Is_Done(void);
|
||||
bool Is_Root(void) { return (CurrentParent->Get_Parent() == 0); }
|
||||
|
||||
void Enter_Child( int index ); // Make the given child the new parent
|
||||
void Enter_Largest_Child( void ); // Make the largest child the new parent
|
||||
void Enter_Parent( void ); // Make the current parent's parent the new parent
|
||||
|
||||
// Access the current child
|
||||
const char * Get_Current_Name( void ) { return CurrentChild->Get_Name(); }
|
||||
int Get_Current_Total_Calls( void ) { return CurrentChild->Get_Total_Calls(); }
|
||||
float Get_Current_Total_Time( void ) { return CurrentChild->Get_Total_Time(); }
|
||||
|
||||
void* Get_Current_UserPointer( void ) { return CurrentChild->GetUserPointer(); }
|
||||
void Set_Current_UserPointer(void* ptr) {CurrentChild->SetUserPointer(ptr);}
|
||||
// Access the current parent
|
||||
const char * Get_Current_Parent_Name( void ) { return CurrentParent->Get_Name(); }
|
||||
int Get_Current_Parent_Total_Calls( void ) { return CurrentParent->Get_Total_Calls(); }
|
||||
float Get_Current_Parent_Total_Time( void ) { return CurrentParent->Get_Total_Time(); }
|
||||
|
||||
|
||||
|
||||
protected:
|
||||
|
||||
CProfileNode * CurrentParent;
|
||||
CProfileNode * CurrentChild;
|
||||
|
||||
|
||||
CProfileIterator( CProfileNode * start );
|
||||
friend class CProfileManager;
|
||||
};
|
||||
|
||||
|
||||
///The Manager for the Profile system
|
||||
class CProfileManager {
|
||||
public:
|
||||
static void Start_Profile( const char * name );
|
||||
static void Stop_Profile( void );
|
||||
|
||||
static void CleanupMemory(void)
|
||||
{
|
||||
Root.CleanupMemory();
|
||||
}
|
||||
|
||||
static void Reset( void );
|
||||
static void Increment_Frame_Counter( void );
|
||||
static int Get_Frame_Count_Since_Reset( void ) { return FrameCounter; }
|
||||
static float Get_Time_Since_Reset( void );
|
||||
|
||||
static CProfileIterator * Get_Iterator( void )
|
||||
{
|
||||
|
||||
return new CProfileIterator( &Root );
|
||||
}
|
||||
static void Release_Iterator( CProfileIterator * iterator ) { delete ( iterator); }
|
||||
|
||||
static void dumpRecursive(CProfileIterator* profileIterator, int spacing);
|
||||
|
||||
static void dumpAll();
|
||||
|
||||
private:
|
||||
static CProfileNode Root;
|
||||
static CProfileNode * CurrentNode;
|
||||
static int FrameCounter;
|
||||
static unsigned long int ResetTime;
|
||||
};
|
||||
|
||||
|
||||
///ProfileSampleClass is a simple way to profile a function's scope
|
||||
///Use the BT_PROFILE macro at the start of scope to time
|
||||
class CProfileSample {
|
||||
public:
|
||||
CProfileSample( const char * name )
|
||||
{
|
||||
CProfileManager::Start_Profile( name );
|
||||
}
|
||||
|
||||
~CProfileSample( void )
|
||||
{
|
||||
CProfileManager::Stop_Profile();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#define BT_PROFILE( name ) CProfileSample __profile( name )
|
||||
|
||||
#else
|
||||
|
||||
#define BT_PROFILE( name )
|
||||
|
||||
#endif //#ifndef BT_NO_PROFILE
|
||||
|
||||
|
||||
|
||||
#endif //BT_QUICK_PROF_H
|
||||
|
||||
|
||||
712
opencl/parallel_primitives/host/btRadixSort32CL.cpp
Normal file
712
opencl/parallel_primitives/host/btRadixSort32CL.cpp
Normal file
@@ -0,0 +1,712 @@
|
||||
|
||||
#include "btRadixSort32CL.h"
|
||||
#include "btLauncherCL.h"
|
||||
#include "../../basic_initialize/btOpenCLUtils.h"
|
||||
#include "btPrefixScanCL.h"
|
||||
#include "btFillCL.h"
|
||||
|
||||
#define RADIXSORT32_PATH "opencl/parallel_primitives/kernels/RadixSort32Kernels.cl"
|
||||
|
||||
#include "../kernels/RadixSort32KernelsCL.h"
|
||||
|
||||
btRadixSort32CL::btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
|
||||
:m_commandQueue(queue)
|
||||
{
|
||||
btOpenCLDeviceInfo info;
|
||||
btOpenCLUtils::getDeviceInfo(device,&info);
|
||||
m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
|
||||
|
||||
m_workBuffer1 = new btOpenCLArray<unsigned int>(ctx,queue);
|
||||
m_workBuffer2 = new btOpenCLArray<unsigned int>(ctx,queue);
|
||||
m_workBuffer3 = new btOpenCLArray<btSortData>(ctx,queue);
|
||||
m_workBuffer3a = new btOpenCLArray<unsigned int>(ctx,queue);
|
||||
m_workBuffer4 = new btOpenCLArray<btSortData>(ctx,queue);
|
||||
m_workBuffer4a = new btOpenCLArray<unsigned int>(ctx,queue);
|
||||
|
||||
|
||||
if (initialCapacity>0)
|
||||
{
|
||||
m_workBuffer1->resize(initialCapacity);
|
||||
m_workBuffer3->resize(initialCapacity);
|
||||
m_workBuffer3a->resize(initialCapacity);
|
||||
m_workBuffer4->resize(initialCapacity);
|
||||
m_workBuffer4a->resize(initialCapacity);
|
||||
}
|
||||
|
||||
m_scan = new btPrefixScanCL(ctx,device,queue);
|
||||
m_fill = new btFillCL(ctx,device,queue);
|
||||
|
||||
const char* additionalMacros = "";
|
||||
const char* srcFileNameForCaching="";
|
||||
|
||||
cl_int pErrNum;
|
||||
const char* kernelSource = radixSort32KernelsCL;
|
||||
|
||||
cl_program sortProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
|
||||
btAssert(sortProg);
|
||||
|
||||
m_streamCountSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_streamCountSortDataKernel );
|
||||
|
||||
|
||||
|
||||
m_streamCountKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_streamCountKernel);
|
||||
|
||||
|
||||
|
||||
if (m_deviceCPU)
|
||||
{
|
||||
|
||||
m_sortAndScatterSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_sortAndScatterSortDataKernel);
|
||||
m_sortAndScatterKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_sortAndScatterKernel);
|
||||
} else
|
||||
{
|
||||
m_sortAndScatterSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_sortAndScatterSortDataKernel);
|
||||
m_sortAndScatterKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_sortAndScatterKernel);
|
||||
}
|
||||
|
||||
m_prefixScanKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_prefixScanKernel);
|
||||
|
||||
}
|
||||
|
||||
btRadixSort32CL::~btRadixSort32CL()
|
||||
{
|
||||
delete m_scan;
|
||||
delete m_fill;
|
||||
delete m_workBuffer1;
|
||||
delete m_workBuffer2;
|
||||
delete m_workBuffer3;
|
||||
delete m_workBuffer3a;
|
||||
delete m_workBuffer4;
|
||||
delete m_workBuffer4a;
|
||||
|
||||
clReleaseKernel(m_streamCountSortDataKernel);
|
||||
clReleaseKernel(m_streamCountKernel);
|
||||
clReleaseKernel(m_sortAndScatterSortDataKernel);
|
||||
clReleaseKernel(m_sortAndScatterKernel);
|
||||
clReleaseKernel(m_prefixScanKernel);
|
||||
}
|
||||
|
||||
void btRadixSort32CL::executeHost(btAlignedObjectArray<btSortData>& inout, int sortBits /* = 32 */)
|
||||
{
|
||||
int n = inout.size();
|
||||
const int BITS_PER_PASS = 8;
|
||||
const int NUM_TABLES = (1<<BITS_PER_PASS);
|
||||
|
||||
|
||||
int tables[NUM_TABLES];
|
||||
int counter[NUM_TABLES];
|
||||
|
||||
btSortData* src = &inout[0];
|
||||
btAlignedObjectArray<btSortData> workbuffer;
|
||||
workbuffer.resize(inout.size());
|
||||
btSortData* dst = &workbuffer[0];
|
||||
|
||||
int count=0;
|
||||
for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
|
||||
{
|
||||
for(int i=0; i<NUM_TABLES; i++)
|
||||
{
|
||||
tables[i] = 0;
|
||||
}
|
||||
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
|
||||
tables[tableIdx]++;
|
||||
}
|
||||
//#define TEST
|
||||
#ifdef TEST
|
||||
printf("histogram size=%d\n",NUM_TABLES);
|
||||
for (int i=0;i<NUM_TABLES;i++)
|
||||
{
|
||||
if (tables[i]!=0)
|
||||
{
|
||||
printf("tables[%d]=%d]\n",i,tables[i]);
|
||||
}
|
||||
|
||||
}
|
||||
#endif //TEST
|
||||
// prefix scan
|
||||
int sum = 0;
|
||||
for(int i=0; i<NUM_TABLES; i++)
|
||||
{
|
||||
int iData = tables[i];
|
||||
tables[i] = sum;
|
||||
sum += iData;
|
||||
counter[i] = 0;
|
||||
}
|
||||
|
||||
// distribute
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
|
||||
|
||||
dst[tables[tableIdx] + counter[tableIdx]] = src[i];
|
||||
counter[tableIdx] ++;
|
||||
}
|
||||
|
||||
btSwap( src, dst );
|
||||
count++;
|
||||
}
|
||||
|
||||
if (count&1)
|
||||
{
|
||||
btAssert(0);//need to copy
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void btRadixSort32CL::executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
|
||||
{
|
||||
|
||||
btAlignedObjectArray<btSortData> inout;
|
||||
keyValuesInOut.copyToHost(inout);
|
||||
|
||||
executeHost(inout,sortBits);
|
||||
|
||||
keyValuesInOut.copyFromHost(inout);
|
||||
}
|
||||
|
||||
void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray<unsigned int>& keysOut, btOpenCLArray<unsigned int>& valuesIn,
|
||||
btOpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
//#define DEBUG_RADIXSORT
|
||||
//#define DEBUG_RADIXSORT2
|
||||
|
||||
|
||||
void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
|
||||
{
|
||||
|
||||
int originalSize = keyValuesInOut.size();
|
||||
int workingSize = originalSize;
|
||||
|
||||
|
||||
int dataAlignment = DATA_ALIGNMENT;
|
||||
|
||||
#ifdef DEBUG_RADIXSORT2
|
||||
btAlignedObjectArray<btSortData> test2;
|
||||
keyValuesInOut.copyToHost(test2);
|
||||
printf("numElem = %d\n",test2.size());
|
||||
for (int i=0;i<test2.size();i++)
|
||||
{
|
||||
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
|
||||
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
|
||||
}
|
||||
#endif //DEBUG_RADIXSORT2
|
||||
|
||||
btOpenCLArray<btSortData>* src = 0;
|
||||
|
||||
if (workingSize%dataAlignment)
|
||||
{
|
||||
workingSize += dataAlignment-(workingSize%dataAlignment);
|
||||
m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
|
||||
m_workBuffer4->resize(workingSize);
|
||||
btSortData fillValue;
|
||||
fillValue.m_key = 0xffffffff;
|
||||
fillValue.m_value = 0xffffffff;
|
||||
|
||||
#define USE_BTFILL
|
||||
#ifdef USE_BTFILL
|
||||
m_fill->execute((btOpenCLArray<btInt2>&)*m_workBuffer4,(btInt2&)fillValue,workingSize-originalSize,originalSize);
|
||||
#else
|
||||
//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
|
||||
|
||||
for (int i=originalSize; i<workingSize;i++)
|
||||
{
|
||||
m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
|
||||
}
|
||||
#endif//USE_BTFILL
|
||||
|
||||
src = m_workBuffer4;
|
||||
} else
|
||||
{
|
||||
src = &keyValuesInOut;
|
||||
m_workBuffer4->resize(0);
|
||||
}
|
||||
|
||||
btAssert( workingSize%DATA_ALIGNMENT == 0 );
|
||||
int minCap = NUM_BUCKET*NUM_WGS;
|
||||
|
||||
|
||||
int n = workingSize;
|
||||
|
||||
m_workBuffer1->resize(minCap);
|
||||
m_workBuffer3->resize(workingSize);
|
||||
|
||||
|
||||
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
|
||||
btAssert( BITS_PER_PASS == 4 );
|
||||
btAssert( WG_SIZE == 64 );
|
||||
btAssert( (sortBits&0x3) == 0 );
|
||||
|
||||
|
||||
|
||||
btOpenCLArray<btSortData>* dst = m_workBuffer3;
|
||||
|
||||
btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
|
||||
btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;
|
||||
|
||||
|
||||
int nWGs = NUM_WGS;
|
||||
btConstData cdata;
|
||||
|
||||
{
|
||||
int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
|
||||
int nBlocks = (n+blockSize-1)/(blockSize);
|
||||
cdata.m_n = n;
|
||||
cdata.m_nWGs = NUM_WGS;
|
||||
cdata.m_startBit = 0;
|
||||
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
|
||||
if( nBlocks < NUM_WGS )
|
||||
{
|
||||
cdata.m_nBlocksPerWG = 1;
|
||||
nWGs = nBlocks;
|
||||
}
|
||||
}
|
||||
|
||||
int count=0;
|
||||
for(int ib=0; ib<sortBits; ib+=4)
|
||||
{
|
||||
#ifdef DEBUG_RADIXSORT2
|
||||
keyValuesInOut.copyToHost(test2);
|
||||
printf("numElem = %d\n",test2.size());
|
||||
for (int i=0;i<test2.size();i++)
|
||||
{
|
||||
if (test2[i].m_key != test2[i].m_value)
|
||||
{
|
||||
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
|
||||
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
|
||||
}
|
||||
}
|
||||
#endif //DEBUG_RADIXSORT2
|
||||
|
||||
cdata.m_startBit = ib;
|
||||
|
||||
if (src->size())
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
|
||||
btLauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel);
|
||||
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( cdata );
|
||||
|
||||
int num = NUM_WGS*WG_SIZE;
|
||||
launcher.launch1D( num, WG_SIZE );
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifdef DEBUG_RADIXSORT
|
||||
btAlignedObjectArray<unsigned int> testHist;
|
||||
srcHisto->copyToHost(testHist);
|
||||
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
|
||||
for (int i=0;i<testHist.size();i++)
|
||||
{
|
||||
if (testHist[i]!=0)
|
||||
printf("testHist[%d]=%d\n",i,testHist[i]);
|
||||
}
|
||||
#endif //DEBUG_RADIXSORT
|
||||
|
||||
|
||||
|
||||
//fast prefix scan is not working properly on Mac OSX yet
|
||||
#ifdef _WIN32
|
||||
bool fastScan=!m_deviceCPU;//only use fast scan on GPU
|
||||
#else
|
||||
bool fastScan=false;
|
||||
#endif
|
||||
|
||||
if (fastScan)
|
||||
{// prefix scan group histogram
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
|
||||
btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( cdata );
|
||||
launcher.launch1D( 128, 128 );
|
||||
destHisto = srcHisto;
|
||||
}else
|
||||
{
|
||||
//unsigned int sum; //for debugging
|
||||
m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
|
||||
}
|
||||
|
||||
|
||||
#ifdef DEBUG_RADIXSORT
|
||||
destHisto->copyToHost(testHist);
|
||||
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
|
||||
for (int i=0;i<testHist.size();i++)
|
||||
{
|
||||
if (testHist[i]!=0)
|
||||
printf("testHist[%d]=%d\n",i,testHist[i]);
|
||||
}
|
||||
|
||||
for (int i=0;i<testHist.size();i+=NUM_WGS)
|
||||
{
|
||||
printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
|
||||
}
|
||||
|
||||
#endif //DEBUG_RADIXSORT
|
||||
|
||||
#define USE_GPU
|
||||
#ifdef USE_GPU
|
||||
|
||||
if (src->size())
|
||||
{// local sort and distribute
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
|
||||
btLauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( cdata );
|
||||
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
|
||||
|
||||
}
|
||||
#else
|
||||
{
|
||||
#define NUM_TABLES 16
|
||||
//#define SEQUENTIAL
|
||||
#ifdef SEQUENTIAL
|
||||
int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
int tables[NUM_TABLES];
|
||||
int startBit = ib;
|
||||
|
||||
destHisto->copyToHost(testHist);
|
||||
btAlignedObjectArray<btSortData> srcHost;
|
||||
btAlignedObjectArray<btSortData> dstHost;
|
||||
dstHost.resize(src->size());
|
||||
|
||||
src->copyToHost(srcHost);
|
||||
|
||||
for (int i=0;i<NUM_TABLES;i++)
|
||||
{
|
||||
tables[i] = testHist[i*NUM_WGS];
|
||||
}
|
||||
|
||||
// distribute
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
|
||||
|
||||
dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
|
||||
counter2[tableIdx] ++;
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
|
||||
int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
|
||||
int tables[NUM_TABLES];
|
||||
btAlignedObjectArray<btSortData> dstHostOK;
|
||||
dstHostOK.resize(src->size());
|
||||
|
||||
destHisto->copyToHost(testHist);
|
||||
btAlignedObjectArray<btSortData> srcHost;
|
||||
src->copyToHost(srcHost);
|
||||
|
||||
int blockSize = 256;
|
||||
int nBlocksPerWG = cdata.m_nBlocksPerWG;
|
||||
int startBit = ib;
|
||||
|
||||
{
|
||||
for (int i=0;i<NUM_TABLES;i++)
|
||||
{
|
||||
tables[i] = testHist[i*NUM_WGS];
|
||||
}
|
||||
|
||||
// distribute
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
|
||||
|
||||
dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
|
||||
counter2[tableIdx] ++;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
btAlignedObjectArray<btSortData> dstHost;
|
||||
dstHost.resize(src->size());
|
||||
|
||||
|
||||
int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
|
||||
|
||||
|
||||
for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
|
||||
{
|
||||
int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
|
||||
int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
|
||||
|
||||
for(int iblock=0; iblock<btMin(cdata.m_nBlocksPerWG, nBlocks); iblock++)
|
||||
{
|
||||
for (int lIdx = 0;lIdx < 64;lIdx++)
|
||||
{
|
||||
int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
|
||||
|
||||
// MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
|
||||
// Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
|
||||
// AMD: AtomInc performs better while NV prefers ++
|
||||
for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
|
||||
{
|
||||
if( addr+j < n )
|
||||
{
|
||||
// printf ("addr+j=%d\n", addr+j);
|
||||
|
||||
int i = addr+j;
|
||||
|
||||
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
|
||||
|
||||
int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
|
||||
|
||||
btSortData ok = dstHostOK[destIndex];
|
||||
|
||||
if (ok.m_key != srcHost[i].m_key)
|
||||
{
|
||||
printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
|
||||
printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
|
||||
}
|
||||
if (ok.m_value != srcHost[i].m_value)
|
||||
{
|
||||
|
||||
printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
|
||||
printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );
|
||||
|
||||
}
|
||||
|
||||
dstHost[destIndex] = srcHost[i];
|
||||
counter[tableIdx] ++;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif //SEQUENTIAL
|
||||
|
||||
dst->copyFromHost(dstHost);
|
||||
}
|
||||
#endif//USE_GPU
|
||||
|
||||
|
||||
|
||||
#ifdef DEBUG_RADIXSORT
|
||||
destHisto->copyToHost(testHist);
|
||||
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
|
||||
for (int i=0;i<testHist.size();i++)
|
||||
{
|
||||
if (testHist[i]!=0)
|
||||
printf("testHist[%d]=%d\n",i,testHist[i]);
|
||||
}
|
||||
#endif //DEBUG_RADIXSORT
|
||||
btSwap(src, dst );
|
||||
btSwap(srcHisto,destHisto);
|
||||
|
||||
#ifdef DEBUG_RADIXSORT2
|
||||
keyValuesInOut.copyToHost(test2);
|
||||
printf("numElem = %d\n",test2.size());
|
||||
for (int i=0;i<test2.size();i++)
|
||||
{
|
||||
if (test2[i].m_key != test2[i].m_value)
|
||||
{
|
||||
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
|
||||
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
|
||||
}
|
||||
}
|
||||
#endif //DEBUG_RADIXSORT2
|
||||
|
||||
count++;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (count&1)
|
||||
{
|
||||
btAssert(0);//need to copy from workbuffer to keyValuesInOut
|
||||
}
|
||||
|
||||
if (m_workBuffer4->size())
|
||||
{
|
||||
m_workBuffer4->resize(originalSize);
|
||||
keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
|
||||
}
|
||||
|
||||
|
||||
#ifdef DEBUG_RADIXSORT
|
||||
keyValuesInOut.copyToHost(test2);
|
||||
|
||||
printf("numElem = %d\n",test2.size());
|
||||
for (int i=0;i<test2.size();i++)
|
||||
{
|
||||
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
|
||||
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
|
||||
{
|
||||
int originalSize = keysInOut.size();
|
||||
int workingSize = originalSize;
|
||||
|
||||
|
||||
int dataAlignment = DATA_ALIGNMENT;
|
||||
|
||||
btOpenCLArray<unsigned int>* src = 0;
|
||||
|
||||
if (workingSize%dataAlignment)
|
||||
{
|
||||
workingSize += dataAlignment-(workingSize%dataAlignment);
|
||||
m_workBuffer4a->copyFromOpenCLArray(keysInOut);
|
||||
m_workBuffer4a->resize(workingSize);
|
||||
unsigned int fillValue = 0xffffffff;
|
||||
|
||||
m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize);
|
||||
|
||||
src = m_workBuffer4a;
|
||||
} else
|
||||
{
|
||||
src = &keysInOut;
|
||||
m_workBuffer4a->resize(0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
btAssert( workingSize%DATA_ALIGNMENT == 0 );
|
||||
int minCap = NUM_BUCKET*NUM_WGS;
|
||||
|
||||
|
||||
int n = workingSize;
|
||||
|
||||
|
||||
m_workBuffer1->resize(minCap);
|
||||
m_workBuffer3->resize(workingSize);
|
||||
m_workBuffer3a->resize(workingSize);
|
||||
|
||||
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
|
||||
btAssert( BITS_PER_PASS == 4 );
|
||||
btAssert( WG_SIZE == 64 );
|
||||
btAssert( (sortBits&0x3) == 0 );
|
||||
|
||||
|
||||
|
||||
btOpenCLArray<unsigned int>* dst = m_workBuffer3a;
|
||||
|
||||
btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
|
||||
btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;
|
||||
|
||||
|
||||
int nWGs = NUM_WGS;
|
||||
btConstData cdata;
|
||||
|
||||
{
|
||||
int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
|
||||
int nBlocks = (n+blockSize-1)/(blockSize);
|
||||
cdata.m_n = n;
|
||||
cdata.m_nWGs = NUM_WGS;
|
||||
cdata.m_startBit = 0;
|
||||
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
|
||||
if( nBlocks < NUM_WGS )
|
||||
{
|
||||
cdata.m_nBlocksPerWG = 1;
|
||||
nWGs = nBlocks;
|
||||
}
|
||||
}
|
||||
|
||||
int count=0;
|
||||
for(int ib=0; ib<sortBits; ib+=4)
|
||||
{
|
||||
cdata.m_startBit = ib;
|
||||
|
||||
if (src->size())
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
|
||||
btLauncherCL launcher(m_commandQueue, m_streamCountKernel);
|
||||
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( cdata );
|
||||
|
||||
int num = NUM_WGS*WG_SIZE;
|
||||
launcher.launch1D( num, WG_SIZE );
|
||||
}
|
||||
|
||||
|
||||
|
||||
//fast prefix scan is not working properly on Mac OSX yet
|
||||
#ifdef _WIN32
|
||||
bool fastScan=!m_deviceCPU;
|
||||
|
||||
#else
|
||||
bool fastScan=false;
|
||||
#endif
|
||||
|
||||
if (fastScan)
|
||||
{// prefix scan group histogram
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
|
||||
btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( cdata );
|
||||
launcher.launch1D( 128, 128 );
|
||||
destHisto = srcHisto;
|
||||
}else
|
||||
{
|
||||
//unsigned int sum; //for debugging
|
||||
m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
|
||||
}
|
||||
|
||||
if (src->size())
|
||||
{// local sort and distribute
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
|
||||
btLauncherCL launcher( m_commandQueue, m_sortAndScatterKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( cdata );
|
||||
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
|
||||
|
||||
}
|
||||
|
||||
btSwap(src, dst );
|
||||
btSwap(srcHisto,destHisto);
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
if (count&1)
|
||||
{
|
||||
btAssert(0);//need to copy from workbuffer to keyValuesInOut
|
||||
}
|
||||
|
||||
if (m_workBuffer4a->size())
|
||||
{
|
||||
m_workBuffer4a->resize(originalSize);
|
||||
keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
85
opencl/parallel_primitives/host/btRadixSort32CL.h
Normal file
85
opencl/parallel_primitives/host/btRadixSort32CL.h
Normal file
@@ -0,0 +1,85 @@
|
||||
|
||||
#ifndef BT_RADIXSORT32_H
|
||||
#define BT_RADIXSORT32_H
|
||||
|
||||
#include "btOpenCLArray.h"
|
||||
|
||||
struct btSortData
|
||||
{
|
||||
int m_key;
|
||||
int m_value;
|
||||
};
|
||||
#include "btBufferInfoCL.h"
|
||||
|
||||
class btRadixSort32CL
|
||||
{
|
||||
|
||||
btOpenCLArray<unsigned int>* m_workBuffer1;
|
||||
btOpenCLArray<unsigned int>* m_workBuffer2;
|
||||
|
||||
btOpenCLArray<btSortData>* m_workBuffer3;
|
||||
btOpenCLArray<btSortData>* m_workBuffer4;
|
||||
|
||||
btOpenCLArray<unsigned int>* m_workBuffer3a;
|
||||
btOpenCLArray<unsigned int>* m_workBuffer4a;
|
||||
|
||||
cl_command_queue m_commandQueue;
|
||||
|
||||
cl_kernel m_streamCountSortDataKernel;
|
||||
cl_kernel m_streamCountKernel;
|
||||
|
||||
cl_kernel m_prefixScanKernel;
|
||||
cl_kernel m_sortAndScatterSortDataKernel;
|
||||
cl_kernel m_sortAndScatterKernel;
|
||||
|
||||
|
||||
bool m_deviceCPU;
|
||||
|
||||
class btPrefixScanCL* m_scan;
|
||||
class btFillCL* m_fill;
|
||||
|
||||
public:
|
||||
struct btConstData
|
||||
{
|
||||
int m_n;
|
||||
int m_nWGs;
|
||||
int m_startBit;
|
||||
int m_nBlocksPerWG;
|
||||
};
|
||||
enum
|
||||
{
|
||||
DATA_ALIGNMENT = 256,
|
||||
WG_SIZE = 64,
|
||||
BLOCK_SIZE = 256,
|
||||
ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
|
||||
BITS_PER_PASS = 4,
|
||||
NUM_BUCKET=(1<<BITS_PER_PASS),
|
||||
// if you change this, change nPerWI in kernel as well
|
||||
NUM_WGS = 20*6, // cypress
|
||||
// NUM_WGS = 24*6, // cayman
|
||||
// NUM_WGS = 32*4, // nv
|
||||
};
|
||||
|
||||
|
||||
private:
|
||||
|
||||
|
||||
public:
|
||||
|
||||
btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
|
||||
|
||||
virtual ~btRadixSort32CL();
|
||||
|
||||
void execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray<unsigned int>& keysOut, btOpenCLArray<unsigned int>& valuesIn,
|
||||
btOpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
|
||||
|
||||
///keys only
|
||||
void execute(btOpenCLArray<unsigned int>& keysInOut, int sortBits = 32 );
|
||||
|
||||
void execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits = 32 );
|
||||
void executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits = 32);
|
||||
void executeHost(btAlignedObjectArray<btSortData>& keyValuesInOut, int sortBits = 32);
|
||||
|
||||
};
|
||||
#endif //BT_RADIXSORT32_H
|
||||
|
||||
660
opencl/parallel_primitives/host/btScalar.h
Normal file
660
opencl/parallel_primitives/host/btScalar.h
Normal file
@@ -0,0 +1,660 @@
|
||||
/*
|
||||
Copyright (c) 2003-2009 Erwin Coumans http://bullet.googlecode.com
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#ifndef BT_SCALAR_H
|
||||
#define BT_SCALAR_H
|
||||
|
||||
#ifdef BT_MANAGED_CODE
|
||||
//Aligned data types not supported in managed code
|
||||
#pragma unmanaged
|
||||
#endif
|
||||
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>//size_t for MSVC 6.0
|
||||
#include <float.h>
|
||||
|
||||
/* SVN $Revision$ on $Date$ from http://bullet.googlecode.com*/
|
||||
#define BT_BULLET_VERSION 281
|
||||
|
||||
inline int btGetVersion()
|
||||
{
|
||||
return BT_BULLET_VERSION;
|
||||
}
|
||||
|
||||
#if defined(DEBUG) || defined (_DEBUG)
|
||||
#define BT_DEBUG
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
#if defined(__MINGW32__) || defined(__CYGWIN__) || (defined (_MSC_VER) && _MSC_VER < 1300)
|
||||
|
||||
#define SIMD_FORCE_INLINE inline
|
||||
#define ATTRIBUTE_ALIGNED16(a) a
|
||||
#define ATTRIBUTE_ALIGNED64(a) a
|
||||
#define ATTRIBUTE_ALIGNED128(a) a
|
||||
#else
|
||||
//#define BT_HAS_ALIGNED_ALLOCATOR
|
||||
#pragma warning(disable : 4324) // disable padding warning
|
||||
// #pragma warning(disable:4530) // Disable the exception disable but used in MSCV Stl warning.
|
||||
// #pragma warning(disable:4996) //Turn off warnings about deprecated C routines
|
||||
// #pragma warning(disable:4786) // Disable the "debug name too long" warning
|
||||
|
||||
#define SIMD_FORCE_INLINE __forceinline
|
||||
#define ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a
|
||||
#define ATTRIBUTE_ALIGNED64(a) __declspec(align(64)) a
|
||||
#define ATTRIBUTE_ALIGNED128(a) __declspec (align(128)) a
|
||||
#ifdef _XBOX
|
||||
#define BT_USE_VMX128
|
||||
|
||||
#include <ppcintrinsics.h>
|
||||
#define BT_HAVE_NATIVE_FSEL
|
||||
#define btFsel(a,b,c) __fsel((a),(b),(c))
|
||||
#else
|
||||
|
||||
#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (BT_USE_DOUBLE_PRECISION))
|
||||
#define BT_USE_SSE
|
||||
#ifdef BT_USE_SSE
|
||||
//BT_USE_SSE_IN_API is disabled under Windows by default, because
|
||||
//it makes it harder to integrate Bullet into your application under Windows
|
||||
//(structured embedding Bullet structs/classes need to be 16-byte aligned)
|
||||
//with relatively little performance gain
|
||||
//If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries
|
||||
//you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage)
|
||||
//#define BT_USE_SSE_IN_API
|
||||
#endif //BT_USE_SSE
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#endif//_XBOX
|
||||
|
||||
#endif //__MINGW32__
|
||||
|
||||
#ifdef BT_DEBUG
|
||||
#ifdef _MSC_VER
|
||||
#include <stdio.h>
|
||||
#define btAssert(x) { if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);__debugbreak(); }}
|
||||
#else//_MSC_VER
|
||||
#include <assert.h>
|
||||
#define btAssert assert
|
||||
#endif//_MSC_VER
|
||||
#else
|
||||
#define btAssert(x)
|
||||
#endif
|
||||
//btFullAssert is optional, slows down a lot
|
||||
#define btFullAssert(x)
|
||||
|
||||
#define btLikely(_c) _c
|
||||
#define btUnlikely(_c) _c
|
||||
|
||||
#else
|
||||
|
||||
#if defined (__CELLOS_LV2__)
|
||||
#define SIMD_FORCE_INLINE inline __attribute__((always_inline))
|
||||
#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
|
||||
#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
|
||||
#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
|
||||
#ifndef assert
|
||||
#include <assert.h>
|
||||
#endif
|
||||
#ifdef BT_DEBUG
|
||||
#ifdef __SPU__
|
||||
#include <spu_printf.h>
|
||||
#define printf spu_printf
|
||||
#define btAssert(x) {if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}}
|
||||
#else
|
||||
#define btAssert assert
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define btAssert(x)
|
||||
#endif
|
||||
//btFullAssert is optional, slows down a lot
|
||||
#define btFullAssert(x)
|
||||
|
||||
#define btLikely(_c) _c
|
||||
#define btUnlikely(_c) _c
|
||||
|
||||
#else
|
||||
|
||||
#ifdef USE_LIBSPE2
|
||||
|
||||
#define SIMD_FORCE_INLINE __inline
|
||||
#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
|
||||
#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
|
||||
#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
|
||||
#ifndef assert
|
||||
#include <assert.h>
|
||||
#endif
|
||||
#ifdef BT_DEBUG
|
||||
#define btAssert assert
|
||||
#else
|
||||
#define btAssert(x)
|
||||
#endif
|
||||
//btFullAssert is optional, slows down a lot
|
||||
#define btFullAssert(x)
|
||||
|
||||
|
||||
#define btLikely(_c) __builtin_expect((_c), 1)
|
||||
#define btUnlikely(_c) __builtin_expect((_c), 0)
|
||||
|
||||
|
||||
#else
|
||||
//non-windows systems
|
||||
|
||||
#if (defined (__APPLE__) && (!defined (BT_USE_DOUBLE_PRECISION)))
|
||||
#if defined (__i386__) || defined (__x86_64__)
|
||||
#define BT_USE_SSE
|
||||
//BT_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries
|
||||
//if apps run into issues, we will disable the next line
|
||||
#define BT_USE_SSE_IN_API
|
||||
#ifdef BT_USE_SSE
|
||||
// include appropriate SSE level
|
||||
#if defined (__SSE4_1__)
|
||||
#include <smmintrin.h>
|
||||
#elif defined (__SSSE3__)
|
||||
#include <tmmintrin.h>
|
||||
#elif defined (__SSE3__)
|
||||
#include <pmmintrin.h>
|
||||
#else
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
#endif //BT_USE_SSE
|
||||
#elif defined( __armv7__ )
|
||||
#ifdef __clang__
|
||||
#define BT_USE_NEON 1
|
||||
|
||||
#if defined BT_USE_NEON && defined (__clang__)
|
||||
#include <arm_neon.h>
|
||||
#endif//BT_USE_NEON
|
||||
#endif //__clang__
|
||||
#endif//__arm__
|
||||
|
||||
#define SIMD_FORCE_INLINE inline __attribute__ ((always_inline))
|
||||
///@todo: check out alignment methods for other platforms/compilers
|
||||
#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
|
||||
#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
|
||||
#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
|
||||
#ifndef assert
|
||||
#include <assert.h>
|
||||
#endif
|
||||
|
||||
#if defined(DEBUG) || defined (_DEBUG)
|
||||
#if defined (__i386__) || defined (__x86_64__)
|
||||
#include <stdio.h>
|
||||
#define btAssert(x)\
|
||||
{\
|
||||
if(!(x))\
|
||||
{\
|
||||
printf("Assert %s in line %d, file %s\n",#x, __LINE__, __FILE__);\
|
||||
asm volatile ("int3");\
|
||||
}\
|
||||
}
|
||||
#else//defined (__i386__) || defined (__x86_64__)
|
||||
#define btAssert assert
|
||||
#endif//defined (__i386__) || defined (__x86_64__)
|
||||
#else//defined(DEBUG) || defined (_DEBUG)
|
||||
#define btAssert(x)
|
||||
#endif//defined(DEBUG) || defined (_DEBUG)
|
||||
|
||||
//btFullAssert is optional, slows down a lot
|
||||
#define btFullAssert(x)
|
||||
#define btLikely(_c) _c
|
||||
#define btUnlikely(_c) _c
|
||||
|
||||
#else
|
||||
|
||||
#define SIMD_FORCE_INLINE inline
|
||||
///@todo: check out alignment methods for other platforms/compilers
|
||||
///#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
|
||||
///#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
|
||||
///#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
|
||||
#define ATTRIBUTE_ALIGNED16(a) a
|
||||
#define ATTRIBUTE_ALIGNED64(a) a
|
||||
#define ATTRIBUTE_ALIGNED128(a) a
|
||||
#ifndef assert
|
||||
#include <assert.h>
|
||||
#endif
|
||||
|
||||
#if defined(DEBUG) || defined (_DEBUG)
|
||||
#define btAssert assert
|
||||
#else
|
||||
#define btAssert(x)
|
||||
#endif
|
||||
|
||||
//btFullAssert is optional, slows down a lot
|
||||
#define btFullAssert(x)
|
||||
#define btLikely(_c) _c
|
||||
#define btUnlikely(_c) _c
|
||||
#endif //__APPLE__
|
||||
|
||||
#endif // LIBSPE2
|
||||
|
||||
#endif //__CELLOS_LV2__
|
||||
#endif
|
||||
|
||||
|
||||
///The btScalar type abstracts floating point numbers, to easily switch between double and single floating point precision.
|
||||
#if defined(BT_USE_DOUBLE_PRECISION)
|
||||
typedef double btScalar;
|
||||
//this number could be bigger in double precision
|
||||
#define BT_LARGE_FLOAT 1e30
|
||||
#else
|
||||
typedef float btScalar;
|
||||
//keep BT_LARGE_FLOAT*BT_LARGE_FLOAT < FLT_MAX
|
||||
#define BT_LARGE_FLOAT 1e18f
|
||||
#endif
|
||||
|
||||
#ifdef BT_USE_SSE
|
||||
typedef __m128 btSimdFloat4;
|
||||
#endif//BT_USE_SSE
|
||||
|
||||
#if defined BT_USE_SSE_IN_API && defined (BT_USE_SSE)
|
||||
#ifdef _WIN32
|
||||
|
||||
#ifndef BT_NAN
|
||||
static int btNanMask = 0x7F800001;
|
||||
#define BT_NAN (*(float*)&btNanMask)
|
||||
#endif
|
||||
|
||||
#ifndef BT_INFINITY
|
||||
static int btInfinityMask = 0x7F800000;
|
||||
#define BT_INFINITY (*(float*)&btInfinityMask)
|
||||
#endif
|
||||
|
||||
inline __m128 operator + (const __m128 A, const __m128 B)
|
||||
{
|
||||
return _mm_add_ps(A, B);
|
||||
}
|
||||
|
||||
inline __m128 operator - (const __m128 A, const __m128 B)
|
||||
{
|
||||
return _mm_sub_ps(A, B);
|
||||
}
|
||||
|
||||
inline __m128 operator * (const __m128 A, const __m128 B)
|
||||
{
|
||||
return _mm_mul_ps(A, B);
|
||||
}
|
||||
|
||||
#define btCastfTo128i(a) (_mm_castps_si128(a))
|
||||
#define btCastfTo128d(a) (_mm_castps_pd(a))
|
||||
#define btCastiTo128f(a) (_mm_castsi128_ps(a))
|
||||
#define btCastdTo128f(a) (_mm_castpd_ps(a))
|
||||
#define btCastdTo128i(a) (_mm_castpd_si128(a))
|
||||
#define btAssign128(r0,r1,r2,r3) _mm_setr_ps(r0,r1,r2,r3)
|
||||
|
||||
#else//_WIN32
|
||||
|
||||
#define btCastfTo128i(a) ((__m128i)(a))
|
||||
#define btCastfTo128d(a) ((__m128d)(a))
|
||||
#define btCastiTo128f(a) ((__m128) (a))
|
||||
#define btCastdTo128f(a) ((__m128) (a))
|
||||
#define btCastdTo128i(a) ((__m128i)(a))
|
||||
#define btAssign128(r0,r1,r2,r3) (__m128){r0,r1,r2,r3}
|
||||
#define BT_INFINITY INFINITY
|
||||
#define BT_NAN NAN
|
||||
#endif//_WIN32
|
||||
#endif //BT_USE_SSE_IN_API
|
||||
|
||||
#ifdef BT_USE_NEON
|
||||
#include <arm_neon.h>
|
||||
|
||||
typedef float32x4_t btSimdFloat4;
|
||||
#define BT_INFINITY INFINITY
|
||||
#define BT_NAN NAN
|
||||
#define btAssign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#define BT_DECLARE_ALIGNED_ALLOCATOR() \
|
||||
SIMD_FORCE_INLINE void* operator new(size_t sizeInBytes) { return btAlignedAlloc(sizeInBytes,16); } \
|
||||
SIMD_FORCE_INLINE void operator delete(void* ptr) { btAlignedFree(ptr); } \
|
||||
SIMD_FORCE_INLINE void* operator new(size_t, void* ptr) { return ptr; } \
|
||||
SIMD_FORCE_INLINE void operator delete(void*, void*) { } \
|
||||
SIMD_FORCE_INLINE void* operator new[](size_t sizeInBytes) { return btAlignedAlloc(sizeInBytes,16); } \
|
||||
SIMD_FORCE_INLINE void operator delete[](void* ptr) { btAlignedFree(ptr); } \
|
||||
SIMD_FORCE_INLINE void* operator new[](size_t, void* ptr) { return ptr; } \
|
||||
SIMD_FORCE_INLINE void operator delete[](void*, void*) { } \
|
||||
|
||||
|
||||
|
||||
#if defined(BT_USE_DOUBLE_PRECISION) || defined(BT_FORCE_DOUBLE_FUNCTIONS)
|
||||
|
||||
SIMD_FORCE_INLINE btScalar btSqrt(btScalar x) { return sqrt(x); }
|
||||
SIMD_FORCE_INLINE btScalar btFabs(btScalar x) { return fabs(x); }
|
||||
SIMD_FORCE_INLINE btScalar btCos(btScalar x) { return cos(x); }
|
||||
SIMD_FORCE_INLINE btScalar btSin(btScalar x) { return sin(x); }
|
||||
SIMD_FORCE_INLINE btScalar btTan(btScalar x) { return tan(x); }
|
||||
SIMD_FORCE_INLINE btScalar btAcos(btScalar x) { if (x<btScalar(-1)) x=btScalar(-1); if (x>btScalar(1)) x=btScalar(1); return acos(x); }
|
||||
SIMD_FORCE_INLINE btScalar btAsin(btScalar x) { if (x<btScalar(-1)) x=btScalar(-1); if (x>btScalar(1)) x=btScalar(1); return asin(x); }
|
||||
SIMD_FORCE_INLINE btScalar btAtan(btScalar x) { return atan(x); }
|
||||
SIMD_FORCE_INLINE btScalar btAtan2(btScalar x, btScalar y) { return atan2(x, y); }
|
||||
SIMD_FORCE_INLINE btScalar btExp(btScalar x) { return exp(x); }
|
||||
SIMD_FORCE_INLINE btScalar btLog(btScalar x) { return log(x); }
|
||||
SIMD_FORCE_INLINE btScalar btPow(btScalar x,btScalar y) { return pow(x,y); }
|
||||
SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmod(x,y); }
|
||||
|
||||
#else
|
||||
|
||||
SIMD_FORCE_INLINE btScalar btSqrt(btScalar y)
|
||||
{
|
||||
#ifdef USE_APPROXIMATION
|
||||
double x, z, tempf;
|
||||
unsigned long *tfptr = ((unsigned long *)&tempf) + 1;
|
||||
|
||||
tempf = y;
|
||||
*tfptr = (0xbfcdd90a - *tfptr)>>1; /* estimate of 1/sqrt(y) */
|
||||
x = tempf;
|
||||
z = y*btScalar(0.5);
|
||||
x = (btScalar(1.5)*x)-(x*x)*(x*z); /* iteration formula */
|
||||
x = (btScalar(1.5)*x)-(x*x)*(x*z);
|
||||
x = (btScalar(1.5)*x)-(x*x)*(x*z);
|
||||
x = (btScalar(1.5)*x)-(x*x)*(x*z);
|
||||
x = (btScalar(1.5)*x)-(x*x)*(x*z);
|
||||
return x*y;
|
||||
#else
|
||||
return sqrtf(y);
|
||||
#endif
|
||||
}
|
||||
SIMD_FORCE_INLINE btScalar btFabs(btScalar x) { return fabsf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btCos(btScalar x) { return cosf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btSin(btScalar x) { return sinf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btTan(btScalar x) { return tanf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btAcos(btScalar x) {
|
||||
if (x<btScalar(-1))
|
||||
x=btScalar(-1);
|
||||
if (x>btScalar(1))
|
||||
x=btScalar(1);
|
||||
return acosf(x);
|
||||
}
|
||||
SIMD_FORCE_INLINE btScalar btAsin(btScalar x) {
|
||||
if (x<btScalar(-1))
|
||||
x=btScalar(-1);
|
||||
if (x>btScalar(1))
|
||||
x=btScalar(1);
|
||||
return asinf(x);
|
||||
}
|
||||
SIMD_FORCE_INLINE btScalar btAtan(btScalar x) { return atanf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btAtan2(btScalar x, btScalar y) { return atan2f(x, y); }
|
||||
SIMD_FORCE_INLINE btScalar btExp(btScalar x) { return expf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btLog(btScalar x) { return logf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btPow(btScalar x,btScalar y) { return powf(x,y); }
|
||||
SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmodf(x,y); }
|
||||
|
||||
#endif
|
||||
|
||||
#define SIMD_2_PI btScalar(6.283185307179586232)
|
||||
#define SIMD_PI (SIMD_2_PI * btScalar(0.5))
|
||||
#define SIMD_HALF_PI (SIMD_2_PI * btScalar(0.25))
|
||||
#define SIMD_RADS_PER_DEG (SIMD_2_PI / btScalar(360.0))
|
||||
#define SIMD_DEGS_PER_RAD (btScalar(360.0) / SIMD_2_PI)
|
||||
#define SIMDSQRT12 btScalar(0.7071067811865475244008443621048490)
|
||||
|
||||
#define btRecipSqrt(x) ((btScalar)(btScalar(1.0)/btSqrt(btScalar(x)))) /* reciprocal square root */
|
||||
|
||||
|
||||
#ifdef BT_USE_DOUBLE_PRECISION
|
||||
#define SIMD_EPSILON DBL_EPSILON
|
||||
#define SIMD_INFINITY DBL_MAX
|
||||
#else
|
||||
#define SIMD_EPSILON FLT_EPSILON
|
||||
#define SIMD_INFINITY FLT_MAX
|
||||
#endif
|
||||
|
||||
SIMD_FORCE_INLINE btScalar btAtan2Fast(btScalar y, btScalar x)
|
||||
{
|
||||
btScalar coeff_1 = SIMD_PI / 4.0f;
|
||||
btScalar coeff_2 = 3.0f * coeff_1;
|
||||
btScalar abs_y = btFabs(y);
|
||||
btScalar angle;
|
||||
if (x >= 0.0f) {
|
||||
btScalar r = (x - abs_y) / (x + abs_y);
|
||||
angle = coeff_1 - coeff_1 * r;
|
||||
} else {
|
||||
btScalar r = (x + abs_y) / (abs_y - x);
|
||||
angle = coeff_2 - coeff_1 * r;
|
||||
}
|
||||
return (y < 0.0f) ? -angle : angle;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE bool btFuzzyZero(btScalar x) { return btFabs(x) < SIMD_EPSILON; }
|
||||
|
||||
SIMD_FORCE_INLINE bool btEqual(btScalar a, btScalar eps) {
|
||||
return (((a) <= eps) && !((a) < -eps));
|
||||
}
|
||||
SIMD_FORCE_INLINE bool btGreaterEqual (btScalar a, btScalar eps) {
|
||||
return (!((a) <= eps));
|
||||
}
|
||||
|
||||
|
||||
SIMD_FORCE_INLINE int btIsNegative(btScalar x) {
|
||||
return x < btScalar(0.0) ? 1 : 0;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE btScalar btRadians(btScalar x) { return x * SIMD_RADS_PER_DEG; }
|
||||
SIMD_FORCE_INLINE btScalar btDegrees(btScalar x) { return x * SIMD_DEGS_PER_RAD; }
|
||||
|
||||
#define BT_DECLARE_HANDLE(name) typedef struct name##__ { int unused; } *name
|
||||
|
||||
#ifndef btFsel
|
||||
SIMD_FORCE_INLINE btScalar btFsel(btScalar a, btScalar b, btScalar c)
|
||||
{
|
||||
return a >= 0 ? b : c;
|
||||
}
|
||||
#endif
|
||||
#define btFsels(a,b,c) (btScalar)btFsel(a,b,c)
|
||||
|
||||
|
||||
SIMD_FORCE_INLINE bool btMachineIsLittleEndian()
|
||||
{
|
||||
long int i = 1;
|
||||
const char *p = (const char *) &i;
|
||||
if (p[0] == 1) // Lowest address contains the least significant byte
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
///btSelect avoids branches, which makes performance much better for consoles like Playstation 3 and XBox 360
|
||||
///Thanks Phil Knight. See also http://www.cellperformance.com/articles/2006/04/more_techniques_for_eliminatin_1.html
|
||||
SIMD_FORCE_INLINE unsigned btSelect(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero)
|
||||
{
|
||||
// Set testNz to 0xFFFFFFFF if condition is nonzero, 0x00000000 if condition is zero
|
||||
// Rely on positive value or'ed with its negative having sign bit on
|
||||
// and zero value or'ed with its negative (which is still zero) having sign bit off
|
||||
// Use arithmetic shift right, shifting the sign bit through all 32 bits
|
||||
unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
|
||||
unsigned testEqz = ~testNz;
|
||||
return ((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
|
||||
}
|
||||
SIMD_FORCE_INLINE int btSelect(unsigned condition, int valueIfConditionNonZero, int valueIfConditionZero)
|
||||
{
|
||||
unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
|
||||
unsigned testEqz = ~testNz;
|
||||
return static_cast<int>((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
|
||||
}
|
||||
SIMD_FORCE_INLINE float btSelect(unsigned condition, float valueIfConditionNonZero, float valueIfConditionZero)
|
||||
{
|
||||
#ifdef BT_HAVE_NATIVE_FSEL
|
||||
return (float)btFsel((btScalar)condition - btScalar(1.0f), valueIfConditionNonZero, valueIfConditionZero);
|
||||
#else
|
||||
return (condition != 0) ? valueIfConditionNonZero : valueIfConditionZero;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename T> SIMD_FORCE_INLINE void btSwap(T& a, T& b)
|
||||
{
|
||||
T tmp = a;
|
||||
a = b;
|
||||
b = tmp;
|
||||
}
|
||||
|
||||
|
||||
//PCK: endian swapping functions
|
||||
SIMD_FORCE_INLINE unsigned btSwapEndian(unsigned val)
|
||||
{
|
||||
return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8) | ((val & 0x000000ff) << 24));
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE unsigned short btSwapEndian(unsigned short val)
|
||||
{
|
||||
return static_cast<unsigned short>(((val & 0xff00) >> 8) | ((val & 0x00ff) << 8));
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE unsigned btSwapEndian(int val)
|
||||
{
|
||||
return btSwapEndian((unsigned)val);
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE unsigned short btSwapEndian(short val)
|
||||
{
|
||||
return btSwapEndian((unsigned short) val);
|
||||
}
|
||||
|
||||
///btSwapFloat uses using char pointers to swap the endianness
|
||||
////btSwapFloat/btSwapDouble will NOT return a float, because the machine might 'correct' invalid floating point values
|
||||
///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754.
|
||||
///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception.
|
||||
///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you.
|
||||
///so instead of returning a float/double, we return integer/long long integer
|
||||
SIMD_FORCE_INLINE unsigned int btSwapEndianFloat(float d)
|
||||
{
|
||||
unsigned int a = 0;
|
||||
unsigned char *dst = (unsigned char *)&a;
|
||||
unsigned char *src = (unsigned char *)&d;
|
||||
|
||||
dst[0] = src[3];
|
||||
dst[1] = src[2];
|
||||
dst[2] = src[1];
|
||||
dst[3] = src[0];
|
||||
return a;
|
||||
}
|
||||
|
||||
// unswap using char pointers
|
||||
SIMD_FORCE_INLINE float btUnswapEndianFloat(unsigned int a)
|
||||
{
|
||||
float d = 0.0f;
|
||||
unsigned char *src = (unsigned char *)&a;
|
||||
unsigned char *dst = (unsigned char *)&d;
|
||||
|
||||
dst[0] = src[3];
|
||||
dst[1] = src[2];
|
||||
dst[2] = src[1];
|
||||
dst[3] = src[0];
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
|
||||
// swap using char pointers
|
||||
SIMD_FORCE_INLINE void btSwapEndianDouble(double d, unsigned char* dst)
|
||||
{
|
||||
unsigned char *src = (unsigned char *)&d;
|
||||
|
||||
dst[0] = src[7];
|
||||
dst[1] = src[6];
|
||||
dst[2] = src[5];
|
||||
dst[3] = src[4];
|
||||
dst[4] = src[3];
|
||||
dst[5] = src[2];
|
||||
dst[6] = src[1];
|
||||
dst[7] = src[0];
|
||||
|
||||
}
|
||||
|
||||
// unswap using char pointers
|
||||
SIMD_FORCE_INLINE double btUnswapEndianDouble(const unsigned char *src)
|
||||
{
|
||||
double d = 0.0;
|
||||
unsigned char *dst = (unsigned char *)&d;
|
||||
|
||||
dst[0] = src[7];
|
||||
dst[1] = src[6];
|
||||
dst[2] = src[5];
|
||||
dst[3] = src[4];
|
||||
dst[4] = src[3];
|
||||
dst[5] = src[2];
|
||||
dst[6] = src[1];
|
||||
dst[7] = src[0];
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
// returns normalized value in range [-SIMD_PI, SIMD_PI]
|
||||
SIMD_FORCE_INLINE btScalar btNormalizeAngle(btScalar angleInRadians)
|
||||
{
|
||||
angleInRadians = btFmod(angleInRadians, SIMD_2_PI);
|
||||
if(angleInRadians < -SIMD_PI)
|
||||
{
|
||||
return angleInRadians + SIMD_2_PI;
|
||||
}
|
||||
else if(angleInRadians > SIMD_PI)
|
||||
{
|
||||
return angleInRadians - SIMD_2_PI;
|
||||
}
|
||||
else
|
||||
{
|
||||
return angleInRadians;
|
||||
}
|
||||
}
|
||||
|
||||
///rudimentary class to provide type info
|
||||
struct btTypedObject
|
||||
{
|
||||
btTypedObject(int objectType)
|
||||
:m_objectType(objectType)
|
||||
{
|
||||
}
|
||||
int m_objectType;
|
||||
inline int getObjectType() const
|
||||
{
|
||||
return m_objectType;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
///align a pointer to the provided alignment, upwards
|
||||
template <typename T>T* btAlignPointer(T* unalignedPtr, size_t alignment)
|
||||
{
|
||||
|
||||
struct btConvertPointerSizeT
|
||||
{
|
||||
union
|
||||
{
|
||||
T* ptr;
|
||||
size_t integer;
|
||||
};
|
||||
};
|
||||
btConvertPointerSizeT converter;
|
||||
|
||||
|
||||
const size_t bit_mask = ~(alignment - 1);
|
||||
converter.ptr = unalignedPtr;
|
||||
converter.integer += alignment-1;
|
||||
converter.integer &= bit_mask;
|
||||
return converter.ptr;
|
||||
}
|
||||
|
||||
#endif //BT_SCALAR_H
|
||||
26
opencl/parallel_primitives/host/premake4.lua
Normal file
26
opencl/parallel_primitives/host/premake4.lua
Normal file
@@ -0,0 +1,26 @@
|
||||
function createProject(vendor)
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("OpenCL_lib_parallel_primitives_host_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
kind "StaticLib"
|
||||
targetdir "../../../lib"
|
||||
includedirs {
|
||||
".",
|
||||
}
|
||||
files {
|
||||
"**.cpp",
|
||||
"**.h"
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
createProject("Apple")
|
||||
106
opencl/parallel_primitives/kernels/BoundSearchKernels.cl
Normal file
106
opencl/parallel_primitives/kernels/BoundSearchKernels.cl
Normal file
@@ -0,0 +1,106 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
typedef unsigned int u32;
|
||||
#define GET_GROUP_IDX get_group_id(0)
|
||||
#define GET_LOCAL_IDX get_local_id(0)
|
||||
#define GET_GLOBAL_IDX get_global_id(0)
|
||||
#define GET_GROUP_SIZE get_local_size(0)
|
||||
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u32 m_key;
|
||||
u32 m_value;
|
||||
}SortData;
|
||||
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
u32 m_nSrc;
|
||||
u32 m_nDst;
|
||||
u32 m_padding[2];
|
||||
} ConstBuffer;
|
||||
|
||||
|
||||
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
__kernel
|
||||
void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst,
|
||||
unsigned int nSrc, unsigned int nDst)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < nSrc )
|
||||
{
|
||||
SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);
|
||||
SortData end; end.m_key = nDst; end.m_value = nDst;
|
||||
|
||||
SortData iData = (gIdx==0)? first: src[gIdx-1];
|
||||
SortData jData = (gIdx==nSrc)? end: src[gIdx];
|
||||
|
||||
if( iData.m_key != jData.m_key )
|
||||
{
|
||||
// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
|
||||
u32 k = jData.m_key;
|
||||
{
|
||||
dst[k] = gIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
__kernel
|
||||
void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst,
|
||||
unsigned int nSrc, unsigned int nDst)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX+1;
|
||||
|
||||
if( gIdx < nSrc+1 )
|
||||
{
|
||||
SortData first; first.m_key = 0; first.m_value = 0;
|
||||
SortData end; end.m_key = nDst; end.m_value = nDst;
|
||||
|
||||
SortData iData = src[gIdx-1];
|
||||
SortData jData = (gIdx==nSrc)? end: src[gIdx];
|
||||
|
||||
if( iData.m_key != jData.m_key )
|
||||
{
|
||||
u32 k = iData.m_key;
|
||||
{
|
||||
dst[k] = gIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
__kernel
|
||||
void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C,
|
||||
unsigned int nSrc, unsigned int nDst)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
|
||||
if( gIdx < nDst )
|
||||
{
|
||||
C[gIdx] = A[gIdx] - B[gIdx];
|
||||
}
|
||||
}
|
||||
|
||||
110
opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h
Normal file
110
opencl/parallel_primitives/kernels/BoundSearchKernelsCL.h
Normal file
@@ -0,0 +1,110 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* boundSearchKernelsCL= \
|
||||
"/*\n"
|
||||
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
|
||||
"\n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Originally written by Takahiro Harada\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"#define GET_GROUP_IDX get_group_id(0)\n"
|
||||
"#define GET_LOCAL_IDX get_local_id(0)\n"
|
||||
"#define GET_GLOBAL_IDX get_global_id(0)\n"
|
||||
"#define GET_GROUP_SIZE get_local_size(0)\n"
|
||||
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_key; \n"
|
||||
" u32 m_value;\n"
|
||||
"}SortData;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" u32 m_nSrc;\n"
|
||||
" u32 m_nDst;\n"
|
||||
" u32 m_padding[2];\n"
|
||||
"} ConstBuffer;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"__kernel\n"
|
||||
"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
|
||||
" unsigned int nSrc, unsigned int nDst)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < nSrc )\n"
|
||||
" {\n"
|
||||
" SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
|
||||
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
|
||||
"\n"
|
||||
" SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
|
||||
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
|
||||
"\n"
|
||||
" if( iData.m_key != jData.m_key )\n"
|
||||
" {\n"
|
||||
"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
|
||||
" u32 k = jData.m_key;\n"
|
||||
" {\n"
|
||||
" dst[k] = gIdx;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"__kernel\n"
|
||||
"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
|
||||
" unsigned int nSrc, unsigned int nDst)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX+1;\n"
|
||||
"\n"
|
||||
" if( gIdx < nSrc+1 )\n"
|
||||
" {\n"
|
||||
" SortData first; first.m_key = 0; first.m_value = 0;\n"
|
||||
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
|
||||
"\n"
|
||||
" SortData iData = src[gIdx-1];\n"
|
||||
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
|
||||
"\n"
|
||||
" if( iData.m_key != jData.m_key )\n"
|
||||
" {\n"
|
||||
" u32 k = iData.m_key;\n"
|
||||
" {\n"
|
||||
" dst[k] = gIdx;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"__kernel\n"
|
||||
"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
|
||||
" unsigned int nSrc, unsigned int nDst)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" \n"
|
||||
"\n"
|
||||
" if( gIdx < nDst )\n"
|
||||
" {\n"
|
||||
" C[gIdx] = A[gIdx] - B[gIdx];\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
;
|
||||
128
opencl/parallel_primitives/kernels/CopyKernels.cl
Normal file
128
opencl/parallel_primitives/kernels/CopyKernels.cl
Normal file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
|
||||
typedef unsigned int u32;
|
||||
#define GET_GROUP_IDX get_group_id(0)
|
||||
#define GET_LOCAL_IDX get_local_id(0)
|
||||
#define GET_GLOBAL_IDX get_global_id(0)
|
||||
#define GET_GROUP_SIZE get_local_size(0)
|
||||
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
|
||||
#define AtomInc(x) atom_inc(&(x))
|
||||
#define AtomInc1(x, out) out = atom_inc(&(x))
|
||||
|
||||
#define make_uint4 (uint4)
|
||||
#define make_uint2 (uint2)
|
||||
#define make_int2 (int2)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int m_n;
|
||||
int m_padding[3];
|
||||
} ConstBuffer;
|
||||
|
||||
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void Copy1F4Kernel(__global float4* dst, __global float4* src,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < cb.m_n )
|
||||
{
|
||||
float4 a0 = src[gIdx];
|
||||
|
||||
dst[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void Copy2F4Kernel(__global float4* dst, __global float4* src,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( 2*gIdx <= cb.m_n )
|
||||
{
|
||||
float4 a0 = src[gIdx*2+0];
|
||||
float4 a1 = src[gIdx*2+1];
|
||||
|
||||
dst[ gIdx*2+0 ] = a0;
|
||||
dst[ gIdx*2+1 ] = a1;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void Copy4F4Kernel(__global float4* dst, __global float4* src,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( 4*gIdx <= cb.m_n )
|
||||
{
|
||||
int idx0 = gIdx*4+0;
|
||||
int idx1 = gIdx*4+1;
|
||||
int idx2 = gIdx*4+2;
|
||||
int idx3 = gIdx*4+3;
|
||||
|
||||
float4 a0 = src[idx0];
|
||||
float4 a1 = src[idx1];
|
||||
float4 a2 = src[idx2];
|
||||
float4 a3 = src[idx3];
|
||||
|
||||
dst[ idx0 ] = a0;
|
||||
dst[ idx1 ] = a1;
|
||||
dst[ idx2 ] = a2;
|
||||
dst[ idx3 ] = a3;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void CopyF1Kernel(__global float* dstF1, __global float* srcF1,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < cb.m_n )
|
||||
{
|
||||
float a0 = srcF1[gIdx];
|
||||
|
||||
dstF1[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2,
|
||||
ConstBuffer cb)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < cb.m_n )
|
||||
{
|
||||
float2 a0 = srcF2[gIdx];
|
||||
|
||||
dstF2[ gIdx ] = a0;
|
||||
}
|
||||
}
|
||||
|
||||
132
opencl/parallel_primitives/kernels/CopyKernelsCL.h
Normal file
132
opencl/parallel_primitives/kernels/CopyKernelsCL.h
Normal file
@@ -0,0 +1,132 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* copyKernelsCL= \
|
||||
"/*\n"
|
||||
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
|
||||
"\n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Originally written by Takahiro Harada\n"
|
||||
"\n"
|
||||
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
|
||||
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"#define GET_GROUP_IDX get_group_id(0)\n"
|
||||
"#define GET_LOCAL_IDX get_local_id(0)\n"
|
||||
"#define GET_GLOBAL_IDX get_global_id(0)\n"
|
||||
"#define GET_GROUP_SIZE get_local_size(0)\n"
|
||||
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define AtomInc(x) atom_inc(&(x))\n"
|
||||
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
|
||||
"\n"
|
||||
"#define make_uint4 (uint4)\n"
|
||||
"#define make_uint2 (uint2)\n"
|
||||
"#define make_int2 (int2)\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" int m_n;\n"
|
||||
" int m_padding[3];\n"
|
||||
"} ConstBuffer;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < cb.m_n )\n"
|
||||
" {\n"
|
||||
" float4 a0 = src[gIdx];\n"
|
||||
"\n"
|
||||
" dst[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( 2*gIdx <= cb.m_n )\n"
|
||||
" {\n"
|
||||
" float4 a0 = src[gIdx*2+0];\n"
|
||||
" float4 a1 = src[gIdx*2+1];\n"
|
||||
"\n"
|
||||
" dst[ gIdx*2+0 ] = a0;\n"
|
||||
" dst[ gIdx*2+1 ] = a1;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( 4*gIdx <= cb.m_n )\n"
|
||||
" {\n"
|
||||
" int idx0 = gIdx*4+0;\n"
|
||||
" int idx1 = gIdx*4+1;\n"
|
||||
" int idx2 = gIdx*4+2;\n"
|
||||
" int idx3 = gIdx*4+3;\n"
|
||||
"\n"
|
||||
" float4 a0 = src[idx0];\n"
|
||||
" float4 a1 = src[idx1];\n"
|
||||
" float4 a2 = src[idx2];\n"
|
||||
" float4 a3 = src[idx3];\n"
|
||||
"\n"
|
||||
" dst[ idx0 ] = a0;\n"
|
||||
" dst[ idx1 ] = a1;\n"
|
||||
" dst[ idx2 ] = a2;\n"
|
||||
" dst[ idx3 ] = a3;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < cb.m_n )\n"
|
||||
" {\n"
|
||||
" float a0 = srcF1[gIdx];\n"
|
||||
"\n"
|
||||
" dstF1[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
|
||||
" ConstBuffer cb)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < cb.m_n )\n"
|
||||
" {\n"
|
||||
" float2 a0 = srcF2[gIdx];\n"
|
||||
"\n"
|
||||
" dstF2[ gIdx ] = a0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
;
|
||||
107
opencl/parallel_primitives/kernels/FillKernels.cl
Normal file
107
opencl/parallel_primitives/kernels/FillKernels.cl
Normal file
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
|
||||
typedef unsigned int u32;
|
||||
#define GET_GROUP_IDX get_group_id(0)
|
||||
#define GET_LOCAL_IDX get_local_id(0)
|
||||
#define GET_GLOBAL_IDX get_global_id(0)
|
||||
#define GET_GROUP_SIZE get_local_size(0)
|
||||
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
|
||||
#define AtomInc(x) atom_inc(&(x))
|
||||
#define AtomInc1(x, out) out = atom_inc(&(x))
|
||||
|
||||
#define make_uint4 (uint4)
|
||||
#define make_uint2 (uint2)
|
||||
#define make_int2 (int2)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
union
|
||||
{
|
||||
int4 m_data;
|
||||
uint4 m_unsignedData;
|
||||
float m_floatData;
|
||||
};
|
||||
int m_offset;
|
||||
int m_n;
|
||||
int m_padding[2];
|
||||
} ConstBuffer;
|
||||
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < num_elements )
|
||||
{
|
||||
dstInt[ offset+gIdx ] = value;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < num_elements )
|
||||
{
|
||||
dstFloat[ offset+gIdx ] = value;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < num )
|
||||
{
|
||||
dstInt[ offset+gIdx ] = value;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < num )
|
||||
{
|
||||
dstInt2[ gIdx + offset] = make_int2( value.x, value.y );
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(64,1,1)))
|
||||
void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)
|
||||
{
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
|
||||
if( gIdx < num )
|
||||
{
|
||||
dstInt4[ offset+gIdx ] = value;
|
||||
}
|
||||
}
|
||||
|
||||
111
opencl/parallel_primitives/kernels/FillKernelsCL.h
Normal file
111
opencl/parallel_primitives/kernels/FillKernelsCL.h
Normal file
@@ -0,0 +1,111 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* fillKernelsCL= \
|
||||
"/*\n"
|
||||
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
|
||||
"\n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Originally written by Takahiro Harada\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
|
||||
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"#define GET_GROUP_IDX get_group_id(0)\n"
|
||||
"#define GET_LOCAL_IDX get_local_id(0)\n"
|
||||
"#define GET_GLOBAL_IDX get_global_id(0)\n"
|
||||
"#define GET_GROUP_SIZE get_local_size(0)\n"
|
||||
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"#define AtomInc(x) atom_inc(&(x))\n"
|
||||
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
|
||||
"\n"
|
||||
"#define make_uint4 (uint4)\n"
|
||||
"#define make_uint2 (uint2)\n"
|
||||
"#define make_int2 (int2)\n"
|
||||
"\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" int4 m_data;\n"
|
||||
" uint4 m_unsignedData;\n"
|
||||
" float m_floatData;\n"
|
||||
" };\n"
|
||||
" int m_offset;\n"
|
||||
" int m_n;\n"
|
||||
" int m_padding[2];\n"
|
||||
"} ConstBuffer;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < num_elements )\n"
|
||||
" {\n"
|
||||
" dstInt[ offset+gIdx ] = value;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < num_elements )\n"
|
||||
" {\n"
|
||||
" dstFloat[ offset+gIdx ] = value;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < num )\n"
|
||||
" {\n"
|
||||
" dstInt[ offset+gIdx ] = value;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < num )\n"
|
||||
" {\n"
|
||||
" dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel\n"
|
||||
"__attribute__((reqd_work_group_size(64,1,1)))\n"
|
||||
"void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n"
|
||||
"{\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
"\n"
|
||||
" if( gIdx < num )\n"
|
||||
" {\n"
|
||||
" dstInt4[ offset+gIdx ] = value;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
;
|
||||
154
opencl/parallel_primitives/kernels/PrefixScanKernels.cl
Normal file
154
opencl/parallel_primitives/kernels/PrefixScanKernels.cl
Normal file
@@ -0,0 +1,154 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
|
||||
typedef unsigned int u32;
|
||||
#define GET_GROUP_IDX get_group_id(0)
|
||||
#define GET_LOCAL_IDX get_local_id(0)
|
||||
#define GET_GLOBAL_IDX get_global_id(0)
|
||||
#define GET_GROUP_SIZE get_local_size(0)
|
||||
#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
|
||||
|
||||
// takahiro end
|
||||
#define WG_SIZE 128
|
||||
#define m_numElems x
|
||||
#define m_numBlocks y
|
||||
#define m_numScanBlocks z
|
||||
|
||||
/*typedef struct
|
||||
{
|
||||
uint m_numElems;
|
||||
uint m_numBlocks;
|
||||
uint m_numScanBlocks;
|
||||
uint m_padding[1];
|
||||
} ConstBuffer;
|
||||
*/
|
||||
|
||||
u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)
|
||||
{
|
||||
u32 blocksum;
|
||||
int offset = 1;
|
||||
for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
|
||||
{
|
||||
GROUP_LDS_BARRIER;
|
||||
for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
|
||||
{
|
||||
int ai = offset*(2*iIdx+1)-1;
|
||||
int bi = offset*(2*iIdx+2)-1;
|
||||
data[bi] += data[ai];
|
||||
}
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
if( lIdx == 0 )
|
||||
{
|
||||
blocksum = data[ n-1 ];
|
||||
data[ n-1 ] = 0;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
offset >>= 1;
|
||||
for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
|
||||
{
|
||||
GROUP_LDS_BARRIER;
|
||||
for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
|
||||
{
|
||||
int ai = offset*(2*iIdx+1)-1;
|
||||
int bi = offset*(2*iIdx+2)-1;
|
||||
u32 temp = data[ai];
|
||||
data[ai] = data[bi];
|
||||
data[bi] += temp;
|
||||
}
|
||||
}
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
return blocksum;
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
|
||||
__kernel
|
||||
void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,
|
||||
uint4 cb)
|
||||
{
|
||||
__local u32 ldsData[WG_SIZE*2];
|
||||
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
|
||||
ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
|
||||
ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
|
||||
|
||||
u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
|
||||
|
||||
if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
|
||||
|
||||
if( (2*gIdx) < cb.m_numElems )
|
||||
{
|
||||
dst[2*gIdx] = ldsData[2*lIdx];
|
||||
}
|
||||
if( (2*gIdx + 1) < cb.m_numElems )
|
||||
{
|
||||
dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
|
||||
__kernel
|
||||
void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)
|
||||
{
|
||||
const u32 blockSize = WG_SIZE*2;
|
||||
|
||||
int myIdx = GET_GROUP_IDX+1;
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
|
||||
u32 iBlockSum = blockSum[myIdx];
|
||||
|
||||
int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
|
||||
for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
|
||||
{
|
||||
dst[i] += iBlockSum;
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
|
||||
__kernel
|
||||
void TopLevelScanKernel(__global u32* dst, uint4 cb)
|
||||
{
|
||||
__local u32 ldsData[2048];
|
||||
int gIdx = GET_GLOBAL_IDX;
|
||||
int lIdx = GET_LOCAL_IDX;
|
||||
int lSize = GET_GROUP_SIZE;
|
||||
|
||||
for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
|
||||
{
|
||||
ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
|
||||
}
|
||||
|
||||
GROUP_LDS_BARRIER;
|
||||
|
||||
u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
|
||||
|
||||
for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
|
||||
{
|
||||
dst[i] = ldsData[i];
|
||||
}
|
||||
|
||||
if( gIdx == 0 )
|
||||
{
|
||||
dst[cb.m_numBlocks] = sum;
|
||||
}
|
||||
}
|
||||
158
opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h
Normal file
158
opencl/parallel_primitives/kernels/PrefixScanKernelsCL.h
Normal file
@@ -0,0 +1,158 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* prefixScanKernelsCL= \
|
||||
"/*\n"
|
||||
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
|
||||
"\n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Originally written by Takahiro Harada\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef unsigned int u32;\n"
|
||||
"#define GET_GROUP_IDX get_group_id(0)\n"
|
||||
"#define GET_LOCAL_IDX get_local_id(0)\n"
|
||||
"#define GET_GLOBAL_IDX get_global_id(0)\n"
|
||||
"#define GET_GROUP_SIZE get_local_size(0)\n"
|
||||
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
|
||||
"\n"
|
||||
"// takahiro end\n"
|
||||
"#define WG_SIZE 128 \n"
|
||||
"#define m_numElems x\n"
|
||||
"#define m_numBlocks y\n"
|
||||
"#define m_numScanBlocks z\n"
|
||||
"\n"
|
||||
"/*typedef struct\n"
|
||||
"{\n"
|
||||
" uint m_numElems;\n"
|
||||
" uint m_numBlocks;\n"
|
||||
" uint m_numScanBlocks;\n"
|
||||
" uint m_padding[1];\n"
|
||||
"} ConstBuffer;\n"
|
||||
"*/\n"
|
||||
"\n"
|
||||
"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
|
||||
"{\n"
|
||||
" u32 blocksum;\n"
|
||||
" int offset = 1;\n"
|
||||
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
|
||||
" {\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
|
||||
" {\n"
|
||||
" int ai = offset*(2*iIdx+1)-1;\n"
|
||||
" int bi = offset*(2*iIdx+2)-1;\n"
|
||||
" data[bi] += data[ai];\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" if( lIdx == 0 )\n"
|
||||
" {\n"
|
||||
" blocksum = data[ n-1 ];\n"
|
||||
" data[ n-1 ] = 0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" offset >>= 1;\n"
|
||||
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
|
||||
" {\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
|
||||
" {\n"
|
||||
" int ai = offset*(2*iIdx+1)-1;\n"
|
||||
" int bi = offset*(2*iIdx+2)-1;\n"
|
||||
" u32 temp = data[ai];\n"
|
||||
" data[ai] = data[bi];\n"
|
||||
" data[bi] += temp;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" return blocksum;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"__kernel\n"
|
||||
"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
|
||||
" uint4 cb)\n"
|
||||
"{\n"
|
||||
" __local u32 ldsData[WG_SIZE*2];\n"
|
||||
"\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
"\n"
|
||||
" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
|
||||
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
|
||||
"\n"
|
||||
" u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
|
||||
"\n"
|
||||
" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
|
||||
"\n"
|
||||
" if( (2*gIdx) < cb.m_numElems )\n"
|
||||
" {\n"
|
||||
" dst[2*gIdx] = ldsData[2*lIdx];\n"
|
||||
" }\n"
|
||||
" if( (2*gIdx + 1) < cb.m_numElems )\n"
|
||||
" {\n"
|
||||
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"__kernel\n"
|
||||
"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
|
||||
"{\n"
|
||||
" const u32 blockSize = WG_SIZE*2;\n"
|
||||
"\n"
|
||||
" int myIdx = GET_GROUP_IDX+1;\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
"\n"
|
||||
" u32 iBlockSum = blockSum[myIdx];\n"
|
||||
"\n"
|
||||
" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
|
||||
" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
|
||||
" {\n"
|
||||
" dst[i] += iBlockSum;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
|
||||
"__kernel\n"
|
||||
"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
|
||||
"{\n"
|
||||
" __local u32 ldsData[2048];\n"
|
||||
" int gIdx = GET_GLOBAL_IDX;\n"
|
||||
" int lIdx = GET_LOCAL_IDX;\n"
|
||||
" int lSize = GET_GROUP_SIZE;\n"
|
||||
"\n"
|
||||
" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
|
||||
" {\n"
|
||||
" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" GROUP_LDS_BARRIER;\n"
|
||||
"\n"
|
||||
" u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
|
||||
"\n"
|
||||
" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
|
||||
" {\n"
|
||||
" dst[i] = ldsData[i];\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" if( gIdx == 0 )\n"
|
||||
" {\n"
|
||||
" dst[cb.m_numBlocks] = sum;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
;
|
||||
1071
opencl/parallel_primitives/kernels/RadixSort32Kernels.cl
Normal file
1071
opencl/parallel_primitives/kernels/RadixSort32Kernels.cl
Normal file
File diff suppressed because it is too large
Load Diff
1074
opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h
Normal file
1074
opencl/parallel_primitives/kernels/RadixSort32KernelsCL.h
Normal file
File diff suppressed because it is too large
Load Diff
379
opencl/parallel_primitives/test/main.cpp
Normal file
379
opencl/parallel_primitives/test/main.cpp
Normal file
@@ -0,0 +1,379 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "../basic_initialize/btOpenCLUtils.h"
|
||||
#include "../host/btFillCL.h"
|
||||
#include "../host/btBoundSearchCL.h"
|
||||
#include "../host/btRadixSort32CL.h"
|
||||
#include "../host/btPrefixScanCL.h"
|
||||
#include "../host/CommandLineArgs.h"
|
||||
|
||||
#include "../host/btMinMax.h"
|
||||
int g_nPassed = 0;
|
||||
int g_nFailed = 0;
|
||||
bool g_testFailed = 0;
|
||||
|
||||
#define TEST_INIT g_testFailed = 0;
|
||||
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
|
||||
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
|
||||
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
|
||||
|
||||
cl_context g_context=0;
|
||||
cl_device_id g_device=0;
|
||||
cl_command_queue g_queue =0;
|
||||
const char* g_deviceName = 0;
|
||||
|
||||
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
|
||||
{
|
||||
void* glCtx=0;
|
||||
void* glDC = 0;
|
||||
int ciErrNum = 0;
|
||||
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
|
||||
|
||||
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
|
||||
|
||||
g_context = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
int numDev = btOpenCLUtils::getNumDevices(g_context);
|
||||
if (numDev>0)
|
||||
{
|
||||
btOpenCLDeviceInfo info;
|
||||
g_device= btOpenCLUtils::getDevice(g_context,0);
|
||||
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
btOpenCLUtils::printDeviceInfo(g_device);
|
||||
btOpenCLUtils::getDeviceInfo(g_device,&info);
|
||||
g_deviceName = info.m_deviceName;
|
||||
}
|
||||
}
|
||||
|
||||
void exitCL()
|
||||
{
|
||||
clReleaseCommandQueue(g_queue);
|
||||
clReleaseContext(g_context);
|
||||
}
|
||||
|
||||
|
||||
inline void fillIntTest()
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
btFillCL* fillCL = new btFillCL(g_context,g_device,g_queue);
|
||||
int maxSize=1024*256;
|
||||
btOpenCLArray<int> intBuffer(g_context,g_queue,maxSize);
|
||||
intBuffer.resize(maxSize);
|
||||
|
||||
#define NUM_TESTS 7
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for (int iter=0;iter<NUM_TESTS;iter++)
|
||||
{
|
||||
int size = btMin( 11+dx*iter, maxSize );
|
||||
|
||||
int value = 2;
|
||||
|
||||
|
||||
int offset=0;
|
||||
fillCL->execute(intBuffer,value,size,offset);
|
||||
|
||||
btAlignedObjectArray<int> hostBuf2;
|
||||
hostBuf2.resize(size);
|
||||
fillCL->executeHost(hostBuf2,value,size,offset);
|
||||
|
||||
btAlignedObjectArray<int> hostBuf;
|
||||
intBuffer.copyToHost(hostBuf);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
|
||||
TEST_ASSERT( hostBuf[i] == hostBuf2[i] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
delete fillCL;
|
||||
|
||||
TEST_REPORT( "fillIntTest" );
|
||||
}
|
||||
|
||||
|
||||
__inline
|
||||
void seedRandom(int seed)
|
||||
{
|
||||
srand( seed );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__inline
|
||||
T getRandom(const T& minV, const T& maxV)
|
||||
{
|
||||
float r = (rand()%10000)/10000.f;
|
||||
T range = maxV - minV;
|
||||
return (T)(minV + r*range);
|
||||
}
|
||||
|
||||
struct btSortDataCompare
|
||||
{
|
||||
inline bool operator()(const btSortData& first, const btSortData& second) const
|
||||
{
|
||||
return (first.m_key < second.m_key) || (first.m_key==second.m_key && first.m_value < second.m_value);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void boundSearchTest( )
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
int maxSize = 1024*256;
|
||||
int bucketSize = 256;
|
||||
|
||||
btOpenCLArray<btSortData> srcCL(g_context,g_queue,maxSize);
|
||||
btOpenCLArray<unsigned int> upperCL(g_context,g_queue,maxSize);
|
||||
btOpenCLArray<unsigned int> lowerCL(g_context,g_queue,maxSize);
|
||||
|
||||
btAlignedObjectArray<btSortData> srcHost;
|
||||
btAlignedObjectArray<unsigned int> upperHost;
|
||||
btAlignedObjectArray<unsigned int> lowerHost;
|
||||
btAlignedObjectArray<unsigned int> upperHostCompare;
|
||||
btAlignedObjectArray<unsigned int> lowerHostCompare;
|
||||
|
||||
btBoundSearchCL* search = new btBoundSearchCL(g_context,g_device,g_queue, maxSize);
|
||||
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
|
||||
int size = btMin( 128+dx*iter, maxSize );
|
||||
|
||||
upperHost.resize(bucketSize);
|
||||
lowerHost.resize(bucketSize);
|
||||
upperHostCompare.resize(bucketSize);
|
||||
lowerHostCompare.resize(bucketSize);
|
||||
|
||||
srcHost.resize(size);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
btSortData v;
|
||||
// v.m_key = i<2? 0 : 5;
|
||||
v.m_key = getRandom(0,bucketSize);
|
||||
|
||||
v.m_value = i;
|
||||
srcHost.at(i) = v;
|
||||
}
|
||||
|
||||
srcHost.quickSort(btSortDataCompare());
|
||||
srcCL.copyFromHost(srcHost);
|
||||
|
||||
{
|
||||
|
||||
for(int i=0; i<bucketSize; i++)
|
||||
{
|
||||
lowerHost[i] = -1;
|
||||
lowerHostCompare[i] = -1;
|
||||
upperHost[i] = -1;
|
||||
upperHostCompare[i] = -1;
|
||||
}
|
||||
upperCL.copyFromHost(upperHost);
|
||||
lowerCL.copyFromHost(lowerHost);
|
||||
}
|
||||
|
||||
search->execute(srcCL,size,upperCL,bucketSize,btBoundSearchCL::BOUND_UPPER);
|
||||
search->execute(srcCL,size,lowerCL,bucketSize,btBoundSearchCL::BOUND_LOWER);
|
||||
|
||||
search->executeHost(srcHost,size,upperHostCompare,bucketSize,btBoundSearchCL::BOUND_UPPER);
|
||||
search->executeHost(srcHost,size,lowerHostCompare,bucketSize,btBoundSearchCL::BOUND_LOWER);
|
||||
|
||||
lowerCL.copyToHost(lowerHost);
|
||||
upperCL.copyToHost(upperHost);
|
||||
for(int i=0; i<bucketSize; i++)
|
||||
{
|
||||
TEST_ASSERT(upperHostCompare[i] == upperHost[i]);
|
||||
TEST_ASSERT(lowerHostCompare[i] == lowerHost[i]);
|
||||
}
|
||||
/*
|
||||
for(int i=1; i<bucketSize; i++)
|
||||
{
|
||||
int lhi_1 = lowerHost[i-1];
|
||||
int lhi = lowerHost[i];
|
||||
|
||||
for(int j=lhi_1; j<lhi; j++)
|
||||
//for(int j=lowerHost[i-1]; j<lowerHost[i]; j++)
|
||||
{
|
||||
TEST_ASSERT( srcHost[j].m_key < i );
|
||||
}
|
||||
}
|
||||
|
||||
for(int i=0; i<bucketSize; i++)
|
||||
{
|
||||
int jMin = (i==0)?0:upperHost[i-1];
|
||||
for(int j=jMin; j<upperHost[i]; j++)
|
||||
{
|
||||
TEST_ASSERT( srcHost[j].m_key <= i );
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
for(int i=0; i<bucketSize; i++)
|
||||
{
|
||||
int lhi = lowerHost[i];
|
||||
int uhi = upperHost[i];
|
||||
|
||||
for(int j=lhi; j<uhi; j++)
|
||||
{
|
||||
if ( srcHost[j].m_key != i )
|
||||
{
|
||||
printf("error %d != %d\n",srcHost[j].m_key,i);
|
||||
}
|
||||
TEST_ASSERT( srcHost[j].m_key == i );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
delete search;
|
||||
|
||||
TEST_REPORT( "boundSearchTest" );
|
||||
}
|
||||
|
||||
|
||||
void prefixScanTest()
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
btAlignedObjectArray<unsigned int> buf0Host;
|
||||
btAlignedObjectArray<unsigned int> buf1Host;
|
||||
|
||||
btOpenCLArray<unsigned int> buf2CL(g_context,g_queue,maxSize);
|
||||
btOpenCLArray<unsigned int> buf3CL(g_context,g_queue,maxSize);
|
||||
|
||||
|
||||
btPrefixScanCL* scan = new btPrefixScanCL(g_context,g_device,g_queue,maxSize);
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = btMin( 128+dx*iter, maxSize );
|
||||
buf0Host.resize(size);
|
||||
buf1Host.resize(size);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
buf0Host[i] = 1;
|
||||
|
||||
buf2CL.copyFromHost( buf0Host);
|
||||
|
||||
unsigned int sumHost, sumGPU;
|
||||
|
||||
scan->executeHost(buf0Host, buf1Host, size, &sumHost );
|
||||
scan->execute( buf2CL, buf3CL, size, &sumGPU );
|
||||
|
||||
buf3CL.copyToHost(buf0Host);
|
||||
|
||||
TEST_ASSERT( sumHost == sumGPU );
|
||||
for(int i=0; i<size; i++)
|
||||
TEST_ASSERT( buf1Host[i] == buf0Host[i] );
|
||||
}
|
||||
|
||||
delete scan;
|
||||
|
||||
TEST_REPORT( "scanTest" );
|
||||
}
|
||||
|
||||
|
||||
bool radixSortTest()
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
int maxSize = 1024*256;
|
||||
|
||||
btAlignedObjectArray<btSortData> buf0Host;
|
||||
buf0Host.resize(maxSize);
|
||||
btAlignedObjectArray<btSortData> buf1Host;
|
||||
buf1Host.resize(maxSize );
|
||||
btOpenCLArray<btSortData> buf2CL(g_context,g_queue,maxSize);
|
||||
|
||||
btRadixSort32CL* sort = new btRadixSort32CL(g_context,g_device,g_queue,maxSize);
|
||||
|
||||
int dx = maxSize/NUM_TESTS;
|
||||
for(int iter=0; iter<NUM_TESTS; iter++)
|
||||
{
|
||||
int size = btMin( 128+dx*iter, maxSize-512 );
|
||||
size = NEXTMULTIPLEOF( size, 512 );//not necessary
|
||||
|
||||
buf0Host.resize(size);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
btSortData v;
|
||||
v.m_key = getRandom(0,0xff);
|
||||
v.m_value = i;
|
||||
buf0Host[i] = v;
|
||||
}
|
||||
|
||||
buf2CL.copyFromHost( buf0Host);
|
||||
|
||||
|
||||
sort->executeHost( buf0Host);
|
||||
sort->execute(buf2CL);
|
||||
|
||||
buf2CL.copyToHost(buf1Host);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
TEST_ASSERT( buf0Host[i].m_value == buf1Host[i].m_value && buf0Host[i].m_key == buf1Host[i].m_key );
|
||||
}
|
||||
}
|
||||
|
||||
delete sort;
|
||||
|
||||
TEST_REPORT( "radixSort" );
|
||||
|
||||
return g_testFailed;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int preferredDeviceIndex = -1;
|
||||
int preferredPlatformIndex = -1;
|
||||
|
||||
CommandLineArgs args(argc, argv);
|
||||
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
|
||||
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
|
||||
|
||||
initCL(preferredDeviceIndex,preferredPlatformIndex);
|
||||
|
||||
fillIntTest();
|
||||
|
||||
boundSearchTest();
|
||||
|
||||
prefixScanTest();
|
||||
|
||||
radixSortTest();
|
||||
|
||||
exitCL();
|
||||
|
||||
printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
|
||||
printf("End, press <enter>\n");
|
||||
getchar();
|
||||
}
|
||||
|
||||
41
opencl/parallel_primitives/test/premake4.lua
Normal file
41
opencl/parallel_primitives/test/premake4.lua
Normal file
@@ -0,0 +1,41 @@
|
||||
function createProject(vendor)
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("OpenCL_primitives_test_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../bin"
|
||||
includedirs {".",".."}
|
||||
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../../basic_initialize/btOpenCLInclude.h",
|
||||
"../../basic_initialize/btOpenCLUtils.cpp",
|
||||
"../../basic_initialize/btOpenCLUtils.h",
|
||||
"../host/btFillCL.cpp",
|
||||
"../host/btFillCL.h",
|
||||
"../host/btBoundSearchCL.cpp",
|
||||
"../host/btBoundSearchCL.h",
|
||||
"../host/btPrefixScanCL.cpp",
|
||||
"../host/btPrefixScanCL.h",
|
||||
"../host/btRadixSort32CL.cpp",
|
||||
"../host/btRadixSort32CL.h",
|
||||
"../host/btAlignedAllocator.cpp",
|
||||
"../host/btAlignedAllocator.h",
|
||||
"../host/btAlignedObjectArray.h",
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
createProject("Apple")
|
||||
116
opencl/reduce/main.cpp
Normal file
116
opencl/reduce/main.cpp
Normal file
@@ -0,0 +1,116 @@
|
||||
///original author: Erwin Coumans
|
||||
#include "btOpenCLUtils.h"
|
||||
#include "../parallel_primitives/host/btOpenCLArray.h"
|
||||
#include "../parallel_primitives/host/btLauncherCL.h"
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
#define MSTRINGIFY(A) #A
|
||||
const char* kernelString= MSTRINGIFY(
|
||||
__kernel void ReduceGlobal(__global int* d_in, __global int* d_out, int numElements)
|
||||
{
|
||||
int myId = get_global_id(0);
|
||||
int tid = get_local_id(0);
|
||||
|
||||
|
||||
int ls = get_local_size(0);
|
||||
for (unsigned int s=ls/2;s>0;s>>=1)
|
||||
{
|
||||
if (myId<numElements)
|
||||
{
|
||||
if (tid<s)
|
||||
{
|
||||
d_in[myId] += d_in[myId+s];
|
||||
}
|
||||
}
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
if (tid==0)
|
||||
{
|
||||
if (myId<numElements)
|
||||
{
|
||||
d_out[get_group_id(0)]=d_in[myId];
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int ciErrNum = 0;
|
||||
int preferred_device = -1;
|
||||
int preferred_platform = -1;
|
||||
cl_platform_id platformId;
|
||||
cl_context ctx;
|
||||
cl_command_queue queue;
|
||||
cl_device_id device;
|
||||
cl_kernel addKernel;
|
||||
ctx = btOpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
|
||||
btOpenCLUtils::printPlatformInfo(platformId);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
if (!ctx) {
|
||||
printf("No OpenCL capable GPU found!");
|
||||
return 0;
|
||||
}
|
||||
|
||||
device = btOpenCLUtils::getDevice(ctx,0);
|
||||
queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
|
||||
addKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"ReduceGlobal",&ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
int numElements = 1024*1024;
|
||||
btOpenCLArray<int> a(ctx,queue);
|
||||
btOpenCLArray<int> b(ctx,queue);
|
||||
btAlignedObjectArray<int> hostA;
|
||||
btAlignedObjectArray<int> hostB;
|
||||
|
||||
for (int i=0;i<numElements;i++)
|
||||
{
|
||||
hostA.push_back(1);
|
||||
hostB.push_back(0.f);
|
||||
}
|
||||
a.copyFromHost(hostA);
|
||||
b.copyFromHost(hostB);
|
||||
|
||||
int hostSum= 0;
|
||||
for (int i=0;i<numElements;i++)
|
||||
{
|
||||
hostSum += hostA.at(i);
|
||||
}
|
||||
b.resize(numElements);
|
||||
|
||||
{
|
||||
btLauncherCL launcher( queue, addKernel);
|
||||
launcher.setBuffer( a.getBufferCL());
|
||||
launcher.setBuffer( b.getBufferCL());
|
||||
launcher.setConst( numElements );
|
||||
launcher.launch1D( numElements,1024);
|
||||
}
|
||||
clFinish(queue);
|
||||
{
|
||||
btLauncherCL launcher( queue, addKernel);
|
||||
launcher.setBuffer( b.getBufferCL());
|
||||
launcher.setBuffer( a.getBufferCL());
|
||||
launcher.setConst( 1024 );
|
||||
launcher.launch1D( 1024,1024);
|
||||
}
|
||||
clFinish(queue);
|
||||
|
||||
printf("hostSum = %d\n", hostSum);
|
||||
|
||||
int clSum = a.at(0);
|
||||
printf("clSum = %d\n", clSum );
|
||||
if (hostSum != clSum)
|
||||
{
|
||||
printf("Incorrect result\n");
|
||||
} else
|
||||
{
|
||||
printf("Correct result\n");
|
||||
}
|
||||
|
||||
|
||||
clReleaseCommandQueue(queue);
|
||||
clReleaseContext(ctx);
|
||||
printf("press key\n");
|
||||
getchar();
|
||||
return 0;
|
||||
}
|
||||
37
opencl/reduce/premake4.lua
Normal file
37
opencl/reduce/premake4.lua
Normal file
@@ -0,0 +1,37 @@
|
||||
|
||||
function createProject (vendor)
|
||||
|
||||
local hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ( "OpenCL_reduce_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../bin"
|
||||
|
||||
links {
|
||||
"OpenCL_lib_parallel_primitives_host_" .. vendor
|
||||
}
|
||||
|
||||
includedirs {
|
||||
"../basic_initialize"
|
||||
}
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../basic_initialize/btOpenCLUtils.cpp",
|
||||
"../basic_initialize/btOpenCLUtils.h"
|
||||
}
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("NVIDIA")
|
||||
createProject("Intel")
|
||||
createProject("Apple")
|
||||
16
opencl/vector_add/VectorAddKernels.cl
Normal file
16
opencl/vector_add/VectorAddKernels.cl
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
|
||||
__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)
|
||||
{
|
||||
// get oct-float index into global data array
|
||||
int iGID = get_global_id(0);
|
||||
if (iGID>=numElements)
|
||||
return;
|
||||
|
||||
float8 aGID = a[iGID];
|
||||
float8 bGID = b[iGID];
|
||||
|
||||
float8 result = aGID + bGID;
|
||||
// write back out to GMEM
|
||||
c[iGID] = result;
|
||||
}
|
||||
20
opencl/vector_add/VectorAddKernels.h
Normal file
20
opencl/vector_add/VectorAddKernels.h
Normal file
@@ -0,0 +1,20 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* vectorAddCL= \
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c, int numElements)\n"
|
||||
"{\n"
|
||||
" // get oct-float index into global data array\n"
|
||||
" int iGID = get_global_id(0);\n"
|
||||
" if (iGID>=numElements)\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" float8 aGID = a[iGID];\n"
|
||||
" float8 bGID = b[iGID];\n"
|
||||
"\n"
|
||||
" float8 result = aGID + bGID;\n"
|
||||
" // write back out to GMEM\n"
|
||||
" c[iGID] = result;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
;
|
||||
408
opencl/vector_add/main.cpp
Normal file
408
opencl/vector_add/main.cpp
Normal file
@@ -0,0 +1,408 @@
|
||||
|
||||
///VectorAdd sample, from the NVidia JumpStart Guide
|
||||
///http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf
|
||||
|
||||
///Instead of #include <CL/cl.h> we include <MiniCL/cl.h>
|
||||
///Apart from this include file, all other code should compile and work on OpenCL compliant implementation
|
||||
|
||||
|
||||
#define LOAD_FROM_FILE
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/OpenCL.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif //__APPLE__
|
||||
#ifdef _WIN32
|
||||
#pragma warning (disable:4996)
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); }
|
||||
size_t wgSize;
|
||||
|
||||
#include "VectorAddKernels.h"
|
||||
|
||||
#ifdef CL_PLATFORM_INTEL
|
||||
const char* preferredPlatform = "Intel(R) Corporation";
|
||||
#elif defined CL_PLATFORM_AMD
|
||||
const char* preferredPlatform = "Advanced Micro Devices, Inc.";
|
||||
#elif defined CL_PLATFORM_NVIDIA
|
||||
const char* preferredPlatform = "NVIDIA Corporation";
|
||||
#else
|
||||
const char* preferredPlatform = "Unknown";
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
|
||||
{
|
||||
// locals
|
||||
FILE* pFileStream = NULL;
|
||||
size_t szSourceLength;
|
||||
|
||||
// open the OpenCL source code file
|
||||
pFileStream = fopen(cFilename, "rb");
|
||||
if(pFileStream == 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t szPreambleLength = strlen(cPreamble);
|
||||
|
||||
// get the length of the source code
|
||||
fseek(pFileStream, 0, SEEK_END);
|
||||
szSourceLength = ftell(pFileStream);
|
||||
fseek(pFileStream, 0, SEEK_SET);
|
||||
|
||||
// allocate a buffer for the source code string and read it in
|
||||
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
|
||||
memcpy(cSourceString, cPreamble, szPreambleLength);
|
||||
fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream);
|
||||
|
||||
// close the file and return the total length of the combined (preamble + source) string
|
||||
fclose(pFileStream);
|
||||
if(szFinalLength != 0)
|
||||
{
|
||||
*szFinalLength = szSourceLength + szPreambleLength;
|
||||
}
|
||||
cSourceString[szSourceLength + szPreambleLength] = '\0';
|
||||
|
||||
return cSourceString;
|
||||
}
|
||||
|
||||
size_t workitem_size[3];
|
||||
|
||||
void printDevInfo(cl_device_id device)
|
||||
{
|
||||
char device_string[1024];
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
printf( " Device %s:\n", device_string);
|
||||
|
||||
// CL_DEVICE_INFO
|
||||
cl_device_type type;
|
||||
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
|
||||
if( type & CL_DEVICE_TYPE_CPU )
|
||||
printf(" CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_CPU");
|
||||
if( type & CL_DEVICE_TYPE_GPU )
|
||||
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_GPU");
|
||||
if( type & CL_DEVICE_TYPE_ACCELERATOR )
|
||||
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
|
||||
if( type & CL_DEVICE_TYPE_DEFAULT )
|
||||
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
|
||||
|
||||
// CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
cl_uint compute_units;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
|
||||
printf( " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_GROUP_SIZE
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
|
||||
printf( " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Main function
|
||||
// *********************************************************************
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
void *srcA, *srcB, *dst; // Host buffers for OpenCL test
|
||||
cl_context cxGPUContext; // OpenCL context
|
||||
cl_command_queue cqCommandQue; // OpenCL command que
|
||||
cl_device_id* cdDevices; // OpenCL device list
|
||||
cl_program cpProgram; // OpenCL program
|
||||
cl_kernel ckKernel; // OpenCL kernel
|
||||
cl_mem cmMemObjs[3]; // OpenCL memory buffer objects: 3 for device
|
||||
size_t szGlobalWorkSize[1]; // 1D var for Total # of work items
|
||||
size_t szLocalWorkSize[1]; // 1D var for # of work items in the work group
|
||||
size_t szParmDataBytes; // Byte size of context information
|
||||
cl_int ciErr1, ciErr2; // Error code var
|
||||
|
||||
|
||||
int iTestN = 100000 * 8; // Size of Vectors to process
|
||||
|
||||
int actualGlobalSize = iTestN / 8;
|
||||
|
||||
|
||||
// set Global and Local work size dimensions
|
||||
szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item
|
||||
szLocalWorkSize[0]= iTestN>>3;
|
||||
|
||||
|
||||
// Allocate and initialize host arrays
|
||||
srcA = (void *)malloc (sizeof(cl_float) * iTestN);
|
||||
srcB = (void *)malloc (sizeof(cl_float) * iTestN);
|
||||
dst = (void *)malloc (sizeof(cl_float) * iTestN);
|
||||
|
||||
int i;
|
||||
|
||||
// Initialize arrays with some values
|
||||
for (i=0;i<iTestN;i++)
|
||||
{
|
||||
((cl_float*)srcA)[i] = cl_float(i);
|
||||
((cl_float*)srcB)[i] = 2;
|
||||
((cl_float*)dst)[i]=-1;
|
||||
}
|
||||
|
||||
|
||||
cl_uint numPlatforms;
|
||||
cl_platform_id platform = NULL;
|
||||
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
|
||||
|
||||
if (0 < numPlatforms)
|
||||
{
|
||||
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
|
||||
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
|
||||
|
||||
for (unsigned i = 0; i < numPlatforms; ++i)
|
||||
{
|
||||
char pbuf[100];
|
||||
status = clGetPlatformInfo(platforms[i],
|
||||
CL_PLATFORM_VENDOR,
|
||||
sizeof(pbuf),
|
||||
pbuf,
|
||||
NULL);
|
||||
|
||||
platform = platforms[i];
|
||||
if (!strcmp(pbuf, preferredPlatform))
|
||||
{
|
||||
printf("Found platform %s\n", preferredPlatform);
|
||||
break;
|
||||
}
|
||||
}
|
||||
delete[] platforms;
|
||||
}
|
||||
|
||||
cl_context_properties cps[3] =
|
||||
{
|
||||
CL_CONTEXT_PLATFORM,
|
||||
(cl_context_properties)platform,
|
||||
0
|
||||
};
|
||||
|
||||
// Create OpenCL context & context
|
||||
cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
|
||||
|
||||
// Query all devices available to the context
|
||||
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||
cdDevices = (cl_device_id*)malloc(szParmDataBytes);
|
||||
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
|
||||
if (cdDevices)
|
||||
{
|
||||
printDevInfo(cdDevices[0]);
|
||||
}
|
||||
|
||||
// Create a command queue for first device the context reported
|
||||
cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
|
||||
ciErr1 |= ciErr2;
|
||||
|
||||
// Allocate the OpenCL source and result buffer memory objects on the device GMEM
|
||||
cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
|
||||
ciErr1 |= ciErr2;
|
||||
cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
|
||||
ciErr1 |= ciErr2;
|
||||
cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
|
||||
ciErr1 |= ciErr2;
|
||||
|
||||
///create kernels from binary
|
||||
int numDevices = 1;
|
||||
::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
|
||||
const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));
|
||||
|
||||
for (i = 0; i < numDevices; ++i) {
|
||||
images[i] = 0;
|
||||
lengths[i] = 0;
|
||||
}
|
||||
|
||||
|
||||
// Read the OpenCL kernel in from source file
|
||||
const char* cSourceFile = "opencl/vector_add/VectorAddKernels.cl";
|
||||
|
||||
|
||||
const char* cPathAndName = cSourceFile;
|
||||
#ifdef LOAD_FROM_FILE
|
||||
size_t szKernelLength;
|
||||
|
||||
const char* cSourceCL =0;
|
||||
char relativeFileName[1024];
|
||||
|
||||
{
|
||||
const char* prefix[]={"../","../../","../../../","../../../../"};
|
||||
int numPrefixes = sizeof(prefix)/sizeof(char*);
|
||||
|
||||
for (int i=0;!cSourceCL && i<numPrefixes;i++)
|
||||
{
|
||||
|
||||
sprintf(relativeFileName,"%s%s",prefix[i],cSourceFile);
|
||||
cSourceCL = loadProgSource(relativeFileName, "", &szKernelLength);
|
||||
if (cSourceCL)
|
||||
{
|
||||
printf("Loaded program source: %s\n", relativeFileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!cSourceCL)
|
||||
{
|
||||
printf("Couldn't find file %s, exiting\n",cSourceFile);
|
||||
exit(0);
|
||||
}
|
||||
#else
|
||||
const char* cSourceCL = vectorAddCL;
|
||||
size_t szKernelLength = strlen(cSourceCL);
|
||||
#endif //LOAD_FROM_FILE
|
||||
|
||||
|
||||
|
||||
// Create the program
|
||||
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
|
||||
printf("clCreateProgramWithSource...\n");
|
||||
if (ciErr1 != CL_SUCCESS)
|
||||
{
|
||||
printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// Build the program with 'mad' Optimization option
|
||||
#ifdef MAC
|
||||
char* flags = "-cl-mad-enable -DMAC ";
|
||||
#else
|
||||
char flags[1024]={0};
|
||||
#ifdef CL_PLATFORM_INTEL
|
||||
sprintf(flags,"-g -s \"%s\"","C:/develop/experiments/opencl/vector_add/VectorAddKernels.cl");
|
||||
#endif//CL_PLATFORM_INTEL
|
||||
|
||||
#endif//MAC
|
||||
ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
|
||||
printf("clBuildProgram...\n");
|
||||
if (ciErr1 != CL_SUCCESS)
|
||||
{
|
||||
printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// Create the kernel
|
||||
ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
|
||||
printf("clCreateKernel (VectorAdd)...\n");
|
||||
if (ciErr1 != CL_SUCCESS)
|
||||
{
|
||||
printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
cl_int ciErrNum;
|
||||
|
||||
ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
printf("cannot get workgroup size\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Set the Argument values
|
||||
ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
|
||||
ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
|
||||
ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);
|
||||
ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(int), (void*)&actualGlobalSize);
|
||||
|
||||
printf("Press ENTER to quit\n");
|
||||
getchar();
|
||||
|
||||
int workgroupSize = wgSize;
|
||||
if(workgroupSize <= 0)
|
||||
{ // let OpenCL library calculate workgroup size
|
||||
size_t globalWorkSize[2];
|
||||
globalWorkSize[0] = actualGlobalSize;
|
||||
globalWorkSize[1] = 1;
|
||||
|
||||
// Copy input data from host to GPU and launch kernel
|
||||
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 );
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t localWorkSize[2], globalWorkSize[2];
|
||||
//workgroupSize = btMin(workgroupSize, actualGlobalSize);
|
||||
int num_t = actualGlobalSize / workgroupSize;
|
||||
int num_g = num_t * workgroupSize;
|
||||
if(num_g < actualGlobalSize)
|
||||
{
|
||||
num_t++;
|
||||
//this can cause problems -> processing outside of the buffer
|
||||
//make sure to check kernel
|
||||
}
|
||||
|
||||
size_t globalThreads[] = {num_t * workgroupSize};
|
||||
size_t localThreads[] = {workgroupSize};
|
||||
|
||||
|
||||
localWorkSize[0] = workgroupSize;
|
||||
globalWorkSize[0] = num_t * workgroupSize;
|
||||
localWorkSize[1] = 1;
|
||||
globalWorkSize[1] = 1;
|
||||
|
||||
// Copy input data from host to GPU and launch kernel
|
||||
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);
|
||||
|
||||
}
|
||||
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
printf("cannot clEnqueueNDRangeKernel\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
clFinish(cqCommandQue);
|
||||
// Read back results and check accumulated errors
|
||||
ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);
|
||||
|
||||
// Release kernel, program, and memory objects
|
||||
// NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
|
||||
free(cdDevices);
|
||||
clReleaseKernel(ckKernel);
|
||||
clReleaseProgram(cpProgram);
|
||||
clReleaseCommandQueue(cqCommandQue);
|
||||
clReleaseContext(cxGPUContext);
|
||||
|
||||
|
||||
// print the results
|
||||
int iErrorCount = 0;
|
||||
for (i = 0; i < iTestN; i++)
|
||||
{
|
||||
if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
|
||||
iErrorCount++;
|
||||
}
|
||||
|
||||
if (iErrorCount)
|
||||
{
|
||||
printf("Validation FAILED\n");
|
||||
} else
|
||||
{
|
||||
printf("Validation SUCCESSFULL\n");
|
||||
}
|
||||
// Free host memory, close log and return success
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
clReleaseMemObject(cmMemObjs[i]);
|
||||
}
|
||||
|
||||
free(srcA);
|
||||
free(srcB);
|
||||
free (dst);
|
||||
printf("Press ENTER to quit\n");
|
||||
getchar();
|
||||
}
|
||||
|
||||
|
||||
28
opencl/vector_add/premake4.lua
Normal file
28
opencl/vector_add/premake4.lua
Normal file
@@ -0,0 +1,28 @@
|
||||
function createProject(vendor)
|
||||
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("OpenCL_VectorAdd_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../bin"
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../basic_initialize/btOpenCLUtils.cpp",
|
||||
"../basic_initialize/btOpenCLUtils.h"
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
createProject("Apple")
|
||||
69
opencl/vector_add_simplified/main.cpp
Normal file
69
opencl/vector_add_simplified/main.cpp
Normal file
@@ -0,0 +1,69 @@
|
||||
///original author: Erwin Coumans
|
||||
#include "btOpenCLUtils.h"
|
||||
#include "../parallel_primitives/host/btOpenCLArray.h"
|
||||
#include "../parallel_primitives/host/btLauncherCL.h"
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
#define MSTRINGIFY(A) #A
|
||||
const char* kernelString= MSTRINGIFY(
|
||||
__kernel void VectorAdd(__global const float* a, __global const float* b, __global float* c, int numElements)
|
||||
{
|
||||
int iGID = get_global_id(0);
|
||||
if (iGID>=numElements)
|
||||
return;
|
||||
float aGID = a[iGID];
|
||||
float bGID = b[iGID];
|
||||
float result = aGID + bGID;
|
||||
c[iGID] = result;
|
||||
}
|
||||
);
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int ciErrNum = 0;
|
||||
int preferred_device = -1;
|
||||
int preferred_platform = -1;
|
||||
cl_platform_id platformId;
|
||||
cl_context ctx;
|
||||
cl_command_queue queue;
|
||||
cl_device_id device;
|
||||
cl_kernel addKernel;
|
||||
ctx = btOpenCLUtils::createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId);
|
||||
btOpenCLUtils::printPlatformInfo(platformId);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
if (!ctx) {
|
||||
printf("No OpenCL capable GPU found!");
|
||||
return 0;
|
||||
}
|
||||
|
||||
device = btOpenCLUtils::getDevice(ctx,0);
|
||||
queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum);
|
||||
addKernel = btOpenCLUtils::compileCLKernelFromString(ctx,device,kernelString,"VectorAdd",&ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
int numElements = 32;
|
||||
btOpenCLArray<float> a(ctx,queue);
|
||||
btOpenCLArray<float> b(ctx,queue);
|
||||
btOpenCLArray<float> c(ctx,queue);
|
||||
for (int i=0;i<numElements;i++)
|
||||
{
|
||||
a.push_back(float(i));
|
||||
b.push_back(float(i));
|
||||
}
|
||||
|
||||
c.resize(numElements);
|
||||
btLauncherCL launcher( queue, addKernel);
|
||||
launcher.setBuffer( a.getBufferCL());
|
||||
launcher.setBuffer( b.getBufferCL());
|
||||
launcher.setBuffer( c.getBufferCL());
|
||||
launcher.setConst( numElements );
|
||||
launcher.launch1D( numElements);
|
||||
for (int i=0;i<numElements;i++)
|
||||
{
|
||||
float v = c.at(i);
|
||||
printf("c[%d]=%f\n",i,v);
|
||||
}
|
||||
clReleaseCommandQueue(queue);
|
||||
clReleaseContext(ctx);
|
||||
return 0;
|
||||
}
|
||||
37
opencl/vector_add_simplified/premake4.lua
Normal file
37
opencl/vector_add_simplified/premake4.lua
Normal file
@@ -0,0 +1,37 @@
|
||||
|
||||
function createProject (vendor)
|
||||
|
||||
local hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ( "OpenCL_vector_add_simplified_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../bin"
|
||||
|
||||
links {
|
||||
"OpenCL_lib_parallel_primitives_host_" .. vendor
|
||||
}
|
||||
|
||||
includedirs {
|
||||
"../basic_initialize"
|
||||
}
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../basic_initialize/btOpenCLUtils.cpp",
|
||||
"../basic_initialize/btOpenCLUtils.h"
|
||||
}
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("NVIDIA")
|
||||
createProject("Intel")
|
||||
createProject("Apple")
|
||||
Reference in New Issue
Block a user