add AMD version of VectorAdd_OpenCL, next to MiniCL (has issues with workgroup size)

This commit is contained in:
erwin.coumans
2010-06-25 00:19:22 +00:00
parent 498da0721b
commit 5db1c008bd
7 changed files with 296 additions and 41 deletions

View File

@@ -0,0 +1,37 @@
IF (INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
INCLUDE_DIRECTORIES( $ENV{==ATISTREAMSDKROOT=}/include )
IF (CMAKE_CL_64)
SET(CMAKE_ATISTREAMSDK_LIBPATH $ENV{==ATISTREAMSDKROOT=}/lib/x86_64 )
ELSE(CMAKE_CL_64)
SET(CMAKE_ATISTREAMSDK_LIBPATH $ENV{==ATISTREAMSDKROOT=}/lib/x86 )
ENDIF(CMAKE_CL_64)
ELSE()
INCLUDE_DIRECTORIES( $ENV{ATISTREAMSDKROOT}/include )
IF (CMAKE_CL_64)
SET(CMAKE_ATISTREAMSDK_LIBPATH $ENV{ATISTREAMSDKROOT}/lib/x86_64 )
ELSE(CMAKE_CL_64)
SET(CMAKE_ATISTREAMSDK_LIBPATH $ENV{ATISTREAMSDKROOT}/lib/x86 )
ENDIF(CMAKE_CL_64)
ENDIF()
INCLUDE_DIRECTORIES(
${BULLET_PHYSICS_SOURCE_DIR}/src
)
LINK_LIBRARIES(
BulletMultiThreaded LinearMath
${CMAKE_ATISTREAMSDK_LIBPATH}/OpenCL.lib
)
ADD_EXECUTABLE(AppVectorAdd_AMD
../MiniCL_VectorAdd.cpp
../VectorAddKernels.cl
)
IF (UNIX)
TARGET_LINK_LIBRARIES(AppVectorAdd_AMD pthread)
ENDIF(UNIX)

View File

@@ -0,0 +1,16 @@
IF(BUILD_MINICL_OPENCL_DEMOS)
SUBDIRS( MiniCL )
ENDIF()
IF(BUILD_AMD_OPENCL_DEMOS)
SUBDIRS(AMD)
ENDIF()
IF(BUILD_NVIDIA_OPENCL_DEMOS)
SUBDIRS(NVidia)
ENDIF()
IF(APPLE)
SUBDIRS(Apple)
ENDIF()

View File

@@ -0,0 +1,22 @@
# AppMiniCLVectorAdd is a very basic test for MiniCL.
ADD_DEFINITIONS(-DUSE_MINICL)
INCLUDE_DIRECTORIES(
${BULLET_PHYSICS_SOURCE_DIR}/src
)
LINK_LIBRARIES(
BulletMultiThreaded LinearMath
)
ADD_EXECUTABLE(AppVectorAdd_Mini
../MiniCL_VectorAdd.cpp
../VectorAddKernels.cl
)
IF (UNIX)
TARGET_LINK_LIBRARIES(AppVectorAdd_Mini pthread)
ENDIF(UNIX)

View File

@@ -0,0 +1,388 @@
///VectorAdd sample, from the NVidia JumpStart Guide
///http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf
///Instead of #include <CL/cl.h> we include <MiniCL/cl.h>
///Apart from this include file, all other code should compile and work on OpenCL compliant implementation
#ifdef USE_MINICL
#include "MiniCL/cl.h"
#else //USE_MINICL
#ifdef __APPLE__
#include <OpenCL/OpenCL.h>
#else
#include <CL/cl.h>
#endif //__APPLE__
#endif//USE_MINICL
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include "LinearMath/btMinMax.h"
#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); }
size_t wgSize;
char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
#ifdef _WIN32 // Windows version
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
{
return NULL;
}
#else // Linux version
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
#endif
size_t szPreambleLength = strlen(cPreamble);
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
memcpy(cSourceString, cPreamble, szPreambleLength);
fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream);
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength + szPreambleLength;
}
cSourceString[szSourceLength + szPreambleLength] = '\0';
return cSourceString;
}
size_t workitem_size[3];
void printDevInfo(cl_device_id device)
{
char device_string[1024];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
printf( " Device %s:\n", device_string);
// CL_DEVICE_INFO
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
if( type & CL_DEVICE_TYPE_CPU )
printf(" CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if( type & CL_DEVICE_TYPE_GPU )
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if( type & CL_DEVICE_TYPE_ACCELERATOR )
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if( type & CL_DEVICE_TYPE_DEFAULT )
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
printf( " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
printf( " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%d / %d / %d \n", workitem_size[0], workitem_size[1], workitem_size[2]);
}
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
void *srcA, *srcB, *dst; // Host buffers for OpenCL test
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQue; // OpenCL command que
cl_device_id* cdDevices; // OpenCL device list
cl_program cpProgram; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel
cl_mem cmMemObjs[3]; // OpenCL memory buffer objects: 3 for device
size_t szGlobalWorkSize[1]; // 1D var for Total # of work items
size_t szLocalWorkSize[1]; // 1D var for # of work items in the work group
size_t szParmDataBytes; // Byte size of context information
cl_int ciErr1, ciErr2; // Error code var
int iTestN = 100000 * 8; // Size of Vectors to process
int actualGlobalSize = iTestN>>3;
// set Global and Local work size dimensions
szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item
szLocalWorkSize[0]= iTestN>>3;
// Allocate and initialize host arrays
srcA = (void *)malloc (sizeof(cl_float) * iTestN);
srcB = (void *)malloc (sizeof(cl_float) * iTestN);
dst = (void *)malloc (sizeof(cl_float) * iTestN);
int i;
// Initialize arrays with some values
for (i=0;i<iTestN;i++)
{
((cl_float*)srcA)[i] = cl_float(i);
((cl_float*)srcB)[i] = 2;
((cl_float*)dst)[i]=-1;
}
cl_uint numPlatforms;
cl_platform_id platform = NULL;
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
if (0 < numPlatforms)
{
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
for (unsigned i = 0; i < numPlatforms; ++i)
{
char pbuf[100];
status = clGetPlatformInfo(platforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuf),
pbuf,
NULL);
platform = platforms[i];
if (!strcmp(pbuf, "Advanced Micro Devices, Inc."))
{
break;
}
}
delete[] platforms;
}
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)platform,
0
};
// Create OpenCL context & context
cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_CPU, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
// Query all devices available to the context
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*)malloc(szParmDataBytes);
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
if (cdDevices)
{
printDevInfo(cdDevices[0]);
}
// Create a command queue for first device the context reported
cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
ciErr1 |= ciErr2;
// Allocate the OpenCL source and result buffer memory objects on the device GMEM
cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
ciErr1 |= ciErr2;
cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
ciErr1 |= ciErr2;
cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
ciErr1 |= ciErr2;
///create kernels from binary
int numDevices = 1;
::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));
for (i = 0; i < numDevices; ++i) {
images[i] = 0;
lengths[i] = 0;
}
// Read the OpenCL kernel in from source file
const char* cSourceFile = "VectorAddKernels.cl";
printf("loadProgSource (%s)...\n", cSourceFile);
const char* cPathAndName = cSourceFile;
size_t szKernelLength;
char* cSourceCL = loadProgSource(cPathAndName, "", &szKernelLength);
// Create the program
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
printf("clCreateProgramWithSource...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
exit(0);
}
// Build the program with 'mad' Optimization option
#ifdef MAC
char* flags = "-cl-mad-enable -DMAC";
#else
const char* flags = "";//"-cl-mad-enable";
#endif
ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
printf("clBuildProgram...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
exit(0);
}
// Create the kernel
ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
printf("clCreateKernel (VectorAdd)...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
exit(0);
}
cl_int ciErrNum;
ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
if (ciErrNum != CL_SUCCESS)
{
printf("cannot get workgroup size\n");
exit(0);
}
// Set the Argument values
ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);
int workgroupSize = wgSize;
if(workgroupSize <= 0)
{ // let OpenCL library calculate workgroup size
size_t globalWorkSize[2];
globalWorkSize[0] = actualGlobalSize;
globalWorkSize[1] = 1;
// Copy input data from host to GPU and launch kernel
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 );
}
else
{
size_t localWorkSize[2], globalWorkSize[2];
workgroupSize = btMin(workgroupSize, actualGlobalSize);
int num_t = actualGlobalSize / workgroupSize;
int num_g = num_t * workgroupSize;
if(num_g < actualGlobalSize)
{
num_t++;
//this can cause problems -> processing outside of the buffer
}
size_t globalThreads[] = {actualGlobalSize};//num_t * workgroupSize};
size_t localThreads[] = {workgroupSize};
localWorkSize[0] = workgroupSize;
globalWorkSize[0] = num_t * workgroupSize;
localWorkSize[1] = 1;
globalWorkSize[1] = 1;
/* size_t localWorkSize[2], globalWorkSize[2];
workgroupSize = workgroupSize < actualGlobalSize ? workgroupSize : actualGlobalSize;
int num_t = actualGlobalSize / workgroupSize;
int num_g = num_t * workgroupSize;
if(num_g < actualGlobalSize)
{
num_t++;
}
localWorkSize[0] = workgroupSize;
globalWorkSize[0] = num_t * workgroupSize;
localWorkSize[1] = 1;
globalWorkSize[1] = 1;
*/
// Copy input data from host to GPU and launch kernel
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);
}
if (ciErrNum != CL_SUCCESS)
{
printf("cannot clEnqueueNDRangeKernel\n");
exit(0);
}
clFinish(cqCommandQue);
// Read back results and check accumulated errors
ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);
// Release kernel, program, and memory objects
// NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
free(cdDevices);
clReleaseKernel(ckKernel);
clReleaseProgram(cpProgram);
clReleaseCommandQueue(cqCommandQue);
clReleaseContext(cxGPUContext);
// print the results
int iErrorCount = 0;
for (i = 0; i < iTestN; i++)
{
if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
iErrorCount++;
}
if (iErrorCount)
{
printf("MiniCL validation FAILED\n");
} else
{
printf("MiniCL validation SUCCESSFULL\n");
}
// Free host memory, close log and return success
for (i = 0; i < 3; i++)
{
clReleaseMemObject(cmMemObjs[i]);
}
free(srcA);
free(srcB);
free (dst);
printf("Press ENTER to quit\n");
getchar();
}
#ifdef USE_MINICL
#include "MiniCL/cl_MiniCL_Defs.h"
extern "C"
{
#include "VectorAddKernels.cl"
}
MINICL_REGISTER(VectorAdd)
#endif//USE_MINICL

View File

@@ -0,0 +1,47 @@
/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///GUID_ARG is only used by MiniCL to pass in the guid used by its get_global_id implementation
#ifndef GUID_ARG
#define GUID_ARG
#endif
///////////////////////////////////////////////////
// OpenCL Kernel Function for element by element vector addition
__kernel void VectorAdd(__global const float8* a, __global const float8* b, __global float8* c GUID_ARG)
{
// get oct-float index into global data array
int iGID = get_global_id(0);
// read inputs into registers
float8 f8InA = a[iGID];
float8 f8InB = b[iGID];
float8 f8Out = (float8)0.0f;
// add the vector elements
f8Out.s0 = f8InA.s0 + f8InB.s0;
f8Out.s1 = f8InA.s1 + f8InB.s1;
f8Out.s2 = f8InA.s2 + f8InB.s2;
f8Out.s3 = f8InA.s3 + f8InB.s3;
f8Out.s4 = f8InA.s4 + f8InB.s4;
f8Out.s5 = f8InA.s5 + f8InB.s5;
f8Out.s6 = f8InA.s6 + f8InB.s6;
f8Out.s7 = f8InA.s7 + f8InB.s7;
// write back out to GMEM
c[get_global_id(0)] = f8Out;
}