Added MiniCL, a limited subset of OpenCL, the open standard for parallel programming of heterogeneous systems.

MiniCL includes a cross-platform run-time frontend based on pthreads, Win32 Threads, or libspe2 for Cell SPU.
It is there, to bridge the gap until OpenCL is more widely available.

See Bullet/Demos/VectorAdd, influenced by NVidia OpenCL Jumpstart Guide:
http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf
This commit is contained in:
erwin.coumans
2009-05-22 01:43:37 +00:00
parent 2f1014268b
commit fb6146f0be
10 changed files with 2361 additions and 0 deletions

View File

@@ -0,0 +1,16 @@
# AppMiniCLVectorAdd is a very basic test for MiniCL.
INCLUDE_DIRECTORIES(
${BULLET_PHYSICS_SOURCE_DIR}/src }
)
LINK_LIBRARIES(
BulletMultiThreaded LinearMath
)
ADD_EXECUTABLE(AppMiniCLVectorAdd
MiniCL_VectorAdd.cpp
MiniCL.cpp
)

View File

@@ -0,0 +1,346 @@
#include <MiniCL/cl.h>
#define __PHYSICS_COMMON_H__ 1
#ifdef WIN32
#include "BulletMultiThreaded/Win32ThreadSupport.h"
#else
#include "BulletMultiThreaded/SequentialThreadSupport.h"
#endif
#include "BulletMultiThreaded/MiniCLTaskScheduler.h"
#include "BulletMultiThreaded/MiniCLTask/MiniCLTask.h"
#include "LinearMath/btMinMax.h"
/*
m_threadSupportCollision = new Win32ThreadSupport(Win32ThreadSupport::Win32ThreadConstructionInfo(
"collision",
processCollisionTask,
createCollisionLocalStoreMemory,
maxNumOutstandingTasks));
if (!m_spuCollisionTaskProcess)
m_spuCollisionTaskProcess = new SpuCollisionTaskProcess(m_threadInterface,m_maxNumOutstandingTasks);
m_spuCollisionTaskProcess->initialize2(dispatchInfo.m_useEpa);
m_spuCollisionTaskProcess->addWorkToTask(pairPtr,i,endIndex);
//make sure all SPU work is done
m_spuCollisionTaskProcess->flush2();
*/
CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(
cl_device_id device ,
cl_device_info param_name ,
size_t param_value_size ,
void * param_value ,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0
{
switch (param_name)
{
case CL_DEVICE_NAME:
{
char deviceName[] = "CPU";
int nameLen = strlen(deviceName)+1;
assert(param_value_size>strlen(deviceName));
if (nameLen < param_value_size)
{
sprintf((char*)param_value,"CPU");
} else
{
printf("error: param_value_size should be at least %d, but it is %d\n",nameLen,param_value_size);
}
break;
}
case CL_DEVICE_TYPE:
{
if (param_value_size>=sizeof(cl_device_type))
{
cl_device_type* deviceType = (cl_device_type*)param_value;
*deviceType = CL_DEVICE_TYPE_CPU;
} else
{
printf("error: param_value_size should be at least %d\n",sizeof(cl_device_type));
}
break;
}
case CL_DEVICE_MAX_COMPUTE_UNITS:
{
if (param_value_size>=sizeof(cl_uint))
{
cl_uint* numUnits = (cl_uint*)param_value;
*numUnits= 4;
} else
{
printf("error: param_value_size should be at least %d\n",sizeof(cl_uint));
}
break;
}
case CL_DEVICE_MAX_WORK_ITEM_SIZES:
{
size_t workitem_size[3];
if (param_value_size>=sizeof(workitem_size))
{
size_t* workItemSize = (size_t*)param_value;
workItemSize[0] = 64;
workItemSize[1] = 24;
workItemSize[2] = 16;
} else
{
printf("error: param_value_size should be at least %d\n",sizeof(cl_uint));
}
break;
}
default:
{
printf("error: unsupported param_name:%d\n",param_name);
}
}
return 0;
}
CL_API_ENTRY cl_int CL_API_CALL clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0
{
return 0;
}
CL_API_ENTRY cl_int CL_API_CALL clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0
{
return 0;
}
CL_API_ENTRY cl_int CL_API_CALL clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0
{
return 0;
}
CL_API_ENTRY cl_int CL_API_CALL clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0
{
return 0;
}
// Enqueued Commands APIs
CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadBuffer(cl_command_queue command_queue ,
cl_mem buffer ,
cl_bool /* blocking_read */,
size_t /* offset */,
size_t cb ,
void * ptr ,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0
{
MiniCLTaskScheduler* scheduler = (MiniCLTaskScheduler*) command_queue;
///wait for all work items to be completed
scheduler->flush();
memcpy(ptr,buffer,cb);
return 0;
}
CL_API_ENTRY cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
cl_kernel clKernel ,
cl_uint work_dim ,
const size_t * /* global_work_offset */,
const size_t * global_work_size ,
const size_t * /* local_work_size */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0
{
MiniCLKernel* kernel = (MiniCLKernel*) clKernel;
for (int ii=0;ii<work_dim;ii++)
{
int maxTask = kernel->m_scheduler->getMaxNumOutstandingTasks();
int numWorkItems = global_work_size[ii];
//at minimum 64 work items per task
int numWorkItemsPerTask = btMax(64,numWorkItems / maxTask);
for (int t=0;t<numWorkItems;)
{
//Performance Hint: tweak this number during benchmarking
int endIndex = (t+numWorkItemsPerTask) < numWorkItems ? t+numWorkItemsPerTask : numWorkItems;
kernel->m_scheduler->issueTask(t,endIndex,kernel->m_kernelProgramCommandId,(char*)&kernel->m_argData[0][0],kernel->m_argSizes);
t = endIndex;
}
}
/*
void* bla = 0;
scheduler->issueTask(bla,2,3);
scheduler->flush();
*/
return 0;
}
CL_API_ENTRY cl_int CL_API_CALL clSetKernelArg(cl_kernel clKernel ,
cl_uint arg_index ,
size_t arg_size ,
const void * arg_value ) CL_API_SUFFIX__VERSION_1_0
{
MiniCLKernel* kernel = (MiniCLKernel* ) clKernel;
assert(arg_size < MINICL_MAX_ARGLENGTH);
if (arg_index>MINI_CL_MAX_ARG)
{
printf("error: clSetKernelArg arg_index (%d) exceeds %d\n",arg_index,MINI_CL_MAX_ARG);
} else
{
if (arg_size>=MINICL_MAX_ARGLENGTH)
{
printf("error: clSetKernelArg argdata too large: %d (maximum is %d)\n",arg_size,MINICL_MAX_ARGLENGTH);
} else
{
memcpy( kernel->m_argData[arg_index],arg_value,arg_size);
kernel->m_argSizes[arg_index] = arg_size;
}
}
return 0;
}
// Kernel Object APIs
CL_API_ENTRY cl_kernel CL_API_CALL clCreateKernel(cl_program program ,
const char * kernel_name ,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0
{
MiniCLTaskScheduler* scheduler = (MiniCLTaskScheduler*) program;
MiniCLKernel* kernel = new MiniCLKernel();
kernel->m_kernelProgramCommandId = scheduler->findProgramCommandIdByName(kernel_name);
kernel->m_scheduler = scheduler;
return (cl_kernel)kernel;
}
CL_API_ENTRY cl_int CL_API_CALL clBuildProgram(cl_program /* program */,
cl_uint /* num_devices */,
const cl_device_id * /* device_list */,
const char * /* options */,
void (*pfn_notify)(cl_program /* program */, void * /* user_data */),
void * /* user_data */) CL_API_SUFFIX__VERSION_1_0
{
return 0;
}
CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithBinary(cl_context context ,
cl_uint /* num_devices */,
const cl_device_id * /* device_list */,
const size_t * /* lengths */,
const unsigned char ** /* binaries */,
cl_int * /* binary_status */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0
{
return (cl_program)context;
}
// Memory Object APIs
CL_API_ENTRY cl_mem CL_API_CALL clCreateBuffer(cl_context /* context */,
cl_mem_flags flags ,
size_t size,
void * host_ptr ,
cl_int * errcode_ret ) CL_API_SUFFIX__VERSION_1_0
{
cl_mem buf = (cl_mem)malloc(size);
if ((flags&CL_MEM_COPY_HOST_PTR) && host_ptr)
{
memcpy(buf,host_ptr,size);
}
return buf;
}
// Command Queue APIs
CL_API_ENTRY cl_command_queue CL_API_CALL clCreateCommandQueue(cl_context context ,
cl_device_id /* device */,
cl_command_queue_properties /* properties */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0
{
return (cl_command_queue) context;
}
extern CL_API_ENTRY cl_int CL_API_CALL clGetContextInfo(cl_context /* context */,
cl_context_info param_name ,
size_t param_value_size ,
void * param_value,
size_t * param_value_size_ret ) CL_API_SUFFIX__VERSION_1_0
{
switch (param_name)
{
case CL_CONTEXT_DEVICES:
{
if (!param_value_size)
{
*param_value_size_ret = 13;
} else
{
sprintf((char*)param_value,"MiniCL_Test.");
}
break;
};
default:
{
printf("unsupported\n");
}
}
return 0;
}
CL_API_ENTRY cl_context CL_API_CALL clCreateContextFromType(cl_context_properties * /* properties */,
cl_device_type /* device_type */,
void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */,
void * /* user_data */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0
{
int maxNumOutstandingTasks = 4;
#ifdef WIN32
Win32ThreadSupport* threadSupport = new Win32ThreadSupport(Win32ThreadSupport::Win32ThreadConstructionInfo(
"MiniCL",
processMiniCLTask, //processCollisionTask,
createMiniCLLocalStoreMemory,//createCollisionLocalStoreMemory,
maxNumOutstandingTasks));
#else
SequentialThreadSupport::SequentialThreadConstructionInfo stc("MiniCL",processMiniCLTask,createMiniCLLocalStoreMemory);
SequentialThreadSupport* threadSupport = new SequentialThreadSupport(stc);
#endif
MiniCLTaskScheduler* scheduler = new MiniCLTaskScheduler(threadSupport,maxNumOutstandingTasks);
return (cl_context)scheduler;
}
CL_API_ENTRY cl_int CL_API_CALL clReleaseContext(cl_context context ) CL_API_SUFFIX__VERSION_1_0
{
MiniCLTaskScheduler* scheduler = (MiniCLTaskScheduler*) context;
btThreadSupportInterface* threadSupport = scheduler->getThreadSupportInterface();
delete scheduler;
delete threadSupport;
return 0;
}

View File

@@ -0,0 +1,172 @@
///VectorAdd sample, from the NVidia JumpStart Guide
///http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf
///Instead of #include <CL/cl.h> we include <MiniCL/cl.h>
///Apart from this include file, all other code should compile and work on OpenCL compliant implementation
#include <MiniCL/cl.h>
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
void printDevInfo(cl_device_id device)
{
char device_string[1024];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
printf( " Device %s:\n", device_string);
// CL_DEVICE_INFO
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
if( type & CL_DEVICE_TYPE_CPU )
printf(" CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if( type & CL_DEVICE_TYPE_GPU )
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if( type & CL_DEVICE_TYPE_ACCELERATOR )
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if( type & CL_DEVICE_TYPE_DEFAULT )
printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
printf( " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
size_t workitem_size[3];
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
printf( " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%d / %d / %d \n", workitem_size[0], workitem_size[1], workitem_size[2]);
}
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
void *srcA, *srcB, *dst; // Host buffers for OpenCL test
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQue; // OpenCL command que
cl_device_id* cdDevices; // OpenCL device list
cl_program cpProgram; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel
cl_mem cmMemObjs[3]; // OpenCL memory buffer objects: 3 for device
size_t szGlobalWorkSize[1]; // 1D var for Total # of work items
size_t szLocalWorkSize[1]; // 1D var for # of work items in the work group
size_t szParmDataBytes; // Byte size of context information
cl_int ciErr1, ciErr2; // Error code var
int iTestN = 100000 * 8; // Size of Vectors to process
// set Global and Local work size dimensions
szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item
szLocalWorkSize[0]= iTestN>>3;
// Allocate and initialize host arrays
srcA = (void *)malloc (sizeof(cl_float) * iTestN);
srcB = (void *)malloc (sizeof(cl_float) * iTestN);
dst = (void *)malloc (sizeof(cl_float) * iTestN);
int i;
// Initialize arrays with some values
for (i=0;i<iTestN;i++)
{
((cl_float*)srcA)[i] = cl_float(i);
((cl_float*)srcB)[i] = 2;
((cl_float*)dst)[i]=-1;
}
// Create OpenCL context & context
cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_CPU, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
// Query all devices available to the context
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*)malloc(szParmDataBytes);
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
if (cdDevices)
{
printDevInfo(cdDevices[0]);
}
// Create a command queue for first device the context reported
cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
ciErr1 |= ciErr2;
// Allocate the OpenCL source and result buffer memory objects on the device GMEM
cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
ciErr1 |= ciErr2;
cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
ciErr1 |= ciErr2;
cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
ciErr1 |= ciErr2;
///create kernels from binary
int numDevices = 1;
cl_int err;
::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));
for (i = 0; i < numDevices; ++i) {
images[i] = 0;
lengths[i] = 0;
}
cpProgram = clCreateProgramWithBinary(cxGPUContext, numDevices,cdDevices,lengths, images, 0, &err);
// Build the executable program from a binary
ciErr1 |= clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
// Create the kernel
ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
// Set the Argument values
ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);
// Copy input data from host to GPU and launch kernel
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
// Read back results and check accumulated errors
ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);
// Release kernel, program, and memory objects
// NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
free(cdDevices);
clReleaseKernel(ckKernel);
clReleaseProgram(cpProgram);
clReleaseCommandQueue(cqCommandQue);
clReleaseContext(cxGPUContext);
// print the results
int iErrorCount = 0;
for (i = 0; i < iTestN; i++)
{
if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
iErrorCount++;
}
if (iErrorCount)
{
printf("MiniCL validation FAILED\n");
} else
{
printf("MiniCL validation SUCCESSFULL\n");
}
// Free host memory, close log and return success
for (i = 0; i < 3; i++)
{
clReleaseMemObject(cmMemObjs[i]);
}
free(srcA);
free(srcB);
free (dst);
}