add AMD version of VectorAdd_OpenCL, next to MiniCL (has issues with workgroup size)
This commit is contained in:
@@ -25,7 +25,7 @@ else (CMAKE_SIZEOF_VOID_P MATCHES "8")
|
|||||||
SUBDIRS( ${SharedDemoSubdirs}
|
SUBDIRS( ${SharedDemoSubdirs}
|
||||||
ThreadingDemo
|
ThreadingDemo
|
||||||
MultiThreadedDemo
|
MultiThreadedDemo
|
||||||
MiniCL_VectorAdd
|
VectorAdd_OpenCL
|
||||||
ParticlesOpenCL
|
ParticlesOpenCL
|
||||||
)
|
)
|
||||||
endif (CMAKE_SIZEOF_VOID_P MATCHES "8")
|
endif (CMAKE_SIZEOF_VOID_P MATCHES "8")
|
||||||
@@ -46,7 +46,7 @@ SUBDIRS(
|
|||||||
GenericJointDemo
|
GenericJointDemo
|
||||||
SerializeDemo
|
SerializeDemo
|
||||||
SoftDemo
|
SoftDemo
|
||||||
MiniCL_VectorAdd
|
VectorAdd_OpenCL
|
||||||
)
|
)
|
||||||
|
|
||||||
ENDIF (USE_GLUT)
|
ENDIF (USE_GLUT)
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
INCLUDE_DIRECTORIES(
|
INCLUDE_DIRECTORIES(
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/src
|
${BULLET_PHYSICS_SOURCE_DIR}/src
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Extras/OpenCL/Shared
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -51,15 +51,15 @@ IF (USE_GLUT)
|
|||||||
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btParticlesSharedDefs.h
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btParticlesSharedDefs.h
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btParticlesSharedTypes.h
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btParticlesSharedTypes.h
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesDemo.h
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesDemo.h
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btOclUtils.h
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/shaders.h
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/shaders.h
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Extras/OpenCL/Shared/btOclCommon.h
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.h
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btOclUtils.cpp
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.cpp
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/main.cpp
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/main.cpp
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesDemo.cpp
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesDemo.cpp
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/shaders.cpp
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/shaders.cpp
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Extras/OpenCL/Shared/btOclCommon.cpp
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesOCL.cl
|
${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesOCL.cl
|
||||||
)
|
)
|
||||||
ELSE (USE_GLUT)
|
ELSE (USE_GLUT)
|
||||||
|
|||||||
37
Demos/VectorAdd_OpenCL/AMD/CMakeLists.txt
Normal file
37
Demos/VectorAdd_OpenCL/AMD/CMakeLists.txt
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
IF (INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
|
||||||
|
INCLUDE_DIRECTORIES( $ENV{==ATISTREAMSDKROOT=}/include )
|
||||||
|
IF (CMAKE_CL_64)
|
||||||
|
SET(CMAKE_ATISTREAMSDK_LIBPATH $ENV{==ATISTREAMSDKROOT=}/lib/x86_64 )
|
||||||
|
ELSE(CMAKE_CL_64)
|
||||||
|
SET(CMAKE_ATISTREAMSDK_LIBPATH $ENV{==ATISTREAMSDKROOT=}/lib/x86 )
|
||||||
|
ENDIF(CMAKE_CL_64)
|
||||||
|
ELSE()
|
||||||
|
INCLUDE_DIRECTORIES( $ENV{ATISTREAMSDKROOT}/include )
|
||||||
|
IF (CMAKE_CL_64)
|
||||||
|
SET(CMAKE_ATISTREAMSDK_LIBPATH $ENV{ATISTREAMSDKROOT}/lib/x86_64 )
|
||||||
|
ELSE(CMAKE_CL_64)
|
||||||
|
SET(CMAKE_ATISTREAMSDK_LIBPATH $ENV{ATISTREAMSDKROOT}/lib/x86 )
|
||||||
|
ENDIF(CMAKE_CL_64)
|
||||||
|
ENDIF()
|
||||||
|
|
||||||
|
INCLUDE_DIRECTORIES(
|
||||||
|
${BULLET_PHYSICS_SOURCE_DIR}/src
|
||||||
|
)
|
||||||
|
|
||||||
|
LINK_LIBRARIES(
|
||||||
|
BulletMultiThreaded LinearMath
|
||||||
|
${CMAKE_ATISTREAMSDK_LIBPATH}/OpenCL.lib
|
||||||
|
)
|
||||||
|
|
||||||
|
ADD_EXECUTABLE(AppVectorAdd_AMD
|
||||||
|
../MiniCL_VectorAdd.cpp
|
||||||
|
../VectorAddKernels.cl
|
||||||
|
)
|
||||||
|
|
||||||
|
IF (UNIX)
|
||||||
|
TARGET_LINK_LIBRARIES(AppVectorAdd_AMD pthread)
|
||||||
|
ENDIF(UNIX)
|
||||||
|
|
||||||
16
Demos/VectorAdd_OpenCL/CMakeLists.txt
Normal file
16
Demos/VectorAdd_OpenCL/CMakeLists.txt
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
|
||||||
|
IF(BUILD_MINICL_OPENCL_DEMOS)
|
||||||
|
SUBDIRS( MiniCL )
|
||||||
|
ENDIF()
|
||||||
|
|
||||||
|
IF(BUILD_AMD_OPENCL_DEMOS)
|
||||||
|
SUBDIRS(AMD)
|
||||||
|
ENDIF()
|
||||||
|
|
||||||
|
IF(BUILD_NVIDIA_OPENCL_DEMOS)
|
||||||
|
SUBDIRS(NVidia)
|
||||||
|
ENDIF()
|
||||||
|
|
||||||
|
IF(APPLE)
|
||||||
|
SUBDIRS(Apple)
|
||||||
|
ENDIF()
|
||||||
@@ -1,20 +1,22 @@
|
|||||||
# AppMiniCLVectorAdd is a very basic test for MiniCL.
|
# AppMiniCLVectorAdd is a very basic test for MiniCL.
|
||||||
|
|
||||||
|
|
||||||
INCLUDE_DIRECTORIES(
|
ADD_DEFINITIONS(-DUSE_MINICL)
|
||||||
${BULLET_PHYSICS_SOURCE_DIR}/src
|
|
||||||
)
|
INCLUDE_DIRECTORIES(
|
||||||
|
${BULLET_PHYSICS_SOURCE_DIR}/src
|
||||||
LINK_LIBRARIES(
|
)
|
||||||
BulletMultiThreaded LinearMath
|
|
||||||
)
|
LINK_LIBRARIES(
|
||||||
|
BulletMultiThreaded LinearMath
|
||||||
ADD_EXECUTABLE(AppMiniCLVectorAdd
|
)
|
||||||
MiniCL_VectorAdd.cpp
|
|
||||||
VectorAddKernels.cl
|
ADD_EXECUTABLE(AppVectorAdd_Mini
|
||||||
)
|
../MiniCL_VectorAdd.cpp
|
||||||
|
../VectorAddKernels.cl
|
||||||
IF (UNIX)
|
)
|
||||||
TARGET_LINK_LIBRARIES(AppMiniCLVectorAdd pthread)
|
|
||||||
ENDIF(UNIX)
|
IF (UNIX)
|
||||||
|
TARGET_LINK_LIBRARIES(AppVectorAdd_Mini pthread)
|
||||||
|
ENDIF(UNIX)
|
||||||
|
|
||||||
@@ -5,18 +5,77 @@
|
|||||||
///Instead of #include <CL/cl.h> we include <MiniCL/cl.h>
|
///Instead of #include <CL/cl.h> we include <MiniCL/cl.h>
|
||||||
///Apart from this include file, all other code should compile and work on OpenCL compliant implementation
|
///Apart from this include file, all other code should compile and work on OpenCL compliant implementation
|
||||||
|
|
||||||
#define USE_MINICL 1
|
|
||||||
#ifdef USE_MINICL
|
#ifdef USE_MINICL
|
||||||
|
|
||||||
#include "MiniCL/cl.h"
|
#include "MiniCL/cl.h"
|
||||||
#else //USE_MINICL
|
#else //USE_MINICL
|
||||||
|
#ifdef __APPLE__
|
||||||
|
#include <OpenCL/OpenCL.h>
|
||||||
|
#else
|
||||||
#include <CL/cl.h>
|
#include <CL/cl.h>
|
||||||
|
#endif //__APPLE__
|
||||||
#endif//USE_MINICL
|
#endif//USE_MINICL
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include "LinearMath/btMinMax.h"
|
||||||
|
#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); }
|
||||||
|
size_t wgSize;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
|
||||||
|
{
|
||||||
|
// locals
|
||||||
|
FILE* pFileStream = NULL;
|
||||||
|
size_t szSourceLength;
|
||||||
|
|
||||||
|
// open the OpenCL source code file
|
||||||
|
#ifdef _WIN32 // Windows version
|
||||||
|
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
#else // Linux version
|
||||||
|
pFileStream = fopen(cFilename, "rb");
|
||||||
|
if(pFileStream == 0)
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
size_t szPreambleLength = strlen(cPreamble);
|
||||||
|
|
||||||
|
// get the length of the source code
|
||||||
|
fseek(pFileStream, 0, SEEK_END);
|
||||||
|
szSourceLength = ftell(pFileStream);
|
||||||
|
fseek(pFileStream, 0, SEEK_SET);
|
||||||
|
|
||||||
|
// allocate a buffer for the source code string and read it in
|
||||||
|
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
|
||||||
|
memcpy(cSourceString, cPreamble, szPreambleLength);
|
||||||
|
fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream);
|
||||||
|
|
||||||
|
// close the file and return the total length of the combined (preamble + source) string
|
||||||
|
fclose(pFileStream);
|
||||||
|
if(szFinalLength != 0)
|
||||||
|
{
|
||||||
|
*szFinalLength = szSourceLength + szPreambleLength;
|
||||||
|
}
|
||||||
|
cSourceString[szSourceLength + szPreambleLength] = '\0';
|
||||||
|
|
||||||
|
return cSourceString;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t workitem_size[3];
|
||||||
|
|
||||||
void printDevInfo(cl_device_id device)
|
void printDevInfo(cl_device_id device)
|
||||||
{
|
{
|
||||||
char device_string[1024];
|
char device_string[1024];
|
||||||
@@ -42,7 +101,7 @@ void printDevInfo(cl_device_id device)
|
|||||||
printf( " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units);
|
printf( " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units);
|
||||||
|
|
||||||
// CL_DEVICE_MAX_WORK_GROUP_SIZE
|
// CL_DEVICE_MAX_WORK_GROUP_SIZE
|
||||||
size_t workitem_size[3];
|
|
||||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
|
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
|
||||||
printf( " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%d / %d / %d \n", workitem_size[0], workitem_size[1], workitem_size[2]);
|
printf( " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%d / %d / %d \n", workitem_size[0], workitem_size[1], workitem_size[2]);
|
||||||
|
|
||||||
@@ -68,11 +127,13 @@ int main(int argc, char **argv)
|
|||||||
cl_int ciErr1, ciErr2; // Error code var
|
cl_int ciErr1, ciErr2; // Error code var
|
||||||
int iTestN = 100000 * 8; // Size of Vectors to process
|
int iTestN = 100000 * 8; // Size of Vectors to process
|
||||||
|
|
||||||
|
int actualGlobalSize = iTestN>>3;
|
||||||
|
|
||||||
// set Global and Local work size dimensions
|
// set Global and Local work size dimensions
|
||||||
szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item
|
szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item
|
||||||
szLocalWorkSize[0]= iTestN>>3;
|
szLocalWorkSize[0]= iTestN>>3;
|
||||||
|
|
||||||
|
|
||||||
// Allocate and initialize host arrays
|
// Allocate and initialize host arrays
|
||||||
srcA = (void *)malloc (sizeof(cl_float) * iTestN);
|
srcA = (void *)malloc (sizeof(cl_float) * iTestN);
|
||||||
srcB = (void *)malloc (sizeof(cl_float) * iTestN);
|
srcB = (void *)malloc (sizeof(cl_float) * iTestN);
|
||||||
@@ -88,8 +149,43 @@ int main(int argc, char **argv)
|
|||||||
((cl_float*)dst)[i]=-1;
|
((cl_float*)dst)[i]=-1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
cl_uint numPlatforms;
|
||||||
|
cl_platform_id platform = NULL;
|
||||||
|
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
|
||||||
|
|
||||||
|
if (0 < numPlatforms)
|
||||||
|
{
|
||||||
|
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
|
||||||
|
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < numPlatforms; ++i)
|
||||||
|
{
|
||||||
|
char pbuf[100];
|
||||||
|
status = clGetPlatformInfo(platforms[i],
|
||||||
|
CL_PLATFORM_VENDOR,
|
||||||
|
sizeof(pbuf),
|
||||||
|
pbuf,
|
||||||
|
NULL);
|
||||||
|
|
||||||
|
platform = platforms[i];
|
||||||
|
if (!strcmp(pbuf, "Advanced Micro Devices, Inc."))
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
delete[] platforms;
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_context_properties cps[3] =
|
||||||
|
{
|
||||||
|
CL_CONTEXT_PLATFORM,
|
||||||
|
(cl_context_properties)platform,
|
||||||
|
0
|
||||||
|
};
|
||||||
|
|
||||||
// Create OpenCL context & context
|
// Create OpenCL context & context
|
||||||
cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_CPU, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
|
cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_CPU, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
|
||||||
|
|
||||||
// Query all devices available to the context
|
// Query all devices available to the context
|
||||||
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||||
@@ -114,7 +210,6 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
///create kernels from binary
|
///create kernels from binary
|
||||||
int numDevices = 1;
|
int numDevices = 1;
|
||||||
cl_int err;
|
|
||||||
::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
|
::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
|
||||||
const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));
|
const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));
|
||||||
|
|
||||||
@@ -123,22 +218,125 @@ int main(int argc, char **argv)
|
|||||||
lengths[i] = 0;
|
lengths[i] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
cpProgram = clCreateProgramWithBinary(cxGPUContext, numDevices,cdDevices,lengths, images, 0, &err);
|
|
||||||
|
// Read the OpenCL kernel in from source file
|
||||||
// Build the executable program from a binary
|
const char* cSourceFile = "VectorAddKernels.cl";
|
||||||
ciErr1 |= clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
|
|
||||||
|
printf("loadProgSource (%s)...\n", cSourceFile);
|
||||||
|
const char* cPathAndName = cSourceFile;
|
||||||
|
size_t szKernelLength;
|
||||||
|
char* cSourceCL = loadProgSource(cPathAndName, "", &szKernelLength);
|
||||||
|
|
||||||
|
// Create the program
|
||||||
|
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
|
||||||
|
printf("clCreateProgramWithSource...\n");
|
||||||
|
if (ciErr1 != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the program with 'mad' Optimization option
|
||||||
|
#ifdef MAC
|
||||||
|
char* flags = "-cl-mad-enable -DMAC";
|
||||||
|
#else
|
||||||
|
const char* flags = "";//"-cl-mad-enable";
|
||||||
|
#endif
|
||||||
|
ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
|
||||||
|
printf("clBuildProgram...\n");
|
||||||
|
if (ciErr1 != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
// Create the kernel
|
// Create the kernel
|
||||||
ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
|
ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
|
||||||
|
printf("clCreateKernel (VectorAdd)...\n");
|
||||||
|
if (ciErr1 != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
cl_int ciErrNum;
|
||||||
|
|
||||||
|
ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
printf("cannot get workgroup size\n");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Set the Argument values
|
// Set the Argument values
|
||||||
ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
|
ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
|
||||||
ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
|
ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
|
||||||
ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);
|
ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);
|
||||||
|
|
||||||
// Copy input data from host to GPU and launch kernel
|
|
||||||
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
|
|
||||||
|
int workgroupSize = wgSize;
|
||||||
|
if(workgroupSize <= 0)
|
||||||
|
{ // let OpenCL library calculate workgroup size
|
||||||
|
size_t globalWorkSize[2];
|
||||||
|
globalWorkSize[0] = actualGlobalSize;
|
||||||
|
globalWorkSize[1] = 1;
|
||||||
|
|
||||||
|
// Copy input data from host to GPU and launch kernel
|
||||||
|
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 );
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
size_t localWorkSize[2], globalWorkSize[2];
|
||||||
|
workgroupSize = btMin(workgroupSize, actualGlobalSize);
|
||||||
|
int num_t = actualGlobalSize / workgroupSize;
|
||||||
|
int num_g = num_t * workgroupSize;
|
||||||
|
if(num_g < actualGlobalSize)
|
||||||
|
{
|
||||||
|
num_t++;
|
||||||
|
//this can cause problems -> processing outside of the buffer
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t globalThreads[] = {actualGlobalSize};//num_t * workgroupSize};
|
||||||
|
size_t localThreads[] = {workgroupSize};
|
||||||
|
|
||||||
|
|
||||||
|
localWorkSize[0] = workgroupSize;
|
||||||
|
globalWorkSize[0] = num_t * workgroupSize;
|
||||||
|
localWorkSize[1] = 1;
|
||||||
|
globalWorkSize[1] = 1;
|
||||||
|
|
||||||
|
/* size_t localWorkSize[2], globalWorkSize[2];
|
||||||
|
workgroupSize = workgroupSize < actualGlobalSize ? workgroupSize : actualGlobalSize;
|
||||||
|
int num_t = actualGlobalSize / workgroupSize;
|
||||||
|
int num_g = num_t * workgroupSize;
|
||||||
|
if(num_g < actualGlobalSize)
|
||||||
|
{
|
||||||
|
num_t++;
|
||||||
|
}
|
||||||
|
localWorkSize[0] = workgroupSize;
|
||||||
|
globalWorkSize[0] = num_t * workgroupSize;
|
||||||
|
localWorkSize[1] = 1;
|
||||||
|
globalWorkSize[1] = 1;
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Copy input data from host to GPU and launch kernel
|
||||||
|
ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
printf("cannot clEnqueueNDRangeKernel\n");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
clFinish(cqCommandQue);
|
||||||
// Read back results and check accumulated errors
|
// Read back results and check accumulated errors
|
||||||
ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);
|
ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);
|
||||||
|
|
||||||
@@ -175,6 +373,8 @@ int main(int argc, char **argv)
|
|||||||
free(srcA);
|
free(srcA);
|
||||||
free(srcB);
|
free(srcB);
|
||||||
free (dst);
|
free (dst);
|
||||||
|
printf("Press ENTER to quit\n");
|
||||||
|
getchar();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user