Removed to bitonic sort and ios, they were not used and confuse Bram :)

http://bulletphysics.org/Bullet/phpBB3/viewtopic.php?t=9570
This commit is contained in:
erwin coumans
2013-11-25 21:38:44 -08:00
parent beb9e98cd3
commit ec39c2f210
6 changed files with 6 additions and 557 deletions

View File

@@ -13,11 +13,6 @@
end
newoption
{
trigger = "ios",
description = "Enable iOS target (requires xcode4)"
}
newoption
{
@@ -64,24 +59,12 @@
postfix=""
if _ACTION == "xcode4" then
if _OPTIONS["ios"] then
postfix = "ios";
xcodebuildsettings
{
'CODE_SIGN_IDENTITY = "iPhone Developer"',
"SDKROOT = iphoneos",
'ARCHS = "armv7"',
'TARGETED_DEVICE_FAMILY = "1,2"',
'VALID_ARCHS = "armv7"',
}
else
xcodebuildsettings
{
'ARCHS = "$(ARCHS_STANDARD_32_BIT) $(ARCHS_STANDARD_64_BIT)"',
'VALID_ARCHS = "x86_64 i386"',
}
end
end
flags { "NoRTTI", "NoExceptions"}
@@ -99,8 +82,6 @@
language "C++"
if not _OPTIONS["ios"] then
include "../Demos3/GpuDemos"
-- include "../Demos3/CpuDemos"
@@ -126,7 +107,7 @@ include "../Demos3/SimpleOpenGL3"
-- include "../test/OpenCL/NarrowphaseCollision"
include "../test/OpenCL/ParallelPrimitives"
include "../test/OpenCL/RadixSortBenchmark"
include "../test/OpenCL/BitonicSort"
include "../src/Bullet3Dynamics"
include "../src/Bullet3Common"
@@ -166,4 +147,3 @@ include "../Demos3/SimpleOpenGL3"
include "../Demos3"
end
end

View File

@@ -1,171 +0,0 @@
MSTRINGIFY(
/*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/
inline void ComparatorPrivate(int2* keyA, int2* keyB, uint dir)
{
if((keyA[0].x > keyB[0].x) == dir)
{
int2 tmp = *keyA;
*keyA = *keyB;
*keyB = tmp;
}
}
inline void ComparatorLocal(__local int2* keyA, __local int2* keyB, uint dir)
{
if((keyA[0].x > keyB[0].x) == dir)
{
int2 tmp = *keyA;
*keyA = *keyB;
*keyB = tmp;
}
}
////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into local memory
////////////////////////////////////////////////////////////////////////////////
__kernel void kBitonicSortCellIdLocal(__global int2* pKey, uint arrayLength, uint dir GUID_ARG)
{
__local int2 l_key[1024U];
int localSizeLimit = get_local_size(0) * 2;
//Offset to the beginning of subbatch and load data
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
for(uint size = 2; size < arrayLength; size <<= 1)
{
//Bitonic merge
uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
for(uint stride = size / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
//ddd == dir for the last bitonic merge step
{
for(uint stride = arrayLength / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], dir);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into local memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortLocal with the only exception
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
//sorted in opposite directions
__kernel void kBitonicSortCellIdLocal1(__global int2* pKey GUID_ARG)
{
__local int2 l_key[1024U];
uint localSizeLimit = get_local_size(0) * 2;
//Offset to the beginning of subarray and load data
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
uint comparatorI = get_global_id(0) & ((localSizeLimit / 2) - 1);
for(uint size = 2; size < localSizeLimit; size <<= 1)
{
//Bitonic merge
uint ddd = (comparatorI & (size / 2)) != 0;
for(uint stride = size / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
//Odd / even arrays of localSizeLimit elements
//sorted in opposite directions
{
uint ddd = (get_group_id(0) & 1);
for(uint stride = localSizeLimit / 2; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
__kernel void kBitonicSortCellIdMergeGlobal(__global int2* pKey, uint arrayLength, uint size, uint stride, uint dir GUID_ARG)
{
uint global_comparatorI = get_global_id(0);
uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
//Bitonic merge
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
int2 keyA = pKey[pos + 0];
int2 keyB = pKey[pos + stride];
ComparatorPrivate(&keyA, &keyB, ddd);
pKey[pos + 0] = keyA;
pKey[pos + stride] = keyB;
}
//Combined bitonic merge steps for
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
__kernel void kBitonicSortCellIdMergeLocal(__global int2* pKey, uint arrayLength, uint stride, uint size, uint dir GUID_ARG)
{
__local int2 l_key[1024U];
int localSizeLimit = get_local_size(0) * 2;
pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
l_key[get_local_id(0) + 0] = pKey[ 0];
l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];
//Bitonic merge
uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
for(; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
}
barrier(CLK_LOCAL_MEM_FENCE);
pKey[ 0] = l_key[get_local_id(0) + 0];
pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}
);

View File

@@ -1,90 +0,0 @@
#include "b3BitonicSort.h"
#include "Bullet3Common/b3Scalar.h"
//Note: logically shared with BitonicSort OpenCL code!
// TODO : get parameter from OpenCL and pass it to kernel (needed for platforms other than NVIDIA)
void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info)
{
if(arrayLength < 2)
return;
//Only power-of-two array lengths are supported so far
info.dir = (info.dir != 0);
cl_int ciErrNum;
size_t localWorkSize, globalWorkSize;
int res = -1;
cl_int clerr=clGetKernelWorkGroupInfo (info.bitonicSortLocal1,info.dev,CL_KERNEL_WORK_GROUP_SIZE,sizeof(size_t),&res,NULL);
if((clerr==CL_SUCCESS)&&(res>0))
info.localSizeLimit=res;
if(arrayLength <= info.localSizeLimit)
{
b3Assert( ( arrayLength) % info.localSizeLimit == 0);
//Launch bitonicSortLocal
ciErrNum = clSetKernelArg(info.bitonicSortLocal, 0, sizeof(cl_mem), (void *)&pKey);
ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
ciErrNum |= clSetKernelArg(info.bitonicSortLocal, 2, sizeof(cl_uint), (void *)&info.dir);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 2;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
else
{
//Launch bitonicSortLocal1
ciErrNum = clSetKernelArg(info.bitonicSortLocal1, 0, sizeof(cl_mem), (void *)&pKey);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 2;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
for(unsigned int size = 2 * info.localSizeLimit; size <= arrayLength; size <<= 1)
{
for(unsigned stride = size / 2; stride > 0; stride >>= 1)
{
if(stride >= info.localSizeLimit)
{
//Launch bitonicMergeGlobal
ciErrNum = clSetKernelArg(info.bitonicSortMergeGlobal, 0, sizeof(cl_mem), (void *)&pKey);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 1, sizeof(cl_uint), (void *)&arrayLength);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 2, sizeof(cl_uint), (void *)&size);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 3, sizeof(cl_uint), (void *)&stride);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeGlobal, 4, sizeof(cl_uint), (void *)&info.dir);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 4;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeGlobal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
else
{
//Launch bitonicMergeLocal
ciErrNum = clSetKernelArg(info.bitonicSortMergeLocal, 0, sizeof(cl_mem), (void *)&pKey);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 1, sizeof(cl_uint), (void *)&arrayLength);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 2, sizeof(cl_uint), (void *)&stride);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 3, sizeof(cl_uint), (void *)&size);
ciErrNum |= clSetKernelArg(info.bitonicSortMergeLocal, 4, sizeof(cl_uint), (void *)&info.dir);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
localWorkSize = info.localSizeLimit / 2;
globalWorkSize = arrayLength / 2;
ciErrNum = clEnqueueNDRangeKernel(info.m_cqCommandQue, info.bitonicSortMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
break;
}
}
}
}
}

View File

@@ -1,35 +0,0 @@
#ifndef B3_BITONIC_SORT_H
#define B3_BITONIC_SORT_H
#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
struct b3BitonicSortInfo
{
cl_command_queue m_cqCommandQue;
cl_device_id dev;
cl_kernel bitonicSortLocal;
cl_kernel bitonicSortLocal1;
cl_kernel bitonicSortMergeGlobal;
cl_kernel bitonicSortMergeLocal;
unsigned int dir;
unsigned int localSizeLimit;
b3BitonicSortInfo()
{
dev = 0;
m_cqCommandQue = 0;
bitonicSortLocal=0;
bitonicSortLocal1=0;
bitonicSortMergeGlobal=0;
bitonicSortMergeLocal=0;
dir = 1;
localSizeLimit = 1024U;
}
};
void bitonicSortNv(cl_mem pKey, int arrayLength, b3BitonicSortInfo& info);
#endif //B3_BITONIC_SORT_H

View File

@@ -1,196 +0,0 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///original author: Erwin Coumans
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3Common/shared/b3Int2.h"
#include "../btgui/Timing/b3Clock.h"
#include "b3BitonicSort.h"
#include <stdio.h>
int numSuccess=0;
int numFailed=0;
cl_context g_cxMainContext;
cl_command_queue g_cqCommandQue;
#define MSTRINGIFY(A) #A
static const char* kernelSource=
#include "BitonicSort.cl"
static bool compareFunc(const b3Int2& p, const b3Int2& q)
{
return (p.x < q.x) || ((p.x == q.x) && ((p.y < q.y)));
}
int main(int argc, char* argv[])
{
int ciErrNum = 0;
b3Clock clock;
cl_device_type deviceType = CL_DEVICE_TYPE_GPU;//ALL;
const char* vendorSDK = b3OpenCLUtils::getSdkVendorName();
printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
int numPlatforms = b3OpenCLUtils::getNumPlatforms();
printf("Num Platforms = %d\n", numPlatforms);
for (int i=0;i<numPlatforms;i++)
{
cl_platform_id platform = b3OpenCLUtils::getPlatform(i);
b3OpenCLPlatformInfo platformInfo;
b3OpenCLUtils::getPlatformInfo(platform,&platformInfo);
printf("--------------------------------\n");
printf("Platform info for platform nr %d:\n",i);
printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
cl_context context = b3OpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
if (context)
{
int numDevices = b3OpenCLUtils::getNumDevices(context);
printf("Num Devices = %d\n", numDevices);
for (int j=0;j<numDevices;j++)
{
cl_device_id dev = b3OpenCLUtils::getDevice(context,j);
b3OpenCLDeviceInfo devInfo;
b3OpenCLUtils::getDeviceInfo(dev,&devInfo);
printf("m_deviceName = %s\n",devInfo.m_deviceName);
//b3OpenCLUtils::printDeviceInfo(dev);
g_cqCommandQue = clCreateCommandQueue(context, dev, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
b3BitonicSortInfo info;
info.bitonicSortLocal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdLocal",&ciErrNum,0,"");
oclCHECKERROR(ciErrNum, CL_SUCCESS);
info.bitonicSortLocal1 = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdLocal1",&ciErrNum,0,"");
oclCHECKERROR(ciErrNum, CL_SUCCESS);
info.bitonicSortMergeGlobal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdMergeGlobal",&ciErrNum,0,"");
oclCHECKERROR(ciErrNum, CL_SUCCESS);
info.bitonicSortMergeLocal = b3OpenCLUtils::compileCLKernelFromString(context,dev,kernelSource,"kBitonicSortCellIdMergeLocal",&ciErrNum,0,"");
oclCHECKERROR(ciErrNum, CL_SUCCESS);
info.m_cqCommandQue = g_cqCommandQue;
info.dev = dev;
b3OpenCLArray<b3Int2> keyValuesGPU(context,g_cqCommandQue);
b3AlignedObjectArray<b3Int2> keyValuesCPU;
b3AlignedObjectArray<b3Int2> keyValuesGold;
int numValues = 8*1024*1024;//2048;//1024;
keyValuesCPU.resize(numValues);
for (int i=0;i<numValues;i++)
{
b3Int2 v;
v.x = numValues+1-i;
v.y = i*i;
keyValuesCPU[i] = v;
}
keyValuesGPU.copyFromHost(keyValuesCPU);
keyValuesGPU.copyToHost(keyValuesGold);
keyValuesGold.quickSort(compareFunc);
unsigned int batch = 1;
unsigned int arrayLength = keyValuesGPU.size();
for (int i=0;i<10;i++)
{
keyValuesGPU.copyFromHost(keyValuesCPU);
clFinish(info.m_cqCommandQue);
unsigned long pre=clock.getTimeMilliseconds();
bitonicSortNv(keyValuesGPU.getBufferCL(), arrayLength, info);
clFinish(info.m_cqCommandQue);
unsigned long post=clock.getTimeMilliseconds();
printf("GPU sort took %d ms\n",post-pre);
}
keyValuesGPU.copyToHost(keyValuesCPU);
int success=1;
for (int i=0;i<numValues;i++)
{
if (keyValuesCPU[i].x != keyValuesGold[i].x)
success = 0;
if (keyValuesCPU[i].y != keyValuesGold[i].y)
success = 0;
}
if (success)
{
printf("Correct\n");
numSuccess++;
} else
{
printf("Sort Failed\n");
numFailed++;
}
}
clReleaseContext(context);
}
}
///Easier method to initialize OpenCL using createContextFromType for a GPU
deviceType = CL_DEVICE_TYPE_GPU;
void* glCtx=0;
void* glDC = 0;
printf("Initialize OpenCL using b3OpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
g_cxMainContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
if (g_cxMainContext)
{
int numDev = b3OpenCLUtils::getNumDevices(g_cxMainContext);
for (int i=0;i<numDev;i++)
{
cl_device_id device;
device = b3OpenCLUtils::getDevice(g_cxMainContext,i);
b3OpenCLDeviceInfo clInfo;
b3OpenCLUtils::getDeviceInfo(device,&clInfo);
b3OpenCLUtils::printDeviceInfo(device);
// create a command-queue
g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//normally you would create and execute kernels using this command queue
clReleaseCommandQueue(g_cqCommandQue);
}
clReleaseContext(g_cxMainContext);
}
else {
printf("No OpenCL capable GPU found!");
}
printf("numSuccess=%d\n",numSuccess);
printf("numFailed=%d\n",numFailed);
printf("press <Enter>\n");
getchar();
return 0;
}

View File

@@ -1,39 +0,0 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("Test_BitonicSort_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {"../../../src"}
files {
"main.cpp",
"b3BitonicSort.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
"../../../src/Bullet3Common/b3AlignedAllocator.h",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h",
"../../../src/Bullet3Common/b3Logging.cpp",
"../../../src/Bullet3Common/b3Logging.h",
"../../../btgui/Timing/b3Clock.cpp",
"../../../btgui/Timing/b3Clock.h",
}
end
end
createProject("clew")
createProject("Apple")
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")