import opencl_course source for a start
This commit is contained in:
92
opencl/parallel_primitives/host/CommandLineArgs.h
Normal file
92
opencl/parallel_primitives/host/CommandLineArgs.h
Normal file
@@ -0,0 +1,92 @@
|
||||
#ifndef COMMAND_LINE_ARGS_H
|
||||
#define COMMAND_LINE_ARGS_H
|
||||
|
||||
/******************************************************************************
|
||||
* Command-line parsing
|
||||
******************************************************************************/
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <sstream>
|
||||
class CommandLineArgs
|
||||
{
|
||||
protected:
|
||||
|
||||
std::map<std::string, std::string> pairs;
|
||||
|
||||
public:
|
||||
|
||||
// Constructor
|
||||
CommandLineArgs(int argc, char **argv)
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
string arg = argv[i];
|
||||
|
||||
if ((arg[0] != '-') || (arg[1] != '-')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
string::size_type pos;
|
||||
string key, val;
|
||||
if ((pos = arg.find( '=')) == string::npos) {
|
||||
key = string(arg, 2, arg.length() - 2);
|
||||
val = "";
|
||||
} else {
|
||||
key = string(arg, 2, pos - 2);
|
||||
val = string(arg, pos + 1, arg.length() - 1);
|
||||
}
|
||||
pairs[key] = val;
|
||||
}
|
||||
}
|
||||
|
||||
bool CheckCmdLineFlag(const char* arg_name)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GetCmdLineArgument(const char *arg_name, T &val);
|
||||
|
||||
int ParsedArgc()
|
||||
{
|
||||
return pairs.size();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
istringstream strstream(itr->second);
|
||||
strstream >> val;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
|
||||
{
|
||||
using namespace std;
|
||||
map<string, string>::iterator itr;
|
||||
if ((itr = pairs.find(arg_name)) != pairs.end()) {
|
||||
|
||||
string s = itr->second;
|
||||
val = (char*) malloc(sizeof(char) * (s.length() + 1));
|
||||
std::strcpy(val, s.c_str());
|
||||
|
||||
} else {
|
||||
val = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
#endif //COMMAND_LINE_ARGS_H
|
||||
181
opencl/parallel_primitives/host/btAlignedAllocator.cpp
Normal file
181
opencl/parallel_primitives/host/btAlignedAllocator.cpp
Normal file
@@ -0,0 +1,181 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#include "btAlignedAllocator.h"
|
||||
|
||||
int gNumAlignedAllocs = 0;
|
||||
int gNumAlignedFree = 0;
|
||||
int gTotalBytesAlignedAllocs = 0;//detect memory leaks
|
||||
|
||||
static void *btAllocDefault(size_t size)
|
||||
{
|
||||
return malloc(size);
|
||||
}
|
||||
|
||||
static void btFreeDefault(void *ptr)
|
||||
{
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
static btAllocFunc *sAllocFunc = btAllocDefault;
|
||||
static btFreeFunc *sFreeFunc = btFreeDefault;
|
||||
|
||||
|
||||
|
||||
#if defined (BT_HAS_ALIGNED_ALLOCATOR)
|
||||
#include <malloc.h>
|
||||
static void *btAlignedAllocDefault(size_t size, int alignment)
|
||||
{
|
||||
return _aligned_malloc(size, (size_t)alignment);
|
||||
}
|
||||
|
||||
static void btAlignedFreeDefault(void *ptr)
|
||||
{
|
||||
_aligned_free(ptr);
|
||||
}
|
||||
#elif defined(__CELLOS_LV2__)
|
||||
#include <stdlib.h>
|
||||
|
||||
static inline void *btAlignedAllocDefault(size_t size, int alignment)
|
||||
{
|
||||
return memalign(alignment, size);
|
||||
}
|
||||
|
||||
static inline void btAlignedFreeDefault(void *ptr)
|
||||
{
|
||||
free(ptr);
|
||||
}
|
||||
#else
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static inline void *btAlignedAllocDefault(size_t size, int alignment)
|
||||
{
|
||||
void *ret;
|
||||
char *real;
|
||||
real = (char *)sAllocFunc(size + sizeof(void *) + (alignment-1));
|
||||
if (real) {
|
||||
ret = btAlignPointer(real + sizeof(void *),alignment);
|
||||
*((void **)(ret)-1) = (void *)(real);
|
||||
} else {
|
||||
ret = (void *)(real);
|
||||
}
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static inline void btAlignedFreeDefault(void *ptr)
|
||||
{
|
||||
void* real;
|
||||
|
||||
if (ptr) {
|
||||
real = *((void **)(ptr)-1);
|
||||
sFreeFunc(real);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static btAlignedAllocFunc *sAlignedAllocFunc = btAlignedAllocDefault;
|
||||
static btAlignedFreeFunc *sAlignedFreeFunc = btAlignedFreeDefault;
|
||||
|
||||
void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc)
|
||||
{
|
||||
sAlignedAllocFunc = allocFunc ? allocFunc : btAlignedAllocDefault;
|
||||
sAlignedFreeFunc = freeFunc ? freeFunc : btAlignedFreeDefault;
|
||||
}
|
||||
|
||||
void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc)
|
||||
{
|
||||
sAllocFunc = allocFunc ? allocFunc : btAllocDefault;
|
||||
sFreeFunc = freeFunc ? freeFunc : btFreeDefault;
|
||||
}
|
||||
|
||||
#ifdef BT_DEBUG_MEMORY_ALLOCATIONS
|
||||
//this generic allocator provides the total allocated number of bytes
|
||||
#include <stdio.h>
|
||||
|
||||
void* btAlignedAllocInternal (size_t size, int alignment,int line,char* filename)
|
||||
{
|
||||
void *ret;
|
||||
char *real;
|
||||
|
||||
gTotalBytesAlignedAllocs += size;
|
||||
gNumAlignedAllocs++;
|
||||
|
||||
|
||||
real = (char *)sAllocFunc(size + 2*sizeof(void *) + (alignment-1));
|
||||
if (real) {
|
||||
ret = (void*) btAlignPointer(real + 2*sizeof(void *), alignment);
|
||||
*((void **)(ret)-1) = (void *)(real);
|
||||
*((int*)(ret)-2) = size;
|
||||
|
||||
} else {
|
||||
ret = (void *)(real);//??
|
||||
}
|
||||
|
||||
printf("allocation#%d at address %x, from %s,line %d, size %d\n",gNumAlignedAllocs,real, filename,line,size);
|
||||
|
||||
int* ptr = (int*)ret;
|
||||
*ptr = 12;
|
||||
return (ret);
|
||||
}
|
||||
|
||||
void btAlignedFreeInternal (void* ptr,int line,char* filename)
|
||||
{
|
||||
|
||||
void* real;
|
||||
gNumAlignedFree++;
|
||||
|
||||
if (ptr) {
|
||||
real = *((void **)(ptr)-1);
|
||||
int size = *((int*)(ptr)-2);
|
||||
gTotalBytesAlignedAllocs -= size;
|
||||
|
||||
printf("free #%d at address %x, from %s,line %d, size %d\n",gNumAlignedFree,real, filename,line,size);
|
||||
|
||||
sFreeFunc(real);
|
||||
} else
|
||||
{
|
||||
printf("NULL ptr\n");
|
||||
}
|
||||
}
|
||||
|
||||
#else //BT_DEBUG_MEMORY_ALLOCATIONS
|
||||
|
||||
void* btAlignedAllocInternal (size_t size, int alignment)
|
||||
{
|
||||
gNumAlignedAllocs++;
|
||||
void* ptr;
|
||||
ptr = sAlignedAllocFunc(size, alignment);
|
||||
// printf("btAlignedAllocInternal %d, %x\n",size,ptr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void btAlignedFreeInternal (void* ptr)
|
||||
{
|
||||
if (!ptr)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
gNumAlignedFree++;
|
||||
// printf("btAlignedFreeInternal %x\n",ptr);
|
||||
sAlignedFreeFunc(ptr);
|
||||
}
|
||||
|
||||
#endif //BT_DEBUG_MEMORY_ALLOCATIONS
|
||||
|
||||
107
opencl/parallel_primitives/host/btAlignedAllocator.h
Normal file
107
opencl/parallel_primitives/host/btAlignedAllocator.h
Normal file
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#ifndef BT_ALIGNED_ALLOCATOR
|
||||
#define BT_ALIGNED_ALLOCATOR
|
||||
|
||||
///we probably replace this with our own aligned memory allocator
|
||||
///so we replace _aligned_malloc and _aligned_free with our own
|
||||
///that is better portable and more predictable
|
||||
|
||||
#include "btScalar.h"
|
||||
//#define BT_DEBUG_MEMORY_ALLOCATIONS 1
|
||||
#ifdef BT_DEBUG_MEMORY_ALLOCATIONS
|
||||
|
||||
#define btAlignedAlloc(a,b) \
|
||||
btAlignedAllocInternal(a,b,__LINE__,__FILE__)
|
||||
|
||||
#define btAlignedFree(ptr) \
|
||||
btAlignedFreeInternal(ptr,__LINE__,__FILE__)
|
||||
|
||||
void* btAlignedAllocInternal (size_t size, int alignment,int line,char* filename);
|
||||
|
||||
void btAlignedFreeInternal (void* ptr,int line,char* filename);
|
||||
|
||||
#else
|
||||
void* btAlignedAllocInternal (size_t size, int alignment);
|
||||
void btAlignedFreeInternal (void* ptr);
|
||||
|
||||
#define btAlignedAlloc(size,alignment) btAlignedAllocInternal(size,alignment)
|
||||
#define btAlignedFree(ptr) btAlignedFreeInternal(ptr)
|
||||
|
||||
#endif
|
||||
typedef int size_type;
|
||||
|
||||
typedef void *(btAlignedAllocFunc)(size_t size, int alignment);
|
||||
typedef void (btAlignedFreeFunc)(void *memblock);
|
||||
typedef void *(btAllocFunc)(size_t size);
|
||||
typedef void (btFreeFunc)(void *memblock);
|
||||
|
||||
///The developer can let all Bullet memory allocations go through a custom memory allocator, using btAlignedAllocSetCustom
|
||||
void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc);
|
||||
///If the developer has already an custom aligned allocator, then btAlignedAllocSetCustomAligned can be used. The default aligned allocator pre-allocates extra memory using the non-aligned allocator, and instruments it.
|
||||
void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc);
|
||||
|
||||
|
||||
///The btAlignedAllocator is a portable class for aligned memory allocations.
|
||||
///Default implementations for unaligned and aligned allocations can be overridden by a custom allocator using btAlignedAllocSetCustom and btAlignedAllocSetCustomAligned.
|
||||
template < typename T , unsigned Alignment >
|
||||
class btAlignedAllocator {
|
||||
|
||||
typedef btAlignedAllocator< T , Alignment > self_type;
|
||||
|
||||
public:
|
||||
|
||||
//just going down a list:
|
||||
btAlignedAllocator() {}
|
||||
/*
|
||||
btAlignedAllocator( const self_type & ) {}
|
||||
*/
|
||||
|
||||
template < typename Other >
|
||||
btAlignedAllocator( const btAlignedAllocator< Other , Alignment > & ) {}
|
||||
|
||||
typedef const T* const_pointer;
|
||||
typedef const T& const_reference;
|
||||
typedef T* pointer;
|
||||
typedef T& reference;
|
||||
typedef T value_type;
|
||||
|
||||
pointer address ( reference ref ) const { return &ref; }
|
||||
const_pointer address ( const_reference ref ) const { return &ref; }
|
||||
pointer allocate ( size_type n , const_pointer * hint = 0 ) {
|
||||
(void)hint;
|
||||
return reinterpret_cast< pointer >(btAlignedAlloc( sizeof(value_type) * n , Alignment ));
|
||||
}
|
||||
void construct ( pointer ptr , const value_type & value ) { new (ptr) value_type( value ); }
|
||||
void deallocate( pointer ptr ) {
|
||||
btAlignedFree( reinterpret_cast< void * >( ptr ) );
|
||||
}
|
||||
void destroy ( pointer ptr ) { ptr->~value_type(); }
|
||||
|
||||
|
||||
template < typename O > struct rebind {
|
||||
typedef btAlignedAllocator< O , Alignment > other;
|
||||
};
|
||||
template < typename O >
|
||||
self_type & operator=( const btAlignedAllocator< O , Alignment > & ) { return *this; }
|
||||
|
||||
friend bool operator==( const self_type & , const self_type & ) { return true; }
|
||||
};
|
||||
|
||||
|
||||
|
||||
#endif //BT_ALIGNED_ALLOCATOR
|
||||
|
||||
511
opencl/parallel_primitives/host/btAlignedObjectArray.h
Normal file
511
opencl/parallel_primitives/host/btAlignedObjectArray.h
Normal file
@@ -0,0 +1,511 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef BT_OBJECT_ARRAY__
|
||||
#define BT_OBJECT_ARRAY__
|
||||
|
||||
#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE
|
||||
#include "btAlignedAllocator.h"
|
||||
|
||||
///If the platform doesn't support placement new, you can disable BT_USE_PLACEMENT_NEW
|
||||
///then the btAlignedObjectArray doesn't support objects with virtual methods, and non-trivial constructors/destructors
|
||||
///You can enable BT_USE_MEMCPY, then swapping elements in the array will use memcpy instead of operator=
|
||||
///see discussion here: http://continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1231 and
|
||||
///http://www.continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1240
|
||||
|
||||
#define BT_USE_PLACEMENT_NEW 1
|
||||
//#define BT_USE_MEMCPY 1 //disable, because it is cumbersome to find out for each platform where memcpy is defined. It can be in <memory.h> or <string.h> or otherwise...
|
||||
#define BT_ALLOW_ARRAY_COPY_OPERATOR // enabling this can accidently perform deep copies of data if you are not careful
|
||||
|
||||
#ifdef BT_USE_MEMCPY
|
||||
#include <memory.h>
|
||||
#include <string.h>
|
||||
#endif //BT_USE_MEMCPY
|
||||
|
||||
#ifdef BT_USE_PLACEMENT_NEW
|
||||
#include <new> //for placement new
|
||||
#endif //BT_USE_PLACEMENT_NEW
|
||||
|
||||
|
||||
///The btAlignedObjectArray template class uses a subset of the stl::vector interface for its methods
|
||||
///It is developed to replace stl::vector to avoid portability issues, including STL alignment issues to add SIMD/SSE data
|
||||
template <typename T>
|
||||
//template <class T>
|
||||
class btAlignedObjectArray
|
||||
{
|
||||
btAlignedAllocator<T , 16> m_allocator;
|
||||
|
||||
int m_size;
|
||||
int m_capacity;
|
||||
T* m_data;
|
||||
//PCK: added this line
|
||||
bool m_ownsMemory;
|
||||
|
||||
#ifdef BT_ALLOW_ARRAY_COPY_OPERATOR
|
||||
public:
|
||||
SIMD_FORCE_INLINE btAlignedObjectArray<T>& operator=(const btAlignedObjectArray<T> &other)
|
||||
{
|
||||
copyFromArray(other);
|
||||
return *this;
|
||||
}
|
||||
#else//BT_ALLOW_ARRAY_COPY_OPERATOR
|
||||
private:
|
||||
SIMD_FORCE_INLINE btAlignedObjectArray<T>& operator=(const btAlignedObjectArray<T> &other);
|
||||
#endif//BT_ALLOW_ARRAY_COPY_OPERATOR
|
||||
|
||||
protected:
|
||||
SIMD_FORCE_INLINE int allocSize(int size)
|
||||
{
|
||||
return (size ? size*2 : 1);
|
||||
}
|
||||
SIMD_FORCE_INLINE void copy(int start,int end, T* dest) const
|
||||
{
|
||||
int i;
|
||||
for (i=start;i<end;++i)
|
||||
#ifdef BT_USE_PLACEMENT_NEW
|
||||
new (&dest[i]) T(m_data[i]);
|
||||
#else
|
||||
dest[i] = m_data[i];
|
||||
#endif //BT_USE_PLACEMENT_NEW
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void init()
|
||||
{
|
||||
//PCK: added this line
|
||||
m_ownsMemory = true;
|
||||
m_data = 0;
|
||||
m_size = 0;
|
||||
m_capacity = 0;
|
||||
}
|
||||
SIMD_FORCE_INLINE void destroy(int first,int last)
|
||||
{
|
||||
int i;
|
||||
for (i=first; i<last;i++)
|
||||
{
|
||||
m_data[i].~T();
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void* allocate(int size)
|
||||
{
|
||||
if (size)
|
||||
return m_allocator.allocate(size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void deallocate()
|
||||
{
|
||||
if(m_data) {
|
||||
//PCK: enclosed the deallocation in this block
|
||||
if (m_ownsMemory)
|
||||
{
|
||||
m_allocator.deallocate(m_data);
|
||||
}
|
||||
m_data = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public:
|
||||
|
||||
btAlignedObjectArray()
|
||||
{
|
||||
init();
|
||||
}
|
||||
|
||||
~btAlignedObjectArray()
|
||||
{
|
||||
clear();
|
||||
}
|
||||
|
||||
///Generally it is best to avoid using the copy constructor of an btAlignedObjectArray, and use a (const) reference to the array instead.
|
||||
btAlignedObjectArray(const btAlignedObjectArray& otherArray)
|
||||
{
|
||||
init();
|
||||
|
||||
int otherSize = otherArray.size();
|
||||
resize (otherSize);
|
||||
otherArray.copy(0, otherSize, m_data);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// return the number of elements in the array
|
||||
SIMD_FORCE_INLINE int size() const
|
||||
{
|
||||
return m_size;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE const T& at(int n) const
|
||||
{
|
||||
btAssert(n>=0);
|
||||
btAssert(n<size());
|
||||
return m_data[n];
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE T& at(int n)
|
||||
{
|
||||
btAssert(n>=0);
|
||||
btAssert(n<size());
|
||||
return m_data[n];
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE const T& operator[](int n) const
|
||||
{
|
||||
btAssert(n>=0);
|
||||
btAssert(n<size());
|
||||
return m_data[n];
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE T& operator[](int n)
|
||||
{
|
||||
btAssert(n>=0);
|
||||
btAssert(n<size());
|
||||
return m_data[n];
|
||||
}
|
||||
|
||||
|
||||
///clear the array, deallocated memory. Generally it is better to use array.resize(0), to reduce performance overhead of run-time memory (de)allocations.
|
||||
SIMD_FORCE_INLINE void clear()
|
||||
{
|
||||
destroy(0,size());
|
||||
|
||||
deallocate();
|
||||
|
||||
init();
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void pop_back()
|
||||
{
|
||||
btAssert(m_size>0);
|
||||
m_size--;
|
||||
m_data[m_size].~T();
|
||||
}
|
||||
|
||||
|
||||
///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument.
|
||||
///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations.
|
||||
SIMD_FORCE_INLINE void resizeNoInitialize(int newsize)
|
||||
{
|
||||
int curSize = size();
|
||||
|
||||
if (newsize < curSize)
|
||||
{
|
||||
} else
|
||||
{
|
||||
if (newsize > size())
|
||||
{
|
||||
reserve(newsize);
|
||||
}
|
||||
//leave this uninitialized
|
||||
}
|
||||
m_size = newsize;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void resize(int newsize, const T& fillData=T())
|
||||
{
|
||||
int curSize = size();
|
||||
|
||||
if (newsize < curSize)
|
||||
{
|
||||
for(int i = newsize; i < curSize; i++)
|
||||
{
|
||||
m_data[i].~T();
|
||||
}
|
||||
} else
|
||||
{
|
||||
if (newsize > size())
|
||||
{
|
||||
reserve(newsize);
|
||||
}
|
||||
#ifdef BT_USE_PLACEMENT_NEW
|
||||
for (int i=curSize;i<newsize;i++)
|
||||
{
|
||||
new ( &m_data[i]) T(fillData);
|
||||
}
|
||||
#endif //BT_USE_PLACEMENT_NEW
|
||||
|
||||
}
|
||||
|
||||
m_size = newsize;
|
||||
}
|
||||
SIMD_FORCE_INLINE T& expandNonInitializing( )
|
||||
{
|
||||
int sz = size();
|
||||
if( sz == capacity() )
|
||||
{
|
||||
reserve( allocSize(size()) );
|
||||
}
|
||||
m_size++;
|
||||
|
||||
return m_data[sz];
|
||||
}
|
||||
|
||||
|
||||
SIMD_FORCE_INLINE T& expand( const T& fillValue=T())
|
||||
{
|
||||
int sz = size();
|
||||
if( sz == capacity() )
|
||||
{
|
||||
reserve( allocSize(size()) );
|
||||
}
|
||||
m_size++;
|
||||
#ifdef BT_USE_PLACEMENT_NEW
|
||||
new (&m_data[sz]) T(fillValue); //use the in-place new (not really allocating heap memory)
|
||||
#endif
|
||||
|
||||
return m_data[sz];
|
||||
}
|
||||
|
||||
|
||||
SIMD_FORCE_INLINE void push_back(const T& _Val)
|
||||
{
|
||||
int sz = size();
|
||||
if( sz == capacity() )
|
||||
{
|
||||
reserve( allocSize(size()) );
|
||||
}
|
||||
|
||||
#ifdef BT_USE_PLACEMENT_NEW
|
||||
new ( &m_data[m_size] ) T(_Val);
|
||||
#else
|
||||
m_data[size()] = _Val;
|
||||
#endif //BT_USE_PLACEMENT_NEW
|
||||
|
||||
m_size++;
|
||||
}
|
||||
|
||||
|
||||
/// return the pre-allocated (reserved) elements, this is at least as large as the total number of elements,see size() and reserve()
|
||||
SIMD_FORCE_INLINE int capacity() const
|
||||
{
|
||||
return m_capacity;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void reserve(int _Count)
|
||||
{ // determine new minimum length of allocated storage
|
||||
if (capacity() < _Count)
|
||||
{ // not enough room, reallocate
|
||||
T* s = (T*)allocate(_Count);
|
||||
|
||||
copy(0, size(), s);
|
||||
|
||||
destroy(0,size());
|
||||
|
||||
deallocate();
|
||||
|
||||
//PCK: added this line
|
||||
m_ownsMemory = true;
|
||||
|
||||
m_data = s;
|
||||
|
||||
m_capacity = _Count;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class less
|
||||
{
|
||||
public:
|
||||
|
||||
bool operator() ( const T& a, const T& b )
|
||||
{
|
||||
return ( a < b );
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename L>
|
||||
void quickSortInternal(const L& CompareFunc,int lo, int hi)
|
||||
{
|
||||
// lo is the lower index, hi is the upper index
|
||||
// of the region of array a that is to be sorted
|
||||
int i=lo, j=hi;
|
||||
T x=m_data[(lo+hi)/2];
|
||||
|
||||
// partition
|
||||
do
|
||||
{
|
||||
while (CompareFunc(m_data[i],x))
|
||||
i++;
|
||||
while (CompareFunc(x,m_data[j]))
|
||||
j--;
|
||||
if (i<=j)
|
||||
{
|
||||
swap(i,j);
|
||||
i++; j--;
|
||||
}
|
||||
} while (i<=j);
|
||||
|
||||
// recursion
|
||||
if (lo<j)
|
||||
quickSortInternal( CompareFunc, lo, j);
|
||||
if (i<hi)
|
||||
quickSortInternal( CompareFunc, i, hi);
|
||||
}
|
||||
|
||||
|
||||
template <typename L>
|
||||
void quickSort(const L& CompareFunc)
|
||||
{
|
||||
//don't sort 0 or 1 elements
|
||||
if (size()>1)
|
||||
{
|
||||
quickSortInternal(CompareFunc,0,size()-1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/
|
||||
template <typename L>
|
||||
void downHeap(T *pArr, int k, int n, const L& CompareFunc)
|
||||
{
|
||||
/* PRE: a[k+1..N] is a heap */
|
||||
/* POST: a[k..N] is a heap */
|
||||
|
||||
T temp = pArr[k - 1];
|
||||
/* k has child(s) */
|
||||
while (k <= n/2)
|
||||
{
|
||||
int child = 2*k;
|
||||
|
||||
if ((child < n) && CompareFunc(pArr[child - 1] , pArr[child]))
|
||||
{
|
||||
child++;
|
||||
}
|
||||
/* pick larger child */
|
||||
if (CompareFunc(temp , pArr[child - 1]))
|
||||
{
|
||||
/* move child up */
|
||||
pArr[k - 1] = pArr[child - 1];
|
||||
k = child;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
pArr[k - 1] = temp;
|
||||
} /*downHeap*/
|
||||
|
||||
void swap(int index0,int index1)
|
||||
{
|
||||
#ifdef BT_USE_MEMCPY
|
||||
char temp[sizeof(T)];
|
||||
memcpy(temp,&m_data[index0],sizeof(T));
|
||||
memcpy(&m_data[index0],&m_data[index1],sizeof(T));
|
||||
memcpy(&m_data[index1],temp,sizeof(T));
|
||||
#else
|
||||
T temp = m_data[index0];
|
||||
m_data[index0] = m_data[index1];
|
||||
m_data[index1] = temp;
|
||||
#endif //BT_USE_PLACEMENT_NEW
|
||||
|
||||
}
|
||||
|
||||
template <typename L>
|
||||
void heapSort(const L& CompareFunc)
|
||||
{
|
||||
/* sort a[0..N-1], N.B. 0 to N-1 */
|
||||
int k;
|
||||
int n = m_size;
|
||||
for (k = n/2; k > 0; k--)
|
||||
{
|
||||
downHeap(m_data, k, n, CompareFunc);
|
||||
}
|
||||
|
||||
/* a[1..N] is now a heap */
|
||||
while ( n>=1 )
|
||||
{
|
||||
swap(0,n-1); /* largest of a[0..n-1] */
|
||||
|
||||
|
||||
n = n - 1;
|
||||
/* restore a[1..i-1] heap */
|
||||
downHeap(m_data, 1, n, CompareFunc);
|
||||
}
|
||||
}
|
||||
|
||||
///non-recursive binary search, assumes sorted array
|
||||
int findBinarySearch(const T& key) const
|
||||
{
|
||||
int first = 0;
|
||||
int last = size()-1;
|
||||
|
||||
//assume sorted array
|
||||
while (first <= last) {
|
||||
int mid = (first + last) / 2; // compute mid point.
|
||||
if (key > m_data[mid])
|
||||
first = mid + 1; // repeat search in top half.
|
||||
else if (key < m_data[mid])
|
||||
last = mid - 1; // repeat search in bottom half.
|
||||
else
|
||||
return mid; // found it. return position /////
|
||||
}
|
||||
return size(); // failed to find key
|
||||
}
|
||||
|
||||
|
||||
int findLinearSearch(const T& key) const
|
||||
{
|
||||
int index=size();
|
||||
int i;
|
||||
|
||||
for (i=0;i<size();i++)
|
||||
{
|
||||
if (m_data[i] == key)
|
||||
{
|
||||
index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
void remove(const T& key)
|
||||
{
|
||||
|
||||
int findIndex = findLinearSearch(key);
|
||||
if (findIndex<size())
|
||||
{
|
||||
swap( findIndex,size()-1);
|
||||
pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
//PCK: whole function
|
||||
void initializeFromBuffer(void *buffer, int size, int capacity)
|
||||
{
|
||||
clear();
|
||||
m_ownsMemory = false;
|
||||
m_data = (T*)buffer;
|
||||
m_size = size;
|
||||
m_capacity = capacity;
|
||||
}
|
||||
|
||||
void copyFromArray(const btAlignedObjectArray& otherArray)
|
||||
{
|
||||
int otherSize = otherArray.size();
|
||||
resize (otherSize);
|
||||
otherArray.copy(0, otherSize, m_data);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif //BT_OBJECT_ARRAY__
|
||||
213
opencl/parallel_primitives/host/btBoundSearchCL.cpp
Normal file
213
opencl/parallel_primitives/host/btBoundSearchCL.cpp
Normal file
@@ -0,0 +1,213 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
//Host-code rewritten by Erwin Coumans
|
||||
|
||||
#define BOUNDSEARCH_PATH "opencl/parallel_primitives/kernels/BoundSearchKernels.cl"
|
||||
#define KERNEL0 "SearchSortDataLowerKernel"
|
||||
#define KERNEL1 "SearchSortDataUpperKernel"
|
||||
#define KERNEL2 "SubtractKernel"
|
||||
|
||||
|
||||
#include "btBoundSearchCL.h"
|
||||
#include "../../basic_initialize/btOpenCLUtils.h"
|
||||
#include "btLauncherCL.h"
|
||||
#include "../kernels/BoundSearchKernelsCL.h"
|
||||
|
||||
btBoundSearchCL::btBoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
|
||||
:m_context(ctx),
|
||||
m_device(device),
|
||||
m_queue(queue)
|
||||
{
|
||||
|
||||
const char* additionalMacros = "";
|
||||
const char* srcFileNameForCaching="";
|
||||
|
||||
cl_int pErrNum;
|
||||
const char* kernelSource = boundSearchKernelsCL;
|
||||
|
||||
cl_program boundSearchProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH);
|
||||
btAssert(boundSearchProg);
|
||||
|
||||
m_lowerSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros );
|
||||
btAssert(m_lowerSortDataKernel );
|
||||
|
||||
m_upperSortDataKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros );
|
||||
btAssert(m_upperSortDataKernel);
|
||||
|
||||
m_subtractKernel = 0;
|
||||
|
||||
if( maxSize )
|
||||
{
|
||||
m_subtractKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros );
|
||||
btAssert(m_subtractKernel);
|
||||
}
|
||||
|
||||
//m_constBuffer = new btOpenCLArray<btInt4>( device, 1, BufferBase::BUFFER_CONST );
|
||||
|
||||
m_lower = (maxSize == 0)? 0: new btOpenCLArray<unsigned int>(ctx,queue,maxSize );
|
||||
m_upper = (maxSize == 0)? 0: new btOpenCLArray<unsigned int>(ctx,queue, maxSize );
|
||||
|
||||
m_filler = new btFillCL(ctx,device,queue);
|
||||
}
|
||||
|
||||
btBoundSearchCL::~btBoundSearchCL()
|
||||
{
|
||||
|
||||
delete m_lower;
|
||||
delete m_upper;
|
||||
delete m_filler;
|
||||
|
||||
clReleaseKernel(m_lowerSortDataKernel);
|
||||
clReleaseKernel(m_upperSortDataKernel);
|
||||
clReleaseKernel(m_subtractKernel);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void btBoundSearchCL::execute(btOpenCLArray<btSortData>& src, int nSrc, btOpenCLArray<unsigned int>& dst, int nDst, Option option )
|
||||
{
|
||||
btInt4 constBuffer;
|
||||
constBuffer.x = nSrc;
|
||||
constBuffer.y = nDst;
|
||||
|
||||
if( option == BOUND_LOWER )
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL()) };
|
||||
|
||||
btLauncherCL launcher( m_queue, m_lowerSortDataKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( nSrc );
|
||||
launcher.setConst( nDst );
|
||||
|
||||
launcher.launch1D( nSrc, 64 );
|
||||
}
|
||||
else if( option == BOUND_UPPER )
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };
|
||||
|
||||
btLauncherCL launcher(m_queue, m_upperSortDataKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( nSrc );
|
||||
launcher.setConst( nDst );
|
||||
|
||||
launcher.launch1D( nSrc, 64 );
|
||||
}
|
||||
else if( option == COUNT )
|
||||
{
|
||||
btAssert( m_lower );
|
||||
btAssert( m_upper );
|
||||
btAssert( m_lower->capacity() <= (int)nDst );
|
||||
btAssert( m_upper->capacity() <= (int)nDst );
|
||||
|
||||
int zero = 0;
|
||||
m_filler->execute( *m_lower, zero, nDst );
|
||||
m_filler->execute( *m_upper, zero, nDst );
|
||||
|
||||
execute( src, nSrc, *m_lower, nDst, BOUND_LOWER );
|
||||
execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );
|
||||
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_upper->getBufferCL(), true ), btBufferInfoCL( m_lower->getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };
|
||||
|
||||
btLauncherCL launcher( m_queue, m_subtractKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( nSrc );
|
||||
launcher.setConst( nDst );
|
||||
|
||||
launcher.launch1D( nDst, 64 );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
btAssert( 0 );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void btBoundSearchCL::executeHost( btAlignedObjectArray<btSortData>& src, int nSrc,
|
||||
btAlignedObjectArray<unsigned int>& dst, int nDst, Option option )
|
||||
{
|
||||
|
||||
|
||||
for(int i=0; i<nSrc-1; i++)
|
||||
btAssert( src[i].m_key <= src[i+1].m_key );
|
||||
|
||||
btSortData minData,zeroData,maxData;
|
||||
minData.m_key = -1;
|
||||
minData.m_value = -1;
|
||||
zeroData.m_key=0;
|
||||
zeroData.m_value=0;
|
||||
maxData.m_key = nDst;
|
||||
maxData.m_value = nDst;
|
||||
|
||||
if( option == BOUND_LOWER )
|
||||
{
|
||||
for(int i=0; i<nSrc; i++)
|
||||
{
|
||||
btSortData& iData = (i==0)? minData: src[i-1];
|
||||
btSortData& jData = (i==nSrc)? maxData: src[i];
|
||||
|
||||
if( iData.m_key != jData.m_key )
|
||||
{
|
||||
int k = jData.m_key;
|
||||
{
|
||||
dst[k] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( option == BOUND_UPPER )
|
||||
{
|
||||
for(int i=1; i<nSrc+1; i++)
|
||||
{
|
||||
btSortData& iData = src[i-1];
|
||||
btSortData& jData = (i==nSrc)? maxData: src[i];
|
||||
|
||||
if( iData.m_key != jData.m_key )
|
||||
{
|
||||
int k = iData.m_key;
|
||||
{
|
||||
dst[k] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( option == COUNT )
|
||||
{
|
||||
btAlignedObjectArray<unsigned int> lower;
|
||||
lower.resize(nDst );
|
||||
btAlignedObjectArray<unsigned int> upper;
|
||||
upper.resize(nDst );
|
||||
|
||||
for(int i=0; i<nDst; i++)
|
||||
{
|
||||
lower[i] = upper[i] = 0;
|
||||
}
|
||||
|
||||
executeHost( src, nSrc, lower, nDst, BOUND_LOWER );
|
||||
executeHost( src, nSrc, upper, nDst, BOUND_UPPER );
|
||||
|
||||
for( int i=0; i<nDst; i++)
|
||||
{
|
||||
dst[i] = upper[i] - lower[i];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
btAssert( 0 );
|
||||
}
|
||||
}
|
||||
67
opencl/parallel_primitives/host/btBoundSearchCL.h
Normal file
67
opencl/parallel_primitives/host/btBoundSearchCL.h
Normal file
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Takahiro Harada
|
||||
|
||||
#ifndef BT_BOUNDSEARCH_H
|
||||
#define BT_BOUNDSEARCH_H
|
||||
|
||||
#pragma once
|
||||
|
||||
/*#include <Adl/Adl.h>
|
||||
#include <AdlPrimitives/Math/Math.h>
|
||||
#include <AdlPrimitives/Sort/SortData.h>
|
||||
#include <AdlPrimitives/Fill/Fill.h>
|
||||
*/
|
||||
|
||||
#include "btOpenCLArray.h"
|
||||
#include "btFillCL.h"
|
||||
#include "btRadixSort32CL.h" //for btSortData (perhaps move it?)
|
||||
class btBoundSearchCL
|
||||
{
|
||||
public:
|
||||
|
||||
enum Option
|
||||
{
|
||||
BOUND_LOWER,
|
||||
BOUND_UPPER,
|
||||
COUNT,
|
||||
};
|
||||
|
||||
cl_context m_context;
|
||||
cl_device_id m_device;
|
||||
cl_command_queue m_queue;
|
||||
|
||||
|
||||
cl_kernel m_lowerSortDataKernel;
|
||||
cl_kernel m_upperSortDataKernel;
|
||||
cl_kernel m_subtractKernel;
|
||||
|
||||
btOpenCLArray<btInt4>* m_constbtOpenCLArray;
|
||||
btOpenCLArray<unsigned int>* m_lower;
|
||||
btOpenCLArray<unsigned int>* m_upper;
|
||||
|
||||
btFillCL* m_filler;
|
||||
|
||||
btBoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
|
||||
|
||||
virtual ~btBoundSearchCL();
|
||||
|
||||
// src has to be src[i].m_key <= src[i+1].m_key
|
||||
void execute( btOpenCLArray<btSortData>& src, int nSrc, btOpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
|
||||
|
||||
void executeHost( btAlignedObjectArray<btSortData>& src, int nSrc, btAlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
|
||||
};
|
||||
|
||||
|
||||
#endif //BT_BOUNDSEARCH_H
|
||||
19
opencl/parallel_primitives/host/btBufferInfoCL.h
Normal file
19
opencl/parallel_primitives/host/btBufferInfoCL.h
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
#ifndef BT_BUFFER_INFO_CL_H
|
||||
#define BT_BUFFER_INFO_CL_H
|
||||
|
||||
#include "btOpenCLArray.h"
|
||||
|
||||
|
||||
struct btBufferInfoCL
|
||||
{
|
||||
//btBufferInfoCL(){}
|
||||
|
||||
// template<typename T>
|
||||
btBufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
|
||||
|
||||
cl_mem m_clBuffer;
|
||||
bool m_isReadOnly;
|
||||
};
|
||||
|
||||
#endif //BT_BUFFER_INFO_CL_H
|
||||
126
opencl/parallel_primitives/host/btFillCL.cpp
Normal file
126
opencl/parallel_primitives/host/btFillCL.cpp
Normal file
@@ -0,0 +1,126 @@
|
||||
#include "btFillCL.h"
|
||||
#include "../../basic_initialize/btOpenCLUtils.h"
|
||||
#include "btBufferInfoCL.h"
|
||||
#include "btLauncherCL.h"
|
||||
|
||||
#define FILL_CL_PROGRAM_PATH "opencl/parallel_primitives/kernels/FillKernels.cl"
|
||||
|
||||
#include "../kernels/FillKernelsCL.h"
|
||||
|
||||
btFillCL::btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
|
||||
:m_commandQueue(queue)
|
||||
{
|
||||
const char* kernelSource = fillKernelsCL;
|
||||
cl_int pErrNum;
|
||||
const char* additionalMacros = "";
|
||||
|
||||
cl_program fillProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH);
|
||||
btAssert(fillProg);
|
||||
|
||||
m_fillIntKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros );
|
||||
btAssert(m_fillIntKernel);
|
||||
|
||||
m_fillUnsignedIntKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros );
|
||||
btAssert(m_fillIntKernel);
|
||||
|
||||
m_fillFloatKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros );
|
||||
btAssert(m_fillFloatKernel);
|
||||
|
||||
|
||||
|
||||
m_fillKernelInt2 = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros );
|
||||
btAssert(m_fillKernelInt2);
|
||||
|
||||
}
|
||||
|
||||
btFillCL::~btFillCL()
|
||||
{
|
||||
clReleaseKernel(m_fillKernelInt2);
|
||||
clReleaseKernel(m_fillIntKernel);
|
||||
clReleaseKernel(m_fillUnsignedIntKernel);
|
||||
clReleaseKernel(m_fillFloatKernel);
|
||||
|
||||
}
|
||||
|
||||
void btFillCL::execute(btOpenCLArray<float>& src, const float value, int n, int offset)
|
||||
{
|
||||
btAssert( n>0 );
|
||||
|
||||
{
|
||||
btLauncherCL launcher( m_commandQueue, m_fillFloatKernel );
|
||||
launcher.setBuffer( src.getBufferCL());
|
||||
launcher.setConst( n );
|
||||
launcher.setConst( value );
|
||||
launcher.setConst( offset);
|
||||
|
||||
launcher.launch1D( n );
|
||||
}
|
||||
}
|
||||
|
||||
void btFillCL::execute(btOpenCLArray<int>& src, const int value, int n, int offset)
|
||||
{
|
||||
btAssert( n>0 );
|
||||
|
||||
|
||||
{
|
||||
btLauncherCL launcher( m_commandQueue, m_fillIntKernel );
|
||||
launcher.setBuffer(src.getBufferCL());
|
||||
launcher.setConst( n);
|
||||
launcher.setConst( value);
|
||||
launcher.setConst( offset);
|
||||
launcher.launch1D( n );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void btFillCL::execute(btOpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
|
||||
{
|
||||
btAssert( n>0 );
|
||||
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };
|
||||
|
||||
btLauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( n );
|
||||
launcher.setConst(value);
|
||||
launcher.setConst(offset);
|
||||
|
||||
launcher.launch1D( n );
|
||||
}
|
||||
}
|
||||
|
||||
void btFillCL::executeHost(btAlignedObjectArray<btInt2> &src, const btInt2 &value, int n, int offset)
|
||||
{
|
||||
for (int i=0;i<n;i++)
|
||||
{
|
||||
src[i+offset]=value;
|
||||
}
|
||||
}
|
||||
|
||||
void btFillCL::executeHost(btAlignedObjectArray<int> &src, const int value, int n, int offset)
|
||||
{
|
||||
for (int i=0;i<n;i++)
|
||||
{
|
||||
src[i+offset]=value;
|
||||
}
|
||||
}
|
||||
|
||||
void btFillCL::execute(btOpenCLArray<btInt2> &src, const btInt2 &value, int n, int offset)
|
||||
{
|
||||
btAssert( n>0 );
|
||||
|
||||
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };
|
||||
|
||||
btLauncherCL launcher(m_commandQueue, m_fillKernelInt2);
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst(n);
|
||||
launcher.setConst(value);
|
||||
launcher.setConst(offset);
|
||||
|
||||
//( constBuffer );
|
||||
launcher.launch1D( n );
|
||||
}
|
||||
}
|
||||
137
opencl/parallel_primitives/host/btFillCL.h
Normal file
137
opencl/parallel_primitives/host/btFillCL.h
Normal file
@@ -0,0 +1,137 @@
|
||||
#ifndef BT_FILL_CL_H
|
||||
#define BT_FILL_CL_H
|
||||
|
||||
#include "btOpenCLArray.h"
|
||||
#include "btScalar.h"
|
||||
|
||||
ATTRIBUTE_ALIGNED16(struct) btUnsignedInt4
|
||||
{
|
||||
BT_DECLARE_ALIGNED_ALLOCATOR();
|
||||
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
unsigned int x,y,z,w;
|
||||
};
|
||||
struct
|
||||
{
|
||||
unsigned int s[4];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
ATTRIBUTE_ALIGNED16(struct) btInt4
|
||||
{
|
||||
BT_DECLARE_ALIGNED_ALLOCATOR();
|
||||
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
int x,y,z,w;
|
||||
};
|
||||
struct
|
||||
{
|
||||
int s[4];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
struct btUnsignedInt2
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
unsigned int x,y;
|
||||
};
|
||||
struct
|
||||
{
|
||||
unsigned int s[2];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
struct btInt2
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
int x,y;
|
||||
};
|
||||
struct
|
||||
{
|
||||
int s[2];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
SIMD_FORCE_INLINE btInt4 btMakeInt4(int x, int y, int z, int w = 0)
|
||||
{
|
||||
btInt4 v;
|
||||
v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
|
||||
return v;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE btUnsignedInt4 btMakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0)
|
||||
{
|
||||
btUnsignedInt4 v;
|
||||
v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
|
||||
return v;
|
||||
}
|
||||
|
||||
class btFillCL
|
||||
{
|
||||
|
||||
cl_command_queue m_commandQueue;
|
||||
|
||||
cl_kernel m_fillKernelInt2;
|
||||
cl_kernel m_fillIntKernel;
|
||||
cl_kernel m_fillUnsignedIntKernel;
|
||||
cl_kernel m_fillFloatKernel;
|
||||
|
||||
public:
|
||||
|
||||
struct btConstData
|
||||
{
|
||||
union
|
||||
{
|
||||
btInt4 m_data;
|
||||
btUnsignedInt4 m_UnsignedData;
|
||||
};
|
||||
int m_offset;
|
||||
int m_n;
|
||||
int m_padding[2];
|
||||
};
|
||||
|
||||
protected:
|
||||
|
||||
public:
|
||||
|
||||
btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
|
||||
|
||||
virtual ~btFillCL();
|
||||
|
||||
void execute(btOpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
|
||||
|
||||
void execute(btOpenCLArray<int>& src, const int value, int n, int offset = 0);
|
||||
|
||||
void execute(btOpenCLArray<float>& src, const float value, int n, int offset = 0);
|
||||
|
||||
void execute(btOpenCLArray<btInt2>& src, const btInt2& value, int n, int offset = 0);
|
||||
|
||||
void executeHost(btAlignedObjectArray<btInt2> &src, const btInt2 &value, int n, int offset);
|
||||
|
||||
void executeHost(btAlignedObjectArray<int> &src, const int value, int n, int offset);
|
||||
|
||||
// void execute(btOpenCLArray<btInt4>& src, const btInt4& value, int n, int offset = 0);
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif //BT_FILL_CL_H
|
||||
450
opencl/parallel_primitives/host/btHashMap.h
Normal file
450
opencl/parallel_primitives/host/btHashMap.h
Normal file
@@ -0,0 +1,450 @@
|
||||
/*
|
||||
Bullet Continuous Collision Detection and Physics Library
|
||||
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef BT_HASH_MAP_H
|
||||
#define BT_HASH_MAP_H
|
||||
|
||||
#include "btAlignedObjectArray.h"
|
||||
|
||||
///very basic hashable string implementation, compatible with btHashMap
|
||||
struct btHashString
|
||||
{
|
||||
const char* m_string;
|
||||
unsigned int m_hash;
|
||||
|
||||
SIMD_FORCE_INLINE unsigned int getHash()const
|
||||
{
|
||||
return m_hash;
|
||||
}
|
||||
|
||||
btHashString(const char* name)
|
||||
:m_string(name)
|
||||
{
|
||||
/* magic numbers from http://www.isthe.com/chongo/tech/comp/fnv/ */
|
||||
static const unsigned int InitialFNV = 2166136261u;
|
||||
static const unsigned int FNVMultiple = 16777619u;
|
||||
|
||||
/* Fowler / Noll / Vo (FNV) Hash */
|
||||
unsigned int hash = InitialFNV;
|
||||
|
||||
for(int i = 0; m_string[i]; i++)
|
||||
{
|
||||
hash = hash ^ (m_string[i]); /* xor the low 8 bits */
|
||||
hash = hash * FNVMultiple; /* multiply by the magic number */
|
||||
}
|
||||
m_hash = hash;
|
||||
}
|
||||
|
||||
int portableStringCompare(const char* src, const char* dst) const
|
||||
{
|
||||
int ret = 0 ;
|
||||
|
||||
while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst)
|
||||
++src, ++dst;
|
||||
|
||||
if ( ret < 0 )
|
||||
ret = -1 ;
|
||||
else if ( ret > 0 )
|
||||
ret = 1 ;
|
||||
|
||||
return( ret );
|
||||
}
|
||||
|
||||
bool equals(const btHashString& other) const
|
||||
{
|
||||
return (m_string == other.m_string) ||
|
||||
(0==portableStringCompare(m_string,other.m_string));
|
||||
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
const int BT_HASH_NULL=0xffffffff;
|
||||
|
||||
|
||||
class btHashInt
|
||||
{
|
||||
int m_uid;
|
||||
public:
|
||||
btHashInt(int uid) :m_uid(uid)
|
||||
{
|
||||
}
|
||||
|
||||
int getUid1() const
|
||||
{
|
||||
return m_uid;
|
||||
}
|
||||
|
||||
void setUid1(int uid)
|
||||
{
|
||||
m_uid = uid;
|
||||
}
|
||||
|
||||
bool equals(const btHashInt& other) const
|
||||
{
|
||||
return getUid1() == other.getUid1();
|
||||
}
|
||||
//to our success
|
||||
SIMD_FORCE_INLINE unsigned int getHash()const
|
||||
{
|
||||
int key = m_uid;
|
||||
// Thomas Wang's hash
|
||||
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
|
||||
return key;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
class btHashPtr
|
||||
{
|
||||
|
||||
union
|
||||
{
|
||||
const void* m_pointer;
|
||||
int m_hashValues[2];
|
||||
};
|
||||
|
||||
public:
|
||||
|
||||
btHashPtr(const void* ptr)
|
||||
:m_pointer(ptr)
|
||||
{
|
||||
}
|
||||
|
||||
const void* getPointer() const
|
||||
{
|
||||
return m_pointer;
|
||||
}
|
||||
|
||||
bool equals(const btHashPtr& other) const
|
||||
{
|
||||
return getPointer() == other.getPointer();
|
||||
}
|
||||
|
||||
//to our success
|
||||
SIMD_FORCE_INLINE unsigned int getHash()const
|
||||
{
|
||||
const bool VOID_IS_8 = ((sizeof(void*)==8));
|
||||
|
||||
int key = VOID_IS_8? m_hashValues[0]+m_hashValues[1] : m_hashValues[0];
|
||||
|
||||
// Thomas Wang's hash
|
||||
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
|
||||
return key;
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
template <class Value>
|
||||
class btHashKeyPtr
|
||||
{
|
||||
int m_uid;
|
||||
public:
|
||||
|
||||
btHashKeyPtr(int uid) :m_uid(uid)
|
||||
{
|
||||
}
|
||||
|
||||
int getUid1() const
|
||||
{
|
||||
return m_uid;
|
||||
}
|
||||
|
||||
bool equals(const btHashKeyPtr<Value>& other) const
|
||||
{
|
||||
return getUid1() == other.getUid1();
|
||||
}
|
||||
|
||||
//to our success
|
||||
SIMD_FORCE_INLINE unsigned int getHash()const
|
||||
{
|
||||
int key = m_uid;
|
||||
// Thomas Wang's hash
|
||||
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
|
||||
return key;
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
template <class Value>
|
||||
class btHashKey
|
||||
{
|
||||
int m_uid;
|
||||
public:
|
||||
|
||||
btHashKey(int uid) :m_uid(uid)
|
||||
{
|
||||
}
|
||||
|
||||
int getUid1() const
|
||||
{
|
||||
return m_uid;
|
||||
}
|
||||
|
||||
bool equals(const btHashKey<Value>& other) const
|
||||
{
|
||||
return getUid1() == other.getUid1();
|
||||
}
|
||||
//to our success
|
||||
SIMD_FORCE_INLINE unsigned int getHash()const
|
||||
{
|
||||
int key = m_uid;
|
||||
// Thomas Wang's hash
|
||||
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
|
||||
return key;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
///The btHashMap template class implements a generic and lightweight hashmap.
|
||||
///A basic sample of how to use btHashMap is located in Demos\BasicDemo\main.cpp
|
||||
template <class Key, class Value>
|
||||
class btHashMap
|
||||
{
|
||||
|
||||
protected:
|
||||
btAlignedObjectArray<int> m_hashTable;
|
||||
btAlignedObjectArray<int> m_next;
|
||||
|
||||
btAlignedObjectArray<Value> m_valueArray;
|
||||
btAlignedObjectArray<Key> m_keyArray;
|
||||
|
||||
void growTables(const Key& /*key*/)
|
||||
{
|
||||
int newCapacity = m_valueArray.capacity();
|
||||
|
||||
if (m_hashTable.size() < newCapacity)
|
||||
{
|
||||
//grow hashtable and next table
|
||||
int curHashtableSize = m_hashTable.size();
|
||||
|
||||
m_hashTable.resize(newCapacity);
|
||||
m_next.resize(newCapacity);
|
||||
|
||||
int i;
|
||||
|
||||
for (i= 0; i < newCapacity; ++i)
|
||||
{
|
||||
m_hashTable[i] = BT_HASH_NULL;
|
||||
}
|
||||
for (i = 0; i < newCapacity; ++i)
|
||||
{
|
||||
m_next[i] = BT_HASH_NULL;
|
||||
}
|
||||
|
||||
for(i=0;i<curHashtableSize;i++)
|
||||
{
|
||||
//const Value& value = m_valueArray[i];
|
||||
//const Key& key = m_keyArray[i];
|
||||
|
||||
int hashValue = m_keyArray[i].getHash() & (m_valueArray.capacity()-1); // New hash value with new mask
|
||||
m_next[i] = m_hashTable[hashValue];
|
||||
m_hashTable[hashValue] = i;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
void insert(const Key& key, const Value& value) {
|
||||
int hash = key.getHash() & (m_valueArray.capacity()-1);
|
||||
|
||||
//replace value if the key is already there
|
||||
int index = findIndex(key);
|
||||
if (index != BT_HASH_NULL)
|
||||
{
|
||||
m_valueArray[index]=value;
|
||||
return;
|
||||
}
|
||||
|
||||
int count = m_valueArray.size();
|
||||
int oldCapacity = m_valueArray.capacity();
|
||||
m_valueArray.push_back(value);
|
||||
m_keyArray.push_back(key);
|
||||
|
||||
int newCapacity = m_valueArray.capacity();
|
||||
if (oldCapacity < newCapacity)
|
||||
{
|
||||
growTables(key);
|
||||
//hash with new capacity
|
||||
hash = key.getHash() & (m_valueArray.capacity()-1);
|
||||
}
|
||||
m_next[count] = m_hashTable[hash];
|
||||
m_hashTable[hash] = count;
|
||||
}
|
||||
|
||||
void remove(const Key& key) {
|
||||
|
||||
int hash = key.getHash() & (m_valueArray.capacity()-1);
|
||||
|
||||
int pairIndex = findIndex(key);
|
||||
|
||||
if (pairIndex ==BT_HASH_NULL)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Remove the pair from the hash table.
|
||||
int index = m_hashTable[hash];
|
||||
btAssert(index != BT_HASH_NULL);
|
||||
|
||||
int previous = BT_HASH_NULL;
|
||||
while (index != pairIndex)
|
||||
{
|
||||
previous = index;
|
||||
index = m_next[index];
|
||||
}
|
||||
|
||||
if (previous != BT_HASH_NULL)
|
||||
{
|
||||
btAssert(m_next[previous] == pairIndex);
|
||||
m_next[previous] = m_next[pairIndex];
|
||||
}
|
||||
else
|
||||
{
|
||||
m_hashTable[hash] = m_next[pairIndex];
|
||||
}
|
||||
|
||||
// We now move the last pair into spot of the
|
||||
// pair being removed. We need to fix the hash
|
||||
// table indices to support the move.
|
||||
|
||||
int lastPairIndex = m_valueArray.size() - 1;
|
||||
|
||||
// If the removed pair is the last pair, we are done.
|
||||
if (lastPairIndex == pairIndex)
|
||||
{
|
||||
m_valueArray.pop_back();
|
||||
m_keyArray.pop_back();
|
||||
return;
|
||||
}
|
||||
|
||||
// Remove the last pair from the hash table.
|
||||
int lastHash = m_keyArray[lastPairIndex].getHash() & (m_valueArray.capacity()-1);
|
||||
|
||||
index = m_hashTable[lastHash];
|
||||
btAssert(index != BT_HASH_NULL);
|
||||
|
||||
previous = BT_HASH_NULL;
|
||||
while (index != lastPairIndex)
|
||||
{
|
||||
previous = index;
|
||||
index = m_next[index];
|
||||
}
|
||||
|
||||
if (previous != BT_HASH_NULL)
|
||||
{
|
||||
btAssert(m_next[previous] == lastPairIndex);
|
||||
m_next[previous] = m_next[lastPairIndex];
|
||||
}
|
||||
else
|
||||
{
|
||||
m_hashTable[lastHash] = m_next[lastPairIndex];
|
||||
}
|
||||
|
||||
// Copy the last pair into the remove pair's spot.
|
||||
m_valueArray[pairIndex] = m_valueArray[lastPairIndex];
|
||||
m_keyArray[pairIndex] = m_keyArray[lastPairIndex];
|
||||
|
||||
// Insert the last pair into the hash table
|
||||
m_next[pairIndex] = m_hashTable[lastHash];
|
||||
m_hashTable[lastHash] = pairIndex;
|
||||
|
||||
m_valueArray.pop_back();
|
||||
m_keyArray.pop_back();
|
||||
|
||||
}
|
||||
|
||||
|
||||
int size() const
|
||||
{
|
||||
return m_valueArray.size();
|
||||
}
|
||||
|
||||
const Value* getAtIndex(int index) const
|
||||
{
|
||||
btAssert(index < m_valueArray.size());
|
||||
|
||||
return &m_valueArray[index];
|
||||
}
|
||||
|
||||
Value* getAtIndex(int index)
|
||||
{
|
||||
btAssert(index < m_valueArray.size());
|
||||
|
||||
return &m_valueArray[index];
|
||||
}
|
||||
|
||||
Value* operator[](const Key& key) {
|
||||
return find(key);
|
||||
}
|
||||
|
||||
const Value* find(const Key& key) const
|
||||
{
|
||||
int index = findIndex(key);
|
||||
if (index == BT_HASH_NULL)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
return &m_valueArray[index];
|
||||
}
|
||||
|
||||
Value* find(const Key& key)
|
||||
{
|
||||
int index = findIndex(key);
|
||||
if (index == BT_HASH_NULL)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
return &m_valueArray[index];
|
||||
}
|
||||
|
||||
|
||||
int findIndex(const Key& key) const
|
||||
{
|
||||
unsigned int hash = key.getHash() & (m_valueArray.capacity()-1);
|
||||
|
||||
if (hash >= (unsigned int)m_hashTable.size())
|
||||
{
|
||||
return BT_HASH_NULL;
|
||||
}
|
||||
|
||||
int index = m_hashTable[hash];
|
||||
while ((index != BT_HASH_NULL) && key.equals(m_keyArray[index]) == false)
|
||||
{
|
||||
index = m_next[index];
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
void clear()
|
||||
{
|
||||
m_hashTable.clear();
|
||||
m_next.clear();
|
||||
m_valueArray.clear();
|
||||
m_keyArray.clear();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif //BT_HASH_MAP_H
|
||||
363
opencl/parallel_primitives/host/btLauncherCL.h
Normal file
363
opencl/parallel_primitives/host/btLauncherCL.h
Normal file
@@ -0,0 +1,363 @@
|
||||
|
||||
#ifndef BT_LAUNCHER_CL_H
|
||||
#define BT_LAUNCHER_CL_H
|
||||
|
||||
#include "btBufferInfoCL.h"
|
||||
#include "btMinMax.h"
|
||||
#include "btOpenCLArray.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#pragma warning(disable :4996)
|
||||
#endif
|
||||
#define BT_CL_MAX_ARG_SIZE 16
|
||||
struct btKernelArgData
|
||||
{
|
||||
int m_isBuffer;
|
||||
int m_argIndex;
|
||||
int m_argSizeInBytes;
|
||||
union
|
||||
{
|
||||
cl_mem m_clBuffer;
|
||||
unsigned char m_argData[BT_CL_MAX_ARG_SIZE];
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
class btLauncherCL
|
||||
{
|
||||
|
||||
cl_command_queue m_commandQueue;
|
||||
cl_kernel m_kernel;
|
||||
int m_idx;
|
||||
|
||||
btAlignedObjectArray<btKernelArgData> m_kernelArguments;
|
||||
|
||||
|
||||
int m_serializationSizeInBytes;
|
||||
|
||||
public:
|
||||
|
||||
btAlignedObjectArray<btOpenCLArray<unsigned char>* > m_arrays;
|
||||
|
||||
btLauncherCL(cl_command_queue queue, cl_kernel kernel)
|
||||
:m_commandQueue(queue),
|
||||
m_kernel(kernel),
|
||||
m_idx(0)
|
||||
{
|
||||
m_serializationSizeInBytes = sizeof(int);
|
||||
}
|
||||
|
||||
virtual ~btLauncherCL()
|
||||
{
|
||||
for (int i=0;i<m_arrays.size();i++)
|
||||
{
|
||||
clReleaseMemObject(m_arrays[i]->getBufferCL());
|
||||
}
|
||||
}
|
||||
|
||||
inline void setBuffer( cl_mem clBuffer)
|
||||
{
|
||||
|
||||
btKernelArgData kernelArg;
|
||||
kernelArg.m_argIndex = m_idx;
|
||||
kernelArg.m_isBuffer = 1;
|
||||
kernelArg.m_clBuffer = clBuffer;
|
||||
|
||||
cl_mem_info param_name = CL_MEM_SIZE;
|
||||
size_t param_value;
|
||||
size_t sizeInBytes = sizeof(size_t);
|
||||
size_t actualSizeInBytes;
|
||||
cl_int err;
|
||||
err = clGetMemObjectInfo ( kernelArg.m_clBuffer,
|
||||
param_name,
|
||||
sizeInBytes,
|
||||
¶m_value,
|
||||
&actualSizeInBytes);
|
||||
|
||||
btAssert( err == CL_SUCCESS );
|
||||
kernelArg.m_argSizeInBytes = param_value;
|
||||
|
||||
m_kernelArguments.push_back(kernelArg);
|
||||
m_serializationSizeInBytes+= sizeof(btKernelArgData);
|
||||
m_serializationSizeInBytes+=param_value;
|
||||
|
||||
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
|
||||
btAssert( status == CL_SUCCESS );
|
||||
}
|
||||
|
||||
|
||||
inline void setBuffers( btBufferInfoCL* buffInfo, int n )
|
||||
{
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
btKernelArgData kernelArg;
|
||||
kernelArg.m_argIndex = m_idx;
|
||||
kernelArg.m_isBuffer = 1;
|
||||
kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
|
||||
|
||||
cl_mem_info param_name = CL_MEM_SIZE;
|
||||
size_t param_value;
|
||||
size_t sizeInBytes = sizeof(size_t);
|
||||
size_t actualSizeInBytes;
|
||||
cl_int err;
|
||||
err = clGetMemObjectInfo ( kernelArg.m_clBuffer,
|
||||
param_name,
|
||||
sizeInBytes,
|
||||
¶m_value,
|
||||
&actualSizeInBytes);
|
||||
|
||||
btAssert( err == CL_SUCCESS );
|
||||
kernelArg.m_argSizeInBytes = param_value;
|
||||
|
||||
m_kernelArguments.push_back(kernelArg);
|
||||
m_serializationSizeInBytes+= sizeof(btKernelArgData);
|
||||
m_serializationSizeInBytes+=param_value;
|
||||
|
||||
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
|
||||
btAssert( status == CL_SUCCESS );
|
||||
}
|
||||
}
|
||||
|
||||
int getSerializationBufferSize() const
|
||||
{
|
||||
return m_serializationSizeInBytes;
|
||||
}
|
||||
|
||||
inline int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
|
||||
{
|
||||
int index=0;
|
||||
|
||||
int numArguments = *(int*) &buf[index];
|
||||
index+=sizeof(int);
|
||||
|
||||
for (int i=0;i<numArguments;i++)
|
||||
{
|
||||
btKernelArgData* arg = (btKernelArgData*)&buf[index];
|
||||
|
||||
index+=sizeof(btKernelArgData);
|
||||
if (arg->m_isBuffer)
|
||||
{
|
||||
btOpenCLArray<unsigned char>* clData = new btOpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
|
||||
clData->resize(arg->m_argSizeInBytes);
|
||||
|
||||
clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
|
||||
|
||||
arg->m_clBuffer = clData->getBufferCL();
|
||||
|
||||
m_arrays.push_back(clData);
|
||||
|
||||
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
|
||||
btAssert( status == CL_SUCCESS );
|
||||
index+=arg->m_argSizeInBytes;
|
||||
} else
|
||||
{
|
||||
cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
|
||||
btAssert( status == CL_SUCCESS );
|
||||
}
|
||||
m_kernelArguments.push_back(*arg);
|
||||
}
|
||||
m_serializationSizeInBytes = index;
|
||||
return index;
|
||||
}
|
||||
|
||||
inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
|
||||
{
|
||||
int index=0;
|
||||
|
||||
int numArguments = *(int*) &goldBuffer[index];
|
||||
index+=sizeof(int);
|
||||
|
||||
if (numArguments != m_kernelArguments.size())
|
||||
{
|
||||
printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size());
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (int ii=0;ii<numArguments;ii++)
|
||||
{
|
||||
btKernelArgData* argGold = (btKernelArgData*)&goldBuffer[index];
|
||||
|
||||
if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
|
||||
{
|
||||
printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
|
||||
return -2;
|
||||
}
|
||||
|
||||
{
|
||||
int expected = argGold->m_isBuffer;
|
||||
int found = m_kernelArguments[ii].m_isBuffer;
|
||||
|
||||
if (expected != found)
|
||||
{
|
||||
printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found);
|
||||
return -3;
|
||||
}
|
||||
}
|
||||
index+=sizeof(btKernelArgData);
|
||||
|
||||
if (argGold->m_isBuffer)
|
||||
{
|
||||
|
||||
unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes);
|
||||
unsigned char* goldBuf = &goldBuffer[index];
|
||||
for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++)
|
||||
{
|
||||
memBuf[j] = 0xaa;
|
||||
}
|
||||
|
||||
cl_int status = 0;
|
||||
status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
|
||||
memBuf, 0,0,0 );
|
||||
btAssert( status==CL_SUCCESS );
|
||||
clFinish(m_commandQueue);
|
||||
|
||||
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
|
||||
{
|
||||
int expected = goldBuf[b];
|
||||
int found = memBuf[b];
|
||||
if (expected != found)
|
||||
{
|
||||
printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
|
||||
ii, b, expected, found);
|
||||
return -4;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
index+=argGold->m_argSizeInBytes;
|
||||
} else
|
||||
{
|
||||
|
||||
//compare content
|
||||
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
|
||||
{
|
||||
int expected = argGold->m_argData[b];
|
||||
int found =m_kernelArguments[ii].m_argData[b];
|
||||
if (expected != found)
|
||||
{
|
||||
printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
|
||||
ii, b, expected, found);
|
||||
return -5;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return index;
|
||||
|
||||
}
|
||||
|
||||
inline int serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
|
||||
{
|
||||
//initialize to known values
|
||||
for (int i=0;i<destBufferCapacity;i++)
|
||||
destBuffer[i] = 0xec;
|
||||
|
||||
assert(destBufferCapacity>=m_serializationSizeInBytes);
|
||||
|
||||
//todo: use the btSerializer for this to allow for 32/64bit, endianness etc
|
||||
int numArguments = m_kernelArguments.size();
|
||||
int curBufferSize = 0;
|
||||
int* dest = (int*)&destBuffer[curBufferSize];
|
||||
*dest = numArguments;
|
||||
curBufferSize += sizeof(int);
|
||||
|
||||
|
||||
|
||||
for (int i=0;i<this->m_kernelArguments.size();i++)
|
||||
{
|
||||
btKernelArgData* arg = (btKernelArgData*) &destBuffer[curBufferSize];
|
||||
*arg = m_kernelArguments[i];
|
||||
curBufferSize+=sizeof(btKernelArgData);
|
||||
if (arg->m_isBuffer==1)
|
||||
{
|
||||
//copy the OpenCL buffer content
|
||||
cl_int status = 0;
|
||||
status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
|
||||
&destBuffer[curBufferSize], 0,0,0 );
|
||||
btAssert( status==CL_SUCCESS );
|
||||
clFinish(m_commandQueue);
|
||||
curBufferSize+=arg->m_argSizeInBytes;
|
||||
}
|
||||
|
||||
}
|
||||
return curBufferSize;
|
||||
}
|
||||
|
||||
void serializeToFile(const char* fileName, int numWorkItems)
|
||||
{
|
||||
int num = numWorkItems;
|
||||
int buffSize = getSerializationBufferSize();
|
||||
unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
|
||||
for (int i=0;i<buffSize+1;i++)
|
||||
{
|
||||
unsigned char* ptr = (unsigned char*)&buf[i];
|
||||
*ptr = 0xff;
|
||||
}
|
||||
int actualWrite = serializeArguments(buf,buffSize);
|
||||
|
||||
unsigned char* cptr = (unsigned char*)&buf[buffSize];
|
||||
// printf("buf[buffSize] = %d\n",*cptr);
|
||||
|
||||
assert(buf[buffSize]==0xff);//check for buffer overrun
|
||||
int* ptr = (int*)&buf[buffSize];
|
||||
|
||||
*ptr = num;
|
||||
|
||||
FILE* f = fopen(fileName,"wb");
|
||||
fwrite(buf,buffSize+sizeof(int),1,f);
|
||||
fclose(f);
|
||||
|
||||
delete[] buf;
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
inline void setConst( const T& consts )
|
||||
{
|
||||
int sz=sizeof(T);
|
||||
btAssert(sz<=BT_CL_MAX_ARG_SIZE);
|
||||
btKernelArgData kernelArg;
|
||||
kernelArg.m_argIndex = m_idx;
|
||||
kernelArg.m_isBuffer = 0;
|
||||
T* destArg = (T*)kernelArg.m_argData;
|
||||
*destArg = consts;
|
||||
kernelArg.m_argSizeInBytes = sizeof(T);
|
||||
m_kernelArguments.push_back(kernelArg);
|
||||
m_serializationSizeInBytes+=sizeof(btKernelArgData);
|
||||
|
||||
cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts );
|
||||
btAssert( status == CL_SUCCESS );
|
||||
}
|
||||
|
||||
inline void launch1D( int numThreads, int localSize = 64)
|
||||
{
|
||||
launch2D( numThreads, 1, localSize, 1 );
|
||||
}
|
||||
|
||||
inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
|
||||
{
|
||||
size_t gRange[3] = {1,1,1};
|
||||
size_t lRange[3] = {1,1,1};
|
||||
lRange[0] = localSizeX;
|
||||
lRange[1] = localSizeY;
|
||||
gRange[0] = btMax((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
|
||||
gRange[0] *= lRange[0];
|
||||
gRange[1] = btMax((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
|
||||
gRange[1] *= lRange[1];
|
||||
|
||||
cl_int status = clEnqueueNDRangeKernel( m_commandQueue,
|
||||
m_kernel, 2, NULL, gRange, lRange, 0,0,0 );
|
||||
if (status != CL_SUCCESS)
|
||||
{
|
||||
printf("Error: OpenCL status = %d\n",status);
|
||||
}
|
||||
btAssert( status == CL_SUCCESS );
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
#endif //BT_LAUNCHER_CL_H
|
||||
71
opencl/parallel_primitives/host/btMinMax.h
Normal file
71
opencl/parallel_primitives/host/btMinMax.h
Normal file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans http://continuousphysics.com/Bullet/
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#ifndef BT_GEN_MINMAX_H
|
||||
#define BT_GEN_MINMAX_H
|
||||
|
||||
#include "btScalar.h"
|
||||
|
||||
template <class T>
|
||||
SIMD_FORCE_INLINE const T& btMin(const T& a, const T& b)
|
||||
{
|
||||
return a < b ? a : b ;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
SIMD_FORCE_INLINE const T& btMax(const T& a, const T& b)
|
||||
{
|
||||
return a > b ? a : b;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
SIMD_FORCE_INLINE const T& btClamped(const T& a, const T& lb, const T& ub)
|
||||
{
|
||||
return a < lb ? lb : (ub < a ? ub : a);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
SIMD_FORCE_INLINE void btSetMin(T& a, const T& b)
|
||||
{
|
||||
if (b < a)
|
||||
{
|
||||
a = b;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
SIMD_FORCE_INLINE void btSetMax(T& a, const T& b)
|
||||
{
|
||||
if (a < b)
|
||||
{
|
||||
a = b;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
SIMD_FORCE_INLINE void btClamp(T& a, const T& lb, const T& ub)
|
||||
{
|
||||
if (a < lb)
|
||||
{
|
||||
a = lb;
|
||||
}
|
||||
else if (ub < a)
|
||||
{
|
||||
a = ub;
|
||||
}
|
||||
}
|
||||
|
||||
#endif //BT_GEN_MINMAX_H
|
||||
274
opencl/parallel_primitives/host/btOpenCLArray.h
Normal file
274
opencl/parallel_primitives/host/btOpenCLArray.h
Normal file
@@ -0,0 +1,274 @@
|
||||
#ifndef BT_OPENCL_ARRAY_H
|
||||
#define BT_OPENCL_ARRAY_H
|
||||
|
||||
#include "btAlignedObjectArray.h"
|
||||
#include "../../basic_initialize/btOpenCLInclude.h"
|
||||
|
||||
template <typename T>
|
||||
class btOpenCLArray
|
||||
{
|
||||
int m_size;
|
||||
int m_capacity;
|
||||
cl_mem m_clBuffer;
|
||||
|
||||
cl_context m_clContext;
|
||||
cl_command_queue m_commandQueue;
|
||||
|
||||
bool m_ownsMemory;
|
||||
|
||||
bool m_allowGrowingCapacity;
|
||||
|
||||
void deallocate()
|
||||
{
|
||||
if (m_clBuffer && m_ownsMemory)
|
||||
{
|
||||
clReleaseMemObject(m_clBuffer);
|
||||
}
|
||||
m_clBuffer = 0;
|
||||
m_capacity=0;
|
||||
}
|
||||
|
||||
btOpenCLArray<T>& operator=(const btOpenCLArray<T>& src);
|
||||
|
||||
SIMD_FORCE_INLINE int allocSize(int size)
|
||||
{
|
||||
return (size ? size*2 : 1);
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
btOpenCLArray(cl_context ctx, cl_command_queue queue, int initialCapacity=0, bool allowGrowingCapacity=true)
|
||||
:m_size(0), m_capacity(0),m_clBuffer(0),
|
||||
m_clContext(ctx),m_commandQueue(queue),
|
||||
m_ownsMemory(true),m_allowGrowingCapacity(true)
|
||||
{
|
||||
if (initialCapacity)
|
||||
{
|
||||
reserve(initialCapacity);
|
||||
}
|
||||
m_allowGrowingCapacity = allowGrowingCapacity;
|
||||
}
|
||||
|
||||
///this is an error-prone method with no error checking, be careful!
|
||||
void setFromOpenCLBuffer(cl_mem buffer, int sizeInElements)
|
||||
{
|
||||
deallocate();
|
||||
m_ownsMemory = false;
|
||||
m_allowGrowingCapacity = false;
|
||||
m_clBuffer = buffer;
|
||||
m_size = sizeInElements;
|
||||
m_capacity = sizeInElements;
|
||||
}
|
||||
|
||||
// we could enable this assignment, but need to make sure to avoid accidental deep copies
|
||||
// btOpenCLArray<T>& operator=(const btAlignedObjectArray<T>& src)
|
||||
// {
|
||||
// copyFromArray(src);
|
||||
// return *this;
|
||||
// }
|
||||
|
||||
|
||||
cl_mem getBufferCL() const
|
||||
{
|
||||
return m_clBuffer;
|
||||
}
|
||||
|
||||
|
||||
virtual ~btOpenCLArray()
|
||||
{
|
||||
deallocate();
|
||||
m_size=0;
|
||||
m_capacity=0;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void push_back(const T& _Val,bool waitForCompletion=true)
|
||||
{
|
||||
int sz = size();
|
||||
if( sz == capacity() )
|
||||
{
|
||||
reserve( allocSize(size()) );
|
||||
}
|
||||
copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
|
||||
m_size++;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE T forcedAt(int n) const
|
||||
{
|
||||
btAssert(n>=0);
|
||||
btAssert(n<capacity());
|
||||
T elem;
|
||||
copyToHostPointer(&elem,1,n,true);
|
||||
return elem;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE T at(int n) const
|
||||
{
|
||||
btAssert(n>=0);
|
||||
btAssert(n<size());
|
||||
T elem;
|
||||
copyToHostPointer(&elem,1,n,true);
|
||||
return elem;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void resize(int newsize, bool copyOldContents=true)
|
||||
{
|
||||
int curSize = size();
|
||||
|
||||
if (newsize < curSize)
|
||||
{
|
||||
//leave the OpenCL memory for now
|
||||
} else
|
||||
{
|
||||
if (newsize > size())
|
||||
{
|
||||
reserve(newsize,copyOldContents);
|
||||
}
|
||||
|
||||
//leave new data uninitialized (init in debug mode?)
|
||||
//for (int i=curSize;i<newsize;i++) ...
|
||||
}
|
||||
|
||||
m_size = newsize;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE int size() const
|
||||
{
|
||||
return m_size;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE int capacity() const
|
||||
{
|
||||
return m_capacity;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE void reserve(int _Count, bool copyOldContents=true)
|
||||
{ // determine new minimum length of allocated storage
|
||||
if (capacity() < _Count)
|
||||
{ // not enough room, reallocate
|
||||
|
||||
if (m_allowGrowingCapacity)
|
||||
{
|
||||
cl_int ciErrNum;
|
||||
//create a new OpenCL buffer
|
||||
int memSizeInBytes = sizeof(T)*_Count;
|
||||
cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
|
||||
btAssert(ciErrNum==CL_SUCCESS);
|
||||
|
||||
//#define BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
|
||||
#ifdef BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
|
||||
unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
|
||||
for (int i=0;i<memSizeInBytes;i++)
|
||||
src[i] = 0xbb;
|
||||
ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 );
|
||||
btAssert(ciErrNum==CL_SUCCESS);
|
||||
clFinish(m_commandQueue);
|
||||
free(src);
|
||||
#endif //BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
|
||||
|
||||
if (copyOldContents)
|
||||
copyToCL(buf, size());
|
||||
|
||||
//deallocate the old buffer
|
||||
deallocate();
|
||||
|
||||
m_clBuffer = buf;
|
||||
|
||||
m_capacity = _Count;
|
||||
} else
|
||||
{
|
||||
//fail: assert and
|
||||
btAssert(0);
|
||||
deallocate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void copyToCL(cl_mem destination, int numElements, int firstElem=0, int dstOffsetInElems=0) const
|
||||
{
|
||||
if (numElements<=0)
|
||||
return;
|
||||
|
||||
btAssert(m_clBuffer);
|
||||
btAssert(destination);
|
||||
|
||||
//likely some error, destination is same as source
|
||||
btAssert(m_clBuffer != destination);
|
||||
|
||||
btAssert((firstElem+numElements)<=m_size);
|
||||
|
||||
cl_int status = 0;
|
||||
|
||||
|
||||
btAssert(numElements>0);
|
||||
btAssert(numElements<=m_size);
|
||||
|
||||
int srcOffsetBytes = sizeof(T)*firstElem;
|
||||
int dstOffsetInBytes = sizeof(T)*dstOffsetInElems;
|
||||
|
||||
status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination,
|
||||
srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 );
|
||||
|
||||
btAssert( status == CL_SUCCESS );
|
||||
}
|
||||
|
||||
void copyFromHost(const btAlignedObjectArray<T>& srcArray, bool waitForCompletion=true)
|
||||
{
|
||||
int newSize = srcArray.size();
|
||||
|
||||
bool copyOldContents = false;
|
||||
resize (newSize,copyOldContents);
|
||||
if (newSize)
|
||||
copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion);
|
||||
|
||||
}
|
||||
|
||||
void copyFromHostPointer(const T* src, int numElems, int destFirstElem= 0, bool waitForCompletion=true)
|
||||
{
|
||||
btAssert(numElems+destFirstElem <= capacity());
|
||||
|
||||
cl_int status = 0;
|
||||
int sizeInBytes=sizeof(T)*numElems;
|
||||
status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes,
|
||||
src, 0,0,0 );
|
||||
btAssert(status == CL_SUCCESS );
|
||||
if (waitForCompletion)
|
||||
clFinish(m_commandQueue);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void copyToHost(btAlignedObjectArray<T>& destArray, bool waitForCompletion=true) const
|
||||
{
|
||||
destArray.resize(this->size());
|
||||
if (size())
|
||||
copyToHostPointer(&destArray[0], size(),0,waitForCompletion);
|
||||
}
|
||||
|
||||
void copyToHostPointer(T* destPtr, int numElem, int srcFirstElem=0, bool waitForCompletion=true) const
|
||||
{
|
||||
btAssert(numElem+srcFirstElem <= capacity());
|
||||
|
||||
cl_int status = 0;
|
||||
status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem,
|
||||
destPtr, 0,0,0 );
|
||||
btAssert( status==CL_SUCCESS );
|
||||
|
||||
if (waitForCompletion)
|
||||
clFinish(m_commandQueue);
|
||||
}
|
||||
|
||||
void copyFromOpenCLArray(const btOpenCLArray& src)
|
||||
{
|
||||
int newSize = src.size();
|
||||
resize(newSize);
|
||||
if (size())
|
||||
{
|
||||
src.copyToCL(m_clBuffer,size());
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
#endif //BT_OPENCL_ARRAY_H
|
||||
126
opencl/parallel_primitives/host/btPrefixScanCL.cpp
Normal file
126
opencl/parallel_primitives/host/btPrefixScanCL.cpp
Normal file
@@ -0,0 +1,126 @@
|
||||
#include "btPrefixScanCL.h"
|
||||
#include "btFillCL.h"
|
||||
#define BT_PREFIXSCAN_PROG_PATH "opencl/parallel_primitives/kernels/PrefixScanKernels.cl"
|
||||
|
||||
#include "btLauncherCL.h"
|
||||
#include "../../basic_initialize/btOpenCLUtils.h"
|
||||
#include "../kernels/PrefixScanKernelsCL.h"
|
||||
|
||||
btPrefixScanCL::btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
|
||||
:m_commandQueue(queue)
|
||||
{
|
||||
const char* scanKernelSource = prefixScanKernelsCL;
|
||||
cl_int pErrNum;
|
||||
char* additionalMacros=0;
|
||||
|
||||
m_workBuffer = new btOpenCLArray<unsigned int>(ctx,queue,size);
|
||||
cl_program scanProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, BT_PREFIXSCAN_PROG_PATH);
|
||||
btAssert(scanProg);
|
||||
|
||||
m_localScanKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
|
||||
btAssert(m_localScanKernel );
|
||||
m_blockSumKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
|
||||
btAssert(m_blockSumKernel );
|
||||
m_propagationKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
|
||||
btAssert(m_propagationKernel );
|
||||
}
|
||||
|
||||
|
||||
btPrefixScanCL::~btPrefixScanCL()
|
||||
{
|
||||
delete m_workBuffer;
|
||||
clReleaseKernel(m_localScanKernel);
|
||||
clReleaseKernel(m_blockSumKernel);
|
||||
clReleaseKernel(m_propagationKernel);
|
||||
}
|
||||
|
||||
template<class T>
|
||||
T btNextPowerOf2(T n)
|
||||
{
|
||||
n -= 1;
|
||||
for(int i=0; i<sizeof(T)*8; i++)
|
||||
n = n | (n>>i);
|
||||
return n+1;
|
||||
}
|
||||
|
||||
void btPrefixScanCL::execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
|
||||
{
|
||||
|
||||
// btAssert( data->m_option == EXCLUSIVE );
|
||||
const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
|
||||
|
||||
dst.resize(src.size());
|
||||
m_workBuffer->resize(src.size());
|
||||
|
||||
btInt4 constBuffer;
|
||||
constBuffer.x = n;
|
||||
constBuffer.y = numBlocks;
|
||||
constBuffer.z = (int)btNextPowerOf2( numBlocks );
|
||||
|
||||
btOpenCLArray<unsigned int>* srcNative = &src;
|
||||
btOpenCLArray<unsigned int>* dstNative = &dst;
|
||||
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( srcNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) };
|
||||
|
||||
btLauncherCL launcher( m_commandQueue, m_localScanKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( constBuffer );
|
||||
launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
|
||||
}
|
||||
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_workBuffer->getBufferCL() ) };
|
||||
|
||||
btLauncherCL launcher( m_commandQueue, m_blockSumKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( constBuffer );
|
||||
launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
|
||||
}
|
||||
|
||||
|
||||
if( numBlocks > 1 )
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) };
|
||||
btLauncherCL launcher( m_commandQueue, m_propagationKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( constBuffer );
|
||||
launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
|
||||
}
|
||||
|
||||
|
||||
if( sum )
|
||||
{
|
||||
clFinish(m_commandQueue);
|
||||
dstNative->copyToHostPointer(sum,1,n-1,true);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void btPrefixScanCL::executeHost(btAlignedObjectArray<unsigned int>& src, btAlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
|
||||
{
|
||||
unsigned int s = 0;
|
||||
//if( data->m_option == EXCLUSIVE )
|
||||
{
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
dst[i] = s;
|
||||
s += src[i];
|
||||
}
|
||||
}
|
||||
/*else
|
||||
{
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
s += hSrc[i];
|
||||
hDst[i] = s;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
if( sum )
|
||||
{
|
||||
*sum = dst[n-1];
|
||||
}
|
||||
}
|
||||
37
opencl/parallel_primitives/host/btPrefixScanCL.h
Normal file
37
opencl/parallel_primitives/host/btPrefixScanCL.h
Normal file
@@ -0,0 +1,37 @@
|
||||
|
||||
#ifndef BT_PREFIX_SCAN_CL_H
|
||||
#define BT_PREFIX_SCAN_CL_H
|
||||
|
||||
#include "btOpenCLArray.h"
|
||||
#include "btBufferInfoCL.h"
|
||||
#include "btAlignedObjectArray.h"
|
||||
|
||||
class btPrefixScanCL
|
||||
{
|
||||
enum
|
||||
{
|
||||
BLOCK_SIZE = 128
|
||||
};
|
||||
|
||||
// Option m_option;
|
||||
|
||||
cl_command_queue m_commandQueue;
|
||||
|
||||
cl_kernel m_localScanKernel;
|
||||
cl_kernel m_blockSumKernel;
|
||||
cl_kernel m_propagationKernel;
|
||||
|
||||
btOpenCLArray<unsigned int>* m_workBuffer;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
|
||||
|
||||
virtual ~btPrefixScanCL();
|
||||
|
||||
void execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
|
||||
void executeHost(btAlignedObjectArray<unsigned int>& src, btAlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum);
|
||||
};
|
||||
|
||||
#endif //BT_PREFIX_SCAN_CL_H
|
||||
566
opencl/parallel_primitives/host/btQuickprof.cpp
Normal file
566
opencl/parallel_primitives/host/btQuickprof.cpp
Normal file
@@ -0,0 +1,566 @@
|
||||
/*
|
||||
|
||||
***************************************************************************************************
|
||||
**
|
||||
** profile.cpp
|
||||
**
|
||||
** Real-Time Hierarchical Profiling for Game Programming Gems 3
|
||||
**
|
||||
** by Greg Hjelstrom & Byon Garrabrant
|
||||
**
|
||||
***************************************************************************************************/
|
||||
|
||||
// Credits: The Clock class was inspired by the Timer classes in
|
||||
// Ogre (www.ogre3d.org).
|
||||
|
||||
#include "btQuickprof.h"
|
||||
|
||||
#ifndef BT_NO_PROFILE
|
||||
|
||||
|
||||
static btClock gProfileClock;
|
||||
|
||||
|
||||
#ifdef __CELLOS_LV2__
|
||||
#include <sys/sys_time.h>
|
||||
#include <sys/time_util.h>
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#if defined (SUNOS) || defined (__SUNOS__)
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#if defined(WIN32) || defined(_WIN32)
|
||||
|
||||
#define BT_USE_WINDOWS_TIMERS
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOWINRES
|
||||
#define NOMCX
|
||||
#define NOIME
|
||||
|
||||
#ifdef _XBOX
|
||||
#include <Xtl.h>
|
||||
#else //_XBOX
|
||||
#include <windows.h>
|
||||
#endif //_XBOX
|
||||
|
||||
#include <time.h>
|
||||
|
||||
|
||||
#else //_WIN32
|
||||
#include <sys/time.h>
|
||||
#endif //_WIN32
|
||||
|
||||
#define mymin(a,b) (a > b ? a : b)
|
||||
|
||||
struct btClockData
|
||||
{
|
||||
|
||||
#ifdef BT_USE_WINDOWS_TIMERS
|
||||
LARGE_INTEGER mClockFrequency;
|
||||
DWORD mStartTick;
|
||||
LONGLONG mPrevElapsedTime;
|
||||
LARGE_INTEGER mStartTime;
|
||||
#else
|
||||
#ifdef __CELLOS_LV2__
|
||||
uint64_t mStartTime;
|
||||
#else
|
||||
struct timeval mStartTime;
|
||||
#endif
|
||||
#endif //__CELLOS_LV2__
|
||||
|
||||
};
|
||||
|
||||
///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling.
|
||||
btClock::btClock()
|
||||
{
|
||||
m_data = new btClockData;
|
||||
#ifdef BT_USE_WINDOWS_TIMERS
|
||||
QueryPerformanceFrequency(&m_data->mClockFrequency);
|
||||
#endif
|
||||
reset();
|
||||
}
|
||||
|
||||
btClock::~btClock()
|
||||
{
|
||||
delete m_data;
|
||||
}
|
||||
|
||||
btClock::btClock(const btClock& other)
|
||||
{
|
||||
m_data = new btClockData;
|
||||
*m_data = *other.m_data;
|
||||
}
|
||||
|
||||
btClock& btClock::operator=(const btClock& other)
|
||||
{
|
||||
*m_data = *other.m_data;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
/// Resets the initial reference time.
|
||||
void btClock::reset()
|
||||
{
|
||||
#ifdef BT_USE_WINDOWS_TIMERS
|
||||
QueryPerformanceCounter(&m_data->mStartTime);
|
||||
m_data->mStartTick = GetTickCount();
|
||||
m_data->mPrevElapsedTime = 0;
|
||||
#else
|
||||
#ifdef __CELLOS_LV2__
|
||||
|
||||
typedef uint64_t ClockSize;
|
||||
ClockSize newTime;
|
||||
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
|
||||
SYS_TIMEBASE_GET( newTime );
|
||||
m_data->mStartTime = newTime;
|
||||
#else
|
||||
gettimeofday(&m_data->mStartTime, 0);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Returns the time in ms since the last call to reset or since
|
||||
/// the btClock was created.
|
||||
unsigned long int btClock::getTimeMilliseconds()
|
||||
{
|
||||
#ifdef BT_USE_WINDOWS_TIMERS
|
||||
LARGE_INTEGER currentTime;
|
||||
QueryPerformanceCounter(¤tTime);
|
||||
LONGLONG elapsedTime = currentTime.QuadPart -
|
||||
m_data->mStartTime.QuadPart;
|
||||
// Compute the number of millisecond ticks elapsed.
|
||||
unsigned long msecTicks = (unsigned long)(1000 * elapsedTime /
|
||||
m_data->mClockFrequency.QuadPart);
|
||||
// Check for unexpected leaps in the Win32 performance counter.
|
||||
// (This is caused by unexpected data across the PCI to ISA
|
||||
// bridge, aka south bridge. See Microsoft KB274323.)
|
||||
unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
|
||||
signed long msecOff = (signed long)(msecTicks - elapsedTicks);
|
||||
if (msecOff < -100 || msecOff > 100)
|
||||
{
|
||||
// Adjust the starting time forwards.
|
||||
LONGLONG msecAdjustment = mymin(msecOff *
|
||||
m_data->mClockFrequency.QuadPart / 1000, elapsedTime -
|
||||
m_data->mPrevElapsedTime);
|
||||
m_data->mStartTime.QuadPart += msecAdjustment;
|
||||
elapsedTime -= msecAdjustment;
|
||||
|
||||
// Recompute the number of millisecond ticks elapsed.
|
||||
msecTicks = (unsigned long)(1000 * elapsedTime /
|
||||
m_data->mClockFrequency.QuadPart);
|
||||
}
|
||||
|
||||
// Store the current elapsed time for adjustments next time.
|
||||
m_data->mPrevElapsedTime = elapsedTime;
|
||||
|
||||
return msecTicks;
|
||||
#else
|
||||
|
||||
#ifdef __CELLOS_LV2__
|
||||
uint64_t freq=sys_time_get_timebase_frequency();
|
||||
double dFreq=((double) freq) / 1000.0;
|
||||
typedef uint64_t ClockSize;
|
||||
ClockSize newTime;
|
||||
SYS_TIMEBASE_GET( newTime );
|
||||
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
|
||||
|
||||
return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq);
|
||||
#else
|
||||
|
||||
struct timeval currentTime;
|
||||
gettimeofday(¤tTime, 0);
|
||||
return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000 +
|
||||
(currentTime.tv_usec - m_data->mStartTime.tv_usec) / 1000;
|
||||
#endif //__CELLOS_LV2__
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Returns the time in us since the last call to reset or since
|
||||
/// the Clock was created.
|
||||
unsigned long int btClock::getTimeMicroseconds()
|
||||
{
|
||||
#ifdef BT_USE_WINDOWS_TIMERS
|
||||
LARGE_INTEGER currentTime;
|
||||
QueryPerformanceCounter(¤tTime);
|
||||
LONGLONG elapsedTime = currentTime.QuadPart -
|
||||
m_data->mStartTime.QuadPart;
|
||||
|
||||
// Compute the number of millisecond ticks elapsed.
|
||||
unsigned long msecTicks = (unsigned long)(1000 * elapsedTime /
|
||||
m_data->mClockFrequency.QuadPart);
|
||||
|
||||
// Check for unexpected leaps in the Win32 performance counter.
|
||||
// (This is caused by unexpected data across the PCI to ISA
|
||||
// bridge, aka south bridge. See Microsoft KB274323.)
|
||||
unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
|
||||
signed long msecOff = (signed long)(msecTicks - elapsedTicks);
|
||||
if (msecOff < -100 || msecOff > 100)
|
||||
{
|
||||
// Adjust the starting time forwards.
|
||||
LONGLONG msecAdjustment = mymin(msecOff *
|
||||
m_data->mClockFrequency.QuadPart / 1000, elapsedTime -
|
||||
m_data->mPrevElapsedTime);
|
||||
m_data->mStartTime.QuadPart += msecAdjustment;
|
||||
elapsedTime -= msecAdjustment;
|
||||
}
|
||||
|
||||
// Store the current elapsed time for adjustments next time.
|
||||
m_data->mPrevElapsedTime = elapsedTime;
|
||||
|
||||
// Convert to microseconds.
|
||||
unsigned long usecTicks = (unsigned long)(1000000 * elapsedTime /
|
||||
m_data->mClockFrequency.QuadPart);
|
||||
|
||||
return usecTicks;
|
||||
#else
|
||||
|
||||
#ifdef __CELLOS_LV2__
|
||||
uint64_t freq=sys_time_get_timebase_frequency();
|
||||
double dFreq=((double) freq)/ 1000000.0;
|
||||
typedef uint64_t ClockSize;
|
||||
ClockSize newTime;
|
||||
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
|
||||
SYS_TIMEBASE_GET( newTime );
|
||||
|
||||
return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq);
|
||||
#else
|
||||
|
||||
struct timeval currentTime;
|
||||
gettimeofday(¤tTime, 0);
|
||||
return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 +
|
||||
(currentTime.tv_usec - m_data->mStartTime.tv_usec);
|
||||
#endif//__CELLOS_LV2__
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
inline void Profile_Get_Ticks(unsigned long int * ticks)
|
||||
{
|
||||
*ticks = gProfileClock.getTimeMicroseconds();
|
||||
}
|
||||
|
||||
inline float Profile_Get_Tick_Rate(void)
|
||||
{
|
||||
// return 1000000.f;
|
||||
return 1000.f;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/***************************************************************************************************
|
||||
**
|
||||
** CProfileNode
|
||||
**
|
||||
***************************************************************************************************/
|
||||
|
||||
/***********************************************************************************************
|
||||
* INPUT: *
|
||||
* name - pointer to a static string which is the name of this profile node *
|
||||
* parent - parent pointer *
|
||||
* *
|
||||
* WARNINGS: *
|
||||
* The name is assumed to be a static pointer, only the pointer is stored and compared for *
|
||||
* efficiency reasons. *
|
||||
*=============================================================================================*/
|
||||
CProfileNode::CProfileNode( const char * name, CProfileNode * parent ) :
|
||||
Name( name ),
|
||||
TotalCalls( 0 ),
|
||||
TotalTime( 0 ),
|
||||
StartTime( 0 ),
|
||||
RecursionCounter( 0 ),
|
||||
Parent( parent ),
|
||||
Child( NULL ),
|
||||
Sibling( NULL ),
|
||||
m_userPtr(0)
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
|
||||
|
||||
void CProfileNode::CleanupMemory()
|
||||
{
|
||||
delete ( Child);
|
||||
Child = NULL;
|
||||
delete ( Sibling);
|
||||
Sibling = NULL;
|
||||
}
|
||||
|
||||
CProfileNode::~CProfileNode( void )
|
||||
{
|
||||
delete ( Child);
|
||||
delete ( Sibling);
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************************************
|
||||
* INPUT: *
|
||||
* name - static string pointer to the name of the node we are searching for *
|
||||
* *
|
||||
* WARNINGS: *
|
||||
* All profile names are assumed to be static strings so this function uses pointer compares *
|
||||
* to find the named node. *
|
||||
*=============================================================================================*/
|
||||
CProfileNode * CProfileNode::Get_Sub_Node( const char * name )
|
||||
{
|
||||
// Try to find this sub node
|
||||
CProfileNode * child = Child;
|
||||
while ( child ) {
|
||||
if ( child->Name == name ) {
|
||||
return child;
|
||||
}
|
||||
child = child->Sibling;
|
||||
}
|
||||
|
||||
// We didn't find it, so add it
|
||||
|
||||
CProfileNode * node = new CProfileNode( name, this );
|
||||
node->Sibling = Child;
|
||||
Child = node;
|
||||
return node;
|
||||
}
|
||||
|
||||
|
||||
void CProfileNode::Reset( void )
|
||||
{
|
||||
TotalCalls = 0;
|
||||
TotalTime = 0.0f;
|
||||
|
||||
|
||||
if ( Child ) {
|
||||
Child->Reset();
|
||||
}
|
||||
if ( Sibling ) {
|
||||
Sibling->Reset();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void CProfileNode::Call( void )
|
||||
{
|
||||
TotalCalls++;
|
||||
if (RecursionCounter++ == 0) {
|
||||
Profile_Get_Ticks(&StartTime);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool CProfileNode::Return( void )
|
||||
{
|
||||
if ( --RecursionCounter == 0 && TotalCalls != 0 ) {
|
||||
unsigned long int time;
|
||||
Profile_Get_Ticks(&time);
|
||||
time-=StartTime;
|
||||
TotalTime += (float)time / Profile_Get_Tick_Rate();
|
||||
}
|
||||
return ( RecursionCounter == 0 );
|
||||
}
|
||||
|
||||
|
||||
/***************************************************************************************************
|
||||
**
|
||||
** CProfileIterator
|
||||
**
|
||||
***************************************************************************************************/
|
||||
CProfileIterator::CProfileIterator( CProfileNode * start )
|
||||
{
|
||||
CurrentParent = start;
|
||||
CurrentChild = CurrentParent->Get_Child();
|
||||
}
|
||||
|
||||
|
||||
void CProfileIterator::First(void)
|
||||
{
|
||||
CurrentChild = CurrentParent->Get_Child();
|
||||
}
|
||||
|
||||
|
||||
void CProfileIterator::Next(void)
|
||||
{
|
||||
CurrentChild = CurrentChild->Get_Sibling();
|
||||
}
|
||||
|
||||
|
||||
bool CProfileIterator::Is_Done(void)
|
||||
{
|
||||
return CurrentChild == NULL;
|
||||
}
|
||||
|
||||
|
||||
void CProfileIterator::Enter_Child( int index )
|
||||
{
|
||||
CurrentChild = CurrentParent->Get_Child();
|
||||
while ( (CurrentChild != NULL) && (index != 0) ) {
|
||||
index--;
|
||||
CurrentChild = CurrentChild->Get_Sibling();
|
||||
}
|
||||
|
||||
if ( CurrentChild != NULL ) {
|
||||
CurrentParent = CurrentChild;
|
||||
CurrentChild = CurrentParent->Get_Child();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void CProfileIterator::Enter_Parent( void )
|
||||
{
|
||||
if ( CurrentParent->Get_Parent() != NULL ) {
|
||||
CurrentParent = CurrentParent->Get_Parent();
|
||||
}
|
||||
CurrentChild = CurrentParent->Get_Child();
|
||||
}
|
||||
|
||||
|
||||
/***************************************************************************************************
|
||||
**
|
||||
** CProfileManager
|
||||
**
|
||||
***************************************************************************************************/
|
||||
|
||||
CProfileNode CProfileManager::Root( "Root", NULL );
|
||||
CProfileNode * CProfileManager::CurrentNode = &CProfileManager::Root;
|
||||
int CProfileManager::FrameCounter = 0;
|
||||
unsigned long int CProfileManager::ResetTime = 0;
|
||||
|
||||
|
||||
/***********************************************************************************************
|
||||
* CProfileManager::Start_Profile -- Begin a named profile *
|
||||
* *
|
||||
* Steps one level deeper into the tree, if a child already exists with the specified name *
|
||||
* then it accumulates the profiling; otherwise a new child node is added to the profile tree. *
|
||||
* *
|
||||
* INPUT: *
|
||||
* name - name of this profiling record *
|
||||
* *
|
||||
* WARNINGS: *
|
||||
* The string used is assumed to be a static string; pointer compares are used throughout *
|
||||
* the profiling code for efficiency. *
|
||||
*=============================================================================================*/
|
||||
void CProfileManager::Start_Profile( const char * name )
|
||||
{
|
||||
if (name != CurrentNode->Get_Name()) {
|
||||
CurrentNode = CurrentNode->Get_Sub_Node( name );
|
||||
}
|
||||
|
||||
CurrentNode->Call();
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************************************
|
||||
* CProfileManager::Stop_Profile -- Stop timing and record the results. *
|
||||
*=============================================================================================*/
|
||||
void CProfileManager::Stop_Profile( void )
|
||||
{
|
||||
// Return will indicate whether we should back up to our parent (we may
|
||||
// be profiling a recursive function)
|
||||
if (CurrentNode->Return()) {
|
||||
CurrentNode = CurrentNode->Get_Parent();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************************************
|
||||
* CProfileManager::Reset -- Reset the contents of the profiling system *
|
||||
* *
|
||||
* This resets everything except for the tree structure. All of the timing data is reset. *
|
||||
*=============================================================================================*/
|
||||
void CProfileManager::Reset( void )
|
||||
{
|
||||
gProfileClock.reset();
|
||||
Root.Reset();
|
||||
Root.Call();
|
||||
FrameCounter = 0;
|
||||
Profile_Get_Ticks(&ResetTime);
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************************************
|
||||
* CProfileManager::Increment_Frame_Counter -- Increment the frame counter *
|
||||
*=============================================================================================*/
|
||||
void CProfileManager::Increment_Frame_Counter( void )
|
||||
{
|
||||
FrameCounter++;
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************************************
|
||||
* CProfileManager::Get_Time_Since_Reset -- returns the elapsed time since last reset *
|
||||
*=============================================================================================*/
|
||||
float CProfileManager::Get_Time_Since_Reset( void )
|
||||
{
|
||||
unsigned long int time;
|
||||
Profile_Get_Ticks(&time);
|
||||
time -= ResetTime;
|
||||
return (float)time / Profile_Get_Tick_Rate();
|
||||
}
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
void CProfileManager::dumpRecursive(CProfileIterator* profileIterator, int spacing)
|
||||
{
|
||||
profileIterator->First();
|
||||
if (profileIterator->Is_Done())
|
||||
return;
|
||||
|
||||
float accumulated_time=0,parent_time = profileIterator->Is_Root() ? CProfileManager::Get_Time_Since_Reset() : profileIterator->Get_Current_Parent_Total_Time();
|
||||
int i;
|
||||
int frames_since_reset = CProfileManager::Get_Frame_Count_Since_Reset();
|
||||
for (i=0;i<spacing;i++) printf(".");
|
||||
printf("----------------------------------\n");
|
||||
for (i=0;i<spacing;i++) printf(".");
|
||||
printf("Profiling: %s (total running time: %.3f ms) ---\n", profileIterator->Get_Current_Parent_Name(), parent_time );
|
||||
float totalTime = 0.f;
|
||||
|
||||
|
||||
int numChildren = 0;
|
||||
|
||||
for (i = 0; !profileIterator->Is_Done(); i++,profileIterator->Next())
|
||||
{
|
||||
numChildren++;
|
||||
float current_total_time = profileIterator->Get_Current_Total_Time();
|
||||
accumulated_time += current_total_time;
|
||||
float fraction = parent_time > SIMD_EPSILON ? (current_total_time / parent_time) * 100 : 0.f;
|
||||
{
|
||||
int i; for (i=0;i<spacing;i++) printf(".");
|
||||
}
|
||||
printf("%d -- %s (%.2f %%) :: %.3f ms / frame (%d calls)\n",i, profileIterator->Get_Current_Name(), fraction,(current_total_time / (double)frames_since_reset),profileIterator->Get_Current_Total_Calls());
|
||||
totalTime += current_total_time;
|
||||
//recurse into children
|
||||
}
|
||||
|
||||
if (parent_time < accumulated_time)
|
||||
{
|
||||
printf("what's wrong\n");
|
||||
}
|
||||
for (i=0;i<spacing;i++) printf(".");
|
||||
printf("%s (%.3f %%) :: %.3f ms\n", "Unaccounted:",parent_time > SIMD_EPSILON ? ((parent_time - accumulated_time) / parent_time) * 100 : 0.f, parent_time - accumulated_time);
|
||||
|
||||
for (i=0;i<numChildren;i++)
|
||||
{
|
||||
profileIterator->Enter_Child(i);
|
||||
dumpRecursive(profileIterator,spacing+3);
|
||||
profileIterator->Enter_Parent();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void CProfileManager::dumpAll()
|
||||
{
|
||||
CProfileIterator* profileIterator = 0;
|
||||
profileIterator = CProfileManager::Get_Iterator();
|
||||
|
||||
dumpRecursive(profileIterator,0);
|
||||
|
||||
CProfileManager::Release_Iterator(profileIterator);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#endif //BT_NO_PROFILE
|
||||
203
opencl/parallel_primitives/host/btQuickprof.h
Normal file
203
opencl/parallel_primitives/host/btQuickprof.h
Normal file
@@ -0,0 +1,203 @@
|
||||
|
||||
/***************************************************************************************************
|
||||
**
|
||||
** Real-Time Hierarchical Profiling for Game Programming Gems 3
|
||||
**
|
||||
** by Greg Hjelstrom & Byon Garrabrant
|
||||
**
|
||||
***************************************************************************************************/
|
||||
|
||||
// Credits: The Clock class was inspired by the Timer classes in
|
||||
// Ogre (www.ogre3d.org).
|
||||
|
||||
|
||||
|
||||
#ifndef BT_QUICK_PROF_H
|
||||
#define BT_QUICK_PROF_H
|
||||
|
||||
//To disable built-in profiling, please comment out next line
|
||||
//#define BT_NO_PROFILE 1
|
||||
#ifndef BT_NO_PROFILE
|
||||
#include <stdio.h>//@todo remove this, backwards compatibility
|
||||
#include "btScalar.h"
|
||||
#include "btAlignedAllocator.h"
|
||||
#include <new>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#define USE_BT_CLOCK 1
|
||||
|
||||
#ifdef USE_BT_CLOCK
|
||||
|
||||
///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling.
|
||||
class btClock
|
||||
{
|
||||
public:
|
||||
btClock();
|
||||
|
||||
btClock(const btClock& other);
|
||||
btClock& operator=(const btClock& other);
|
||||
|
||||
~btClock();
|
||||
|
||||
/// Resets the initial reference time.
|
||||
void reset();
|
||||
|
||||
/// Returns the time in ms since the last call to reset or since
|
||||
/// the btClock was created.
|
||||
unsigned long int getTimeMilliseconds();
|
||||
|
||||
/// Returns the time in us since the last call to reset or since
|
||||
/// the Clock was created.
|
||||
unsigned long int getTimeMicroseconds();
|
||||
private:
|
||||
struct btClockData* m_data;
|
||||
};
|
||||
|
||||
#endif //USE_BT_CLOCK
|
||||
|
||||
|
||||
|
||||
|
||||
///A node in the Profile Hierarchy Tree
|
||||
class CProfileNode {
|
||||
|
||||
public:
|
||||
CProfileNode( const char * name, CProfileNode * parent );
|
||||
~CProfileNode( void );
|
||||
|
||||
CProfileNode * Get_Sub_Node( const char * name );
|
||||
|
||||
CProfileNode * Get_Parent( void ) { return Parent; }
|
||||
CProfileNode * Get_Sibling( void ) { return Sibling; }
|
||||
CProfileNode * Get_Child( void ) { return Child; }
|
||||
|
||||
void CleanupMemory();
|
||||
void Reset( void );
|
||||
void Call( void );
|
||||
bool Return( void );
|
||||
|
||||
const char * Get_Name( void ) { return Name; }
|
||||
int Get_Total_Calls( void ) { return TotalCalls; }
|
||||
float Get_Total_Time( void ) { return TotalTime; }
|
||||
void* GetUserPointer() const {return m_userPtr;}
|
||||
void SetUserPointer(void* ptr) { m_userPtr = ptr;}
|
||||
protected:
|
||||
|
||||
const char * Name;
|
||||
int TotalCalls;
|
||||
float TotalTime;
|
||||
unsigned long int StartTime;
|
||||
int RecursionCounter;
|
||||
|
||||
CProfileNode * Parent;
|
||||
CProfileNode * Child;
|
||||
CProfileNode * Sibling;
|
||||
void* m_userPtr;
|
||||
};
|
||||
|
||||
///An iterator to navigate through the tree
|
||||
class CProfileIterator
|
||||
{
|
||||
public:
|
||||
// Access all the children of the current parent
|
||||
void First(void);
|
||||
void Next(void);
|
||||
bool Is_Done(void);
|
||||
bool Is_Root(void) { return (CurrentParent->Get_Parent() == 0); }
|
||||
|
||||
void Enter_Child( int index ); // Make the given child the new parent
|
||||
void Enter_Largest_Child( void ); // Make the largest child the new parent
|
||||
void Enter_Parent( void ); // Make the current parent's parent the new parent
|
||||
|
||||
// Access the current child
|
||||
const char * Get_Current_Name( void ) { return CurrentChild->Get_Name(); }
|
||||
int Get_Current_Total_Calls( void ) { return CurrentChild->Get_Total_Calls(); }
|
||||
float Get_Current_Total_Time( void ) { return CurrentChild->Get_Total_Time(); }
|
||||
|
||||
void* Get_Current_UserPointer( void ) { return CurrentChild->GetUserPointer(); }
|
||||
void Set_Current_UserPointer(void* ptr) {CurrentChild->SetUserPointer(ptr);}
|
||||
// Access the current parent
|
||||
const char * Get_Current_Parent_Name( void ) { return CurrentParent->Get_Name(); }
|
||||
int Get_Current_Parent_Total_Calls( void ) { return CurrentParent->Get_Total_Calls(); }
|
||||
float Get_Current_Parent_Total_Time( void ) { return CurrentParent->Get_Total_Time(); }
|
||||
|
||||
|
||||
|
||||
protected:
|
||||
|
||||
CProfileNode * CurrentParent;
|
||||
CProfileNode * CurrentChild;
|
||||
|
||||
|
||||
CProfileIterator( CProfileNode * start );
|
||||
friend class CProfileManager;
|
||||
};
|
||||
|
||||
|
||||
///The Manager for the Profile system
|
||||
class CProfileManager {
|
||||
public:
|
||||
static void Start_Profile( const char * name );
|
||||
static void Stop_Profile( void );
|
||||
|
||||
static void CleanupMemory(void)
|
||||
{
|
||||
Root.CleanupMemory();
|
||||
}
|
||||
|
||||
static void Reset( void );
|
||||
static void Increment_Frame_Counter( void );
|
||||
static int Get_Frame_Count_Since_Reset( void ) { return FrameCounter; }
|
||||
static float Get_Time_Since_Reset( void );
|
||||
|
||||
static CProfileIterator * Get_Iterator( void )
|
||||
{
|
||||
|
||||
return new CProfileIterator( &Root );
|
||||
}
|
||||
static void Release_Iterator( CProfileIterator * iterator ) { delete ( iterator); }
|
||||
|
||||
static void dumpRecursive(CProfileIterator* profileIterator, int spacing);
|
||||
|
||||
static void dumpAll();
|
||||
|
||||
private:
|
||||
static CProfileNode Root;
|
||||
static CProfileNode * CurrentNode;
|
||||
static int FrameCounter;
|
||||
static unsigned long int ResetTime;
|
||||
};
|
||||
|
||||
|
||||
///ProfileSampleClass is a simple way to profile a function's scope
|
||||
///Use the BT_PROFILE macro at the start of scope to time
|
||||
class CProfileSample {
|
||||
public:
|
||||
CProfileSample( const char * name )
|
||||
{
|
||||
CProfileManager::Start_Profile( name );
|
||||
}
|
||||
|
||||
~CProfileSample( void )
|
||||
{
|
||||
CProfileManager::Stop_Profile();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#define BT_PROFILE( name ) CProfileSample __profile( name )
|
||||
|
||||
#else
|
||||
|
||||
#define BT_PROFILE( name )
|
||||
|
||||
#endif //#ifndef BT_NO_PROFILE
|
||||
|
||||
|
||||
|
||||
#endif //BT_QUICK_PROF_H
|
||||
|
||||
|
||||
712
opencl/parallel_primitives/host/btRadixSort32CL.cpp
Normal file
712
opencl/parallel_primitives/host/btRadixSort32CL.cpp
Normal file
@@ -0,0 +1,712 @@
|
||||
|
||||
#include "btRadixSort32CL.h"
|
||||
#include "btLauncherCL.h"
|
||||
#include "../../basic_initialize/btOpenCLUtils.h"
|
||||
#include "btPrefixScanCL.h"
|
||||
#include "btFillCL.h"
|
||||
|
||||
#define RADIXSORT32_PATH "opencl/parallel_primitives/kernels/RadixSort32Kernels.cl"
|
||||
|
||||
#include "../kernels/RadixSort32KernelsCL.h"
|
||||
|
||||
btRadixSort32CL::btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
|
||||
:m_commandQueue(queue)
|
||||
{
|
||||
btOpenCLDeviceInfo info;
|
||||
btOpenCLUtils::getDeviceInfo(device,&info);
|
||||
m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
|
||||
|
||||
m_workBuffer1 = new btOpenCLArray<unsigned int>(ctx,queue);
|
||||
m_workBuffer2 = new btOpenCLArray<unsigned int>(ctx,queue);
|
||||
m_workBuffer3 = new btOpenCLArray<btSortData>(ctx,queue);
|
||||
m_workBuffer3a = new btOpenCLArray<unsigned int>(ctx,queue);
|
||||
m_workBuffer4 = new btOpenCLArray<btSortData>(ctx,queue);
|
||||
m_workBuffer4a = new btOpenCLArray<unsigned int>(ctx,queue);
|
||||
|
||||
|
||||
if (initialCapacity>0)
|
||||
{
|
||||
m_workBuffer1->resize(initialCapacity);
|
||||
m_workBuffer3->resize(initialCapacity);
|
||||
m_workBuffer3a->resize(initialCapacity);
|
||||
m_workBuffer4->resize(initialCapacity);
|
||||
m_workBuffer4a->resize(initialCapacity);
|
||||
}
|
||||
|
||||
m_scan = new btPrefixScanCL(ctx,device,queue);
|
||||
m_fill = new btFillCL(ctx,device,queue);
|
||||
|
||||
const char* additionalMacros = "";
|
||||
const char* srcFileNameForCaching="";
|
||||
|
||||
cl_int pErrNum;
|
||||
const char* kernelSource = radixSort32KernelsCL;
|
||||
|
||||
cl_program sortProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
|
||||
btAssert(sortProg);
|
||||
|
||||
m_streamCountSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_streamCountSortDataKernel );
|
||||
|
||||
|
||||
|
||||
m_streamCountKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_streamCountKernel);
|
||||
|
||||
|
||||
|
||||
if (m_deviceCPU)
|
||||
{
|
||||
|
||||
m_sortAndScatterSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_sortAndScatterSortDataKernel);
|
||||
m_sortAndScatterKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_sortAndScatterKernel);
|
||||
} else
|
||||
{
|
||||
m_sortAndScatterSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_sortAndScatterSortDataKernel);
|
||||
m_sortAndScatterKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_sortAndScatterKernel);
|
||||
}
|
||||
|
||||
m_prefixScanKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
|
||||
btAssert(m_prefixScanKernel);
|
||||
|
||||
}
|
||||
|
||||
btRadixSort32CL::~btRadixSort32CL()
|
||||
{
|
||||
delete m_scan;
|
||||
delete m_fill;
|
||||
delete m_workBuffer1;
|
||||
delete m_workBuffer2;
|
||||
delete m_workBuffer3;
|
||||
delete m_workBuffer3a;
|
||||
delete m_workBuffer4;
|
||||
delete m_workBuffer4a;
|
||||
|
||||
clReleaseKernel(m_streamCountSortDataKernel);
|
||||
clReleaseKernel(m_streamCountKernel);
|
||||
clReleaseKernel(m_sortAndScatterSortDataKernel);
|
||||
clReleaseKernel(m_sortAndScatterKernel);
|
||||
clReleaseKernel(m_prefixScanKernel);
|
||||
}
|
||||
|
||||
void btRadixSort32CL::executeHost(btAlignedObjectArray<btSortData>& inout, int sortBits /* = 32 */)
|
||||
{
|
||||
int n = inout.size();
|
||||
const int BITS_PER_PASS = 8;
|
||||
const int NUM_TABLES = (1<<BITS_PER_PASS);
|
||||
|
||||
|
||||
int tables[NUM_TABLES];
|
||||
int counter[NUM_TABLES];
|
||||
|
||||
btSortData* src = &inout[0];
|
||||
btAlignedObjectArray<btSortData> workbuffer;
|
||||
workbuffer.resize(inout.size());
|
||||
btSortData* dst = &workbuffer[0];
|
||||
|
||||
int count=0;
|
||||
for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
|
||||
{
|
||||
for(int i=0; i<NUM_TABLES; i++)
|
||||
{
|
||||
tables[i] = 0;
|
||||
}
|
||||
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
|
||||
tables[tableIdx]++;
|
||||
}
|
||||
//#define TEST
|
||||
#ifdef TEST
|
||||
printf("histogram size=%d\n",NUM_TABLES);
|
||||
for (int i=0;i<NUM_TABLES;i++)
|
||||
{
|
||||
if (tables[i]!=0)
|
||||
{
|
||||
printf("tables[%d]=%d]\n",i,tables[i]);
|
||||
}
|
||||
|
||||
}
|
||||
#endif //TEST
|
||||
// prefix scan
|
||||
int sum = 0;
|
||||
for(int i=0; i<NUM_TABLES; i++)
|
||||
{
|
||||
int iData = tables[i];
|
||||
tables[i] = sum;
|
||||
sum += iData;
|
||||
counter[i] = 0;
|
||||
}
|
||||
|
||||
// distribute
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
|
||||
|
||||
dst[tables[tableIdx] + counter[tableIdx]] = src[i];
|
||||
counter[tableIdx] ++;
|
||||
}
|
||||
|
||||
btSwap( src, dst );
|
||||
count++;
|
||||
}
|
||||
|
||||
if (count&1)
|
||||
{
|
||||
btAssert(0);//need to copy
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void btRadixSort32CL::executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
|
||||
{
|
||||
|
||||
btAlignedObjectArray<btSortData> inout;
|
||||
keyValuesInOut.copyToHost(inout);
|
||||
|
||||
executeHost(inout,sortBits);
|
||||
|
||||
keyValuesInOut.copyFromHost(inout);
|
||||
}
|
||||
|
||||
void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray<unsigned int>& keysOut, btOpenCLArray<unsigned int>& valuesIn,
|
||||
btOpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
//#define DEBUG_RADIXSORT
|
||||
//#define DEBUG_RADIXSORT2
|
||||
|
||||
|
||||
void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
|
||||
{
|
||||
|
||||
int originalSize = keyValuesInOut.size();
|
||||
int workingSize = originalSize;
|
||||
|
||||
|
||||
int dataAlignment = DATA_ALIGNMENT;
|
||||
|
||||
#ifdef DEBUG_RADIXSORT2
|
||||
btAlignedObjectArray<btSortData> test2;
|
||||
keyValuesInOut.copyToHost(test2);
|
||||
printf("numElem = %d\n",test2.size());
|
||||
for (int i=0;i<test2.size();i++)
|
||||
{
|
||||
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
|
||||
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
|
||||
}
|
||||
#endif //DEBUG_RADIXSORT2
|
||||
|
||||
btOpenCLArray<btSortData>* src = 0;
|
||||
|
||||
if (workingSize%dataAlignment)
|
||||
{
|
||||
workingSize += dataAlignment-(workingSize%dataAlignment);
|
||||
m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
|
||||
m_workBuffer4->resize(workingSize);
|
||||
btSortData fillValue;
|
||||
fillValue.m_key = 0xffffffff;
|
||||
fillValue.m_value = 0xffffffff;
|
||||
|
||||
#define USE_BTFILL
|
||||
#ifdef USE_BTFILL
|
||||
m_fill->execute((btOpenCLArray<btInt2>&)*m_workBuffer4,(btInt2&)fillValue,workingSize-originalSize,originalSize);
|
||||
#else
|
||||
//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
|
||||
|
||||
for (int i=originalSize; i<workingSize;i++)
|
||||
{
|
||||
m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
|
||||
}
|
||||
#endif//USE_BTFILL
|
||||
|
||||
src = m_workBuffer4;
|
||||
} else
|
||||
{
|
||||
src = &keyValuesInOut;
|
||||
m_workBuffer4->resize(0);
|
||||
}
|
||||
|
||||
btAssert( workingSize%DATA_ALIGNMENT == 0 );
|
||||
int minCap = NUM_BUCKET*NUM_WGS;
|
||||
|
||||
|
||||
int n = workingSize;
|
||||
|
||||
m_workBuffer1->resize(minCap);
|
||||
m_workBuffer3->resize(workingSize);
|
||||
|
||||
|
||||
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
|
||||
btAssert( BITS_PER_PASS == 4 );
|
||||
btAssert( WG_SIZE == 64 );
|
||||
btAssert( (sortBits&0x3) == 0 );
|
||||
|
||||
|
||||
|
||||
btOpenCLArray<btSortData>* dst = m_workBuffer3;
|
||||
|
||||
btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
|
||||
btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;
|
||||
|
||||
|
||||
int nWGs = NUM_WGS;
|
||||
btConstData cdata;
|
||||
|
||||
{
|
||||
int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
|
||||
int nBlocks = (n+blockSize-1)/(blockSize);
|
||||
cdata.m_n = n;
|
||||
cdata.m_nWGs = NUM_WGS;
|
||||
cdata.m_startBit = 0;
|
||||
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
|
||||
if( nBlocks < NUM_WGS )
|
||||
{
|
||||
cdata.m_nBlocksPerWG = 1;
|
||||
nWGs = nBlocks;
|
||||
}
|
||||
}
|
||||
|
||||
int count=0;
|
||||
for(int ib=0; ib<sortBits; ib+=4)
|
||||
{
|
||||
#ifdef DEBUG_RADIXSORT2
|
||||
keyValuesInOut.copyToHost(test2);
|
||||
printf("numElem = %d\n",test2.size());
|
||||
for (int i=0;i<test2.size();i++)
|
||||
{
|
||||
if (test2[i].m_key != test2[i].m_value)
|
||||
{
|
||||
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
|
||||
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
|
||||
}
|
||||
}
|
||||
#endif //DEBUG_RADIXSORT2
|
||||
|
||||
cdata.m_startBit = ib;
|
||||
|
||||
if (src->size())
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
|
||||
btLauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel);
|
||||
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( cdata );
|
||||
|
||||
int num = NUM_WGS*WG_SIZE;
|
||||
launcher.launch1D( num, WG_SIZE );
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifdef DEBUG_RADIXSORT
|
||||
btAlignedObjectArray<unsigned int> testHist;
|
||||
srcHisto->copyToHost(testHist);
|
||||
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
|
||||
for (int i=0;i<testHist.size();i++)
|
||||
{
|
||||
if (testHist[i]!=0)
|
||||
printf("testHist[%d]=%d\n",i,testHist[i]);
|
||||
}
|
||||
#endif //DEBUG_RADIXSORT
|
||||
|
||||
|
||||
|
||||
//fast prefix scan is not working properly on Mac OSX yet
|
||||
#ifdef _WIN32
|
||||
bool fastScan=!m_deviceCPU;//only use fast scan on GPU
|
||||
#else
|
||||
bool fastScan=false;
|
||||
#endif
|
||||
|
||||
if (fastScan)
|
||||
{// prefix scan group histogram
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
|
||||
btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( cdata );
|
||||
launcher.launch1D( 128, 128 );
|
||||
destHisto = srcHisto;
|
||||
}else
|
||||
{
|
||||
//unsigned int sum; //for debugging
|
||||
m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
|
||||
}
|
||||
|
||||
|
||||
#ifdef DEBUG_RADIXSORT
|
||||
destHisto->copyToHost(testHist);
|
||||
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
|
||||
for (int i=0;i<testHist.size();i++)
|
||||
{
|
||||
if (testHist[i]!=0)
|
||||
printf("testHist[%d]=%d\n",i,testHist[i]);
|
||||
}
|
||||
|
||||
for (int i=0;i<testHist.size();i+=NUM_WGS)
|
||||
{
|
||||
printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
|
||||
}
|
||||
|
||||
#endif //DEBUG_RADIXSORT
|
||||
|
||||
#define USE_GPU
|
||||
#ifdef USE_GPU
|
||||
|
||||
if (src->size())
|
||||
{// local sort and distribute
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
|
||||
btLauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( cdata );
|
||||
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
|
||||
|
||||
}
|
||||
#else
|
||||
{
|
||||
#define NUM_TABLES 16
|
||||
//#define SEQUENTIAL
|
||||
#ifdef SEQUENTIAL
|
||||
int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
int tables[NUM_TABLES];
|
||||
int startBit = ib;
|
||||
|
||||
destHisto->copyToHost(testHist);
|
||||
btAlignedObjectArray<btSortData> srcHost;
|
||||
btAlignedObjectArray<btSortData> dstHost;
|
||||
dstHost.resize(src->size());
|
||||
|
||||
src->copyToHost(srcHost);
|
||||
|
||||
for (int i=0;i<NUM_TABLES;i++)
|
||||
{
|
||||
tables[i] = testHist[i*NUM_WGS];
|
||||
}
|
||||
|
||||
// distribute
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
|
||||
|
||||
dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
|
||||
counter2[tableIdx] ++;
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
|
||||
int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
|
||||
int tables[NUM_TABLES];
|
||||
btAlignedObjectArray<btSortData> dstHostOK;
|
||||
dstHostOK.resize(src->size());
|
||||
|
||||
destHisto->copyToHost(testHist);
|
||||
btAlignedObjectArray<btSortData> srcHost;
|
||||
src->copyToHost(srcHost);
|
||||
|
||||
int blockSize = 256;
|
||||
int nBlocksPerWG = cdata.m_nBlocksPerWG;
|
||||
int startBit = ib;
|
||||
|
||||
{
|
||||
for (int i=0;i<NUM_TABLES;i++)
|
||||
{
|
||||
tables[i] = testHist[i*NUM_WGS];
|
||||
}
|
||||
|
||||
// distribute
|
||||
for(int i=0; i<n; i++)
|
||||
{
|
||||
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
|
||||
|
||||
dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
|
||||
counter2[tableIdx] ++;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
btAlignedObjectArray<btSortData> dstHost;
|
||||
dstHost.resize(src->size());
|
||||
|
||||
|
||||
int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
|
||||
|
||||
|
||||
for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
|
||||
{
|
||||
int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
|
||||
int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
|
||||
|
||||
for(int iblock=0; iblock<btMin(cdata.m_nBlocksPerWG, nBlocks); iblock++)
|
||||
{
|
||||
for (int lIdx = 0;lIdx < 64;lIdx++)
|
||||
{
|
||||
int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
|
||||
|
||||
// MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
|
||||
// Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
|
||||
// AMD: AtomInc performs better while NV prefers ++
|
||||
for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
|
||||
{
|
||||
if( addr+j < n )
|
||||
{
|
||||
// printf ("addr+j=%d\n", addr+j);
|
||||
|
||||
int i = addr+j;
|
||||
|
||||
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
|
||||
|
||||
int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
|
||||
|
||||
btSortData ok = dstHostOK[destIndex];
|
||||
|
||||
if (ok.m_key != srcHost[i].m_key)
|
||||
{
|
||||
printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
|
||||
printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
|
||||
}
|
||||
if (ok.m_value != srcHost[i].m_value)
|
||||
{
|
||||
|
||||
printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
|
||||
printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );
|
||||
|
||||
}
|
||||
|
||||
dstHost[destIndex] = srcHost[i];
|
||||
counter[tableIdx] ++;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif //SEQUENTIAL
|
||||
|
||||
dst->copyFromHost(dstHost);
|
||||
}
|
||||
#endif//USE_GPU
|
||||
|
||||
|
||||
|
||||
#ifdef DEBUG_RADIXSORT
|
||||
destHisto->copyToHost(testHist);
|
||||
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
|
||||
for (int i=0;i<testHist.size();i++)
|
||||
{
|
||||
if (testHist[i]!=0)
|
||||
printf("testHist[%d]=%d\n",i,testHist[i]);
|
||||
}
|
||||
#endif //DEBUG_RADIXSORT
|
||||
btSwap(src, dst );
|
||||
btSwap(srcHisto,destHisto);
|
||||
|
||||
#ifdef DEBUG_RADIXSORT2
|
||||
keyValuesInOut.copyToHost(test2);
|
||||
printf("numElem = %d\n",test2.size());
|
||||
for (int i=0;i<test2.size();i++)
|
||||
{
|
||||
if (test2[i].m_key != test2[i].m_value)
|
||||
{
|
||||
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
|
||||
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
|
||||
}
|
||||
}
|
||||
#endif //DEBUG_RADIXSORT2
|
||||
|
||||
count++;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (count&1)
|
||||
{
|
||||
btAssert(0);//need to copy from workbuffer to keyValuesInOut
|
||||
}
|
||||
|
||||
if (m_workBuffer4->size())
|
||||
{
|
||||
m_workBuffer4->resize(originalSize);
|
||||
keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
|
||||
}
|
||||
|
||||
|
||||
#ifdef DEBUG_RADIXSORT
|
||||
keyValuesInOut.copyToHost(test2);
|
||||
|
||||
printf("numElem = %d\n",test2.size());
|
||||
for (int i=0;i<test2.size();i++)
|
||||
{
|
||||
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
|
||||
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
|
||||
{
|
||||
int originalSize = keysInOut.size();
|
||||
int workingSize = originalSize;
|
||||
|
||||
|
||||
int dataAlignment = DATA_ALIGNMENT;
|
||||
|
||||
btOpenCLArray<unsigned int>* src = 0;
|
||||
|
||||
if (workingSize%dataAlignment)
|
||||
{
|
||||
workingSize += dataAlignment-(workingSize%dataAlignment);
|
||||
m_workBuffer4a->copyFromOpenCLArray(keysInOut);
|
||||
m_workBuffer4a->resize(workingSize);
|
||||
unsigned int fillValue = 0xffffffff;
|
||||
|
||||
m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize);
|
||||
|
||||
src = m_workBuffer4a;
|
||||
} else
|
||||
{
|
||||
src = &keysInOut;
|
||||
m_workBuffer4a->resize(0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
btAssert( workingSize%DATA_ALIGNMENT == 0 );
|
||||
int minCap = NUM_BUCKET*NUM_WGS;
|
||||
|
||||
|
||||
int n = workingSize;
|
||||
|
||||
|
||||
m_workBuffer1->resize(minCap);
|
||||
m_workBuffer3->resize(workingSize);
|
||||
m_workBuffer3a->resize(workingSize);
|
||||
|
||||
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
|
||||
btAssert( BITS_PER_PASS == 4 );
|
||||
btAssert( WG_SIZE == 64 );
|
||||
btAssert( (sortBits&0x3) == 0 );
|
||||
|
||||
|
||||
|
||||
btOpenCLArray<unsigned int>* dst = m_workBuffer3a;
|
||||
|
||||
btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
|
||||
btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;
|
||||
|
||||
|
||||
int nWGs = NUM_WGS;
|
||||
btConstData cdata;
|
||||
|
||||
{
|
||||
int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
|
||||
int nBlocks = (n+blockSize-1)/(blockSize);
|
||||
cdata.m_n = n;
|
||||
cdata.m_nWGs = NUM_WGS;
|
||||
cdata.m_startBit = 0;
|
||||
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
|
||||
if( nBlocks < NUM_WGS )
|
||||
{
|
||||
cdata.m_nBlocksPerWG = 1;
|
||||
nWGs = nBlocks;
|
||||
}
|
||||
}
|
||||
|
||||
int count=0;
|
||||
for(int ib=0; ib<sortBits; ib+=4)
|
||||
{
|
||||
cdata.m_startBit = ib;
|
||||
|
||||
if (src->size())
|
||||
{
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
|
||||
btLauncherCL launcher(m_commandQueue, m_streamCountKernel);
|
||||
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( cdata );
|
||||
|
||||
int num = NUM_WGS*WG_SIZE;
|
||||
launcher.launch1D( num, WG_SIZE );
|
||||
}
|
||||
|
||||
|
||||
|
||||
//fast prefix scan is not working properly on Mac OSX yet
|
||||
#ifdef _WIN32
|
||||
bool fastScan=!m_deviceCPU;
|
||||
|
||||
#else
|
||||
bool fastScan=false;
|
||||
#endif
|
||||
|
||||
if (fastScan)
|
||||
{// prefix scan group histogram
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
|
||||
btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( cdata );
|
||||
launcher.launch1D( 128, 128 );
|
||||
destHisto = srcHisto;
|
||||
}else
|
||||
{
|
||||
//unsigned int sum; //for debugging
|
||||
m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
|
||||
}
|
||||
|
||||
if (src->size())
|
||||
{// local sort and distribute
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
|
||||
btLauncherCL launcher( m_commandQueue, m_sortAndScatterKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( cdata );
|
||||
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
|
||||
|
||||
}
|
||||
|
||||
btSwap(src, dst );
|
||||
btSwap(srcHisto,destHisto);
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
if (count&1)
|
||||
{
|
||||
btAssert(0);//need to copy from workbuffer to keyValuesInOut
|
||||
}
|
||||
|
||||
if (m_workBuffer4a->size())
|
||||
{
|
||||
m_workBuffer4a->resize(originalSize);
|
||||
keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
85
opencl/parallel_primitives/host/btRadixSort32CL.h
Normal file
85
opencl/parallel_primitives/host/btRadixSort32CL.h
Normal file
@@ -0,0 +1,85 @@
|
||||
|
||||
#ifndef BT_RADIXSORT32_H
|
||||
#define BT_RADIXSORT32_H
|
||||
|
||||
#include "btOpenCLArray.h"
|
||||
|
||||
struct btSortData
|
||||
{
|
||||
int m_key;
|
||||
int m_value;
|
||||
};
|
||||
#include "btBufferInfoCL.h"
|
||||
|
||||
class btRadixSort32CL
|
||||
{
|
||||
|
||||
btOpenCLArray<unsigned int>* m_workBuffer1;
|
||||
btOpenCLArray<unsigned int>* m_workBuffer2;
|
||||
|
||||
btOpenCLArray<btSortData>* m_workBuffer3;
|
||||
btOpenCLArray<btSortData>* m_workBuffer4;
|
||||
|
||||
btOpenCLArray<unsigned int>* m_workBuffer3a;
|
||||
btOpenCLArray<unsigned int>* m_workBuffer4a;
|
||||
|
||||
cl_command_queue m_commandQueue;
|
||||
|
||||
cl_kernel m_streamCountSortDataKernel;
|
||||
cl_kernel m_streamCountKernel;
|
||||
|
||||
cl_kernel m_prefixScanKernel;
|
||||
cl_kernel m_sortAndScatterSortDataKernel;
|
||||
cl_kernel m_sortAndScatterKernel;
|
||||
|
||||
|
||||
bool m_deviceCPU;
|
||||
|
||||
class btPrefixScanCL* m_scan;
|
||||
class btFillCL* m_fill;
|
||||
|
||||
public:
|
||||
struct btConstData
|
||||
{
|
||||
int m_n;
|
||||
int m_nWGs;
|
||||
int m_startBit;
|
||||
int m_nBlocksPerWG;
|
||||
};
|
||||
enum
|
||||
{
|
||||
DATA_ALIGNMENT = 256,
|
||||
WG_SIZE = 64,
|
||||
BLOCK_SIZE = 256,
|
||||
ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
|
||||
BITS_PER_PASS = 4,
|
||||
NUM_BUCKET=(1<<BITS_PER_PASS),
|
||||
// if you change this, change nPerWI in kernel as well
|
||||
NUM_WGS = 20*6, // cypress
|
||||
// NUM_WGS = 24*6, // cayman
|
||||
// NUM_WGS = 32*4, // nv
|
||||
};
|
||||
|
||||
|
||||
private:
|
||||
|
||||
|
||||
public:
|
||||
|
||||
btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
|
||||
|
||||
virtual ~btRadixSort32CL();
|
||||
|
||||
void execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray<unsigned int>& keysOut, btOpenCLArray<unsigned int>& valuesIn,
|
||||
btOpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
|
||||
|
||||
///keys only
|
||||
void execute(btOpenCLArray<unsigned int>& keysInOut, int sortBits = 32 );
|
||||
|
||||
void execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits = 32 );
|
||||
void executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits = 32);
|
||||
void executeHost(btAlignedObjectArray<btSortData>& keyValuesInOut, int sortBits = 32);
|
||||
|
||||
};
|
||||
#endif //BT_RADIXSORT32_H
|
||||
|
||||
660
opencl/parallel_primitives/host/btScalar.h
Normal file
660
opencl/parallel_primitives/host/btScalar.h
Normal file
@@ -0,0 +1,660 @@
|
||||
/*
|
||||
Copyright (c) 2003-2009 Erwin Coumans http://bullet.googlecode.com
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#ifndef BT_SCALAR_H
|
||||
#define BT_SCALAR_H
|
||||
|
||||
#ifdef BT_MANAGED_CODE
|
||||
//Aligned data types not supported in managed code
|
||||
#pragma unmanaged
|
||||
#endif
|
||||
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>//size_t for MSVC 6.0
|
||||
#include <float.h>
|
||||
|
||||
/* SVN $Revision$ on $Date$ from http://bullet.googlecode.com*/
|
||||
#define BT_BULLET_VERSION 281
|
||||
|
||||
inline int btGetVersion()
|
||||
{
|
||||
return BT_BULLET_VERSION;
|
||||
}
|
||||
|
||||
#if defined(DEBUG) || defined (_DEBUG)
|
||||
#define BT_DEBUG
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
#if defined(__MINGW32__) || defined(__CYGWIN__) || (defined (_MSC_VER) && _MSC_VER < 1300)
|
||||
|
||||
#define SIMD_FORCE_INLINE inline
|
||||
#define ATTRIBUTE_ALIGNED16(a) a
|
||||
#define ATTRIBUTE_ALIGNED64(a) a
|
||||
#define ATTRIBUTE_ALIGNED128(a) a
|
||||
#else
|
||||
//#define BT_HAS_ALIGNED_ALLOCATOR
|
||||
#pragma warning(disable : 4324) // disable padding warning
|
||||
// #pragma warning(disable:4530) // Disable the exception disable but used in MSCV Stl warning.
|
||||
// #pragma warning(disable:4996) //Turn off warnings about deprecated C routines
|
||||
// #pragma warning(disable:4786) // Disable the "debug name too long" warning
|
||||
|
||||
#define SIMD_FORCE_INLINE __forceinline
|
||||
#define ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a
|
||||
#define ATTRIBUTE_ALIGNED64(a) __declspec(align(64)) a
|
||||
#define ATTRIBUTE_ALIGNED128(a) __declspec (align(128)) a
|
||||
#ifdef _XBOX
|
||||
#define BT_USE_VMX128
|
||||
|
||||
#include <ppcintrinsics.h>
|
||||
#define BT_HAVE_NATIVE_FSEL
|
||||
#define btFsel(a,b,c) __fsel((a),(b),(c))
|
||||
#else
|
||||
|
||||
#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (BT_USE_DOUBLE_PRECISION))
|
||||
#define BT_USE_SSE
|
||||
#ifdef BT_USE_SSE
|
||||
//BT_USE_SSE_IN_API is disabled under Windows by default, because
|
||||
//it makes it harder to integrate Bullet into your application under Windows
|
||||
//(structured embedding Bullet structs/classes need to be 16-byte aligned)
|
||||
//with relatively little performance gain
|
||||
//If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries
|
||||
//you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage)
|
||||
//#define BT_USE_SSE_IN_API
|
||||
#endif //BT_USE_SSE
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#endif//_XBOX
|
||||
|
||||
#endif //__MINGW32__
|
||||
|
||||
#ifdef BT_DEBUG
|
||||
#ifdef _MSC_VER
|
||||
#include <stdio.h>
|
||||
#define btAssert(x) { if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);__debugbreak(); }}
|
||||
#else//_MSC_VER
|
||||
#include <assert.h>
|
||||
#define btAssert assert
|
||||
#endif//_MSC_VER
|
||||
#else
|
||||
#define btAssert(x)
|
||||
#endif
|
||||
//btFullAssert is optional, slows down a lot
|
||||
#define btFullAssert(x)
|
||||
|
||||
#define btLikely(_c) _c
|
||||
#define btUnlikely(_c) _c
|
||||
|
||||
#else
|
||||
|
||||
#if defined (__CELLOS_LV2__)
|
||||
#define SIMD_FORCE_INLINE inline __attribute__((always_inline))
|
||||
#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
|
||||
#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
|
||||
#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
|
||||
#ifndef assert
|
||||
#include <assert.h>
|
||||
#endif
|
||||
#ifdef BT_DEBUG
|
||||
#ifdef __SPU__
|
||||
#include <spu_printf.h>
|
||||
#define printf spu_printf
|
||||
#define btAssert(x) {if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}}
|
||||
#else
|
||||
#define btAssert assert
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define btAssert(x)
|
||||
#endif
|
||||
//btFullAssert is optional, slows down a lot
|
||||
#define btFullAssert(x)
|
||||
|
||||
#define btLikely(_c) _c
|
||||
#define btUnlikely(_c) _c
|
||||
|
||||
#else
|
||||
|
||||
#ifdef USE_LIBSPE2
|
||||
|
||||
#define SIMD_FORCE_INLINE __inline
|
||||
#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
|
||||
#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
|
||||
#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
|
||||
#ifndef assert
|
||||
#include <assert.h>
|
||||
#endif
|
||||
#ifdef BT_DEBUG
|
||||
#define btAssert assert
|
||||
#else
|
||||
#define btAssert(x)
|
||||
#endif
|
||||
//btFullAssert is optional, slows down a lot
|
||||
#define btFullAssert(x)
|
||||
|
||||
|
||||
#define btLikely(_c) __builtin_expect((_c), 1)
|
||||
#define btUnlikely(_c) __builtin_expect((_c), 0)
|
||||
|
||||
|
||||
#else
|
||||
//non-windows systems
|
||||
|
||||
#if (defined (__APPLE__) && (!defined (BT_USE_DOUBLE_PRECISION)))
|
||||
#if defined (__i386__) || defined (__x86_64__)
|
||||
#define BT_USE_SSE
|
||||
//BT_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries
|
||||
//if apps run into issues, we will disable the next line
|
||||
#define BT_USE_SSE_IN_API
|
||||
#ifdef BT_USE_SSE
|
||||
// include appropriate SSE level
|
||||
#if defined (__SSE4_1__)
|
||||
#include <smmintrin.h>
|
||||
#elif defined (__SSSE3__)
|
||||
#include <tmmintrin.h>
|
||||
#elif defined (__SSE3__)
|
||||
#include <pmmintrin.h>
|
||||
#else
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
#endif //BT_USE_SSE
|
||||
#elif defined( __armv7__ )
|
||||
#ifdef __clang__
|
||||
#define BT_USE_NEON 1
|
||||
|
||||
#if defined BT_USE_NEON && defined (__clang__)
|
||||
#include <arm_neon.h>
|
||||
#endif//BT_USE_NEON
|
||||
#endif //__clang__
|
||||
#endif//__arm__
|
||||
|
||||
#define SIMD_FORCE_INLINE inline __attribute__ ((always_inline))
|
||||
///@todo: check out alignment methods for other platforms/compilers
|
||||
#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
|
||||
#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
|
||||
#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
|
||||
#ifndef assert
|
||||
#include <assert.h>
|
||||
#endif
|
||||
|
||||
#if defined(DEBUG) || defined (_DEBUG)
|
||||
#if defined (__i386__) || defined (__x86_64__)
|
||||
#include <stdio.h>
|
||||
#define btAssert(x)\
|
||||
{\
|
||||
if(!(x))\
|
||||
{\
|
||||
printf("Assert %s in line %d, file %s\n",#x, __LINE__, __FILE__);\
|
||||
asm volatile ("int3");\
|
||||
}\
|
||||
}
|
||||
#else//defined (__i386__) || defined (__x86_64__)
|
||||
#define btAssert assert
|
||||
#endif//defined (__i386__) || defined (__x86_64__)
|
||||
#else//defined(DEBUG) || defined (_DEBUG)
|
||||
#define btAssert(x)
|
||||
#endif//defined(DEBUG) || defined (_DEBUG)
|
||||
|
||||
//btFullAssert is optional, slows down a lot
|
||||
#define btFullAssert(x)
|
||||
#define btLikely(_c) _c
|
||||
#define btUnlikely(_c) _c
|
||||
|
||||
#else
|
||||
|
||||
#define SIMD_FORCE_INLINE inline
|
||||
///@todo: check out alignment methods for other platforms/compilers
|
||||
///#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
|
||||
///#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
|
||||
///#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
|
||||
#define ATTRIBUTE_ALIGNED16(a) a
|
||||
#define ATTRIBUTE_ALIGNED64(a) a
|
||||
#define ATTRIBUTE_ALIGNED128(a) a
|
||||
#ifndef assert
|
||||
#include <assert.h>
|
||||
#endif
|
||||
|
||||
#if defined(DEBUG) || defined (_DEBUG)
|
||||
#define btAssert assert
|
||||
#else
|
||||
#define btAssert(x)
|
||||
#endif
|
||||
|
||||
//btFullAssert is optional, slows down a lot
|
||||
#define btFullAssert(x)
|
||||
#define btLikely(_c) _c
|
||||
#define btUnlikely(_c) _c
|
||||
#endif //__APPLE__
|
||||
|
||||
#endif // LIBSPE2
|
||||
|
||||
#endif //__CELLOS_LV2__
|
||||
#endif
|
||||
|
||||
|
||||
///The btScalar type abstracts floating point numbers, to easily switch between double and single floating point precision.
|
||||
#if defined(BT_USE_DOUBLE_PRECISION)
|
||||
typedef double btScalar;
|
||||
//this number could be bigger in double precision
|
||||
#define BT_LARGE_FLOAT 1e30
|
||||
#else
|
||||
typedef float btScalar;
|
||||
//keep BT_LARGE_FLOAT*BT_LARGE_FLOAT < FLT_MAX
|
||||
#define BT_LARGE_FLOAT 1e18f
|
||||
#endif
|
||||
|
||||
#ifdef BT_USE_SSE
|
||||
typedef __m128 btSimdFloat4;
|
||||
#endif//BT_USE_SSE
|
||||
|
||||
#if defined BT_USE_SSE_IN_API && defined (BT_USE_SSE)
|
||||
#ifdef _WIN32
|
||||
|
||||
#ifndef BT_NAN
|
||||
static int btNanMask = 0x7F800001;
|
||||
#define BT_NAN (*(float*)&btNanMask)
|
||||
#endif
|
||||
|
||||
#ifndef BT_INFINITY
|
||||
static int btInfinityMask = 0x7F800000;
|
||||
#define BT_INFINITY (*(float*)&btInfinityMask)
|
||||
#endif
|
||||
|
||||
inline __m128 operator + (const __m128 A, const __m128 B)
|
||||
{
|
||||
return _mm_add_ps(A, B);
|
||||
}
|
||||
|
||||
inline __m128 operator - (const __m128 A, const __m128 B)
|
||||
{
|
||||
return _mm_sub_ps(A, B);
|
||||
}
|
||||
|
||||
inline __m128 operator * (const __m128 A, const __m128 B)
|
||||
{
|
||||
return _mm_mul_ps(A, B);
|
||||
}
|
||||
|
||||
#define btCastfTo128i(a) (_mm_castps_si128(a))
|
||||
#define btCastfTo128d(a) (_mm_castps_pd(a))
|
||||
#define btCastiTo128f(a) (_mm_castsi128_ps(a))
|
||||
#define btCastdTo128f(a) (_mm_castpd_ps(a))
|
||||
#define btCastdTo128i(a) (_mm_castpd_si128(a))
|
||||
#define btAssign128(r0,r1,r2,r3) _mm_setr_ps(r0,r1,r2,r3)
|
||||
|
||||
#else//_WIN32
|
||||
|
||||
#define btCastfTo128i(a) ((__m128i)(a))
|
||||
#define btCastfTo128d(a) ((__m128d)(a))
|
||||
#define btCastiTo128f(a) ((__m128) (a))
|
||||
#define btCastdTo128f(a) ((__m128) (a))
|
||||
#define btCastdTo128i(a) ((__m128i)(a))
|
||||
#define btAssign128(r0,r1,r2,r3) (__m128){r0,r1,r2,r3}
|
||||
#define BT_INFINITY INFINITY
|
||||
#define BT_NAN NAN
|
||||
#endif//_WIN32
|
||||
#endif //BT_USE_SSE_IN_API
|
||||
|
||||
#ifdef BT_USE_NEON
|
||||
#include <arm_neon.h>
|
||||
|
||||
typedef float32x4_t btSimdFloat4;
|
||||
#define BT_INFINITY INFINITY
|
||||
#define BT_NAN NAN
|
||||
#define btAssign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#define BT_DECLARE_ALIGNED_ALLOCATOR() \
|
||||
SIMD_FORCE_INLINE void* operator new(size_t sizeInBytes) { return btAlignedAlloc(sizeInBytes,16); } \
|
||||
SIMD_FORCE_INLINE void operator delete(void* ptr) { btAlignedFree(ptr); } \
|
||||
SIMD_FORCE_INLINE void* operator new(size_t, void* ptr) { return ptr; } \
|
||||
SIMD_FORCE_INLINE void operator delete(void*, void*) { } \
|
||||
SIMD_FORCE_INLINE void* operator new[](size_t sizeInBytes) { return btAlignedAlloc(sizeInBytes,16); } \
|
||||
SIMD_FORCE_INLINE void operator delete[](void* ptr) { btAlignedFree(ptr); } \
|
||||
SIMD_FORCE_INLINE void* operator new[](size_t, void* ptr) { return ptr; } \
|
||||
SIMD_FORCE_INLINE void operator delete[](void*, void*) { } \
|
||||
|
||||
|
||||
|
||||
#if defined(BT_USE_DOUBLE_PRECISION) || defined(BT_FORCE_DOUBLE_FUNCTIONS)
|
||||
|
||||
SIMD_FORCE_INLINE btScalar btSqrt(btScalar x) { return sqrt(x); }
|
||||
SIMD_FORCE_INLINE btScalar btFabs(btScalar x) { return fabs(x); }
|
||||
SIMD_FORCE_INLINE btScalar btCos(btScalar x) { return cos(x); }
|
||||
SIMD_FORCE_INLINE btScalar btSin(btScalar x) { return sin(x); }
|
||||
SIMD_FORCE_INLINE btScalar btTan(btScalar x) { return tan(x); }
|
||||
SIMD_FORCE_INLINE btScalar btAcos(btScalar x) { if (x<btScalar(-1)) x=btScalar(-1); if (x>btScalar(1)) x=btScalar(1); return acos(x); }
|
||||
SIMD_FORCE_INLINE btScalar btAsin(btScalar x) { if (x<btScalar(-1)) x=btScalar(-1); if (x>btScalar(1)) x=btScalar(1); return asin(x); }
|
||||
SIMD_FORCE_INLINE btScalar btAtan(btScalar x) { return atan(x); }
|
||||
SIMD_FORCE_INLINE btScalar btAtan2(btScalar x, btScalar y) { return atan2(x, y); }
|
||||
SIMD_FORCE_INLINE btScalar btExp(btScalar x) { return exp(x); }
|
||||
SIMD_FORCE_INLINE btScalar btLog(btScalar x) { return log(x); }
|
||||
SIMD_FORCE_INLINE btScalar btPow(btScalar x,btScalar y) { return pow(x,y); }
|
||||
SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmod(x,y); }
|
||||
|
||||
#else
|
||||
|
||||
SIMD_FORCE_INLINE btScalar btSqrt(btScalar y)
|
||||
{
|
||||
#ifdef USE_APPROXIMATION
|
||||
double x, z, tempf;
|
||||
unsigned long *tfptr = ((unsigned long *)&tempf) + 1;
|
||||
|
||||
tempf = y;
|
||||
*tfptr = (0xbfcdd90a - *tfptr)>>1; /* estimate of 1/sqrt(y) */
|
||||
x = tempf;
|
||||
z = y*btScalar(0.5);
|
||||
x = (btScalar(1.5)*x)-(x*x)*(x*z); /* iteration formula */
|
||||
x = (btScalar(1.5)*x)-(x*x)*(x*z);
|
||||
x = (btScalar(1.5)*x)-(x*x)*(x*z);
|
||||
x = (btScalar(1.5)*x)-(x*x)*(x*z);
|
||||
x = (btScalar(1.5)*x)-(x*x)*(x*z);
|
||||
return x*y;
|
||||
#else
|
||||
return sqrtf(y);
|
||||
#endif
|
||||
}
|
||||
SIMD_FORCE_INLINE btScalar btFabs(btScalar x) { return fabsf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btCos(btScalar x) { return cosf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btSin(btScalar x) { return sinf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btTan(btScalar x) { return tanf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btAcos(btScalar x) {
|
||||
if (x<btScalar(-1))
|
||||
x=btScalar(-1);
|
||||
if (x>btScalar(1))
|
||||
x=btScalar(1);
|
||||
return acosf(x);
|
||||
}
|
||||
SIMD_FORCE_INLINE btScalar btAsin(btScalar x) {
|
||||
if (x<btScalar(-1))
|
||||
x=btScalar(-1);
|
||||
if (x>btScalar(1))
|
||||
x=btScalar(1);
|
||||
return asinf(x);
|
||||
}
|
||||
SIMD_FORCE_INLINE btScalar btAtan(btScalar x) { return atanf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btAtan2(btScalar x, btScalar y) { return atan2f(x, y); }
|
||||
SIMD_FORCE_INLINE btScalar btExp(btScalar x) { return expf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btLog(btScalar x) { return logf(x); }
|
||||
SIMD_FORCE_INLINE btScalar btPow(btScalar x,btScalar y) { return powf(x,y); }
|
||||
SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmodf(x,y); }
|
||||
|
||||
#endif
|
||||
|
||||
#define SIMD_2_PI btScalar(6.283185307179586232)
|
||||
#define SIMD_PI (SIMD_2_PI * btScalar(0.5))
|
||||
#define SIMD_HALF_PI (SIMD_2_PI * btScalar(0.25))
|
||||
#define SIMD_RADS_PER_DEG (SIMD_2_PI / btScalar(360.0))
|
||||
#define SIMD_DEGS_PER_RAD (btScalar(360.0) / SIMD_2_PI)
|
||||
#define SIMDSQRT12 btScalar(0.7071067811865475244008443621048490)
|
||||
|
||||
#define btRecipSqrt(x) ((btScalar)(btScalar(1.0)/btSqrt(btScalar(x)))) /* reciprocal square root */
|
||||
|
||||
|
||||
#ifdef BT_USE_DOUBLE_PRECISION
|
||||
#define SIMD_EPSILON DBL_EPSILON
|
||||
#define SIMD_INFINITY DBL_MAX
|
||||
#else
|
||||
#define SIMD_EPSILON FLT_EPSILON
|
||||
#define SIMD_INFINITY FLT_MAX
|
||||
#endif
|
||||
|
||||
SIMD_FORCE_INLINE btScalar btAtan2Fast(btScalar y, btScalar x)
|
||||
{
|
||||
btScalar coeff_1 = SIMD_PI / 4.0f;
|
||||
btScalar coeff_2 = 3.0f * coeff_1;
|
||||
btScalar abs_y = btFabs(y);
|
||||
btScalar angle;
|
||||
if (x >= 0.0f) {
|
||||
btScalar r = (x - abs_y) / (x + abs_y);
|
||||
angle = coeff_1 - coeff_1 * r;
|
||||
} else {
|
||||
btScalar r = (x + abs_y) / (abs_y - x);
|
||||
angle = coeff_2 - coeff_1 * r;
|
||||
}
|
||||
return (y < 0.0f) ? -angle : angle;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE bool btFuzzyZero(btScalar x) { return btFabs(x) < SIMD_EPSILON; }
|
||||
|
||||
SIMD_FORCE_INLINE bool btEqual(btScalar a, btScalar eps) {
|
||||
return (((a) <= eps) && !((a) < -eps));
|
||||
}
|
||||
SIMD_FORCE_INLINE bool btGreaterEqual (btScalar a, btScalar eps) {
|
||||
return (!((a) <= eps));
|
||||
}
|
||||
|
||||
|
||||
SIMD_FORCE_INLINE int btIsNegative(btScalar x) {
|
||||
return x < btScalar(0.0) ? 1 : 0;
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE btScalar btRadians(btScalar x) { return x * SIMD_RADS_PER_DEG; }
|
||||
SIMD_FORCE_INLINE btScalar btDegrees(btScalar x) { return x * SIMD_DEGS_PER_RAD; }
|
||||
|
||||
#define BT_DECLARE_HANDLE(name) typedef struct name##__ { int unused; } *name
|
||||
|
||||
#ifndef btFsel
|
||||
SIMD_FORCE_INLINE btScalar btFsel(btScalar a, btScalar b, btScalar c)
|
||||
{
|
||||
return a >= 0 ? b : c;
|
||||
}
|
||||
#endif
|
||||
#define btFsels(a,b,c) (btScalar)btFsel(a,b,c)
|
||||
|
||||
|
||||
SIMD_FORCE_INLINE bool btMachineIsLittleEndian()
|
||||
{
|
||||
long int i = 1;
|
||||
const char *p = (const char *) &i;
|
||||
if (p[0] == 1) // Lowest address contains the least significant byte
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
///btSelect avoids branches, which makes performance much better for consoles like Playstation 3 and XBox 360
|
||||
///Thanks Phil Knight. See also http://www.cellperformance.com/articles/2006/04/more_techniques_for_eliminatin_1.html
|
||||
SIMD_FORCE_INLINE unsigned btSelect(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero)
|
||||
{
|
||||
// Set testNz to 0xFFFFFFFF if condition is nonzero, 0x00000000 if condition is zero
|
||||
// Rely on positive value or'ed with its negative having sign bit on
|
||||
// and zero value or'ed with its negative (which is still zero) having sign bit off
|
||||
// Use arithmetic shift right, shifting the sign bit through all 32 bits
|
||||
unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
|
||||
unsigned testEqz = ~testNz;
|
||||
return ((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
|
||||
}
|
||||
SIMD_FORCE_INLINE int btSelect(unsigned condition, int valueIfConditionNonZero, int valueIfConditionZero)
|
||||
{
|
||||
unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
|
||||
unsigned testEqz = ~testNz;
|
||||
return static_cast<int>((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
|
||||
}
|
||||
SIMD_FORCE_INLINE float btSelect(unsigned condition, float valueIfConditionNonZero, float valueIfConditionZero)
|
||||
{
|
||||
#ifdef BT_HAVE_NATIVE_FSEL
|
||||
return (float)btFsel((btScalar)condition - btScalar(1.0f), valueIfConditionNonZero, valueIfConditionZero);
|
||||
#else
|
||||
return (condition != 0) ? valueIfConditionNonZero : valueIfConditionZero;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename T> SIMD_FORCE_INLINE void btSwap(T& a, T& b)
|
||||
{
|
||||
T tmp = a;
|
||||
a = b;
|
||||
b = tmp;
|
||||
}
|
||||
|
||||
|
||||
//PCK: endian swapping functions
|
||||
SIMD_FORCE_INLINE unsigned btSwapEndian(unsigned val)
|
||||
{
|
||||
return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8) | ((val & 0x000000ff) << 24));
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE unsigned short btSwapEndian(unsigned short val)
|
||||
{
|
||||
return static_cast<unsigned short>(((val & 0xff00) >> 8) | ((val & 0x00ff) << 8));
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE unsigned btSwapEndian(int val)
|
||||
{
|
||||
return btSwapEndian((unsigned)val);
|
||||
}
|
||||
|
||||
SIMD_FORCE_INLINE unsigned short btSwapEndian(short val)
|
||||
{
|
||||
return btSwapEndian((unsigned short) val);
|
||||
}
|
||||
|
||||
///btSwapFloat uses using char pointers to swap the endianness
|
||||
////btSwapFloat/btSwapDouble will NOT return a float, because the machine might 'correct' invalid floating point values
|
||||
///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754.
|
||||
///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception.
|
||||
///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you.
|
||||
///so instead of returning a float/double, we return integer/long long integer
|
||||
SIMD_FORCE_INLINE unsigned int btSwapEndianFloat(float d)
|
||||
{
|
||||
unsigned int a = 0;
|
||||
unsigned char *dst = (unsigned char *)&a;
|
||||
unsigned char *src = (unsigned char *)&d;
|
||||
|
||||
dst[0] = src[3];
|
||||
dst[1] = src[2];
|
||||
dst[2] = src[1];
|
||||
dst[3] = src[0];
|
||||
return a;
|
||||
}
|
||||
|
||||
// unswap using char pointers
|
||||
SIMD_FORCE_INLINE float btUnswapEndianFloat(unsigned int a)
|
||||
{
|
||||
float d = 0.0f;
|
||||
unsigned char *src = (unsigned char *)&a;
|
||||
unsigned char *dst = (unsigned char *)&d;
|
||||
|
||||
dst[0] = src[3];
|
||||
dst[1] = src[2];
|
||||
dst[2] = src[1];
|
||||
dst[3] = src[0];
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
|
||||
// swap using char pointers
|
||||
SIMD_FORCE_INLINE void btSwapEndianDouble(double d, unsigned char* dst)
|
||||
{
|
||||
unsigned char *src = (unsigned char *)&d;
|
||||
|
||||
dst[0] = src[7];
|
||||
dst[1] = src[6];
|
||||
dst[2] = src[5];
|
||||
dst[3] = src[4];
|
||||
dst[4] = src[3];
|
||||
dst[5] = src[2];
|
||||
dst[6] = src[1];
|
||||
dst[7] = src[0];
|
||||
|
||||
}
|
||||
|
||||
// unswap using char pointers
|
||||
SIMD_FORCE_INLINE double btUnswapEndianDouble(const unsigned char *src)
|
||||
{
|
||||
double d = 0.0;
|
||||
unsigned char *dst = (unsigned char *)&d;
|
||||
|
||||
dst[0] = src[7];
|
||||
dst[1] = src[6];
|
||||
dst[2] = src[5];
|
||||
dst[3] = src[4];
|
||||
dst[4] = src[3];
|
||||
dst[5] = src[2];
|
||||
dst[6] = src[1];
|
||||
dst[7] = src[0];
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
// returns normalized value in range [-SIMD_PI, SIMD_PI]
|
||||
SIMD_FORCE_INLINE btScalar btNormalizeAngle(btScalar angleInRadians)
|
||||
{
|
||||
angleInRadians = btFmod(angleInRadians, SIMD_2_PI);
|
||||
if(angleInRadians < -SIMD_PI)
|
||||
{
|
||||
return angleInRadians + SIMD_2_PI;
|
||||
}
|
||||
else if(angleInRadians > SIMD_PI)
|
||||
{
|
||||
return angleInRadians - SIMD_2_PI;
|
||||
}
|
||||
else
|
||||
{
|
||||
return angleInRadians;
|
||||
}
|
||||
}
|
||||
|
||||
///rudimentary class to provide type info
|
||||
struct btTypedObject
|
||||
{
|
||||
btTypedObject(int objectType)
|
||||
:m_objectType(objectType)
|
||||
{
|
||||
}
|
||||
int m_objectType;
|
||||
inline int getObjectType() const
|
||||
{
|
||||
return m_objectType;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
///align a pointer to the provided alignment, upwards
|
||||
template <typename T>T* btAlignPointer(T* unalignedPtr, size_t alignment)
|
||||
{
|
||||
|
||||
struct btConvertPointerSizeT
|
||||
{
|
||||
union
|
||||
{
|
||||
T* ptr;
|
||||
size_t integer;
|
||||
};
|
||||
};
|
||||
btConvertPointerSizeT converter;
|
||||
|
||||
|
||||
const size_t bit_mask = ~(alignment - 1);
|
||||
converter.ptr = unalignedPtr;
|
||||
converter.integer += alignment-1;
|
||||
converter.integer &= bit_mask;
|
||||
return converter.ptr;
|
||||
}
|
||||
|
||||
#endif //BT_SCALAR_H
|
||||
26
opencl/parallel_primitives/host/premake4.lua
Normal file
26
opencl/parallel_primitives/host/premake4.lua
Normal file
@@ -0,0 +1,26 @@
|
||||
function createProject(vendor)
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("OpenCL_lib_parallel_primitives_host_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
kind "StaticLib"
|
||||
targetdir "../../../lib"
|
||||
includedirs {
|
||||
".",
|
||||
}
|
||||
files {
|
||||
"**.cpp",
|
||||
"**.h"
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
createProject("Apple")
|
||||
Reference in New Issue
Block a user