import opencl_course source for a start

This commit is contained in:
erwin coumans
2013-03-11 22:03:27 +01:00
commit 08272c7de5
64 changed files with 12336 additions and 0 deletions

View File

@@ -0,0 +1,92 @@
#ifndef COMMAND_LINE_ARGS_H
#define COMMAND_LINE_ARGS_H
/******************************************************************************
* Command-line parsing
******************************************************************************/
#include <map>
#include <algorithm>
#include <string>
#include <cstring>
#include <sstream>
class CommandLineArgs
{
protected:
std::map<std::string, std::string> pairs;
public:
// Constructor
CommandLineArgs(int argc, char **argv)
{
using namespace std;
for (int i = 1; i < argc; i++)
{
string arg = argv[i];
if ((arg[0] != '-') || (arg[1] != '-')) {
continue;
}
string::size_type pos;
string key, val;
if ((pos = arg.find( '=')) == string::npos) {
key = string(arg, 2, arg.length() - 2);
val = "";
} else {
key = string(arg, 2, pos - 2);
val = string(arg, pos + 1, arg.length() - 1);
}
pairs[key] = val;
}
}
bool CheckCmdLineFlag(const char* arg_name)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
return true;
}
return false;
}
template <typename T>
void GetCmdLineArgument(const char *arg_name, T &val);
int ParsedArgc()
{
return pairs.size();
}
};
template <typename T>
void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
istringstream strstream(itr->second);
strstream >> val;
}
}
template <>
void CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
{
using namespace std;
map<string, string>::iterator itr;
if ((itr = pairs.find(arg_name)) != pairs.end()) {
string s = itr->second;
val = (char*) malloc(sizeof(char) * (s.length() + 1));
std::strcpy(val, s.c_str());
} else {
val = NULL;
}
}
#endif //COMMAND_LINE_ARGS_H

View File

@@ -0,0 +1,181 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "btAlignedAllocator.h"
int gNumAlignedAllocs = 0;
int gNumAlignedFree = 0;
int gTotalBytesAlignedAllocs = 0;//detect memory leaks
static void *btAllocDefault(size_t size)
{
return malloc(size);
}
static void btFreeDefault(void *ptr)
{
free(ptr);
}
static btAllocFunc *sAllocFunc = btAllocDefault;
static btFreeFunc *sFreeFunc = btFreeDefault;
#if defined (BT_HAS_ALIGNED_ALLOCATOR)
#include <malloc.h>
static void *btAlignedAllocDefault(size_t size, int alignment)
{
return _aligned_malloc(size, (size_t)alignment);
}
static void btAlignedFreeDefault(void *ptr)
{
_aligned_free(ptr);
}
#elif defined(__CELLOS_LV2__)
#include <stdlib.h>
static inline void *btAlignedAllocDefault(size_t size, int alignment)
{
return memalign(alignment, size);
}
static inline void btAlignedFreeDefault(void *ptr)
{
free(ptr);
}
#else
static inline void *btAlignedAllocDefault(size_t size, int alignment)
{
void *ret;
char *real;
real = (char *)sAllocFunc(size + sizeof(void *) + (alignment-1));
if (real) {
ret = btAlignPointer(real + sizeof(void *),alignment);
*((void **)(ret)-1) = (void *)(real);
} else {
ret = (void *)(real);
}
return (ret);
}
static inline void btAlignedFreeDefault(void *ptr)
{
void* real;
if (ptr) {
real = *((void **)(ptr)-1);
sFreeFunc(real);
}
}
#endif
static btAlignedAllocFunc *sAlignedAllocFunc = btAlignedAllocDefault;
static btAlignedFreeFunc *sAlignedFreeFunc = btAlignedFreeDefault;
void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc)
{
sAlignedAllocFunc = allocFunc ? allocFunc : btAlignedAllocDefault;
sAlignedFreeFunc = freeFunc ? freeFunc : btAlignedFreeDefault;
}
void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc)
{
sAllocFunc = allocFunc ? allocFunc : btAllocDefault;
sFreeFunc = freeFunc ? freeFunc : btFreeDefault;
}
#ifdef BT_DEBUG_MEMORY_ALLOCATIONS
//this generic allocator provides the total allocated number of bytes
#include <stdio.h>
void* btAlignedAllocInternal (size_t size, int alignment,int line,char* filename)
{
void *ret;
char *real;
gTotalBytesAlignedAllocs += size;
gNumAlignedAllocs++;
real = (char *)sAllocFunc(size + 2*sizeof(void *) + (alignment-1));
if (real) {
ret = (void*) btAlignPointer(real + 2*sizeof(void *), alignment);
*((void **)(ret)-1) = (void *)(real);
*((int*)(ret)-2) = size;
} else {
ret = (void *)(real);//??
}
printf("allocation#%d at address %x, from %s,line %d, size %d\n",gNumAlignedAllocs,real, filename,line,size);
int* ptr = (int*)ret;
*ptr = 12;
return (ret);
}
void btAlignedFreeInternal (void* ptr,int line,char* filename)
{
void* real;
gNumAlignedFree++;
if (ptr) {
real = *((void **)(ptr)-1);
int size = *((int*)(ptr)-2);
gTotalBytesAlignedAllocs -= size;
printf("free #%d at address %x, from %s,line %d, size %d\n",gNumAlignedFree,real, filename,line,size);
sFreeFunc(real);
} else
{
printf("NULL ptr\n");
}
}
#else //BT_DEBUG_MEMORY_ALLOCATIONS
void* btAlignedAllocInternal (size_t size, int alignment)
{
gNumAlignedAllocs++;
void* ptr;
ptr = sAlignedAllocFunc(size, alignment);
// printf("btAlignedAllocInternal %d, %x\n",size,ptr);
return ptr;
}
void btAlignedFreeInternal (void* ptr)
{
if (!ptr)
{
return;
}
gNumAlignedFree++;
// printf("btAlignedFreeInternal %x\n",ptr);
sAlignedFreeFunc(ptr);
}
#endif //BT_DEBUG_MEMORY_ALLOCATIONS

View File

@@ -0,0 +1,107 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_ALIGNED_ALLOCATOR
#define BT_ALIGNED_ALLOCATOR
///we probably replace this with our own aligned memory allocator
///so we replace _aligned_malloc and _aligned_free with our own
///that is better portable and more predictable
#include "btScalar.h"
//#define BT_DEBUG_MEMORY_ALLOCATIONS 1
#ifdef BT_DEBUG_MEMORY_ALLOCATIONS
#define btAlignedAlloc(a,b) \
btAlignedAllocInternal(a,b,__LINE__,__FILE__)
#define btAlignedFree(ptr) \
btAlignedFreeInternal(ptr,__LINE__,__FILE__)
void* btAlignedAllocInternal (size_t size, int alignment,int line,char* filename);
void btAlignedFreeInternal (void* ptr,int line,char* filename);
#else
void* btAlignedAllocInternal (size_t size, int alignment);
void btAlignedFreeInternal (void* ptr);
#define btAlignedAlloc(size,alignment) btAlignedAllocInternal(size,alignment)
#define btAlignedFree(ptr) btAlignedFreeInternal(ptr)
#endif
typedef int size_type;
typedef void *(btAlignedAllocFunc)(size_t size, int alignment);
typedef void (btAlignedFreeFunc)(void *memblock);
typedef void *(btAllocFunc)(size_t size);
typedef void (btFreeFunc)(void *memblock);
///The developer can let all Bullet memory allocations go through a custom memory allocator, using btAlignedAllocSetCustom
void btAlignedAllocSetCustom(btAllocFunc *allocFunc, btFreeFunc *freeFunc);
///If the developer has already an custom aligned allocator, then btAlignedAllocSetCustomAligned can be used. The default aligned allocator pre-allocates extra memory using the non-aligned allocator, and instruments it.
void btAlignedAllocSetCustomAligned(btAlignedAllocFunc *allocFunc, btAlignedFreeFunc *freeFunc);
///The btAlignedAllocator is a portable class for aligned memory allocations.
///Default implementations for unaligned and aligned allocations can be overridden by a custom allocator using btAlignedAllocSetCustom and btAlignedAllocSetCustomAligned.
template < typename T , unsigned Alignment >
class btAlignedAllocator {
typedef btAlignedAllocator< T , Alignment > self_type;
public:
//just going down a list:
btAlignedAllocator() {}
/*
btAlignedAllocator( const self_type & ) {}
*/
template < typename Other >
btAlignedAllocator( const btAlignedAllocator< Other , Alignment > & ) {}
typedef const T* const_pointer;
typedef const T& const_reference;
typedef T* pointer;
typedef T& reference;
typedef T value_type;
pointer address ( reference ref ) const { return &ref; }
const_pointer address ( const_reference ref ) const { return &ref; }
pointer allocate ( size_type n , const_pointer * hint = 0 ) {
(void)hint;
return reinterpret_cast< pointer >(btAlignedAlloc( sizeof(value_type) * n , Alignment ));
}
void construct ( pointer ptr , const value_type & value ) { new (ptr) value_type( value ); }
void deallocate( pointer ptr ) {
btAlignedFree( reinterpret_cast< void * >( ptr ) );
}
void destroy ( pointer ptr ) { ptr->~value_type(); }
template < typename O > struct rebind {
typedef btAlignedAllocator< O , Alignment > other;
};
template < typename O >
self_type & operator=( const btAlignedAllocator< O , Alignment > & ) { return *this; }
friend bool operator==( const self_type & , const self_type & ) { return true; }
};
#endif //BT_ALIGNED_ALLOCATOR

View File

@@ -0,0 +1,511 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_OBJECT_ARRAY__
#define BT_OBJECT_ARRAY__
#include "btScalar.h" // has definitions like SIMD_FORCE_INLINE
#include "btAlignedAllocator.h"
///If the platform doesn't support placement new, you can disable BT_USE_PLACEMENT_NEW
///then the btAlignedObjectArray doesn't support objects with virtual methods, and non-trivial constructors/destructors
///You can enable BT_USE_MEMCPY, then swapping elements in the array will use memcpy instead of operator=
///see discussion here: http://continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1231 and
///http://www.continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1240
#define BT_USE_PLACEMENT_NEW 1
//#define BT_USE_MEMCPY 1 //disable, because it is cumbersome to find out for each platform where memcpy is defined. It can be in <memory.h> or <string.h> or otherwise...
#define BT_ALLOW_ARRAY_COPY_OPERATOR // enabling this can accidently perform deep copies of data if you are not careful
#ifdef BT_USE_MEMCPY
#include <memory.h>
#include <string.h>
#endif //BT_USE_MEMCPY
#ifdef BT_USE_PLACEMENT_NEW
#include <new> //for placement new
#endif //BT_USE_PLACEMENT_NEW
///The btAlignedObjectArray template class uses a subset of the stl::vector interface for its methods
///It is developed to replace stl::vector to avoid portability issues, including STL alignment issues to add SIMD/SSE data
template <typename T>
//template <class T>
class btAlignedObjectArray
{
btAlignedAllocator<T , 16> m_allocator;
int m_size;
int m_capacity;
T* m_data;
//PCK: added this line
bool m_ownsMemory;
#ifdef BT_ALLOW_ARRAY_COPY_OPERATOR
public:
SIMD_FORCE_INLINE btAlignedObjectArray<T>& operator=(const btAlignedObjectArray<T> &other)
{
copyFromArray(other);
return *this;
}
#else//BT_ALLOW_ARRAY_COPY_OPERATOR
private:
SIMD_FORCE_INLINE btAlignedObjectArray<T>& operator=(const btAlignedObjectArray<T> &other);
#endif//BT_ALLOW_ARRAY_COPY_OPERATOR
protected:
SIMD_FORCE_INLINE int allocSize(int size)
{
return (size ? size*2 : 1);
}
SIMD_FORCE_INLINE void copy(int start,int end, T* dest) const
{
int i;
for (i=start;i<end;++i)
#ifdef BT_USE_PLACEMENT_NEW
new (&dest[i]) T(m_data[i]);
#else
dest[i] = m_data[i];
#endif //BT_USE_PLACEMENT_NEW
}
SIMD_FORCE_INLINE void init()
{
//PCK: added this line
m_ownsMemory = true;
m_data = 0;
m_size = 0;
m_capacity = 0;
}
SIMD_FORCE_INLINE void destroy(int first,int last)
{
int i;
for (i=first; i<last;i++)
{
m_data[i].~T();
}
}
SIMD_FORCE_INLINE void* allocate(int size)
{
if (size)
return m_allocator.allocate(size);
return 0;
}
SIMD_FORCE_INLINE void deallocate()
{
if(m_data) {
//PCK: enclosed the deallocation in this block
if (m_ownsMemory)
{
m_allocator.deallocate(m_data);
}
m_data = 0;
}
}
public:
btAlignedObjectArray()
{
init();
}
~btAlignedObjectArray()
{
clear();
}
///Generally it is best to avoid using the copy constructor of an btAlignedObjectArray, and use a (const) reference to the array instead.
btAlignedObjectArray(const btAlignedObjectArray& otherArray)
{
init();
int otherSize = otherArray.size();
resize (otherSize);
otherArray.copy(0, otherSize, m_data);
}
/// return the number of elements in the array
SIMD_FORCE_INLINE int size() const
{
return m_size;
}
SIMD_FORCE_INLINE const T& at(int n) const
{
btAssert(n>=0);
btAssert(n<size());
return m_data[n];
}
SIMD_FORCE_INLINE T& at(int n)
{
btAssert(n>=0);
btAssert(n<size());
return m_data[n];
}
SIMD_FORCE_INLINE const T& operator[](int n) const
{
btAssert(n>=0);
btAssert(n<size());
return m_data[n];
}
SIMD_FORCE_INLINE T& operator[](int n)
{
btAssert(n>=0);
btAssert(n<size());
return m_data[n];
}
///clear the array, deallocated memory. Generally it is better to use array.resize(0), to reduce performance overhead of run-time memory (de)allocations.
SIMD_FORCE_INLINE void clear()
{
destroy(0,size());
deallocate();
init();
}
SIMD_FORCE_INLINE void pop_back()
{
btAssert(m_size>0);
m_size--;
m_data[m_size].~T();
}
///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument.
///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations.
SIMD_FORCE_INLINE void resizeNoInitialize(int newsize)
{
int curSize = size();
if (newsize < curSize)
{
} else
{
if (newsize > size())
{
reserve(newsize);
}
//leave this uninitialized
}
m_size = newsize;
}
SIMD_FORCE_INLINE void resize(int newsize, const T& fillData=T())
{
int curSize = size();
if (newsize < curSize)
{
for(int i = newsize; i < curSize; i++)
{
m_data[i].~T();
}
} else
{
if (newsize > size())
{
reserve(newsize);
}
#ifdef BT_USE_PLACEMENT_NEW
for (int i=curSize;i<newsize;i++)
{
new ( &m_data[i]) T(fillData);
}
#endif //BT_USE_PLACEMENT_NEW
}
m_size = newsize;
}
SIMD_FORCE_INLINE T& expandNonInitializing( )
{
int sz = size();
if( sz == capacity() )
{
reserve( allocSize(size()) );
}
m_size++;
return m_data[sz];
}
SIMD_FORCE_INLINE T& expand( const T& fillValue=T())
{
int sz = size();
if( sz == capacity() )
{
reserve( allocSize(size()) );
}
m_size++;
#ifdef BT_USE_PLACEMENT_NEW
new (&m_data[sz]) T(fillValue); //use the in-place new (not really allocating heap memory)
#endif
return m_data[sz];
}
SIMD_FORCE_INLINE void push_back(const T& _Val)
{
int sz = size();
if( sz == capacity() )
{
reserve( allocSize(size()) );
}
#ifdef BT_USE_PLACEMENT_NEW
new ( &m_data[m_size] ) T(_Val);
#else
m_data[size()] = _Val;
#endif //BT_USE_PLACEMENT_NEW
m_size++;
}
/// return the pre-allocated (reserved) elements, this is at least as large as the total number of elements,see size() and reserve()
SIMD_FORCE_INLINE int capacity() const
{
return m_capacity;
}
SIMD_FORCE_INLINE void reserve(int _Count)
{ // determine new minimum length of allocated storage
if (capacity() < _Count)
{ // not enough room, reallocate
T* s = (T*)allocate(_Count);
copy(0, size(), s);
destroy(0,size());
deallocate();
//PCK: added this line
m_ownsMemory = true;
m_data = s;
m_capacity = _Count;
}
}
class less
{
public:
bool operator() ( const T& a, const T& b )
{
return ( a < b );
}
};
template <typename L>
void quickSortInternal(const L& CompareFunc,int lo, int hi)
{
// lo is the lower index, hi is the upper index
// of the region of array a that is to be sorted
int i=lo, j=hi;
T x=m_data[(lo+hi)/2];
// partition
do
{
while (CompareFunc(m_data[i],x))
i++;
while (CompareFunc(x,m_data[j]))
j--;
if (i<=j)
{
swap(i,j);
i++; j--;
}
} while (i<=j);
// recursion
if (lo<j)
quickSortInternal( CompareFunc, lo, j);
if (i<hi)
quickSortInternal( CompareFunc, i, hi);
}
template <typename L>
void quickSort(const L& CompareFunc)
{
//don't sort 0 or 1 elements
if (size()>1)
{
quickSortInternal(CompareFunc,0,size()-1);
}
}
///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/
template <typename L>
void downHeap(T *pArr, int k, int n, const L& CompareFunc)
{
/* PRE: a[k+1..N] is a heap */
/* POST: a[k..N] is a heap */
T temp = pArr[k - 1];
/* k has child(s) */
while (k <= n/2)
{
int child = 2*k;
if ((child < n) && CompareFunc(pArr[child - 1] , pArr[child]))
{
child++;
}
/* pick larger child */
if (CompareFunc(temp , pArr[child - 1]))
{
/* move child up */
pArr[k - 1] = pArr[child - 1];
k = child;
}
else
{
break;
}
}
pArr[k - 1] = temp;
} /*downHeap*/
void swap(int index0,int index1)
{
#ifdef BT_USE_MEMCPY
char temp[sizeof(T)];
memcpy(temp,&m_data[index0],sizeof(T));
memcpy(&m_data[index0],&m_data[index1],sizeof(T));
memcpy(&m_data[index1],temp,sizeof(T));
#else
T temp = m_data[index0];
m_data[index0] = m_data[index1];
m_data[index1] = temp;
#endif //BT_USE_PLACEMENT_NEW
}
template <typename L>
void heapSort(const L& CompareFunc)
{
/* sort a[0..N-1], N.B. 0 to N-1 */
int k;
int n = m_size;
for (k = n/2; k > 0; k--)
{
downHeap(m_data, k, n, CompareFunc);
}
/* a[1..N] is now a heap */
while ( n>=1 )
{
swap(0,n-1); /* largest of a[0..n-1] */
n = n - 1;
/* restore a[1..i-1] heap */
downHeap(m_data, 1, n, CompareFunc);
}
}
///non-recursive binary search, assumes sorted array
int findBinarySearch(const T& key) const
{
int first = 0;
int last = size()-1;
//assume sorted array
while (first <= last) {
int mid = (first + last) / 2; // compute mid point.
if (key > m_data[mid])
first = mid + 1; // repeat search in top half.
else if (key < m_data[mid])
last = mid - 1; // repeat search in bottom half.
else
return mid; // found it. return position /////
}
return size(); // failed to find key
}
int findLinearSearch(const T& key) const
{
int index=size();
int i;
for (i=0;i<size();i++)
{
if (m_data[i] == key)
{
index = i;
break;
}
}
return index;
}
void remove(const T& key)
{
int findIndex = findLinearSearch(key);
if (findIndex<size())
{
swap( findIndex,size()-1);
pop_back();
}
}
//PCK: whole function
void initializeFromBuffer(void *buffer, int size, int capacity)
{
clear();
m_ownsMemory = false;
m_data = (T*)buffer;
m_size = size;
m_capacity = capacity;
}
void copyFromArray(const btAlignedObjectArray& otherArray)
{
int otherSize = otherArray.size();
resize (otherSize);
otherArray.copy(0, otherSize, m_data);
}
};
#endif //BT_OBJECT_ARRAY__

View File

@@ -0,0 +1,213 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
//Host-code rewritten by Erwin Coumans
#define BOUNDSEARCH_PATH "opencl/parallel_primitives/kernels/BoundSearchKernels.cl"
#define KERNEL0 "SearchSortDataLowerKernel"
#define KERNEL1 "SearchSortDataUpperKernel"
#define KERNEL2 "SubtractKernel"
#include "btBoundSearchCL.h"
#include "../../basic_initialize/btOpenCLUtils.h"
#include "btLauncherCL.h"
#include "../kernels/BoundSearchKernelsCL.h"
btBoundSearchCL::btBoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
:m_context(ctx),
m_device(device),
m_queue(queue)
{
const char* additionalMacros = "";
const char* srcFileNameForCaching="";
cl_int pErrNum;
const char* kernelSource = boundSearchKernelsCL;
cl_program boundSearchProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH);
btAssert(boundSearchProg);
m_lowerSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros );
btAssert(m_lowerSortDataKernel );
m_upperSortDataKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros );
btAssert(m_upperSortDataKernel);
m_subtractKernel = 0;
if( maxSize )
{
m_subtractKernel= btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros );
btAssert(m_subtractKernel);
}
//m_constBuffer = new btOpenCLArray<btInt4>( device, 1, BufferBase::BUFFER_CONST );
m_lower = (maxSize == 0)? 0: new btOpenCLArray<unsigned int>(ctx,queue,maxSize );
m_upper = (maxSize == 0)? 0: new btOpenCLArray<unsigned int>(ctx,queue, maxSize );
m_filler = new btFillCL(ctx,device,queue);
}
btBoundSearchCL::~btBoundSearchCL()
{
delete m_lower;
delete m_upper;
delete m_filler;
clReleaseKernel(m_lowerSortDataKernel);
clReleaseKernel(m_upperSortDataKernel);
clReleaseKernel(m_subtractKernel);
}
void btBoundSearchCL::execute(btOpenCLArray<btSortData>& src, int nSrc, btOpenCLArray<unsigned int>& dst, int nDst, Option option )
{
btInt4 constBuffer;
constBuffer.x = nSrc;
constBuffer.y = nDst;
if( option == BOUND_LOWER )
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL()) };
btLauncherCL launcher( m_queue, m_lowerSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
launcher.launch1D( nSrc, 64 );
}
else if( option == BOUND_UPPER )
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };
btLauncherCL launcher(m_queue, m_upperSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
launcher.launch1D( nSrc, 64 );
}
else if( option == COUNT )
{
btAssert( m_lower );
btAssert( m_upper );
btAssert( m_lower->capacity() <= (int)nDst );
btAssert( m_upper->capacity() <= (int)nDst );
int zero = 0;
m_filler->execute( *m_lower, zero, nDst );
m_filler->execute( *m_upper, zero, nDst );
execute( src, nSrc, *m_lower, nDst, BOUND_LOWER );
execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_upper->getBufferCL(), true ), btBufferInfoCL( m_lower->getBufferCL(), true ), btBufferInfoCL( dst.getBufferCL() ) };
btLauncherCL launcher( m_queue, m_subtractKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
launcher.launch1D( nDst, 64 );
}
}
else
{
btAssert( 0 );
}
}
void btBoundSearchCL::executeHost( btAlignedObjectArray<btSortData>& src, int nSrc,
btAlignedObjectArray<unsigned int>& dst, int nDst, Option option )
{
for(int i=0; i<nSrc-1; i++)
btAssert( src[i].m_key <= src[i+1].m_key );
btSortData minData,zeroData,maxData;
minData.m_key = -1;
minData.m_value = -1;
zeroData.m_key=0;
zeroData.m_value=0;
maxData.m_key = nDst;
maxData.m_value = nDst;
if( option == BOUND_LOWER )
{
for(int i=0; i<nSrc; i++)
{
btSortData& iData = (i==0)? minData: src[i-1];
btSortData& jData = (i==nSrc)? maxData: src[i];
if( iData.m_key != jData.m_key )
{
int k = jData.m_key;
{
dst[k] = i;
}
}
}
}
else if( option == BOUND_UPPER )
{
for(int i=1; i<nSrc+1; i++)
{
btSortData& iData = src[i-1];
btSortData& jData = (i==nSrc)? maxData: src[i];
if( iData.m_key != jData.m_key )
{
int k = iData.m_key;
{
dst[k] = i;
}
}
}
}
else if( option == COUNT )
{
btAlignedObjectArray<unsigned int> lower;
lower.resize(nDst );
btAlignedObjectArray<unsigned int> upper;
upper.resize(nDst );
for(int i=0; i<nDst; i++)
{
lower[i] = upper[i] = 0;
}
executeHost( src, nSrc, lower, nDst, BOUND_LOWER );
executeHost( src, nSrc, upper, nDst, BOUND_UPPER );
for( int i=0; i<nDst; i++)
{
dst[i] = upper[i] - lower[i];
}
}
else
{
btAssert( 0 );
}
}

View File

@@ -0,0 +1,67 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada
#ifndef BT_BOUNDSEARCH_H
#define BT_BOUNDSEARCH_H
#pragma once
/*#include <Adl/Adl.h>
#include <AdlPrimitives/Math/Math.h>
#include <AdlPrimitives/Sort/SortData.h>
#include <AdlPrimitives/Fill/Fill.h>
*/
#include "btOpenCLArray.h"
#include "btFillCL.h"
#include "btRadixSort32CL.h" //for btSortData (perhaps move it?)
class btBoundSearchCL
{
public:
enum Option
{
BOUND_LOWER,
BOUND_UPPER,
COUNT,
};
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_lowerSortDataKernel;
cl_kernel m_upperSortDataKernel;
cl_kernel m_subtractKernel;
btOpenCLArray<btInt4>* m_constbtOpenCLArray;
btOpenCLArray<unsigned int>* m_lower;
btOpenCLArray<unsigned int>* m_upper;
btFillCL* m_filler;
btBoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
virtual ~btBoundSearchCL();
// src has to be src[i].m_key <= src[i+1].m_key
void execute( btOpenCLArray<btSortData>& src, int nSrc, btOpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
void executeHost( btAlignedObjectArray<btSortData>& src, int nSrc, btAlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
};
#endif //BT_BOUNDSEARCH_H

View File

@@ -0,0 +1,19 @@
#ifndef BT_BUFFER_INFO_CL_H
#define BT_BUFFER_INFO_CL_H
#include "btOpenCLArray.h"
struct btBufferInfoCL
{
//btBufferInfoCL(){}
// template<typename T>
btBufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
cl_mem m_clBuffer;
bool m_isReadOnly;
};
#endif //BT_BUFFER_INFO_CL_H

View File

@@ -0,0 +1,126 @@
#include "btFillCL.h"
#include "../../basic_initialize/btOpenCLUtils.h"
#include "btBufferInfoCL.h"
#include "btLauncherCL.h"
#define FILL_CL_PROGRAM_PATH "opencl/parallel_primitives/kernels/FillKernels.cl"
#include "../kernels/FillKernelsCL.h"
btFillCL::btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
:m_commandQueue(queue)
{
const char* kernelSource = fillKernelsCL;
cl_int pErrNum;
const char* additionalMacros = "";
cl_program fillProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH);
btAssert(fillProg);
m_fillIntKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros );
btAssert(m_fillIntKernel);
m_fillUnsignedIntKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros );
btAssert(m_fillIntKernel);
m_fillFloatKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros );
btAssert(m_fillFloatKernel);
m_fillKernelInt2 = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros );
btAssert(m_fillKernelInt2);
}
btFillCL::~btFillCL()
{
clReleaseKernel(m_fillKernelInt2);
clReleaseKernel(m_fillIntKernel);
clReleaseKernel(m_fillUnsignedIntKernel);
clReleaseKernel(m_fillFloatKernel);
}
void btFillCL::execute(btOpenCLArray<float>& src, const float value, int n, int offset)
{
btAssert( n>0 );
{
btLauncherCL launcher( m_commandQueue, m_fillFloatKernel );
launcher.setBuffer( src.getBufferCL());
launcher.setConst( n );
launcher.setConst( value );
launcher.setConst( offset);
launcher.launch1D( n );
}
}
void btFillCL::execute(btOpenCLArray<int>& src, const int value, int n, int offset)
{
btAssert( n>0 );
{
btLauncherCL launcher( m_commandQueue, m_fillIntKernel );
launcher.setBuffer(src.getBufferCL());
launcher.setConst( n);
launcher.setConst( value);
launcher.setConst( offset);
launcher.launch1D( n );
}
}
void btFillCL::execute(btOpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
{
btAssert( n>0 );
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( n );
launcher.setConst(value);
launcher.setConst(offset);
launcher.launch1D( n );
}
}
void btFillCL::executeHost(btAlignedObjectArray<btInt2> &src, const btInt2 &value, int n, int offset)
{
for (int i=0;i<n;i++)
{
src[i+offset]=value;
}
}
void btFillCL::executeHost(btAlignedObjectArray<int> &src, const int value, int n, int offset)
{
for (int i=0;i<n;i++)
{
src[i+offset]=value;
}
}
void btFillCL::execute(btOpenCLArray<btInt2> &src, const btInt2 &value, int n, int offset)
{
btAssert( n>0 );
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src.getBufferCL() ) };
btLauncherCL launcher(m_commandQueue, m_fillKernelInt2);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst(n);
launcher.setConst(value);
launcher.setConst(offset);
//( constBuffer );
launcher.launch1D( n );
}
}

View File

@@ -0,0 +1,137 @@
#ifndef BT_FILL_CL_H
#define BT_FILL_CL_H
#include "btOpenCLArray.h"
#include "btScalar.h"
ATTRIBUTE_ALIGNED16(struct) btUnsignedInt4
{
BT_DECLARE_ALIGNED_ALLOCATOR();
union
{
struct
{
unsigned int x,y,z,w;
};
struct
{
unsigned int s[4];
};
};
};
ATTRIBUTE_ALIGNED16(struct) btInt4
{
BT_DECLARE_ALIGNED_ALLOCATOR();
union
{
struct
{
int x,y,z,w;
};
struct
{
int s[4];
};
};
};
struct btUnsignedInt2
{
union
{
struct
{
unsigned int x,y;
};
struct
{
unsigned int s[2];
};
};
};
struct btInt2
{
union
{
struct
{
int x,y;
};
struct
{
int s[2];
};
};
};
SIMD_FORCE_INLINE btInt4 btMakeInt4(int x, int y, int z, int w = 0)
{
btInt4 v;
v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
return v;
}
SIMD_FORCE_INLINE btUnsignedInt4 btMakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0)
{
btUnsignedInt4 v;
v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
return v;
}
class btFillCL
{
cl_command_queue m_commandQueue;
cl_kernel m_fillKernelInt2;
cl_kernel m_fillIntKernel;
cl_kernel m_fillUnsignedIntKernel;
cl_kernel m_fillFloatKernel;
public:
struct btConstData
{
union
{
btInt4 m_data;
btUnsignedInt4 m_UnsignedData;
};
int m_offset;
int m_n;
int m_padding[2];
};
protected:
public:
btFillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
virtual ~btFillCL();
void execute(btOpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
void execute(btOpenCLArray<int>& src, const int value, int n, int offset = 0);
void execute(btOpenCLArray<float>& src, const float value, int n, int offset = 0);
void execute(btOpenCLArray<btInt2>& src, const btInt2& value, int n, int offset = 0);
void executeHost(btAlignedObjectArray<btInt2> &src, const btInt2 &value, int n, int offset);
void executeHost(btAlignedObjectArray<int> &src, const int value, int n, int offset);
// void execute(btOpenCLArray<btInt4>& src, const btInt4& value, int n, int offset = 0);
};
#endif //BT_FILL_CL_H

View File

@@ -0,0 +1,450 @@
/*
Bullet Continuous Collision Detection and Physics Library
Copyright (c) 2003-2009 Erwin Coumans http://bulletphysics.org
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_HASH_MAP_H
#define BT_HASH_MAP_H
#include "btAlignedObjectArray.h"
///very basic hashable string implementation, compatible with btHashMap
struct btHashString
{
const char* m_string;
unsigned int m_hash;
SIMD_FORCE_INLINE unsigned int getHash()const
{
return m_hash;
}
btHashString(const char* name)
:m_string(name)
{
/* magic numbers from http://www.isthe.com/chongo/tech/comp/fnv/ */
static const unsigned int InitialFNV = 2166136261u;
static const unsigned int FNVMultiple = 16777619u;
/* Fowler / Noll / Vo (FNV) Hash */
unsigned int hash = InitialFNV;
for(int i = 0; m_string[i]; i++)
{
hash = hash ^ (m_string[i]); /* xor the low 8 bits */
hash = hash * FNVMultiple; /* multiply by the magic number */
}
m_hash = hash;
}
int portableStringCompare(const char* src, const char* dst) const
{
int ret = 0 ;
while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst)
++src, ++dst;
if ( ret < 0 )
ret = -1 ;
else if ( ret > 0 )
ret = 1 ;
return( ret );
}
bool equals(const btHashString& other) const
{
return (m_string == other.m_string) ||
(0==portableStringCompare(m_string,other.m_string));
}
};
const int BT_HASH_NULL=0xffffffff;
class btHashInt
{
int m_uid;
public:
btHashInt(int uid) :m_uid(uid)
{
}
int getUid1() const
{
return m_uid;
}
void setUid1(int uid)
{
m_uid = uid;
}
bool equals(const btHashInt& other) const
{
return getUid1() == other.getUid1();
}
//to our success
SIMD_FORCE_INLINE unsigned int getHash()const
{
int key = m_uid;
// Thomas Wang's hash
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
return key;
}
};
class btHashPtr
{
union
{
const void* m_pointer;
int m_hashValues[2];
};
public:
btHashPtr(const void* ptr)
:m_pointer(ptr)
{
}
const void* getPointer() const
{
return m_pointer;
}
bool equals(const btHashPtr& other) const
{
return getPointer() == other.getPointer();
}
//to our success
SIMD_FORCE_INLINE unsigned int getHash()const
{
const bool VOID_IS_8 = ((sizeof(void*)==8));
int key = VOID_IS_8? m_hashValues[0]+m_hashValues[1] : m_hashValues[0];
// Thomas Wang's hash
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
return key;
}
};
template <class Value>
class btHashKeyPtr
{
int m_uid;
public:
btHashKeyPtr(int uid) :m_uid(uid)
{
}
int getUid1() const
{
return m_uid;
}
bool equals(const btHashKeyPtr<Value>& other) const
{
return getUid1() == other.getUid1();
}
//to our success
SIMD_FORCE_INLINE unsigned int getHash()const
{
int key = m_uid;
// Thomas Wang's hash
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
return key;
}
};
template <class Value>
class btHashKey
{
int m_uid;
public:
btHashKey(int uid) :m_uid(uid)
{
}
int getUid1() const
{
return m_uid;
}
bool equals(const btHashKey<Value>& other) const
{
return getUid1() == other.getUid1();
}
//to our success
SIMD_FORCE_INLINE unsigned int getHash()const
{
int key = m_uid;
// Thomas Wang's hash
key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16);
return key;
}
};
///The btHashMap template class implements a generic and lightweight hashmap.
///A basic sample of how to use btHashMap is located in Demos\BasicDemo\main.cpp
template <class Key, class Value>
class btHashMap
{
protected:
btAlignedObjectArray<int> m_hashTable;
btAlignedObjectArray<int> m_next;
btAlignedObjectArray<Value> m_valueArray;
btAlignedObjectArray<Key> m_keyArray;
void growTables(const Key& /*key*/)
{
int newCapacity = m_valueArray.capacity();
if (m_hashTable.size() < newCapacity)
{
//grow hashtable and next table
int curHashtableSize = m_hashTable.size();
m_hashTable.resize(newCapacity);
m_next.resize(newCapacity);
int i;
for (i= 0; i < newCapacity; ++i)
{
m_hashTable[i] = BT_HASH_NULL;
}
for (i = 0; i < newCapacity; ++i)
{
m_next[i] = BT_HASH_NULL;
}
for(i=0;i<curHashtableSize;i++)
{
//const Value& value = m_valueArray[i];
//const Key& key = m_keyArray[i];
int hashValue = m_keyArray[i].getHash() & (m_valueArray.capacity()-1); // New hash value with new mask
m_next[i] = m_hashTable[hashValue];
m_hashTable[hashValue] = i;
}
}
}
public:
void insert(const Key& key, const Value& value) {
int hash = key.getHash() & (m_valueArray.capacity()-1);
//replace value if the key is already there
int index = findIndex(key);
if (index != BT_HASH_NULL)
{
m_valueArray[index]=value;
return;
}
int count = m_valueArray.size();
int oldCapacity = m_valueArray.capacity();
m_valueArray.push_back(value);
m_keyArray.push_back(key);
int newCapacity = m_valueArray.capacity();
if (oldCapacity < newCapacity)
{
growTables(key);
//hash with new capacity
hash = key.getHash() & (m_valueArray.capacity()-1);
}
m_next[count] = m_hashTable[hash];
m_hashTable[hash] = count;
}
void remove(const Key& key) {
int hash = key.getHash() & (m_valueArray.capacity()-1);
int pairIndex = findIndex(key);
if (pairIndex ==BT_HASH_NULL)
{
return;
}
// Remove the pair from the hash table.
int index = m_hashTable[hash];
btAssert(index != BT_HASH_NULL);
int previous = BT_HASH_NULL;
while (index != pairIndex)
{
previous = index;
index = m_next[index];
}
if (previous != BT_HASH_NULL)
{
btAssert(m_next[previous] == pairIndex);
m_next[previous] = m_next[pairIndex];
}
else
{
m_hashTable[hash] = m_next[pairIndex];
}
// We now move the last pair into spot of the
// pair being removed. We need to fix the hash
// table indices to support the move.
int lastPairIndex = m_valueArray.size() - 1;
// If the removed pair is the last pair, we are done.
if (lastPairIndex == pairIndex)
{
m_valueArray.pop_back();
m_keyArray.pop_back();
return;
}
// Remove the last pair from the hash table.
int lastHash = m_keyArray[lastPairIndex].getHash() & (m_valueArray.capacity()-1);
index = m_hashTable[lastHash];
btAssert(index != BT_HASH_NULL);
previous = BT_HASH_NULL;
while (index != lastPairIndex)
{
previous = index;
index = m_next[index];
}
if (previous != BT_HASH_NULL)
{
btAssert(m_next[previous] == lastPairIndex);
m_next[previous] = m_next[lastPairIndex];
}
else
{
m_hashTable[lastHash] = m_next[lastPairIndex];
}
// Copy the last pair into the remove pair's spot.
m_valueArray[pairIndex] = m_valueArray[lastPairIndex];
m_keyArray[pairIndex] = m_keyArray[lastPairIndex];
// Insert the last pair into the hash table
m_next[pairIndex] = m_hashTable[lastHash];
m_hashTable[lastHash] = pairIndex;
m_valueArray.pop_back();
m_keyArray.pop_back();
}
int size() const
{
return m_valueArray.size();
}
const Value* getAtIndex(int index) const
{
btAssert(index < m_valueArray.size());
return &m_valueArray[index];
}
Value* getAtIndex(int index)
{
btAssert(index < m_valueArray.size());
return &m_valueArray[index];
}
Value* operator[](const Key& key) {
return find(key);
}
const Value* find(const Key& key) const
{
int index = findIndex(key);
if (index == BT_HASH_NULL)
{
return NULL;
}
return &m_valueArray[index];
}
Value* find(const Key& key)
{
int index = findIndex(key);
if (index == BT_HASH_NULL)
{
return NULL;
}
return &m_valueArray[index];
}
int findIndex(const Key& key) const
{
unsigned int hash = key.getHash() & (m_valueArray.capacity()-1);
if (hash >= (unsigned int)m_hashTable.size())
{
return BT_HASH_NULL;
}
int index = m_hashTable[hash];
while ((index != BT_HASH_NULL) && key.equals(m_keyArray[index]) == false)
{
index = m_next[index];
}
return index;
}
void clear()
{
m_hashTable.clear();
m_next.clear();
m_valueArray.clear();
m_keyArray.clear();
}
};
#endif //BT_HASH_MAP_H

View File

@@ -0,0 +1,363 @@
#ifndef BT_LAUNCHER_CL_H
#define BT_LAUNCHER_CL_H
#include "btBufferInfoCL.h"
#include "btMinMax.h"
#include "btOpenCLArray.h"
#include <stdio.h>
#ifdef _WIN32
#pragma warning(disable :4996)
#endif
#define BT_CL_MAX_ARG_SIZE 16
struct btKernelArgData
{
int m_isBuffer;
int m_argIndex;
int m_argSizeInBytes;
union
{
cl_mem m_clBuffer;
unsigned char m_argData[BT_CL_MAX_ARG_SIZE];
};
};
class btLauncherCL
{
cl_command_queue m_commandQueue;
cl_kernel m_kernel;
int m_idx;
btAlignedObjectArray<btKernelArgData> m_kernelArguments;
int m_serializationSizeInBytes;
public:
btAlignedObjectArray<btOpenCLArray<unsigned char>* > m_arrays;
btLauncherCL(cl_command_queue queue, cl_kernel kernel)
:m_commandQueue(queue),
m_kernel(kernel),
m_idx(0)
{
m_serializationSizeInBytes = sizeof(int);
}
virtual ~btLauncherCL()
{
for (int i=0;i<m_arrays.size();i++)
{
clReleaseMemObject(m_arrays[i]->getBufferCL());
}
}
inline void setBuffer( cl_mem clBuffer)
{
btKernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = clBuffer;
cl_mem_info param_name = CL_MEM_SIZE;
size_t param_value;
size_t sizeInBytes = sizeof(size_t);
size_t actualSizeInBytes;
cl_int err;
err = clGetMemObjectInfo ( kernelArg.m_clBuffer,
param_name,
sizeInBytes,
&param_value,
&actualSizeInBytes);
btAssert( err == CL_SUCCESS );
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+= sizeof(btKernelArgData);
m_serializationSizeInBytes+=param_value;
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
btAssert( status == CL_SUCCESS );
}
inline void setBuffers( btBufferInfoCL* buffInfo, int n )
{
for(int i=0; i<n; i++)
{
btKernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
cl_mem_info param_name = CL_MEM_SIZE;
size_t param_value;
size_t sizeInBytes = sizeof(size_t);
size_t actualSizeInBytes;
cl_int err;
err = clGetMemObjectInfo ( kernelArg.m_clBuffer,
param_name,
sizeInBytes,
&param_value,
&actualSizeInBytes);
btAssert( err == CL_SUCCESS );
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+= sizeof(btKernelArgData);
m_serializationSizeInBytes+=param_value;
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
btAssert( status == CL_SUCCESS );
}
}
int getSerializationBufferSize() const
{
return m_serializationSizeInBytes;
}
inline int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
{
int index=0;
int numArguments = *(int*) &buf[index];
index+=sizeof(int);
for (int i=0;i<numArguments;i++)
{
btKernelArgData* arg = (btKernelArgData*)&buf[index];
index+=sizeof(btKernelArgData);
if (arg->m_isBuffer)
{
btOpenCLArray<unsigned char>* clData = new btOpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
clData->resize(arg->m_argSizeInBytes);
clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
arg->m_clBuffer = clData->getBufferCL();
m_arrays.push_back(clData);
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
btAssert( status == CL_SUCCESS );
index+=arg->m_argSizeInBytes;
} else
{
cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
btAssert( status == CL_SUCCESS );
}
m_kernelArguments.push_back(*arg);
}
m_serializationSizeInBytes = index;
return index;
}
inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
{
int index=0;
int numArguments = *(int*) &goldBuffer[index];
index+=sizeof(int);
if (numArguments != m_kernelArguments.size())
{
printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size());
return -1;
}
for (int ii=0;ii<numArguments;ii++)
{
btKernelArgData* argGold = (btKernelArgData*)&goldBuffer[index];
if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
{
printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
return -2;
}
{
int expected = argGold->m_isBuffer;
int found = m_kernelArguments[ii].m_isBuffer;
if (expected != found)
{
printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found);
return -3;
}
}
index+=sizeof(btKernelArgData);
if (argGold->m_isBuffer)
{
unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes);
unsigned char* goldBuf = &goldBuffer[index];
for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++)
{
memBuf[j] = 0xaa;
}
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
memBuf, 0,0,0 );
btAssert( status==CL_SUCCESS );
clFinish(m_commandQueue);
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
{
int expected = goldBuf[b];
int found = memBuf[b];
if (expected != found)
{
printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
ii, b, expected, found);
return -4;
}
}
index+=argGold->m_argSizeInBytes;
} else
{
//compare content
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
{
int expected = argGold->m_argData[b];
int found =m_kernelArguments[ii].m_argData[b];
if (expected != found)
{
printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
ii, b, expected, found);
return -5;
}
}
}
}
return index;
}
inline int serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
{
//initialize to known values
for (int i=0;i<destBufferCapacity;i++)
destBuffer[i] = 0xec;
assert(destBufferCapacity>=m_serializationSizeInBytes);
//todo: use the btSerializer for this to allow for 32/64bit, endianness etc
int numArguments = m_kernelArguments.size();
int curBufferSize = 0;
int* dest = (int*)&destBuffer[curBufferSize];
*dest = numArguments;
curBufferSize += sizeof(int);
for (int i=0;i<this->m_kernelArguments.size();i++)
{
btKernelArgData* arg = (btKernelArgData*) &destBuffer[curBufferSize];
*arg = m_kernelArguments[i];
curBufferSize+=sizeof(btKernelArgData);
if (arg->m_isBuffer==1)
{
//copy the OpenCL buffer content
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
&destBuffer[curBufferSize], 0,0,0 );
btAssert( status==CL_SUCCESS );
clFinish(m_commandQueue);
curBufferSize+=arg->m_argSizeInBytes;
}
}
return curBufferSize;
}
void serializeToFile(const char* fileName, int numWorkItems)
{
int num = numWorkItems;
int buffSize = getSerializationBufferSize();
unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
for (int i=0;i<buffSize+1;i++)
{
unsigned char* ptr = (unsigned char*)&buf[i];
*ptr = 0xff;
}
int actualWrite = serializeArguments(buf,buffSize);
unsigned char* cptr = (unsigned char*)&buf[buffSize];
// printf("buf[buffSize] = %d\n",*cptr);
assert(buf[buffSize]==0xff);//check for buffer overrun
int* ptr = (int*)&buf[buffSize];
*ptr = num;
FILE* f = fopen(fileName,"wb");
fwrite(buf,buffSize+sizeof(int),1,f);
fclose(f);
delete[] buf;
}
template<typename T>
inline void setConst( const T& consts )
{
int sz=sizeof(T);
btAssert(sz<=BT_CL_MAX_ARG_SIZE);
btKernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 0;
T* destArg = (T*)kernelArg.m_argData;
*destArg = consts;
kernelArg.m_argSizeInBytes = sizeof(T);
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+=sizeof(btKernelArgData);
cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts );
btAssert( status == CL_SUCCESS );
}
inline void launch1D( int numThreads, int localSize = 64)
{
launch2D( numThreads, 1, localSize, 1 );
}
inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
{
size_t gRange[3] = {1,1,1};
size_t lRange[3] = {1,1,1};
lRange[0] = localSizeX;
lRange[1] = localSizeY;
gRange[0] = btMax((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
gRange[0] *= lRange[0];
gRange[1] = btMax((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
gRange[1] *= lRange[1];
cl_int status = clEnqueueNDRangeKernel( m_commandQueue,
m_kernel, 2, NULL, gRange, lRange, 0,0,0 );
if (status != CL_SUCCESS)
{
printf("Error: OpenCL status = %d\n",status);
}
btAssert( status == CL_SUCCESS );
}
};
#endif //BT_LAUNCHER_CL_H

View File

@@ -0,0 +1,71 @@
/*
Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans http://continuousphysics.com/Bullet/
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_GEN_MINMAX_H
#define BT_GEN_MINMAX_H
#include "btScalar.h"
template <class T>
SIMD_FORCE_INLINE const T& btMin(const T& a, const T& b)
{
return a < b ? a : b ;
}
template <class T>
SIMD_FORCE_INLINE const T& btMax(const T& a, const T& b)
{
return a > b ? a : b;
}
template <class T>
SIMD_FORCE_INLINE const T& btClamped(const T& a, const T& lb, const T& ub)
{
return a < lb ? lb : (ub < a ? ub : a);
}
template <class T>
SIMD_FORCE_INLINE void btSetMin(T& a, const T& b)
{
if (b < a)
{
a = b;
}
}
template <class T>
SIMD_FORCE_INLINE void btSetMax(T& a, const T& b)
{
if (a < b)
{
a = b;
}
}
template <class T>
SIMD_FORCE_INLINE void btClamp(T& a, const T& lb, const T& ub)
{
if (a < lb)
{
a = lb;
}
else if (ub < a)
{
a = ub;
}
}
#endif //BT_GEN_MINMAX_H

View File

@@ -0,0 +1,274 @@
#ifndef BT_OPENCL_ARRAY_H
#define BT_OPENCL_ARRAY_H
#include "btAlignedObjectArray.h"
#include "../../basic_initialize/btOpenCLInclude.h"
template <typename T>
class btOpenCLArray
{
int m_size;
int m_capacity;
cl_mem m_clBuffer;
cl_context m_clContext;
cl_command_queue m_commandQueue;
bool m_ownsMemory;
bool m_allowGrowingCapacity;
void deallocate()
{
if (m_clBuffer && m_ownsMemory)
{
clReleaseMemObject(m_clBuffer);
}
m_clBuffer = 0;
m_capacity=0;
}
btOpenCLArray<T>& operator=(const btOpenCLArray<T>& src);
SIMD_FORCE_INLINE int allocSize(int size)
{
return (size ? size*2 : 1);
}
public:
btOpenCLArray(cl_context ctx, cl_command_queue queue, int initialCapacity=0, bool allowGrowingCapacity=true)
:m_size(0), m_capacity(0),m_clBuffer(0),
m_clContext(ctx),m_commandQueue(queue),
m_ownsMemory(true),m_allowGrowingCapacity(true)
{
if (initialCapacity)
{
reserve(initialCapacity);
}
m_allowGrowingCapacity = allowGrowingCapacity;
}
///this is an error-prone method with no error checking, be careful!
void setFromOpenCLBuffer(cl_mem buffer, int sizeInElements)
{
deallocate();
m_ownsMemory = false;
m_allowGrowingCapacity = false;
m_clBuffer = buffer;
m_size = sizeInElements;
m_capacity = sizeInElements;
}
// we could enable this assignment, but need to make sure to avoid accidental deep copies
// btOpenCLArray<T>& operator=(const btAlignedObjectArray<T>& src)
// {
// copyFromArray(src);
// return *this;
// }
cl_mem getBufferCL() const
{
return m_clBuffer;
}
virtual ~btOpenCLArray()
{
deallocate();
m_size=0;
m_capacity=0;
}
SIMD_FORCE_INLINE void push_back(const T& _Val,bool waitForCompletion=true)
{
int sz = size();
if( sz == capacity() )
{
reserve( allocSize(size()) );
}
copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
m_size++;
}
SIMD_FORCE_INLINE T forcedAt(int n) const
{
btAssert(n>=0);
btAssert(n<capacity());
T elem;
copyToHostPointer(&elem,1,n,true);
return elem;
}
SIMD_FORCE_INLINE T at(int n) const
{
btAssert(n>=0);
btAssert(n<size());
T elem;
copyToHostPointer(&elem,1,n,true);
return elem;
}
SIMD_FORCE_INLINE void resize(int newsize, bool copyOldContents=true)
{
int curSize = size();
if (newsize < curSize)
{
//leave the OpenCL memory for now
} else
{
if (newsize > size())
{
reserve(newsize,copyOldContents);
}
//leave new data uninitialized (init in debug mode?)
//for (int i=curSize;i<newsize;i++) ...
}
m_size = newsize;
}
SIMD_FORCE_INLINE int size() const
{
return m_size;
}
SIMD_FORCE_INLINE int capacity() const
{
return m_capacity;
}
SIMD_FORCE_INLINE void reserve(int _Count, bool copyOldContents=true)
{ // determine new minimum length of allocated storage
if (capacity() < _Count)
{ // not enough room, reallocate
if (m_allowGrowingCapacity)
{
cl_int ciErrNum;
//create a new OpenCL buffer
int memSizeInBytes = sizeof(T)*_Count;
cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
btAssert(ciErrNum==CL_SUCCESS);
//#define BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
#ifdef BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
for (int i=0;i<memSizeInBytes;i++)
src[i] = 0xbb;
ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 );
btAssert(ciErrNum==CL_SUCCESS);
clFinish(m_commandQueue);
free(src);
#endif //BT_ALWAYS_INITIALIZE_OPENCL_BUFFERS
if (copyOldContents)
copyToCL(buf, size());
//deallocate the old buffer
deallocate();
m_clBuffer = buf;
m_capacity = _Count;
} else
{
//fail: assert and
btAssert(0);
deallocate();
}
}
}
void copyToCL(cl_mem destination, int numElements, int firstElem=0, int dstOffsetInElems=0) const
{
if (numElements<=0)
return;
btAssert(m_clBuffer);
btAssert(destination);
//likely some error, destination is same as source
btAssert(m_clBuffer != destination);
btAssert((firstElem+numElements)<=m_size);
cl_int status = 0;
btAssert(numElements>0);
btAssert(numElements<=m_size);
int srcOffsetBytes = sizeof(T)*firstElem;
int dstOffsetInBytes = sizeof(T)*dstOffsetInElems;
status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination,
srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 );
btAssert( status == CL_SUCCESS );
}
void copyFromHost(const btAlignedObjectArray<T>& srcArray, bool waitForCompletion=true)
{
int newSize = srcArray.size();
bool copyOldContents = false;
resize (newSize,copyOldContents);
if (newSize)
copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion);
}
void copyFromHostPointer(const T* src, int numElems, int destFirstElem= 0, bool waitForCompletion=true)
{
btAssert(numElems+destFirstElem <= capacity());
cl_int status = 0;
int sizeInBytes=sizeof(T)*numElems;
status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes,
src, 0,0,0 );
btAssert(status == CL_SUCCESS );
if (waitForCompletion)
clFinish(m_commandQueue);
}
void copyToHost(btAlignedObjectArray<T>& destArray, bool waitForCompletion=true) const
{
destArray.resize(this->size());
if (size())
copyToHostPointer(&destArray[0], size(),0,waitForCompletion);
}
void copyToHostPointer(T* destPtr, int numElem, int srcFirstElem=0, bool waitForCompletion=true) const
{
btAssert(numElem+srcFirstElem <= capacity());
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem,
destPtr, 0,0,0 );
btAssert( status==CL_SUCCESS );
if (waitForCompletion)
clFinish(m_commandQueue);
}
void copyFromOpenCLArray(const btOpenCLArray& src)
{
int newSize = src.size();
resize(newSize);
if (size())
{
src.copyToCL(m_clBuffer,size());
}
}
};
#endif //BT_OPENCL_ARRAY_H

View File

@@ -0,0 +1,126 @@
#include "btPrefixScanCL.h"
#include "btFillCL.h"
#define BT_PREFIXSCAN_PROG_PATH "opencl/parallel_primitives/kernels/PrefixScanKernels.cl"
#include "btLauncherCL.h"
#include "../../basic_initialize/btOpenCLUtils.h"
#include "../kernels/PrefixScanKernelsCL.h"
btPrefixScanCL::btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
:m_commandQueue(queue)
{
const char* scanKernelSource = prefixScanKernelsCL;
cl_int pErrNum;
char* additionalMacros=0;
m_workBuffer = new btOpenCLArray<unsigned int>(ctx,queue,size);
cl_program scanProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, BT_PREFIXSCAN_PROG_PATH);
btAssert(scanProg);
m_localScanKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
btAssert(m_localScanKernel );
m_blockSumKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
btAssert(m_blockSumKernel );
m_propagationKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
btAssert(m_propagationKernel );
}
btPrefixScanCL::~btPrefixScanCL()
{
delete m_workBuffer;
clReleaseKernel(m_localScanKernel);
clReleaseKernel(m_blockSumKernel);
clReleaseKernel(m_propagationKernel);
}
template<class T>
T btNextPowerOf2(T n)
{
n -= 1;
for(int i=0; i<sizeof(T)*8; i++)
n = n | (n>>i);
return n+1;
}
void btPrefixScanCL::execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
{
// btAssert( data->m_option == EXCLUSIVE );
const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
dst.resize(src.size());
m_workBuffer->resize(src.size());
btInt4 constBuffer;
constBuffer.x = n;
constBuffer.y = numBlocks;
constBuffer.z = (int)btNextPowerOf2( numBlocks );
btOpenCLArray<unsigned int>* srcNative = &src;
btOpenCLArray<unsigned int>* dstNative = &dst;
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( srcNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_localScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
}
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_workBuffer->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_blockSumKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
}
if( numBlocks > 1 )
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_propagationKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
}
if( sum )
{
clFinish(m_commandQueue);
dstNative->copyToHostPointer(sum,1,n-1,true);
}
}
void btPrefixScanCL::executeHost(btAlignedObjectArray<unsigned int>& src, btAlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
{
unsigned int s = 0;
//if( data->m_option == EXCLUSIVE )
{
for(int i=0; i<n; i++)
{
dst[i] = s;
s += src[i];
}
}
/*else
{
for(int i=0; i<n; i++)
{
s += hSrc[i];
hDst[i] = s;
}
}
*/
if( sum )
{
*sum = dst[n-1];
}
}

View File

@@ -0,0 +1,37 @@
#ifndef BT_PREFIX_SCAN_CL_H
#define BT_PREFIX_SCAN_CL_H
#include "btOpenCLArray.h"
#include "btBufferInfoCL.h"
#include "btAlignedObjectArray.h"
class btPrefixScanCL
{
enum
{
BLOCK_SIZE = 128
};
// Option m_option;
cl_command_queue m_commandQueue;
cl_kernel m_localScanKernel;
cl_kernel m_blockSumKernel;
cl_kernel m_propagationKernel;
btOpenCLArray<unsigned int>* m_workBuffer;
public:
btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
virtual ~btPrefixScanCL();
void execute(btOpenCLArray<unsigned int>& src, btOpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
void executeHost(btAlignedObjectArray<unsigned int>& src, btAlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum);
};
#endif //BT_PREFIX_SCAN_CL_H

View File

@@ -0,0 +1,566 @@
/*
***************************************************************************************************
**
** profile.cpp
**
** Real-Time Hierarchical Profiling for Game Programming Gems 3
**
** by Greg Hjelstrom & Byon Garrabrant
**
***************************************************************************************************/
// Credits: The Clock class was inspired by the Timer classes in
// Ogre (www.ogre3d.org).
#include "btQuickprof.h"
#ifndef BT_NO_PROFILE
static btClock gProfileClock;
#ifdef __CELLOS_LV2__
#include <sys/sys_time.h>
#include <sys/time_util.h>
#include <stdio.h>
#endif
#if defined (SUNOS) || defined (__SUNOS__)
#include <stdio.h>
#endif
#if defined(WIN32) || defined(_WIN32)
#define BT_USE_WINDOWS_TIMERS
#define WIN32_LEAN_AND_MEAN
#define NOWINRES
#define NOMCX
#define NOIME
#ifdef _XBOX
#include <Xtl.h>
#else //_XBOX
#include <windows.h>
#endif //_XBOX
#include <time.h>
#else //_WIN32
#include <sys/time.h>
#endif //_WIN32
#define mymin(a,b) (a > b ? a : b)
struct btClockData
{
#ifdef BT_USE_WINDOWS_TIMERS
LARGE_INTEGER mClockFrequency;
DWORD mStartTick;
LONGLONG mPrevElapsedTime;
LARGE_INTEGER mStartTime;
#else
#ifdef __CELLOS_LV2__
uint64_t mStartTime;
#else
struct timeval mStartTime;
#endif
#endif //__CELLOS_LV2__
};
///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling.
btClock::btClock()
{
m_data = new btClockData;
#ifdef BT_USE_WINDOWS_TIMERS
QueryPerformanceFrequency(&m_data->mClockFrequency);
#endif
reset();
}
btClock::~btClock()
{
delete m_data;
}
btClock::btClock(const btClock& other)
{
m_data = new btClockData;
*m_data = *other.m_data;
}
btClock& btClock::operator=(const btClock& other)
{
*m_data = *other.m_data;
return *this;
}
/// Resets the initial reference time.
void btClock::reset()
{
#ifdef BT_USE_WINDOWS_TIMERS
QueryPerformanceCounter(&m_data->mStartTime);
m_data->mStartTick = GetTickCount();
m_data->mPrevElapsedTime = 0;
#else
#ifdef __CELLOS_LV2__
typedef uint64_t ClockSize;
ClockSize newTime;
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
SYS_TIMEBASE_GET( newTime );
m_data->mStartTime = newTime;
#else
gettimeofday(&m_data->mStartTime, 0);
#endif
#endif
}
/// Returns the time in ms since the last call to reset or since
/// the btClock was created.
unsigned long int btClock::getTimeMilliseconds()
{
#ifdef BT_USE_WINDOWS_TIMERS
LARGE_INTEGER currentTime;
QueryPerformanceCounter(&currentTime);
LONGLONG elapsedTime = currentTime.QuadPart -
m_data->mStartTime.QuadPart;
// Compute the number of millisecond ticks elapsed.
unsigned long msecTicks = (unsigned long)(1000 * elapsedTime /
m_data->mClockFrequency.QuadPart);
// Check for unexpected leaps in the Win32 performance counter.
// (This is caused by unexpected data across the PCI to ISA
// bridge, aka south bridge. See Microsoft KB274323.)
unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
signed long msecOff = (signed long)(msecTicks - elapsedTicks);
if (msecOff < -100 || msecOff > 100)
{
// Adjust the starting time forwards.
LONGLONG msecAdjustment = mymin(msecOff *
m_data->mClockFrequency.QuadPart / 1000, elapsedTime -
m_data->mPrevElapsedTime);
m_data->mStartTime.QuadPart += msecAdjustment;
elapsedTime -= msecAdjustment;
// Recompute the number of millisecond ticks elapsed.
msecTicks = (unsigned long)(1000 * elapsedTime /
m_data->mClockFrequency.QuadPart);
}
// Store the current elapsed time for adjustments next time.
m_data->mPrevElapsedTime = elapsedTime;
return msecTicks;
#else
#ifdef __CELLOS_LV2__
uint64_t freq=sys_time_get_timebase_frequency();
double dFreq=((double) freq) / 1000.0;
typedef uint64_t ClockSize;
ClockSize newTime;
SYS_TIMEBASE_GET( newTime );
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq);
#else
struct timeval currentTime;
gettimeofday(&currentTime, 0);
return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000 +
(currentTime.tv_usec - m_data->mStartTime.tv_usec) / 1000;
#endif //__CELLOS_LV2__
#endif
}
/// Returns the time in us since the last call to reset or since
/// the Clock was created.
unsigned long int btClock::getTimeMicroseconds()
{
#ifdef BT_USE_WINDOWS_TIMERS
LARGE_INTEGER currentTime;
QueryPerformanceCounter(&currentTime);
LONGLONG elapsedTime = currentTime.QuadPart -
m_data->mStartTime.QuadPart;
// Compute the number of millisecond ticks elapsed.
unsigned long msecTicks = (unsigned long)(1000 * elapsedTime /
m_data->mClockFrequency.QuadPart);
// Check for unexpected leaps in the Win32 performance counter.
// (This is caused by unexpected data across the PCI to ISA
// bridge, aka south bridge. See Microsoft KB274323.)
unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
signed long msecOff = (signed long)(msecTicks - elapsedTicks);
if (msecOff < -100 || msecOff > 100)
{
// Adjust the starting time forwards.
LONGLONG msecAdjustment = mymin(msecOff *
m_data->mClockFrequency.QuadPart / 1000, elapsedTime -
m_data->mPrevElapsedTime);
m_data->mStartTime.QuadPart += msecAdjustment;
elapsedTime -= msecAdjustment;
}
// Store the current elapsed time for adjustments next time.
m_data->mPrevElapsedTime = elapsedTime;
// Convert to microseconds.
unsigned long usecTicks = (unsigned long)(1000000 * elapsedTime /
m_data->mClockFrequency.QuadPart);
return usecTicks;
#else
#ifdef __CELLOS_LV2__
uint64_t freq=sys_time_get_timebase_frequency();
double dFreq=((double) freq)/ 1000000.0;
typedef uint64_t ClockSize;
ClockSize newTime;
//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
SYS_TIMEBASE_GET( newTime );
return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq);
#else
struct timeval currentTime;
gettimeofday(&currentTime, 0);
return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 +
(currentTime.tv_usec - m_data->mStartTime.tv_usec);
#endif//__CELLOS_LV2__
#endif
}
inline void Profile_Get_Ticks(unsigned long int * ticks)
{
*ticks = gProfileClock.getTimeMicroseconds();
}
inline float Profile_Get_Tick_Rate(void)
{
// return 1000000.f;
return 1000.f;
}
/***************************************************************************************************
**
** CProfileNode
**
***************************************************************************************************/
/***********************************************************************************************
* INPUT: *
* name - pointer to a static string which is the name of this profile node *
* parent - parent pointer *
* *
* WARNINGS: *
* The name is assumed to be a static pointer, only the pointer is stored and compared for *
* efficiency reasons. *
*=============================================================================================*/
CProfileNode::CProfileNode( const char * name, CProfileNode * parent ) :
Name( name ),
TotalCalls( 0 ),
TotalTime( 0 ),
StartTime( 0 ),
RecursionCounter( 0 ),
Parent( parent ),
Child( NULL ),
Sibling( NULL ),
m_userPtr(0)
{
Reset();
}
void CProfileNode::CleanupMemory()
{
delete ( Child);
Child = NULL;
delete ( Sibling);
Sibling = NULL;
}
CProfileNode::~CProfileNode( void )
{
delete ( Child);
delete ( Sibling);
}
/***********************************************************************************************
* INPUT: *
* name - static string pointer to the name of the node we are searching for *
* *
* WARNINGS: *
* All profile names are assumed to be static strings so this function uses pointer compares *
* to find the named node. *
*=============================================================================================*/
CProfileNode * CProfileNode::Get_Sub_Node( const char * name )
{
// Try to find this sub node
CProfileNode * child = Child;
while ( child ) {
if ( child->Name == name ) {
return child;
}
child = child->Sibling;
}
// We didn't find it, so add it
CProfileNode * node = new CProfileNode( name, this );
node->Sibling = Child;
Child = node;
return node;
}
void CProfileNode::Reset( void )
{
TotalCalls = 0;
TotalTime = 0.0f;
if ( Child ) {
Child->Reset();
}
if ( Sibling ) {
Sibling->Reset();
}
}
void CProfileNode::Call( void )
{
TotalCalls++;
if (RecursionCounter++ == 0) {
Profile_Get_Ticks(&StartTime);
}
}
bool CProfileNode::Return( void )
{
if ( --RecursionCounter == 0 && TotalCalls != 0 ) {
unsigned long int time;
Profile_Get_Ticks(&time);
time-=StartTime;
TotalTime += (float)time / Profile_Get_Tick_Rate();
}
return ( RecursionCounter == 0 );
}
/***************************************************************************************************
**
** CProfileIterator
**
***************************************************************************************************/
CProfileIterator::CProfileIterator( CProfileNode * start )
{
CurrentParent = start;
CurrentChild = CurrentParent->Get_Child();
}
void CProfileIterator::First(void)
{
CurrentChild = CurrentParent->Get_Child();
}
void CProfileIterator::Next(void)
{
CurrentChild = CurrentChild->Get_Sibling();
}
bool CProfileIterator::Is_Done(void)
{
return CurrentChild == NULL;
}
void CProfileIterator::Enter_Child( int index )
{
CurrentChild = CurrentParent->Get_Child();
while ( (CurrentChild != NULL) && (index != 0) ) {
index--;
CurrentChild = CurrentChild->Get_Sibling();
}
if ( CurrentChild != NULL ) {
CurrentParent = CurrentChild;
CurrentChild = CurrentParent->Get_Child();
}
}
void CProfileIterator::Enter_Parent( void )
{
if ( CurrentParent->Get_Parent() != NULL ) {
CurrentParent = CurrentParent->Get_Parent();
}
CurrentChild = CurrentParent->Get_Child();
}
/***************************************************************************************************
**
** CProfileManager
**
***************************************************************************************************/
CProfileNode CProfileManager::Root( "Root", NULL );
CProfileNode * CProfileManager::CurrentNode = &CProfileManager::Root;
int CProfileManager::FrameCounter = 0;
unsigned long int CProfileManager::ResetTime = 0;
/***********************************************************************************************
* CProfileManager::Start_Profile -- Begin a named profile *
* *
* Steps one level deeper into the tree, if a child already exists with the specified name *
* then it accumulates the profiling; otherwise a new child node is added to the profile tree. *
* *
* INPUT: *
* name - name of this profiling record *
* *
* WARNINGS: *
* The string used is assumed to be a static string; pointer compares are used throughout *
* the profiling code for efficiency. *
*=============================================================================================*/
void CProfileManager::Start_Profile( const char * name )
{
if (name != CurrentNode->Get_Name()) {
CurrentNode = CurrentNode->Get_Sub_Node( name );
}
CurrentNode->Call();
}
/***********************************************************************************************
* CProfileManager::Stop_Profile -- Stop timing and record the results. *
*=============================================================================================*/
void CProfileManager::Stop_Profile( void )
{
// Return will indicate whether we should back up to our parent (we may
// be profiling a recursive function)
if (CurrentNode->Return()) {
CurrentNode = CurrentNode->Get_Parent();
}
}
/***********************************************************************************************
* CProfileManager::Reset -- Reset the contents of the profiling system *
* *
* This resets everything except for the tree structure. All of the timing data is reset. *
*=============================================================================================*/
void CProfileManager::Reset( void )
{
gProfileClock.reset();
Root.Reset();
Root.Call();
FrameCounter = 0;
Profile_Get_Ticks(&ResetTime);
}
/***********************************************************************************************
* CProfileManager::Increment_Frame_Counter -- Increment the frame counter *
*=============================================================================================*/
void CProfileManager::Increment_Frame_Counter( void )
{
FrameCounter++;
}
/***********************************************************************************************
* CProfileManager::Get_Time_Since_Reset -- returns the elapsed time since last reset *
*=============================================================================================*/
float CProfileManager::Get_Time_Since_Reset( void )
{
unsigned long int time;
Profile_Get_Ticks(&time);
time -= ResetTime;
return (float)time / Profile_Get_Tick_Rate();
}
#include <stdio.h>
void CProfileManager::dumpRecursive(CProfileIterator* profileIterator, int spacing)
{
profileIterator->First();
if (profileIterator->Is_Done())
return;
float accumulated_time=0,parent_time = profileIterator->Is_Root() ? CProfileManager::Get_Time_Since_Reset() : profileIterator->Get_Current_Parent_Total_Time();
int i;
int frames_since_reset = CProfileManager::Get_Frame_Count_Since_Reset();
for (i=0;i<spacing;i++) printf(".");
printf("----------------------------------\n");
for (i=0;i<spacing;i++) printf(".");
printf("Profiling: %s (total running time: %.3f ms) ---\n", profileIterator->Get_Current_Parent_Name(), parent_time );
float totalTime = 0.f;
int numChildren = 0;
for (i = 0; !profileIterator->Is_Done(); i++,profileIterator->Next())
{
numChildren++;
float current_total_time = profileIterator->Get_Current_Total_Time();
accumulated_time += current_total_time;
float fraction = parent_time > SIMD_EPSILON ? (current_total_time / parent_time) * 100 : 0.f;
{
int i; for (i=0;i<spacing;i++) printf(".");
}
printf("%d -- %s (%.2f %%) :: %.3f ms / frame (%d calls)\n",i, profileIterator->Get_Current_Name(), fraction,(current_total_time / (double)frames_since_reset),profileIterator->Get_Current_Total_Calls());
totalTime += current_total_time;
//recurse into children
}
if (parent_time < accumulated_time)
{
printf("what's wrong\n");
}
for (i=0;i<spacing;i++) printf(".");
printf("%s (%.3f %%) :: %.3f ms\n", "Unaccounted:",parent_time > SIMD_EPSILON ? ((parent_time - accumulated_time) / parent_time) * 100 : 0.f, parent_time - accumulated_time);
for (i=0;i<numChildren;i++)
{
profileIterator->Enter_Child(i);
dumpRecursive(profileIterator,spacing+3);
profileIterator->Enter_Parent();
}
}
void CProfileManager::dumpAll()
{
CProfileIterator* profileIterator = 0;
profileIterator = CProfileManager::Get_Iterator();
dumpRecursive(profileIterator,0);
CProfileManager::Release_Iterator(profileIterator);
}
#endif //BT_NO_PROFILE

View File

@@ -0,0 +1,203 @@
/***************************************************************************************************
**
** Real-Time Hierarchical Profiling for Game Programming Gems 3
**
** by Greg Hjelstrom & Byon Garrabrant
**
***************************************************************************************************/
// Credits: The Clock class was inspired by the Timer classes in
// Ogre (www.ogre3d.org).
#ifndef BT_QUICK_PROF_H
#define BT_QUICK_PROF_H
//To disable built-in profiling, please comment out next line
//#define BT_NO_PROFILE 1
#ifndef BT_NO_PROFILE
#include <stdio.h>//@todo remove this, backwards compatibility
#include "btScalar.h"
#include "btAlignedAllocator.h"
#include <new>
#define USE_BT_CLOCK 1
#ifdef USE_BT_CLOCK
///The btClock is a portable basic clock that measures accurate time in seconds, use for profiling.
class btClock
{
public:
btClock();
btClock(const btClock& other);
btClock& operator=(const btClock& other);
~btClock();
/// Resets the initial reference time.
void reset();
/// Returns the time in ms since the last call to reset or since
/// the btClock was created.
unsigned long int getTimeMilliseconds();
/// Returns the time in us since the last call to reset or since
/// the Clock was created.
unsigned long int getTimeMicroseconds();
private:
struct btClockData* m_data;
};
#endif //USE_BT_CLOCK
///A node in the Profile Hierarchy Tree
class CProfileNode {
public:
CProfileNode( const char * name, CProfileNode * parent );
~CProfileNode( void );
CProfileNode * Get_Sub_Node( const char * name );
CProfileNode * Get_Parent( void ) { return Parent; }
CProfileNode * Get_Sibling( void ) { return Sibling; }
CProfileNode * Get_Child( void ) { return Child; }
void CleanupMemory();
void Reset( void );
void Call( void );
bool Return( void );
const char * Get_Name( void ) { return Name; }
int Get_Total_Calls( void ) { return TotalCalls; }
float Get_Total_Time( void ) { return TotalTime; }
void* GetUserPointer() const {return m_userPtr;}
void SetUserPointer(void* ptr) { m_userPtr = ptr;}
protected:
const char * Name;
int TotalCalls;
float TotalTime;
unsigned long int StartTime;
int RecursionCounter;
CProfileNode * Parent;
CProfileNode * Child;
CProfileNode * Sibling;
void* m_userPtr;
};
///An iterator to navigate through the tree
class CProfileIterator
{
public:
// Access all the children of the current parent
void First(void);
void Next(void);
bool Is_Done(void);
bool Is_Root(void) { return (CurrentParent->Get_Parent() == 0); }
void Enter_Child( int index ); // Make the given child the new parent
void Enter_Largest_Child( void ); // Make the largest child the new parent
void Enter_Parent( void ); // Make the current parent's parent the new parent
// Access the current child
const char * Get_Current_Name( void ) { return CurrentChild->Get_Name(); }
int Get_Current_Total_Calls( void ) { return CurrentChild->Get_Total_Calls(); }
float Get_Current_Total_Time( void ) { return CurrentChild->Get_Total_Time(); }
void* Get_Current_UserPointer( void ) { return CurrentChild->GetUserPointer(); }
void Set_Current_UserPointer(void* ptr) {CurrentChild->SetUserPointer(ptr);}
// Access the current parent
const char * Get_Current_Parent_Name( void ) { return CurrentParent->Get_Name(); }
int Get_Current_Parent_Total_Calls( void ) { return CurrentParent->Get_Total_Calls(); }
float Get_Current_Parent_Total_Time( void ) { return CurrentParent->Get_Total_Time(); }
protected:
CProfileNode * CurrentParent;
CProfileNode * CurrentChild;
CProfileIterator( CProfileNode * start );
friend class CProfileManager;
};
///The Manager for the Profile system
class CProfileManager {
public:
static void Start_Profile( const char * name );
static void Stop_Profile( void );
static void CleanupMemory(void)
{
Root.CleanupMemory();
}
static void Reset( void );
static void Increment_Frame_Counter( void );
static int Get_Frame_Count_Since_Reset( void ) { return FrameCounter; }
static float Get_Time_Since_Reset( void );
static CProfileIterator * Get_Iterator( void )
{
return new CProfileIterator( &Root );
}
static void Release_Iterator( CProfileIterator * iterator ) { delete ( iterator); }
static void dumpRecursive(CProfileIterator* profileIterator, int spacing);
static void dumpAll();
private:
static CProfileNode Root;
static CProfileNode * CurrentNode;
static int FrameCounter;
static unsigned long int ResetTime;
};
///ProfileSampleClass is a simple way to profile a function's scope
///Use the BT_PROFILE macro at the start of scope to time
class CProfileSample {
public:
CProfileSample( const char * name )
{
CProfileManager::Start_Profile( name );
}
~CProfileSample( void )
{
CProfileManager::Stop_Profile();
}
};
#define BT_PROFILE( name ) CProfileSample __profile( name )
#else
#define BT_PROFILE( name )
#endif //#ifndef BT_NO_PROFILE
#endif //BT_QUICK_PROF_H

View File

@@ -0,0 +1,712 @@
#include "btRadixSort32CL.h"
#include "btLauncherCL.h"
#include "../../basic_initialize/btOpenCLUtils.h"
#include "btPrefixScanCL.h"
#include "btFillCL.h"
#define RADIXSORT32_PATH "opencl/parallel_primitives/kernels/RadixSort32Kernels.cl"
#include "../kernels/RadixSort32KernelsCL.h"
btRadixSort32CL::btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
:m_commandQueue(queue)
{
btOpenCLDeviceInfo info;
btOpenCLUtils::getDeviceInfo(device,&info);
m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
m_workBuffer1 = new btOpenCLArray<unsigned int>(ctx,queue);
m_workBuffer2 = new btOpenCLArray<unsigned int>(ctx,queue);
m_workBuffer3 = new btOpenCLArray<btSortData>(ctx,queue);
m_workBuffer3a = new btOpenCLArray<unsigned int>(ctx,queue);
m_workBuffer4 = new btOpenCLArray<btSortData>(ctx,queue);
m_workBuffer4a = new btOpenCLArray<unsigned int>(ctx,queue);
if (initialCapacity>0)
{
m_workBuffer1->resize(initialCapacity);
m_workBuffer3->resize(initialCapacity);
m_workBuffer3a->resize(initialCapacity);
m_workBuffer4->resize(initialCapacity);
m_workBuffer4a->resize(initialCapacity);
}
m_scan = new btPrefixScanCL(ctx,device,queue);
m_fill = new btFillCL(ctx,device,queue);
const char* additionalMacros = "";
const char* srcFileNameForCaching="";
cl_int pErrNum;
const char* kernelSource = radixSort32KernelsCL;
cl_program sortProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
btAssert(sortProg);
m_streamCountSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_streamCountSortDataKernel );
m_streamCountKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_streamCountKernel);
if (m_deviceCPU)
{
m_sortAndScatterSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
btAssert(m_sortAndScatterSortDataKernel);
m_sortAndScatterKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
btAssert(m_sortAndScatterKernel);
} else
{
m_sortAndScatterSortDataKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_sortAndScatterSortDataKernel);
m_sortAndScatterKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_sortAndScatterKernel);
}
m_prefixScanKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
btAssert(m_prefixScanKernel);
}
btRadixSort32CL::~btRadixSort32CL()
{
delete m_scan;
delete m_fill;
delete m_workBuffer1;
delete m_workBuffer2;
delete m_workBuffer3;
delete m_workBuffer3a;
delete m_workBuffer4;
delete m_workBuffer4a;
clReleaseKernel(m_streamCountSortDataKernel);
clReleaseKernel(m_streamCountKernel);
clReleaseKernel(m_sortAndScatterSortDataKernel);
clReleaseKernel(m_sortAndScatterKernel);
clReleaseKernel(m_prefixScanKernel);
}
void btRadixSort32CL::executeHost(btAlignedObjectArray<btSortData>& inout, int sortBits /* = 32 */)
{
int n = inout.size();
const int BITS_PER_PASS = 8;
const int NUM_TABLES = (1<<BITS_PER_PASS);
int tables[NUM_TABLES];
int counter[NUM_TABLES];
btSortData* src = &inout[0];
btAlignedObjectArray<btSortData> workbuffer;
workbuffer.resize(inout.size());
btSortData* dst = &workbuffer[0];
int count=0;
for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
{
for(int i=0; i<NUM_TABLES; i++)
{
tables[i] = 0;
}
for(int i=0; i<n; i++)
{
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
tables[tableIdx]++;
}
//#define TEST
#ifdef TEST
printf("histogram size=%d\n",NUM_TABLES);
for (int i=0;i<NUM_TABLES;i++)
{
if (tables[i]!=0)
{
printf("tables[%d]=%d]\n",i,tables[i]);
}
}
#endif //TEST
// prefix scan
int sum = 0;
for(int i=0; i<NUM_TABLES; i++)
{
int iData = tables[i];
tables[i] = sum;
sum += iData;
counter[i] = 0;
}
// distribute
for(int i=0; i<n; i++)
{
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
dst[tables[tableIdx] + counter[tableIdx]] = src[i];
counter[tableIdx] ++;
}
btSwap( src, dst );
count++;
}
if (count&1)
{
btAssert(0);//need to copy
}
}
void btRadixSort32CL::executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
{
btAlignedObjectArray<btSortData> inout;
keyValuesInOut.copyToHost(inout);
executeHost(inout,sortBits);
keyValuesInOut.copyFromHost(inout);
}
void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray<unsigned int>& keysOut, btOpenCLArray<unsigned int>& valuesIn,
btOpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
{
}
//#define DEBUG_RADIXSORT
//#define DEBUG_RADIXSORT2
void btRadixSort32CL::execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits /* = 32 */)
{
int originalSize = keyValuesInOut.size();
int workingSize = originalSize;
int dataAlignment = DATA_ALIGNMENT;
#ifdef DEBUG_RADIXSORT2
btAlignedObjectArray<btSortData> test2;
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
{
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
}
#endif //DEBUG_RADIXSORT2
btOpenCLArray<btSortData>* src = 0;
if (workingSize%dataAlignment)
{
workingSize += dataAlignment-(workingSize%dataAlignment);
m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
m_workBuffer4->resize(workingSize);
btSortData fillValue;
fillValue.m_key = 0xffffffff;
fillValue.m_value = 0xffffffff;
#define USE_BTFILL
#ifdef USE_BTFILL
m_fill->execute((btOpenCLArray<btInt2>&)*m_workBuffer4,(btInt2&)fillValue,workingSize-originalSize,originalSize);
#else
//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
for (int i=originalSize; i<workingSize;i++)
{
m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
}
#endif//USE_BTFILL
src = m_workBuffer4;
} else
{
src = &keyValuesInOut;
m_workBuffer4->resize(0);
}
btAssert( workingSize%DATA_ALIGNMENT == 0 );
int minCap = NUM_BUCKET*NUM_WGS;
int n = workingSize;
m_workBuffer1->resize(minCap);
m_workBuffer3->resize(workingSize);
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
btAssert( BITS_PER_PASS == 4 );
btAssert( WG_SIZE == 64 );
btAssert( (sortBits&0x3) == 0 );
btOpenCLArray<btSortData>* dst = m_workBuffer3;
btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;
int nWGs = NUM_WGS;
btConstData cdata;
{
int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
int nBlocks = (n+blockSize-1)/(blockSize);
cdata.m_n = n;
cdata.m_nWGs = NUM_WGS;
cdata.m_startBit = 0;
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
if( nBlocks < NUM_WGS )
{
cdata.m_nBlocksPerWG = 1;
nWGs = nBlocks;
}
}
int count=0;
for(int ib=0; ib<sortBits; ib+=4)
{
#ifdef DEBUG_RADIXSORT2
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
{
if (test2[i].m_key != test2[i].m_value)
{
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
}
}
#endif //DEBUG_RADIXSORT2
cdata.m_startBit = ib;
if (src->size())
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
btLauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( cdata );
int num = NUM_WGS*WG_SIZE;
launcher.launch1D( num, WG_SIZE );
}
#ifdef DEBUG_RADIXSORT
btAlignedObjectArray<unsigned int> testHist;
srcHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
for (int i=0;i<testHist.size();i++)
{
if (testHist[i]!=0)
printf("testHist[%d]=%d\n",i,testHist[i]);
}
#endif //DEBUG_RADIXSORT
//fast prefix scan is not working properly on Mac OSX yet
#ifdef _WIN32
bool fastScan=!m_deviceCPU;//only use fast scan on GPU
#else
bool fastScan=false;
#endif
if (fastScan)
{// prefix scan group histogram
btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( 128, 128 );
destHisto = srcHisto;
}else
{
//unsigned int sum; //for debugging
m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
}
#ifdef DEBUG_RADIXSORT
destHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
for (int i=0;i<testHist.size();i++)
{
if (testHist[i]!=0)
printf("testHist[%d]=%d\n",i,testHist[i]);
}
for (int i=0;i<testHist.size();i+=NUM_WGS)
{
printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
}
#endif //DEBUG_RADIXSORT
#define USE_GPU
#ifdef USE_GPU
if (src->size())
{// local sort and distribute
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
btLauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
}
#else
{
#define NUM_TABLES 16
//#define SEQUENTIAL
#ifdef SEQUENTIAL
int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int tables[NUM_TABLES];
int startBit = ib;
destHisto->copyToHost(testHist);
btAlignedObjectArray<btSortData> srcHost;
btAlignedObjectArray<btSortData> dstHost;
dstHost.resize(src->size());
src->copyToHost(srcHost);
for (int i=0;i<NUM_TABLES;i++)
{
tables[i] = testHist[i*NUM_WGS];
}
// distribute
for(int i=0; i<n; i++)
{
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
counter2[tableIdx] ++;
}
#else
int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int tables[NUM_TABLES];
btAlignedObjectArray<btSortData> dstHostOK;
dstHostOK.resize(src->size());
destHisto->copyToHost(testHist);
btAlignedObjectArray<btSortData> srcHost;
src->copyToHost(srcHost);
int blockSize = 256;
int nBlocksPerWG = cdata.m_nBlocksPerWG;
int startBit = ib;
{
for (int i=0;i<NUM_TABLES;i++)
{
tables[i] = testHist[i*NUM_WGS];
}
// distribute
for(int i=0; i<n; i++)
{
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
counter2[tableIdx] ++;
}
}
btAlignedObjectArray<btSortData> dstHost;
dstHost.resize(src->size());
int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
{
int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
for(int iblock=0; iblock<btMin(cdata.m_nBlocksPerWG, nBlocks); iblock++)
{
for (int lIdx = 0;lIdx < 64;lIdx++)
{
int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
// MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
// Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
// AMD: AtomInc performs better while NV prefers ++
for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
{
if( addr+j < n )
{
// printf ("addr+j=%d\n", addr+j);
int i = addr+j;
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
btSortData ok = dstHostOK[destIndex];
if (ok.m_key != srcHost[i].m_key)
{
printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
}
if (ok.m_value != srcHost[i].m_value)
{
printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );
}
dstHost[destIndex] = srcHost[i];
counter[tableIdx] ++;
}
}
}
}
}
#endif //SEQUENTIAL
dst->copyFromHost(dstHost);
}
#endif//USE_GPU
#ifdef DEBUG_RADIXSORT
destHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
for (int i=0;i<testHist.size();i++)
{
if (testHist[i]!=0)
printf("testHist[%d]=%d\n",i,testHist[i]);
}
#endif //DEBUG_RADIXSORT
btSwap(src, dst );
btSwap(srcHisto,destHisto);
#ifdef DEBUG_RADIXSORT2
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
{
if (test2[i].m_key != test2[i].m_value)
{
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
}
}
#endif //DEBUG_RADIXSORT2
count++;
}
if (count&1)
{
btAssert(0);//need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4->size())
{
m_workBuffer4->resize(originalSize);
keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
}
#ifdef DEBUG_RADIXSORT
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n",test2.size());
for (int i=0;i<test2.size();i++)
{
printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
}
#endif
}
void btRadixSort32CL::execute(btOpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
{
int originalSize = keysInOut.size();
int workingSize = originalSize;
int dataAlignment = DATA_ALIGNMENT;
btOpenCLArray<unsigned int>* src = 0;
if (workingSize%dataAlignment)
{
workingSize += dataAlignment-(workingSize%dataAlignment);
m_workBuffer4a->copyFromOpenCLArray(keysInOut);
m_workBuffer4a->resize(workingSize);
unsigned int fillValue = 0xffffffff;
m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize);
src = m_workBuffer4a;
} else
{
src = &keysInOut;
m_workBuffer4a->resize(0);
}
btAssert( workingSize%DATA_ALIGNMENT == 0 );
int minCap = NUM_BUCKET*NUM_WGS;
int n = workingSize;
m_workBuffer1->resize(minCap);
m_workBuffer3->resize(workingSize);
m_workBuffer3a->resize(workingSize);
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
btAssert( BITS_PER_PASS == 4 );
btAssert( WG_SIZE == 64 );
btAssert( (sortBits&0x3) == 0 );
btOpenCLArray<unsigned int>* dst = m_workBuffer3a;
btOpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
btOpenCLArray<unsigned int>* destHisto = m_workBuffer2;
int nWGs = NUM_WGS;
btConstData cdata;
{
int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
int nBlocks = (n+blockSize-1)/(blockSize);
cdata.m_n = n;
cdata.m_nWGs = NUM_WGS;
cdata.m_startBit = 0;
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
if( nBlocks < NUM_WGS )
{
cdata.m_nBlocksPerWG = 1;
nWGs = nBlocks;
}
}
int count=0;
for(int ib=0; ib<sortBits; ib+=4)
{
cdata.m_startBit = ib;
if (src->size())
{
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) };
btLauncherCL launcher(m_commandQueue, m_streamCountKernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( cdata );
int num = NUM_WGS*WG_SIZE;
launcher.launch1D( num, WG_SIZE );
}
//fast prefix scan is not working properly on Mac OSX yet
#ifdef _WIN32
bool fastScan=!m_deviceCPU;
#else
bool fastScan=false;
#endif
if (fastScan)
{// prefix scan group histogram
btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) };
btLauncherCL launcher( m_commandQueue, m_prefixScanKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( 128, 128 );
destHisto = srcHisto;
}else
{
//unsigned int sum; //for debugging
m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
}
if (src->size())
{// local sort and distribute
btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )};
btLauncherCL launcher( m_commandQueue, m_sortAndScatterKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( cdata );
launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
}
btSwap(src, dst );
btSwap(srcHisto,destHisto);
count++;
}
if (count&1)
{
btAssert(0);//need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4a->size())
{
m_workBuffer4a->resize(originalSize);
keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
}
}

View File

@@ -0,0 +1,85 @@
#ifndef BT_RADIXSORT32_H
#define BT_RADIXSORT32_H
#include "btOpenCLArray.h"
struct btSortData
{
int m_key;
int m_value;
};
#include "btBufferInfoCL.h"
class btRadixSort32CL
{
btOpenCLArray<unsigned int>* m_workBuffer1;
btOpenCLArray<unsigned int>* m_workBuffer2;
btOpenCLArray<btSortData>* m_workBuffer3;
btOpenCLArray<btSortData>* m_workBuffer4;
btOpenCLArray<unsigned int>* m_workBuffer3a;
btOpenCLArray<unsigned int>* m_workBuffer4a;
cl_command_queue m_commandQueue;
cl_kernel m_streamCountSortDataKernel;
cl_kernel m_streamCountKernel;
cl_kernel m_prefixScanKernel;
cl_kernel m_sortAndScatterSortDataKernel;
cl_kernel m_sortAndScatterKernel;
bool m_deviceCPU;
class btPrefixScanCL* m_scan;
class btFillCL* m_fill;
public:
struct btConstData
{
int m_n;
int m_nWGs;
int m_startBit;
int m_nBlocksPerWG;
};
enum
{
DATA_ALIGNMENT = 256,
WG_SIZE = 64,
BLOCK_SIZE = 256,
ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
BITS_PER_PASS = 4,
NUM_BUCKET=(1<<BITS_PER_PASS),
// if you change this, change nPerWI in kernel as well
NUM_WGS = 20*6, // cypress
// NUM_WGS = 24*6, // cayman
// NUM_WGS = 32*4, // nv
};
private:
public:
btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
virtual ~btRadixSort32CL();
void execute(btOpenCLArray<unsigned int>& keysIn, btOpenCLArray<unsigned int>& keysOut, btOpenCLArray<unsigned int>& valuesIn,
btOpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
///keys only
void execute(btOpenCLArray<unsigned int>& keysInOut, int sortBits = 32 );
void execute(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits = 32 );
void executeHost(btOpenCLArray<btSortData>& keyValuesInOut, int sortBits = 32);
void executeHost(btAlignedObjectArray<btSortData>& keyValuesInOut, int sortBits = 32);
};
#endif //BT_RADIXSORT32_H

View File

@@ -0,0 +1,660 @@
/*
Copyright (c) 2003-2009 Erwin Coumans http://bullet.googlecode.com
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_SCALAR_H
#define BT_SCALAR_H
#ifdef BT_MANAGED_CODE
//Aligned data types not supported in managed code
#pragma unmanaged
#endif
#include <math.h>
#include <stdlib.h>//size_t for MSVC 6.0
#include <float.h>
/* SVN $Revision$ on $Date$ from http://bullet.googlecode.com*/
#define BT_BULLET_VERSION 281
inline int btGetVersion()
{
return BT_BULLET_VERSION;
}
#if defined(DEBUG) || defined (_DEBUG)
#define BT_DEBUG
#endif
#ifdef _WIN32
#if defined(__MINGW32__) || defined(__CYGWIN__) || (defined (_MSC_VER) && _MSC_VER < 1300)
#define SIMD_FORCE_INLINE inline
#define ATTRIBUTE_ALIGNED16(a) a
#define ATTRIBUTE_ALIGNED64(a) a
#define ATTRIBUTE_ALIGNED128(a) a
#else
//#define BT_HAS_ALIGNED_ALLOCATOR
#pragma warning(disable : 4324) // disable padding warning
// #pragma warning(disable:4530) // Disable the exception disable but used in MSCV Stl warning.
// #pragma warning(disable:4996) //Turn off warnings about deprecated C routines
// #pragma warning(disable:4786) // Disable the "debug name too long" warning
#define SIMD_FORCE_INLINE __forceinline
#define ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a
#define ATTRIBUTE_ALIGNED64(a) __declspec(align(64)) a
#define ATTRIBUTE_ALIGNED128(a) __declspec (align(128)) a
#ifdef _XBOX
#define BT_USE_VMX128
#include <ppcintrinsics.h>
#define BT_HAVE_NATIVE_FSEL
#define btFsel(a,b,c) __fsel((a),(b),(c))
#else
#if (defined (_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined (BT_USE_DOUBLE_PRECISION))
#define BT_USE_SSE
#ifdef BT_USE_SSE
//BT_USE_SSE_IN_API is disabled under Windows by default, because
//it makes it harder to integrate Bullet into your application under Windows
//(structured embedding Bullet structs/classes need to be 16-byte aligned)
//with relatively little performance gain
//If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries
//you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage)
//#define BT_USE_SSE_IN_API
#endif //BT_USE_SSE
#include <emmintrin.h>
#endif
#endif//_XBOX
#endif //__MINGW32__
#ifdef BT_DEBUG
#ifdef _MSC_VER
#include <stdio.h>
#define btAssert(x) { if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);__debugbreak(); }}
#else//_MSC_VER
#include <assert.h>
#define btAssert assert
#endif//_MSC_VER
#else
#define btAssert(x)
#endif
//btFullAssert is optional, slows down a lot
#define btFullAssert(x)
#define btLikely(_c) _c
#define btUnlikely(_c) _c
#else
#if defined (__CELLOS_LV2__)
#define SIMD_FORCE_INLINE inline __attribute__((always_inline))
#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
#ifndef assert
#include <assert.h>
#endif
#ifdef BT_DEBUG
#ifdef __SPU__
#include <spu_printf.h>
#define printf spu_printf
#define btAssert(x) {if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}}
#else
#define btAssert assert
#endif
#else
#define btAssert(x)
#endif
//btFullAssert is optional, slows down a lot
#define btFullAssert(x)
#define btLikely(_c) _c
#define btUnlikely(_c) _c
#else
#ifdef USE_LIBSPE2
#define SIMD_FORCE_INLINE __inline
#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
#ifndef assert
#include <assert.h>
#endif
#ifdef BT_DEBUG
#define btAssert assert
#else
#define btAssert(x)
#endif
//btFullAssert is optional, slows down a lot
#define btFullAssert(x)
#define btLikely(_c) __builtin_expect((_c), 1)
#define btUnlikely(_c) __builtin_expect((_c), 0)
#else
//non-windows systems
#if (defined (__APPLE__) && (!defined (BT_USE_DOUBLE_PRECISION)))
#if defined (__i386__) || defined (__x86_64__)
#define BT_USE_SSE
//BT_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries
//if apps run into issues, we will disable the next line
#define BT_USE_SSE_IN_API
#ifdef BT_USE_SSE
// include appropriate SSE level
#if defined (__SSE4_1__)
#include <smmintrin.h>
#elif defined (__SSSE3__)
#include <tmmintrin.h>
#elif defined (__SSE3__)
#include <pmmintrin.h>
#else
#include <emmintrin.h>
#endif
#endif //BT_USE_SSE
#elif defined( __armv7__ )
#ifdef __clang__
#define BT_USE_NEON 1
#if defined BT_USE_NEON && defined (__clang__)
#include <arm_neon.h>
#endif//BT_USE_NEON
#endif //__clang__
#endif//__arm__
#define SIMD_FORCE_INLINE inline __attribute__ ((always_inline))
///@todo: check out alignment methods for other platforms/compilers
#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
#ifndef assert
#include <assert.h>
#endif
#if defined(DEBUG) || defined (_DEBUG)
#if defined (__i386__) || defined (__x86_64__)
#include <stdio.h>
#define btAssert(x)\
{\
if(!(x))\
{\
printf("Assert %s in line %d, file %s\n",#x, __LINE__, __FILE__);\
asm volatile ("int3");\
}\
}
#else//defined (__i386__) || defined (__x86_64__)
#define btAssert assert
#endif//defined (__i386__) || defined (__x86_64__)
#else//defined(DEBUG) || defined (_DEBUG)
#define btAssert(x)
#endif//defined(DEBUG) || defined (_DEBUG)
//btFullAssert is optional, slows down a lot
#define btFullAssert(x)
#define btLikely(_c) _c
#define btUnlikely(_c) _c
#else
#define SIMD_FORCE_INLINE inline
///@todo: check out alignment methods for other platforms/compilers
///#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
///#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
///#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
#define ATTRIBUTE_ALIGNED16(a) a
#define ATTRIBUTE_ALIGNED64(a) a
#define ATTRIBUTE_ALIGNED128(a) a
#ifndef assert
#include <assert.h>
#endif
#if defined(DEBUG) || defined (_DEBUG)
#define btAssert assert
#else
#define btAssert(x)
#endif
//btFullAssert is optional, slows down a lot
#define btFullAssert(x)
#define btLikely(_c) _c
#define btUnlikely(_c) _c
#endif //__APPLE__
#endif // LIBSPE2
#endif //__CELLOS_LV2__
#endif
///The btScalar type abstracts floating point numbers, to easily switch between double and single floating point precision.
#if defined(BT_USE_DOUBLE_PRECISION)
typedef double btScalar;
//this number could be bigger in double precision
#define BT_LARGE_FLOAT 1e30
#else
typedef float btScalar;
//keep BT_LARGE_FLOAT*BT_LARGE_FLOAT < FLT_MAX
#define BT_LARGE_FLOAT 1e18f
#endif
#ifdef BT_USE_SSE
typedef __m128 btSimdFloat4;
#endif//BT_USE_SSE
#if defined BT_USE_SSE_IN_API && defined (BT_USE_SSE)
#ifdef _WIN32
#ifndef BT_NAN
static int btNanMask = 0x7F800001;
#define BT_NAN (*(float*)&btNanMask)
#endif
#ifndef BT_INFINITY
static int btInfinityMask = 0x7F800000;
#define BT_INFINITY (*(float*)&btInfinityMask)
#endif
inline __m128 operator + (const __m128 A, const __m128 B)
{
return _mm_add_ps(A, B);
}
inline __m128 operator - (const __m128 A, const __m128 B)
{
return _mm_sub_ps(A, B);
}
inline __m128 operator * (const __m128 A, const __m128 B)
{
return _mm_mul_ps(A, B);
}
#define btCastfTo128i(a) (_mm_castps_si128(a))
#define btCastfTo128d(a) (_mm_castps_pd(a))
#define btCastiTo128f(a) (_mm_castsi128_ps(a))
#define btCastdTo128f(a) (_mm_castpd_ps(a))
#define btCastdTo128i(a) (_mm_castpd_si128(a))
#define btAssign128(r0,r1,r2,r3) _mm_setr_ps(r0,r1,r2,r3)
#else//_WIN32
#define btCastfTo128i(a) ((__m128i)(a))
#define btCastfTo128d(a) ((__m128d)(a))
#define btCastiTo128f(a) ((__m128) (a))
#define btCastdTo128f(a) ((__m128) (a))
#define btCastdTo128i(a) ((__m128i)(a))
#define btAssign128(r0,r1,r2,r3) (__m128){r0,r1,r2,r3}
#define BT_INFINITY INFINITY
#define BT_NAN NAN
#endif//_WIN32
#endif //BT_USE_SSE_IN_API
#ifdef BT_USE_NEON
#include <arm_neon.h>
typedef float32x4_t btSimdFloat4;
#define BT_INFINITY INFINITY
#define BT_NAN NAN
#define btAssign128(r0,r1,r2,r3) (float32x4_t){r0,r1,r2,r3}
#endif
#define BT_DECLARE_ALIGNED_ALLOCATOR() \
SIMD_FORCE_INLINE void* operator new(size_t sizeInBytes) { return btAlignedAlloc(sizeInBytes,16); } \
SIMD_FORCE_INLINE void operator delete(void* ptr) { btAlignedFree(ptr); } \
SIMD_FORCE_INLINE void* operator new(size_t, void* ptr) { return ptr; } \
SIMD_FORCE_INLINE void operator delete(void*, void*) { } \
SIMD_FORCE_INLINE void* operator new[](size_t sizeInBytes) { return btAlignedAlloc(sizeInBytes,16); } \
SIMD_FORCE_INLINE void operator delete[](void* ptr) { btAlignedFree(ptr); } \
SIMD_FORCE_INLINE void* operator new[](size_t, void* ptr) { return ptr; } \
SIMD_FORCE_INLINE void operator delete[](void*, void*) { } \
#if defined(BT_USE_DOUBLE_PRECISION) || defined(BT_FORCE_DOUBLE_FUNCTIONS)
SIMD_FORCE_INLINE btScalar btSqrt(btScalar x) { return sqrt(x); }
SIMD_FORCE_INLINE btScalar btFabs(btScalar x) { return fabs(x); }
SIMD_FORCE_INLINE btScalar btCos(btScalar x) { return cos(x); }
SIMD_FORCE_INLINE btScalar btSin(btScalar x) { return sin(x); }
SIMD_FORCE_INLINE btScalar btTan(btScalar x) { return tan(x); }
SIMD_FORCE_INLINE btScalar btAcos(btScalar x) { if (x<btScalar(-1)) x=btScalar(-1); if (x>btScalar(1)) x=btScalar(1); return acos(x); }
SIMD_FORCE_INLINE btScalar btAsin(btScalar x) { if (x<btScalar(-1)) x=btScalar(-1); if (x>btScalar(1)) x=btScalar(1); return asin(x); }
SIMD_FORCE_INLINE btScalar btAtan(btScalar x) { return atan(x); }
SIMD_FORCE_INLINE btScalar btAtan2(btScalar x, btScalar y) { return atan2(x, y); }
SIMD_FORCE_INLINE btScalar btExp(btScalar x) { return exp(x); }
SIMD_FORCE_INLINE btScalar btLog(btScalar x) { return log(x); }
SIMD_FORCE_INLINE btScalar btPow(btScalar x,btScalar y) { return pow(x,y); }
SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmod(x,y); }
#else
SIMD_FORCE_INLINE btScalar btSqrt(btScalar y)
{
#ifdef USE_APPROXIMATION
double x, z, tempf;
unsigned long *tfptr = ((unsigned long *)&tempf) + 1;
tempf = y;
*tfptr = (0xbfcdd90a - *tfptr)>>1; /* estimate of 1/sqrt(y) */
x = tempf;
z = y*btScalar(0.5);
x = (btScalar(1.5)*x)-(x*x)*(x*z); /* iteration formula */
x = (btScalar(1.5)*x)-(x*x)*(x*z);
x = (btScalar(1.5)*x)-(x*x)*(x*z);
x = (btScalar(1.5)*x)-(x*x)*(x*z);
x = (btScalar(1.5)*x)-(x*x)*(x*z);
return x*y;
#else
return sqrtf(y);
#endif
}
SIMD_FORCE_INLINE btScalar btFabs(btScalar x) { return fabsf(x); }
SIMD_FORCE_INLINE btScalar btCos(btScalar x) { return cosf(x); }
SIMD_FORCE_INLINE btScalar btSin(btScalar x) { return sinf(x); }
SIMD_FORCE_INLINE btScalar btTan(btScalar x) { return tanf(x); }
SIMD_FORCE_INLINE btScalar btAcos(btScalar x) {
if (x<btScalar(-1))
x=btScalar(-1);
if (x>btScalar(1))
x=btScalar(1);
return acosf(x);
}
SIMD_FORCE_INLINE btScalar btAsin(btScalar x) {
if (x<btScalar(-1))
x=btScalar(-1);
if (x>btScalar(1))
x=btScalar(1);
return asinf(x);
}
SIMD_FORCE_INLINE btScalar btAtan(btScalar x) { return atanf(x); }
SIMD_FORCE_INLINE btScalar btAtan2(btScalar x, btScalar y) { return atan2f(x, y); }
SIMD_FORCE_INLINE btScalar btExp(btScalar x) { return expf(x); }
SIMD_FORCE_INLINE btScalar btLog(btScalar x) { return logf(x); }
SIMD_FORCE_INLINE btScalar btPow(btScalar x,btScalar y) { return powf(x,y); }
SIMD_FORCE_INLINE btScalar btFmod(btScalar x,btScalar y) { return fmodf(x,y); }
#endif
#define SIMD_2_PI btScalar(6.283185307179586232)
#define SIMD_PI (SIMD_2_PI * btScalar(0.5))
#define SIMD_HALF_PI (SIMD_2_PI * btScalar(0.25))
#define SIMD_RADS_PER_DEG (SIMD_2_PI / btScalar(360.0))
#define SIMD_DEGS_PER_RAD (btScalar(360.0) / SIMD_2_PI)
#define SIMDSQRT12 btScalar(0.7071067811865475244008443621048490)
#define btRecipSqrt(x) ((btScalar)(btScalar(1.0)/btSqrt(btScalar(x)))) /* reciprocal square root */
#ifdef BT_USE_DOUBLE_PRECISION
#define SIMD_EPSILON DBL_EPSILON
#define SIMD_INFINITY DBL_MAX
#else
#define SIMD_EPSILON FLT_EPSILON
#define SIMD_INFINITY FLT_MAX
#endif
SIMD_FORCE_INLINE btScalar btAtan2Fast(btScalar y, btScalar x)
{
btScalar coeff_1 = SIMD_PI / 4.0f;
btScalar coeff_2 = 3.0f * coeff_1;
btScalar abs_y = btFabs(y);
btScalar angle;
if (x >= 0.0f) {
btScalar r = (x - abs_y) / (x + abs_y);
angle = coeff_1 - coeff_1 * r;
} else {
btScalar r = (x + abs_y) / (abs_y - x);
angle = coeff_2 - coeff_1 * r;
}
return (y < 0.0f) ? -angle : angle;
}
SIMD_FORCE_INLINE bool btFuzzyZero(btScalar x) { return btFabs(x) < SIMD_EPSILON; }
SIMD_FORCE_INLINE bool btEqual(btScalar a, btScalar eps) {
return (((a) <= eps) && !((a) < -eps));
}
SIMD_FORCE_INLINE bool btGreaterEqual (btScalar a, btScalar eps) {
return (!((a) <= eps));
}
SIMD_FORCE_INLINE int btIsNegative(btScalar x) {
return x < btScalar(0.0) ? 1 : 0;
}
SIMD_FORCE_INLINE btScalar btRadians(btScalar x) { return x * SIMD_RADS_PER_DEG; }
SIMD_FORCE_INLINE btScalar btDegrees(btScalar x) { return x * SIMD_DEGS_PER_RAD; }
#define BT_DECLARE_HANDLE(name) typedef struct name##__ { int unused; } *name
#ifndef btFsel
SIMD_FORCE_INLINE btScalar btFsel(btScalar a, btScalar b, btScalar c)
{
return a >= 0 ? b : c;
}
#endif
#define btFsels(a,b,c) (btScalar)btFsel(a,b,c)
SIMD_FORCE_INLINE bool btMachineIsLittleEndian()
{
long int i = 1;
const char *p = (const char *) &i;
if (p[0] == 1) // Lowest address contains the least significant byte
return true;
else
return false;
}
///btSelect avoids branches, which makes performance much better for consoles like Playstation 3 and XBox 360
///Thanks Phil Knight. See also http://www.cellperformance.com/articles/2006/04/more_techniques_for_eliminatin_1.html
SIMD_FORCE_INLINE unsigned btSelect(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero)
{
// Set testNz to 0xFFFFFFFF if condition is nonzero, 0x00000000 if condition is zero
// Rely on positive value or'ed with its negative having sign bit on
// and zero value or'ed with its negative (which is still zero) having sign bit off
// Use arithmetic shift right, shifting the sign bit through all 32 bits
unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
unsigned testEqz = ~testNz;
return ((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
}
SIMD_FORCE_INLINE int btSelect(unsigned condition, int valueIfConditionNonZero, int valueIfConditionZero)
{
unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
unsigned testEqz = ~testNz;
return static_cast<int>((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
}
SIMD_FORCE_INLINE float btSelect(unsigned condition, float valueIfConditionNonZero, float valueIfConditionZero)
{
#ifdef BT_HAVE_NATIVE_FSEL
return (float)btFsel((btScalar)condition - btScalar(1.0f), valueIfConditionNonZero, valueIfConditionZero);
#else
return (condition != 0) ? valueIfConditionNonZero : valueIfConditionZero;
#endif
}
template<typename T> SIMD_FORCE_INLINE void btSwap(T& a, T& b)
{
T tmp = a;
a = b;
b = tmp;
}
//PCK: endian swapping functions
SIMD_FORCE_INLINE unsigned btSwapEndian(unsigned val)
{
return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8) | ((val & 0x000000ff) << 24));
}
SIMD_FORCE_INLINE unsigned short btSwapEndian(unsigned short val)
{
return static_cast<unsigned short>(((val & 0xff00) >> 8) | ((val & 0x00ff) << 8));
}
SIMD_FORCE_INLINE unsigned btSwapEndian(int val)
{
return btSwapEndian((unsigned)val);
}
SIMD_FORCE_INLINE unsigned short btSwapEndian(short val)
{
return btSwapEndian((unsigned short) val);
}
///btSwapFloat uses using char pointers to swap the endianness
////btSwapFloat/btSwapDouble will NOT return a float, because the machine might 'correct' invalid floating point values
///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754.
///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception.
///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you.
///so instead of returning a float/double, we return integer/long long integer
SIMD_FORCE_INLINE unsigned int btSwapEndianFloat(float d)
{
unsigned int a = 0;
unsigned char *dst = (unsigned char *)&a;
unsigned char *src = (unsigned char *)&d;
dst[0] = src[3];
dst[1] = src[2];
dst[2] = src[1];
dst[3] = src[0];
return a;
}
// unswap using char pointers
SIMD_FORCE_INLINE float btUnswapEndianFloat(unsigned int a)
{
float d = 0.0f;
unsigned char *src = (unsigned char *)&a;
unsigned char *dst = (unsigned char *)&d;
dst[0] = src[3];
dst[1] = src[2];
dst[2] = src[1];
dst[3] = src[0];
return d;
}
// swap using char pointers
SIMD_FORCE_INLINE void btSwapEndianDouble(double d, unsigned char* dst)
{
unsigned char *src = (unsigned char *)&d;
dst[0] = src[7];
dst[1] = src[6];
dst[2] = src[5];
dst[3] = src[4];
dst[4] = src[3];
dst[5] = src[2];
dst[6] = src[1];
dst[7] = src[0];
}
// unswap using char pointers
SIMD_FORCE_INLINE double btUnswapEndianDouble(const unsigned char *src)
{
double d = 0.0;
unsigned char *dst = (unsigned char *)&d;
dst[0] = src[7];
dst[1] = src[6];
dst[2] = src[5];
dst[3] = src[4];
dst[4] = src[3];
dst[5] = src[2];
dst[6] = src[1];
dst[7] = src[0];
return d;
}
// returns normalized value in range [-SIMD_PI, SIMD_PI]
SIMD_FORCE_INLINE btScalar btNormalizeAngle(btScalar angleInRadians)
{
angleInRadians = btFmod(angleInRadians, SIMD_2_PI);
if(angleInRadians < -SIMD_PI)
{
return angleInRadians + SIMD_2_PI;
}
else if(angleInRadians > SIMD_PI)
{
return angleInRadians - SIMD_2_PI;
}
else
{
return angleInRadians;
}
}
///rudimentary class to provide type info
struct btTypedObject
{
btTypedObject(int objectType)
:m_objectType(objectType)
{
}
int m_objectType;
inline int getObjectType() const
{
return m_objectType;
}
};
///align a pointer to the provided alignment, upwards
template <typename T>T* btAlignPointer(T* unalignedPtr, size_t alignment)
{
struct btConvertPointerSizeT
{
union
{
T* ptr;
size_t integer;
};
};
btConvertPointerSizeT converter;
const size_t bit_mask = ~(alignment - 1);
converter.ptr = unalignedPtr;
converter.integer += alignment-1;
converter.integer &= bit_mask;
return converter.ptr;
}
#endif //BT_SCALAR_H

View File

@@ -0,0 +1,26 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("OpenCL_lib_parallel_primitives_host_" .. vendor)
initOpenCL(vendor)
kind "StaticLib"
targetdir "../../../lib"
includedirs {
".",
}
files {
"**.cpp",
"**.h"
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")