Updated CDTestFramework with the OPCODE Array SAP test.

Thanks Pierre Terdiman for the latest update.
This commit is contained in:
erwin.coumans
2008-09-01 18:46:57 +00:00
parent f655eff89f
commit 932de57d4c
41 changed files with 6385 additions and 410 deletions

View File

@@ -1,32 +1,16 @@
/*
* ICE / OPCODE - Optimized Collision Detection
* http://www.codercorner.com/Opcode.htm
*
* Copyright (c) 2001-2008 Pierre Terdiman, pierre@codercorner.com
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/**
* Contains misc. useful macros & defines.
* \file IceUtils.h
* \author Pierre Terdiman (collected from various sources)
* \author Pierre Terdiman (personal code + collected from various sources)
* \date April, 4, 2000
*/
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Include Guard
#ifndef __ICEUTILS_H__
#define __ICEUTILS_H__
#ifndef ICEUTILS_H
#define ICEUTILS_H
#define START_RUNONCE { static bool __RunOnce__ = false; if(!__RunOnce__){
#define END_RUNONCE __RunOnce__ = true;}}
@@ -40,7 +24,7 @@ subject to the following restrictions:
n = ((n >> 4) & 0x0f0f0f0f) | ((n << 4) & 0xf0f0f0f0);
n = ((n >> 8) & 0x00ff00ff) | ((n << 8) & 0xff00ff00);
n = ((n >> 16) & 0x0000ffff) | ((n << 16) & 0xffff0000);
// Etc for larger intergers (64 bits in Java)
// Etc for larger integers (64 bits in Java)
// NOTE: the >> operation must be unsigned! (>>> in java)
}
@@ -48,15 +32,15 @@ subject to the following restrictions:
inline_ udword CountBits(udword n)
{
// This relies of the fact that the count of n bits can NOT overflow
// an n bit interger. EG: 1 bit count takes a 1 bit interger, 2 bit counts
// 2 bit interger, 3 bit count requires only a 2 bit interger.
// an n bit integer. EG: 1 bit count takes a 1 bit integer, 2 bit counts
// 2 bit integer, 3 bit count requires only a 2 bit integer.
// So we add all bit pairs, then each nible, then each byte etc...
n = (n & 0x55555555) + ((n & 0xaaaaaaaa) >> 1);
n = (n & 0x33333333) + ((n & 0xcccccccc) >> 2);
n = (n & 0x0f0f0f0f) + ((n & 0xf0f0f0f0) >> 4);
n = (n & 0x00ff00ff) + ((n & 0xff00ff00) >> 8);
n = (n & 0x0000ffff) + ((n & 0xffff0000) >> 16);
// Etc for larger intergers (64 bits in Java)
// Etc for larger integers (64 bits in Java)
// NOTE: the >> operation must be unsigned! (>>> in java)
return n;
}
@@ -70,9 +54,44 @@ subject to the following restrictions:
return (bits * 0x01010101) >> 24;
}
// "Population Count (Ones Count)
// The population count of a binary integer value x is the number of one bits in the value. Although many machines have
// single instructions for this, the single instructions are usually microcoded loops that test a bit per cycle; a log-time
// algorithm coded in C is often faster. The following code uses a variable-precision SWAR algorithm to perform a tree
// reduction adding the bits in a 32-bit value:"
inline_ udword ones32(udword x)
{
/* 32-bit recursive reduction using SWAR...
but first step is mapping 2-bit values
into sum of 2 1-bit values in sneaky way
*/
x -= ((x >> 1) & 0x55555555);
x = (((x >> 2) & 0x33333333) + (x & 0x33333333));
x = (((x >> 4) + x) & 0x0f0f0f0f);
x += (x >> 8);
x += (x >> 16);
return (x & 0x0000003f);
// "It is worthwhile noting that the SWAR population count algorithm given above can be improved upon for the case of
// counting the population of multi-word bit sets. How? The last few steps in the reduction are using only a portion
// of the SWAR width to produce their results; thus, it would be possible to combine these steps across multiple words
// being reduced. One additional note: the AMD Athlon optimization guidelines suggest a very similar algorithm that
// replaces the last three lines with return((x * 0x01010101) >> 24);. For the Athlon (which has a very fast integer
// multiply), I would have expected AMD's code to be faster... but it is actually 6% slower according to my benchmarks
// using a 1.2GHz Athlon (a Thunderbird). Why? Well, it so happens that GCC doesn't use a multiply instruction - it
// writes out the equivalent shift and add sequence!"
}
// "Trailing Zero Count
// Given the Least Significant 1 Bit and Population Count (Ones Count) algorithms, it is trivial to combine them to
// construct a trailing zero count (as pointed-out by Joe Bowbeer):"
inline_ udword tzc(sdword x)
{
return(ones32((x & -x) - 1));
}
//! Spread out bits. EG 00001111 -> 0101010101
//! 00001010 -> 0100010000
//! This is used to interleve to intergers to produce a `Morten Key'
//! This is used to interleave two integers to produce a `Morton Key'
//! used in Space Filling Curves (See DrDobbs Journal, July 1999)
//! Order is important.
inline_ void SpreadBits(udword& n)
@@ -84,12 +103,12 @@ subject to the following restrictions:
n = ( n & 0x11111111) | (( n & 0x22222222) << 1);
}
// Next Largest Power of 2
// "Next Largest Power of 2
// Given a binary integer value x, the next largest power of 2 can be computed by a SWAR algorithm
// that recursively "folds" the upper bits into the lower bits. This process yields a bit vector with
// the same most significant 1 as x, but all 1's below it. Adding 1 to that value yields the next
// largest power of 2. For a 32-bit value:
inline_ udword nlpo2(udword x)
// largest power of 2. For a 32-bit value:"
inline_ udword NextPowerOfTwo(udword x)
{
x |= (x >> 1);
x |= (x >> 2);
@@ -131,24 +150,45 @@ subject to the following restrictions:
inline_ char LittleEndian() { int i = 1; return *((char*)&i); }
//!< Alternative abs function
inline_ udword abs_(sdword x) { sdword y= x >> 31; return (x^y)-y; }
inline_ udword abs_(sdword x) { sdword y= x >> 31; return (x^y)-y; }
// "Integer Minimum or Maximum
// Given 2's complement integer values x and y, the minimum can be computed without any branches as
// x+(((y-x)>>(WORDBITS-1))&(y-x)).
// Logically, this works because the shift by (WORDBITS-1) replicates the sign bit to create a mask
// -- be aware, however, that the C language does not require that shifts are signed even if their
// operands are signed, so there is a potential portability problem. Additionally, one might think
// that a shift by any number greater than or equal to WORDBITS would have the same effect, but many
// instruction sets have shifts that behave strangely when such shift distances are specified.
// Of course, maximum can be computed using the same trick:
// x-(((x-y)>>(WORDBITS-1))&(x-y))."
//!< Alternative min function
inline_ sdword min_(sdword a, sdword b) { sdword delta = b-a; return a + (delta&(delta>>31)); }
//!< Alternative max function
inline_ sdword max_(sdword a, sdword b) { sdword delta = a-b; return a - (delta&(delta>>31)); }
// "Integer Selection
// A branchless, lookup-free, alternative to code like if (a<b) x=c; else x=d; is ((((a-b) >> (WORDBITS-1)) & (c^d)) ^ d).
// This code assumes that the shift is signed, which, of course, C does not promise."
inline_ sdword IntegerSelection(sdword a, sdword b, sdword c, sdword d)
{
return ((((a-b)>>31) & (c^d)) ^ d);
}
// Determine if one of the bytes in a 4 byte word is zero
inline_ BOOL HasNullByte(udword x) { return ((x + 0xfefefeff) & (~x) & 0x80808080); }
inline_ BOOL HasNullByte(udword x) { return ((x + 0xfefefeff) & (~x) & 0x80808080); }
// To find the smallest 1 bit in a word EG: ~~~~~~10---0 => 0----010---0
inline_ udword LowestOneBit(udword w) { return ((w) & (~(w)+1)); }
// inline_ udword LowestOneBit_(udword w) { return ((w) & (-(w))); }
inline_ udword LowestOneBit(udword w) { return ((w) & (~(w)+1)); }
// inline_ udword LowestOneBit_(udword w) { return ((w) & (-(w))); }
// Most Significant 1 Bit
// "Most Significant 1 Bit
// Given a binary integer value x, the most significant 1 bit (highest numbered element of a bit set)
// can be computed using a SWAR algorithm that recursively "folds" the upper bits into the lower bits.
// This process yields a bit vector with the same most significant 1 as x, but all 1's below it.
// Bitwise AND of the original value with the complement of the "folded" value shifted down by one
// yields the most significant bit. For a 32-bit value:
// yields the most significant bit. For a 32-bit value:"
inline_ udword msb32(udword x)
{
x |= (x >> 1);
@@ -159,6 +199,23 @@ subject to the following restrictions:
return (x & ~(x >> 1));
}
// "Gray Code Conversion
// A Gray code is any binary coding sequence in which only a single bit position changes as we move from one value to the next.
// There are many such codes, but the traditional one is computed such that the Kth Gray code is K^(K>>1).
//
// The well-known algorithm for conversion from Gray to binary is a linear sequence of XORs that makes it seem each bit must be
// dealt with separately. Fortunately, that is equivalent to a parallel prefix XOR that can be computed using SWAR techniques
// in log time. For 32-bit Gray code values produced as described above, the conversion from Gray code back to unsigned binary is:"
inline_ udword g2b(udword gray)
{
gray ^= (gray >> 16);
gray ^= (gray >> 8);
gray ^= (gray >> 4);
gray ^= (gray >> 2);
gray ^= (gray >> 1);
return gray;
}
/*
"Just call it repeatedly with various input values and always with the same variable as "memory".
The sharpness determines the degree of filtering, where 0 completely filters out the input, and 1
@@ -181,9 +238,9 @@ subject to the following restrictions:
return memory = val * sharpness + memory * (1.0f - sharpness);
}
//! If you can guarantee that your input domain (i.e. value of x) is slightly
//! "If you can guarantee that your input domain (i.e. value of x) is slightly
//! limited (abs(x) must be < ((1<<31u)-32767)), then you can use the
//! following code to clamp the resulting value into [-32768,+32767] range:
//! following code to clamp the resulting value into [-32768,+32767] range:"
inline_ int ClampToInt16(int x)
{
// ASSERT(abs(x) < (int)((1<<31u)-32767));
@@ -219,8 +276,12 @@ subject to the following restrictions:
//! TO BE DOCUMENTED
#define OFFSET_OF(Class, Member) (size_t)&(((Class*)0)->Member)
//! TO BE DOCUMENTED
#define ARRAYSIZE(p) (sizeof(p)/sizeof(p[0]))
#if !defined(_XBOX)
// Already defined on Xbox.
#define ARRAYSIZE(p) (sizeof(p)/sizeof(p[0]))
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/**
@@ -236,7 +297,11 @@ subject to the following restrictions:
#define IS_ALIGNED_4(x) ((x&3)==0)
#define IS_ALIGNED_8(x) ((x&7)==0)
inline_ void _prefetch(void const* ptr) { (void)*(char const volatile *)ptr; }
// Updates a pointer with "stride" bytes
inline_ void UpdatePtr(void*& ptr, udword stride) { ptr = ((ubyte*)ptr) + stride; }
// From Jon Watte IIRC
inline_ void _prefetch(void const* ptr) { (void)*(char const volatile *)ptr; }
// Compute implicit coords from an index:
// The idea is to get back 2D coords from a 1D index.
@@ -269,4 +334,44 @@ subject to the following restrictions:
Compute2DCoords(u, v, i - (w * nbu_nbv), nbu);
}
#endif // __ICEUTILS_H__
// Calling fsincos instead of fsin+fcos. Twice faster.
inline_ void FSinCos(float& c, float& s, float f)
{
float LocalCos, LocalSin;
float Local = f;
#ifdef WIN32
_asm fld Local
_asm fsincos
_asm fstp LocalCos
_asm fstp LocalSin
#elif LINUX
asm("fld Local\n\t"
"fsincos\n\t"
"fstp LocalCos\n\t"
"fstp LocalSin\n\t"
);
#endif
c = LocalCos;
s = LocalSin;
}
// Modulo3 macros. See http://www.codercorner.com/Modulo3.htm
#define GET_NEXT_INDICES(i, j, k) \
k = 0x01000201; \
k>>=(i<<3); \
j = k & 0xff; \
k>>=8; \
k&=0xff;
#define GET_NEXT_INDICES2(i, j, k) \
j = ( 9 >> (i<<1)) & 3; \
k = (18 >> (i<<1)) & 3;
// 0=>1, 1=>2, 2=>0
inline_ udword Modulo3(udword i)
{
ASSERT(i==0 || i==1 || i==2);
return (9 >> (i << 1)) & 3;
}
#endif // ICEUTILS_H