(re)Enable SSE by default (using intrinsics) should compile fine under all Intel platforms.
This commit is contained in:
@@ -663,24 +663,24 @@ Benchmarking dbvt...
|
|||||||
Leaves: 8192
|
Leaves: 8192
|
||||||
sizeof(btDbvtVolume): 32 bytes
|
sizeof(btDbvtVolume): 32 bytes
|
||||||
sizeof(btDbvtNode): 44 bytes
|
sizeof(btDbvtNode): 44 bytes
|
||||||
[1] btDbvtVolume intersections: 3537 ms (0%)
|
[1] btDbvtVolume intersections: 3499 ms (-1%)
|
||||||
[2] btDbvtVolume merges: 1945 ms (0%)
|
[2] btDbvtVolume merges: 1934 ms (0%)
|
||||||
[3] btDbvt::collideTT: 6646 ms (0%)
|
[3] btDbvt::collideTT: 5485 ms (-21%)
|
||||||
[4] btDbvt::collideTT self: 3389 ms (0%)
|
[4] btDbvt::collideTT self: 2814 ms (-20%)
|
||||||
[5] btDbvt::collideTT xform: 7505 ms (0%)
|
[5] btDbvt::collideTT xform: 7379 ms (-1%)
|
||||||
[6] btDbvt::collideTT xform,self: 7480 ms (0%)
|
[6] btDbvt::collideTT xform,self: 7270 ms (-2%)
|
||||||
[7] btDbvt::collideRAY: 6307 ms (0%),(332511 r/s)
|
[7] btDbvt::collideRAY: 6314 ms (0%),(332143 r/s)
|
||||||
[8] insert/remove: 2105 ms (-3%),(996271 ir/s)
|
[8] insert/remove: 2093 ms (0%),(1001983 ir/s)
|
||||||
[9] updates (teleport): 1943 ms (0%),(1079337 u/s)
|
[9] updates (teleport): 1879 ms (-3%),(1116100 u/s)
|
||||||
[10] updates (jitter): 1301 ms (0%),(1611953 u/s)
|
[10] updates (jitter): 1244 ms (-4%),(1685813 u/s)
|
||||||
[11] optimize (incremental): 2510 ms (0%),(1671000 o/s)
|
[11] optimize (incremental): 2514 ms (0%),(1668000 o/s)
|
||||||
[12] btDbvtVolume notequal: 3677 ms (0%)
|
[12] btDbvtVolume notequal: 3659 ms (0%)
|
||||||
[13] culling(OCL+fullsort): 2231 ms (0%),(458 t/s)
|
[13] culling(OCL+fullsort): 2218 ms (0%),(461 t/s)
|
||||||
[14] culling(OCL+qsort): 3500 ms (0%),(2340 t/s)
|
[14] culling(OCL+qsort): 3688 ms (5%),(2221 t/s)
|
||||||
[15] culling(KDOP+qsort): 1151 ms (0%),(7117 t/s)
|
[15] culling(KDOP+qsort): 1139 ms (-1%),(7192 t/s)
|
||||||
[16] insert/remove batch(256): 5138 ms (0%),(816330 bir/s)
|
[16] insert/remove batch(256): 5092 ms (0%),(823704 bir/s)
|
||||||
[17] btDbvtVolume proximity: 2842 ms (0%)
|
[17] btDbvtVolume proximity: 2887 ms (1%)
|
||||||
[18] btDbvtVolume select: 3390 ms (0%)
|
[18] btDbvtVolume select: 3419 ms (0%)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
struct btDbvtBenchmark
|
struct btDbvtBenchmark
|
||||||
@@ -782,12 +782,12 @@ static const btScalar cfgVolumeCenterScale = 100;
|
|||||||
static const btScalar cfgVolumeExentsBase = 1;
|
static const btScalar cfgVolumeExentsBase = 1;
|
||||||
static const btScalar cfgVolumeExentsScale = 4;
|
static const btScalar cfgVolumeExentsScale = 4;
|
||||||
static const int cfgLeaves = 8192;
|
static const int cfgLeaves = 8192;
|
||||||
static const bool cfgEnable = true;
|
static const bool cfgEnable = false;
|
||||||
|
|
||||||
//[1] btDbvtVolume intersections
|
//[1] btDbvtVolume intersections
|
||||||
bool cfgBenchmark1_Enable = cfgEnable;
|
bool cfgBenchmark1_Enable = cfgEnable;
|
||||||
static const int cfgBenchmark1_Iterations = 8;
|
static const int cfgBenchmark1_Iterations = 8;
|
||||||
static const int cfgBenchmark1_Reference = 3537;
|
static const int cfgBenchmark1_Reference = 3499;
|
||||||
//[2] btDbvtVolume merges
|
//[2] btDbvtVolume merges
|
||||||
bool cfgBenchmark2_Enable = cfgEnable;
|
bool cfgBenchmark2_Enable = cfgEnable;
|
||||||
static const int cfgBenchmark2_Iterations = 4;
|
static const int cfgBenchmark2_Iterations = 4;
|
||||||
@@ -795,21 +795,21 @@ static const int cfgBenchmark2_Reference = 1945;
|
|||||||
//[3] btDbvt::collideTT
|
//[3] btDbvt::collideTT
|
||||||
bool cfgBenchmark3_Enable = cfgEnable;
|
bool cfgBenchmark3_Enable = cfgEnable;
|
||||||
static const int cfgBenchmark3_Iterations = 512;
|
static const int cfgBenchmark3_Iterations = 512;
|
||||||
static const int cfgBenchmark3_Reference = 6646;
|
static const int cfgBenchmark3_Reference = 5485;
|
||||||
//[4] btDbvt::collideTT self
|
//[4] btDbvt::collideTT self
|
||||||
bool cfgBenchmark4_Enable = cfgEnable;
|
bool cfgBenchmark4_Enable = cfgEnable;
|
||||||
static const int cfgBenchmark4_Iterations = 512;
|
static const int cfgBenchmark4_Iterations = 512;
|
||||||
static const int cfgBenchmark4_Reference = 3389;
|
static const int cfgBenchmark4_Reference = 2814;
|
||||||
//[5] btDbvt::collideTT xform
|
//[5] btDbvt::collideTT xform
|
||||||
bool cfgBenchmark5_Enable = cfgEnable;
|
bool cfgBenchmark5_Enable = cfgEnable;
|
||||||
static const int cfgBenchmark5_Iterations = 512;
|
static const int cfgBenchmark5_Iterations = 512;
|
||||||
static const btScalar cfgBenchmark5_OffsetScale = 2;
|
static const btScalar cfgBenchmark5_OffsetScale = 2;
|
||||||
static const int cfgBenchmark5_Reference = 7505;
|
static const int cfgBenchmark5_Reference = 7379;
|
||||||
//[6] btDbvt::collideTT xform,self
|
//[6] btDbvt::collideTT xform,self
|
||||||
bool cfgBenchmark6_Enable = cfgEnable;
|
bool cfgBenchmark6_Enable = cfgEnable;
|
||||||
static const int cfgBenchmark6_Iterations = 512;
|
static const int cfgBenchmark6_Iterations = 512;
|
||||||
static const btScalar cfgBenchmark6_OffsetScale = 2;
|
static const btScalar cfgBenchmark6_OffsetScale = 2;
|
||||||
static const int cfgBenchmark6_Reference = 7480;
|
static const int cfgBenchmark6_Reference = 7270;
|
||||||
//[7] btDbvt::collideRAY
|
//[7] btDbvt::collideRAY
|
||||||
bool cfgBenchmark7_Enable = cfgEnable;
|
bool cfgBenchmark7_Enable = cfgEnable;
|
||||||
static const int cfgBenchmark7_Passes = 32;
|
static const int cfgBenchmark7_Passes = 32;
|
||||||
@@ -824,13 +824,13 @@ static const int cfgBenchmark8_Reference = 2105;
|
|||||||
bool cfgBenchmark9_Enable = cfgEnable;
|
bool cfgBenchmark9_Enable = cfgEnable;
|
||||||
static const int cfgBenchmark9_Passes = 32;
|
static const int cfgBenchmark9_Passes = 32;
|
||||||
static const int cfgBenchmark9_Iterations = 65536;
|
static const int cfgBenchmark9_Iterations = 65536;
|
||||||
static const int cfgBenchmark9_Reference = 1943;
|
static const int cfgBenchmark9_Reference = 1879;
|
||||||
//[10] updates (jitter)
|
//[10] updates (jitter)
|
||||||
bool cfgBenchmark10_Enable = cfgEnable;
|
bool cfgBenchmark10_Enable = cfgEnable;
|
||||||
static const btScalar cfgBenchmark10_Scale = cfgVolumeCenterScale/10000;
|
static const btScalar cfgBenchmark10_Scale = cfgVolumeCenterScale/10000;
|
||||||
static const int cfgBenchmark10_Passes = 32;
|
static const int cfgBenchmark10_Passes = 32;
|
||||||
static const int cfgBenchmark10_Iterations = 65536;
|
static const int cfgBenchmark10_Iterations = 65536;
|
||||||
static const int cfgBenchmark10_Reference = 1301;
|
static const int cfgBenchmark10_Reference = 1244;
|
||||||
//[11] optimize (incremental)
|
//[11] optimize (incremental)
|
||||||
bool cfgBenchmark11_Enable = cfgEnable;
|
bool cfgBenchmark11_Enable = cfgEnable;
|
||||||
static const int cfgBenchmark11_Passes = 64;
|
static const int cfgBenchmark11_Passes = 64;
|
||||||
@@ -862,7 +862,7 @@ bool cfgBenchmark17_Enable = cfgEnable;
|
|||||||
static const int cfgBenchmark17_Iterations = 8;
|
static const int cfgBenchmark17_Iterations = 8;
|
||||||
static const int cfgBenchmark17_Reference = 2842;
|
static const int cfgBenchmark17_Reference = 2842;
|
||||||
//[18] select
|
//[18] select
|
||||||
bool cfgBenchmark18_Enable = cfgEnable;
|
bool cfgBenchmark18_Enable = true;
|
||||||
static const int cfgBenchmark18_Iterations = 4;
|
static const int cfgBenchmark18_Iterations = 4;
|
||||||
static const int cfgBenchmark18_Reference = 3390;
|
static const int cfgBenchmark18_Reference = 3390;
|
||||||
|
|
||||||
|
|||||||
@@ -41,6 +41,9 @@ subject to the following restrictions:
|
|||||||
#define DBVT_USE_TEMPLATE 0
|
#define DBVT_USE_TEMPLATE 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Use only intrinsics instead of inline asm
|
||||||
|
#define DBVT_USE_INTRINSIC_SSE 1
|
||||||
|
|
||||||
// Using memmov for collideOCL
|
// Using memmov for collideOCL
|
||||||
#define DBVT_USE_MEMMOVE 1
|
#define DBVT_USE_MEMMOVE 1
|
||||||
|
|
||||||
@@ -58,18 +61,18 @@ subject to the following restrictions:
|
|||||||
|
|
||||||
// Specific methods implementation
|
// Specific methods implementation
|
||||||
|
|
||||||
//disabled by default, it breaks certain compilers and platforms (Intel compiler on Win32 and any compiler on Windows 64 bits)
|
#ifdef WIN32
|
||||||
//#define WIN32_USE_SSE 1
|
|
||||||
#ifdef WIN32_USE_SSE
|
|
||||||
#define DBVT_PROXIMITY_IMPL DBVT_IMPL_SSE
|
|
||||||
#define DBVT_SELECT_IMPL DBVT_IMPL_SSE
|
#define DBVT_SELECT_IMPL DBVT_IMPL_SSE
|
||||||
#define DBVT_MERGE_IMPL DBVT_IMPL_SSE
|
#define DBVT_MERGE_IMPL DBVT_IMPL_SSE
|
||||||
#else
|
#else
|
||||||
#define DBVT_PROXIMITY_IMPL DBVT_IMPL_GENERIC
|
|
||||||
#define DBVT_SELECT_IMPL DBVT_IMPL_GENERIC
|
#define DBVT_SELECT_IMPL DBVT_IMPL_GENERIC
|
||||||
#define DBVT_MERGE_IMPL DBVT_IMPL_GENERIC
|
#define DBVT_MERGE_IMPL DBVT_IMPL_GENERIC
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if (DBVT_SELECT_IMPL==DBVT_IMPL_SSE)||(DBVT_MERGE_IMPL==DBVT_IMPL_SSE)
|
||||||
|
#include <xmmintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// Auto config and checks
|
// Auto config and checks
|
||||||
//
|
//
|
||||||
@@ -107,10 +110,6 @@ subject to the following restrictions:
|
|||||||
#error "DBVT_ENABLE_BENCHMARK undefined"
|
#error "DBVT_ENABLE_BENCHMARK undefined"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef DBVT_PROXIMITY_IMPL
|
|
||||||
#error "DBVT_PROXIMITY_IMPL undefined"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef DBVT_SELECT_IMPL
|
#ifndef DBVT_SELECT_IMPL
|
||||||
#error "DBVT_SELECT_IMPL undefined"
|
#error "DBVT_SELECT_IMPL undefined"
|
||||||
#endif
|
#endif
|
||||||
@@ -561,32 +560,8 @@ return(txmax>0);
|
|||||||
DBVT_INLINE btScalar Proximity( const btDbvtAabbMm& a,
|
DBVT_INLINE btScalar Proximity( const btDbvtAabbMm& a,
|
||||||
const btDbvtAabbMm& b)
|
const btDbvtAabbMm& b)
|
||||||
{
|
{
|
||||||
#if DBVT_PROXIMITY_IMPL == DBVT_IMPL_SSE
|
|
||||||
DBVT_ALIGN btScalar r[1];
|
|
||||||
static DBVT_ALIGN const unsigned __int32 mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
|
|
||||||
__asm
|
|
||||||
{
|
|
||||||
mov eax,a
|
|
||||||
mov ecx,b
|
|
||||||
movaps xmm0,[eax]
|
|
||||||
movaps xmm2,[ecx]
|
|
||||||
movaps xmm1,[eax+16]
|
|
||||||
movaps xmm3,[ecx+16]
|
|
||||||
addps xmm0,xmm1
|
|
||||||
addps xmm2,xmm3
|
|
||||||
subps xmm0,xmm2
|
|
||||||
andps xmm0,mask
|
|
||||||
movhlps xmm1,xmm0
|
|
||||||
addps xmm0,xmm1
|
|
||||||
pshufd xmm1,xmm0,1
|
|
||||||
addss xmm0,xmm1
|
|
||||||
movss r,xmm0
|
|
||||||
}
|
|
||||||
return(r[0]);
|
|
||||||
#else
|
|
||||||
const btVector3 d=(a.mi+a.mx)-(b.mi+b.mx);
|
const btVector3 d=(a.mi+a.mx)-(b.mi+b.mx);
|
||||||
return(btFabs(d.x())+btFabs(d.y())+btFabs(d.z()));
|
return(btFabs(d.x())+btFabs(d.y())+btFabs(d.z()));
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -595,36 +570,57 @@ DBVT_INLINE int Select( const btDbvtAabbMm& o,
|
|||||||
const btDbvtAabbMm& b)
|
const btDbvtAabbMm& b)
|
||||||
{
|
{
|
||||||
#if DBVT_SELECT_IMPL == DBVT_IMPL_SSE
|
#if DBVT_SELECT_IMPL == DBVT_IMPL_SSE
|
||||||
DBVT_ALIGN __int32 r[1];
|
|
||||||
static DBVT_ALIGN const unsigned __int32 mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
|
static DBVT_ALIGN const unsigned __int32 mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
|
||||||
__asm
|
// TODO: the intrinsic version is 11% slower
|
||||||
{
|
#if DBVT_USE_INTRINSIC_SSE
|
||||||
mov eax,o
|
__m128 omi(_mm_load_ps(o.mi));
|
||||||
mov ecx,a
|
omi=_mm_add_ps(omi,_mm_load_ps(o.mx));
|
||||||
mov edx,b
|
__m128 ami(_mm_load_ps(a.mi));
|
||||||
movaps xmm0,[eax]
|
ami=_mm_add_ps(ami,_mm_load_ps(a.mx));
|
||||||
movaps xmm5,mask
|
ami=_mm_sub_ps(ami,omi);
|
||||||
addps xmm0,[eax+16]
|
ami=_mm_and_ps(ami,_mm_load_ps((const float*)mask));
|
||||||
movaps xmm1,[ecx]
|
__m128 bmi(_mm_load_ps(b.mi));
|
||||||
movaps xmm2,[edx]
|
bmi=_mm_add_ps(bmi,_mm_load_ps(b.mx));
|
||||||
addps xmm1,[ecx+16]
|
bmi=_mm_sub_ps(bmi,omi);
|
||||||
addps xmm2,[edx+16]
|
bmi=_mm_and_ps(bmi,_mm_load_ps((const float*)mask));
|
||||||
subps xmm1,xmm0
|
__m128 t0(_mm_movehl_ps(ami,ami));
|
||||||
subps xmm2,xmm0
|
ami=_mm_add_ps(ami,t0);
|
||||||
andps xmm1,xmm5
|
ami=_mm_add_ss(ami,_mm_shuffle_ps(ami,ami,1));
|
||||||
andps xmm2,xmm5
|
__m128 t1(_mm_movehl_ps(bmi,bmi));
|
||||||
movhlps xmm3,xmm1
|
bmi=_mm_add_ps(bmi,t1);
|
||||||
movhlps xmm4,xmm2
|
bmi=_mm_add_ss(bmi,_mm_shuffle_ps(bmi,bmi,1));
|
||||||
addps xmm1,xmm3
|
return(_mm_cmple_ss(bmi,ami).m128_u32[0]&1);
|
||||||
addps xmm2,xmm4
|
#else
|
||||||
pshufd xmm3,xmm1,1
|
DBVT_ALIGN __int32 r[1];
|
||||||
pshufd xmm4,xmm2,1
|
__asm
|
||||||
addss xmm1,xmm3
|
{
|
||||||
addss xmm2,xmm4
|
mov eax,o
|
||||||
cmpless xmm2,xmm1
|
mov ecx,a
|
||||||
movss r,xmm2
|
mov edx,b
|
||||||
}
|
movaps xmm0,[eax]
|
||||||
return(r[0]&1);
|
movaps xmm5,mask
|
||||||
|
addps xmm0,[eax+16]
|
||||||
|
movaps xmm1,[ecx]
|
||||||
|
movaps xmm2,[edx]
|
||||||
|
addps xmm1,[ecx+16]
|
||||||
|
addps xmm2,[edx+16]
|
||||||
|
subps xmm1,xmm0
|
||||||
|
subps xmm2,xmm0
|
||||||
|
andps xmm1,xmm5
|
||||||
|
andps xmm2,xmm5
|
||||||
|
movhlps xmm3,xmm1
|
||||||
|
movhlps xmm4,xmm2
|
||||||
|
addps xmm1,xmm3
|
||||||
|
addps xmm2,xmm4
|
||||||
|
pshufd xmm3,xmm1,1
|
||||||
|
pshufd xmm4,xmm2,1
|
||||||
|
addss xmm1,xmm3
|
||||||
|
addss xmm2,xmm4
|
||||||
|
cmpless xmm2,xmm1
|
||||||
|
movss r,xmm2
|
||||||
|
}
|
||||||
|
return(r[0]&1);
|
||||||
|
#endif
|
||||||
#else
|
#else
|
||||||
return(Proximity(o,a)<Proximity(o,b)?0:1);
|
return(Proximity(o,a)<Proximity(o,b)?0:1);
|
||||||
#endif
|
#endif
|
||||||
@@ -636,20 +632,14 @@ DBVT_INLINE void Merge( const btDbvtAabbMm& a,
|
|||||||
btDbvtAabbMm& r)
|
btDbvtAabbMm& r)
|
||||||
{
|
{
|
||||||
#if DBVT_MERGE_IMPL==DBVT_IMPL_SSE
|
#if DBVT_MERGE_IMPL==DBVT_IMPL_SSE
|
||||||
__asm
|
__m128 ami(_mm_load_ps(a.mi));
|
||||||
{
|
__m128 amx(_mm_load_ps(a.mx));
|
||||||
mov eax,a
|
__m128 bmi(_mm_load_ps(b.mi));
|
||||||
mov edx,b
|
__m128 bmx(_mm_load_ps(b.mx));
|
||||||
mov ecx,r
|
ami=_mm_min_ps(ami,bmi);
|
||||||
movaps xmm0,[eax+0]
|
amx=_mm_max_ps(amx,bmx);
|
||||||
movaps xmm1,[edx+0]
|
_mm_store_ps(r.mi,ami);
|
||||||
movaps xmm2,[eax+16]
|
_mm_store_ps(r.mx,amx);
|
||||||
movaps xmm3,[edx+16]
|
|
||||||
minps xmm0,xmm1
|
|
||||||
maxps xmm2,xmm3
|
|
||||||
movaps [ecx+0],xmm0
|
|
||||||
movaps [ecx+16],xmm2
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
for(int i=0;i<3;++i)
|
for(int i=0;i<3;++i)
|
||||||
{
|
{
|
||||||
@@ -1098,7 +1088,9 @@ if(root)
|
|||||||
#undef DBVT_IPOLICY
|
#undef DBVT_IPOLICY
|
||||||
#undef DBVT_CHECKTYPE
|
#undef DBVT_CHECKTYPE
|
||||||
#undef DBVT_IMPL_GENERIC
|
#undef DBVT_IMPL_GENERIC
|
||||||
#undef DBVT_IMPL_FPU0x86
|
|
||||||
#undef DBVT_IMPL_SSE
|
#undef DBVT_IMPL_SSE
|
||||||
|
#undef DBVT_USE_INTRINSIC_SSE
|
||||||
|
#undef DBVT_SELECT_IMPL
|
||||||
|
#undef DBVT_MERGE_IMPL
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user