From bb50a2038fae6cfa5435529f3656615c441daa1b Mon Sep 17 00:00:00 2001 From: id0x1234 Date: Sun, 7 Sep 2008 09:10:43 +0000 Subject: [PATCH] (re)Enable SSE by default (using intrinsics) should compile fine under all Intel platforms. --- .../BroadphaseCollision/btDbvt.cpp | 54 +++---- .../BroadphaseCollision/btDbvt.h | 146 +++++++++--------- 2 files changed, 96 insertions(+), 104 deletions(-) diff --git a/src/BulletCollision/BroadphaseCollision/btDbvt.cpp b/src/BulletCollision/BroadphaseCollision/btDbvt.cpp index 15b90d987..4c5d451a7 100644 --- a/src/BulletCollision/BroadphaseCollision/btDbvt.cpp +++ b/src/BulletCollision/BroadphaseCollision/btDbvt.cpp @@ -663,24 +663,24 @@ Benchmarking dbvt... Leaves: 8192 sizeof(btDbvtVolume): 32 bytes sizeof(btDbvtNode): 44 bytes -[1] btDbvtVolume intersections: 3537 ms (0%) -[2] btDbvtVolume merges: 1945 ms (0%) -[3] btDbvt::collideTT: 6646 ms (0%) -[4] btDbvt::collideTT self: 3389 ms (0%) -[5] btDbvt::collideTT xform: 7505 ms (0%) -[6] btDbvt::collideTT xform,self: 7480 ms (0%) -[7] btDbvt::collideRAY: 6307 ms (0%),(332511 r/s) -[8] insert/remove: 2105 ms (-3%),(996271 ir/s) -[9] updates (teleport): 1943 ms (0%),(1079337 u/s) -[10] updates (jitter): 1301 ms (0%),(1611953 u/s) -[11] optimize (incremental): 2510 ms (0%),(1671000 o/s) -[12] btDbvtVolume notequal: 3677 ms (0%) -[13] culling(OCL+fullsort): 2231 ms (0%),(458 t/s) -[14] culling(OCL+qsort): 3500 ms (0%),(2340 t/s) -[15] culling(KDOP+qsort): 1151 ms (0%),(7117 t/s) -[16] insert/remove batch(256): 5138 ms (0%),(816330 bir/s) -[17] btDbvtVolume proximity: 2842 ms (0%) -[18] btDbvtVolume select: 3390 ms (0%) +[1] btDbvtVolume intersections: 3499 ms (-1%) +[2] btDbvtVolume merges: 1934 ms (0%) +[3] btDbvt::collideTT: 5485 ms (-21%) +[4] btDbvt::collideTT self: 2814 ms (-20%) +[5] btDbvt::collideTT xform: 7379 ms (-1%) +[6] btDbvt::collideTT xform,self: 7270 ms (-2%) +[7] btDbvt::collideRAY: 6314 ms (0%),(332143 r/s) +[8] insert/remove: 2093 ms (0%),(1001983 ir/s) +[9] updates (teleport): 1879 ms (-3%),(1116100 u/s) +[10] updates (jitter): 1244 ms (-4%),(1685813 u/s) +[11] optimize (incremental): 2514 ms (0%),(1668000 o/s) +[12] btDbvtVolume notequal: 3659 ms (0%) +[13] culling(OCL+fullsort): 2218 ms (0%),(461 t/s) +[14] culling(OCL+qsort): 3688 ms (5%),(2221 t/s) +[15] culling(KDOP+qsort): 1139 ms (-1%),(7192 t/s) +[16] insert/remove batch(256): 5092 ms (0%),(823704 bir/s) +[17] btDbvtVolume proximity: 2887 ms (1%) +[18] btDbvtVolume select: 3419 ms (0%) */ struct btDbvtBenchmark @@ -782,12 +782,12 @@ static const btScalar cfgVolumeCenterScale = 100; static const btScalar cfgVolumeExentsBase = 1; static const btScalar cfgVolumeExentsScale = 4; static const int cfgLeaves = 8192; -static const bool cfgEnable = true; +static const bool cfgEnable = false; //[1] btDbvtVolume intersections bool cfgBenchmark1_Enable = cfgEnable; static const int cfgBenchmark1_Iterations = 8; -static const int cfgBenchmark1_Reference = 3537; +static const int cfgBenchmark1_Reference = 3499; //[2] btDbvtVolume merges bool cfgBenchmark2_Enable = cfgEnable; static const int cfgBenchmark2_Iterations = 4; @@ -795,21 +795,21 @@ static const int cfgBenchmark2_Reference = 1945; //[3] btDbvt::collideTT bool cfgBenchmark3_Enable = cfgEnable; static const int cfgBenchmark3_Iterations = 512; -static const int cfgBenchmark3_Reference = 6646; +static const int cfgBenchmark3_Reference = 5485; //[4] btDbvt::collideTT self bool cfgBenchmark4_Enable = cfgEnable; static const int cfgBenchmark4_Iterations = 512; -static const int cfgBenchmark4_Reference = 3389; +static const int cfgBenchmark4_Reference = 2814; //[5] btDbvt::collideTT xform bool cfgBenchmark5_Enable = cfgEnable; static const int cfgBenchmark5_Iterations = 512; static const btScalar cfgBenchmark5_OffsetScale = 2; -static const int cfgBenchmark5_Reference = 7505; +static const int cfgBenchmark5_Reference = 7379; //[6] btDbvt::collideTT xform,self bool cfgBenchmark6_Enable = cfgEnable; static const int cfgBenchmark6_Iterations = 512; static const btScalar cfgBenchmark6_OffsetScale = 2; -static const int cfgBenchmark6_Reference = 7480; +static const int cfgBenchmark6_Reference = 7270; //[7] btDbvt::collideRAY bool cfgBenchmark7_Enable = cfgEnable; static const int cfgBenchmark7_Passes = 32; @@ -824,13 +824,13 @@ static const int cfgBenchmark8_Reference = 2105; bool cfgBenchmark9_Enable = cfgEnable; static const int cfgBenchmark9_Passes = 32; static const int cfgBenchmark9_Iterations = 65536; -static const int cfgBenchmark9_Reference = 1943; +static const int cfgBenchmark9_Reference = 1879; //[10] updates (jitter) bool cfgBenchmark10_Enable = cfgEnable; static const btScalar cfgBenchmark10_Scale = cfgVolumeCenterScale/10000; static const int cfgBenchmark10_Passes = 32; static const int cfgBenchmark10_Iterations = 65536; -static const int cfgBenchmark10_Reference = 1301; +static const int cfgBenchmark10_Reference = 1244; //[11] optimize (incremental) bool cfgBenchmark11_Enable = cfgEnable; static const int cfgBenchmark11_Passes = 64; @@ -862,7 +862,7 @@ bool cfgBenchmark17_Enable = cfgEnable; static const int cfgBenchmark17_Iterations = 8; static const int cfgBenchmark17_Reference = 2842; //[18] select -bool cfgBenchmark18_Enable = cfgEnable; +bool cfgBenchmark18_Enable = true; static const int cfgBenchmark18_Iterations = 4; static const int cfgBenchmark18_Reference = 3390; diff --git a/src/BulletCollision/BroadphaseCollision/btDbvt.h b/src/BulletCollision/BroadphaseCollision/btDbvt.h index beeba3fd2..84ae80ba8 100644 --- a/src/BulletCollision/BroadphaseCollision/btDbvt.h +++ b/src/BulletCollision/BroadphaseCollision/btDbvt.h @@ -41,6 +41,9 @@ subject to the following restrictions: #define DBVT_USE_TEMPLATE 0 #endif +// Use only intrinsics instead of inline asm +#define DBVT_USE_INTRINSIC_SSE 1 + // Using memmov for collideOCL #define DBVT_USE_MEMMOVE 1 @@ -58,18 +61,18 @@ subject to the following restrictions: // Specific methods implementation -//disabled by default, it breaks certain compilers and platforms (Intel compiler on Win32 and any compiler on Windows 64 bits) -//#define WIN32_USE_SSE 1 -#ifdef WIN32_USE_SSE -#define DBVT_PROXIMITY_IMPL DBVT_IMPL_SSE +#ifdef WIN32 #define DBVT_SELECT_IMPL DBVT_IMPL_SSE #define DBVT_MERGE_IMPL DBVT_IMPL_SSE #else -#define DBVT_PROXIMITY_IMPL DBVT_IMPL_GENERIC #define DBVT_SELECT_IMPL DBVT_IMPL_GENERIC #define DBVT_MERGE_IMPL DBVT_IMPL_GENERIC #endif +#if (DBVT_SELECT_IMPL==DBVT_IMPL_SSE)||(DBVT_MERGE_IMPL==DBVT_IMPL_SSE) +#include +#endif + // // Auto config and checks // @@ -107,10 +110,6 @@ subject to the following restrictions: #error "DBVT_ENABLE_BENCHMARK undefined" #endif -#ifndef DBVT_PROXIMITY_IMPL -#error "DBVT_PROXIMITY_IMPL undefined" -#endif - #ifndef DBVT_SELECT_IMPL #error "DBVT_SELECT_IMPL undefined" #endif @@ -561,32 +560,8 @@ return(txmax>0); DBVT_INLINE btScalar Proximity( const btDbvtAabbMm& a, const btDbvtAabbMm& b) { -#if DBVT_PROXIMITY_IMPL == DBVT_IMPL_SSE -DBVT_ALIGN btScalar r[1]; -static DBVT_ALIGN const unsigned __int32 mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff}; -__asm - { - mov eax,a - mov ecx,b - movaps xmm0,[eax] - movaps xmm2,[ecx] - movaps xmm1,[eax+16] - movaps xmm3,[ecx+16] - addps xmm0,xmm1 - addps xmm2,xmm3 - subps xmm0,xmm2 - andps xmm0,mask - movhlps xmm1,xmm0 - addps xmm0,xmm1 - pshufd xmm1,xmm0,1 - addss xmm0,xmm1 - movss r,xmm0 - } -return(r[0]); -#else const btVector3 d=(a.mi+a.mx)-(b.mi+b.mx); return(btFabs(d.x())+btFabs(d.y())+btFabs(d.z())); -#endif } // @@ -595,36 +570,57 @@ DBVT_INLINE int Select( const btDbvtAabbMm& o, const btDbvtAabbMm& b) { #if DBVT_SELECT_IMPL == DBVT_IMPL_SSE -DBVT_ALIGN __int32 r[1]; static DBVT_ALIGN const unsigned __int32 mask[]={0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff}; -__asm - { - mov eax,o - mov ecx,a - mov edx,b - movaps xmm0,[eax] - movaps xmm5,mask - addps xmm0,[eax+16] - movaps xmm1,[ecx] - movaps xmm2,[edx] - addps xmm1,[ecx+16] - addps xmm2,[edx+16] - subps xmm1,xmm0 - subps xmm2,xmm0 - andps xmm1,xmm5 - andps xmm2,xmm5 - movhlps xmm3,xmm1 - movhlps xmm4,xmm2 - addps xmm1,xmm3 - addps xmm2,xmm4 - pshufd xmm3,xmm1,1 - pshufd xmm4,xmm2,1 - addss xmm1,xmm3 - addss xmm2,xmm4 - cmpless xmm2,xmm1 - movss r,xmm2 - } -return(r[0]&1); + // TODO: the intrinsic version is 11% slower + #if DBVT_USE_INTRINSIC_SSE + __m128 omi(_mm_load_ps(o.mi)); + omi=_mm_add_ps(omi,_mm_load_ps(o.mx)); + __m128 ami(_mm_load_ps(a.mi)); + ami=_mm_add_ps(ami,_mm_load_ps(a.mx)); + ami=_mm_sub_ps(ami,omi); + ami=_mm_and_ps(ami,_mm_load_ps((const float*)mask)); + __m128 bmi(_mm_load_ps(b.mi)); + bmi=_mm_add_ps(bmi,_mm_load_ps(b.mx)); + bmi=_mm_sub_ps(bmi,omi); + bmi=_mm_and_ps(bmi,_mm_load_ps((const float*)mask)); + __m128 t0(_mm_movehl_ps(ami,ami)); + ami=_mm_add_ps(ami,t0); + ami=_mm_add_ss(ami,_mm_shuffle_ps(ami,ami,1)); + __m128 t1(_mm_movehl_ps(bmi,bmi)); + bmi=_mm_add_ps(bmi,t1); + bmi=_mm_add_ss(bmi,_mm_shuffle_ps(bmi,bmi,1)); + return(_mm_cmple_ss(bmi,ami).m128_u32[0]&1); + #else + DBVT_ALIGN __int32 r[1]; + __asm + { + mov eax,o + mov ecx,a + mov edx,b + movaps xmm0,[eax] + movaps xmm5,mask + addps xmm0,[eax+16] + movaps xmm1,[ecx] + movaps xmm2,[edx] + addps xmm1,[ecx+16] + addps xmm2,[edx+16] + subps xmm1,xmm0 + subps xmm2,xmm0 + andps xmm1,xmm5 + andps xmm2,xmm5 + movhlps xmm3,xmm1 + movhlps xmm4,xmm2 + addps xmm1,xmm3 + addps xmm2,xmm4 + pshufd xmm3,xmm1,1 + pshufd xmm4,xmm2,1 + addss xmm1,xmm3 + addss xmm2,xmm4 + cmpless xmm2,xmm1 + movss r,xmm2 + } + return(r[0]&1); + #endif #else return(Proximity(o,a)