diff --git a/src/Bullet3Common/b3Vector3.cpp b/src/Bullet3Common/b3Vector3.cpp index e43f13897..5f5ac4ac0 100644 --- a/src/Bullet3Common/b3Vector3.cpp +++ b/src/Bullet3Common/b3Vector3.cpp @@ -1,16 +1,16 @@ /* Copyright (c) 2011-213 Apple Inc. http://bulletphysics.org - + This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. - Permission is granted to anyone to use this software for any purpose, - including commercial applications, and to alter it and redistribute it freely, + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: - + 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. - + This source version has been altered. */ @@ -47,35 +47,35 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl float4 vvec = _mm_loadu_ps( vec ); float4 vHi = b3CastiTo128f(_mm_shuffle_epi32( b3CastfTo128i( vvec), 0xaa )); /// zzzz float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy - + long maxIndex = -1L; - + size_t segment = 0; float4 stack_array[ STACK_ARRAY_COUNT ]; - + #if DEBUG - memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) ); + // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) ); #endif - + size_t index; float4 max; // Faster loop without cleanup code for full tiles - for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) + for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) { max = dotMax; - - for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 ) + + for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 ) { // do four dot products at a time. Carefully avoid touching the w element. float4 v0 = vertices[0]; float4 v1 = vertices[1]; float4 v2 = vertices[2]; float4 v3 = vertices[3]; vertices += 4; - + float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - + lo0 = lo0*vLo; lo1 = lo1*vLo; float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); @@ -86,17 +86,17 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl x = x+z; stack_array[index] = x; max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - + v0 = vertices[0]; v1 = vertices[1]; v2 = vertices[2]; v3 = vertices[3]; vertices += 4; - + lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - + lo0 = lo0*vLo; lo1 = lo1*vLo; z = _mm_shuffle_ps(hi0, hi1, 0x88); @@ -107,17 +107,17 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl x = x+z; stack_array[index+1] = x; max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - + v0 = vertices[0]; v1 = vertices[1]; v2 = vertices[2]; v3 = vertices[3]; vertices += 4; - + lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - + lo0 = lo0*vLo; lo1 = lo1*vLo; z = _mm_shuffle_ps(hi0, hi1, 0x88); @@ -128,17 +128,17 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl x = x+z; stack_array[index+2] = x; max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - + v0 = vertices[0]; v1 = vertices[1]; v2 = vertices[2]; v3 = vertices[3]; vertices += 4; - + lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - + lo0 = lo0*vLo; lo1 = lo1*vLo; z = _mm_shuffle_ps(hi0, hi1, 0x88); @@ -149,20 +149,20 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl x = x+z; stack_array[index+3] = x; max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - + // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way. } - + // If we found a new max if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax))) - { + { // copy the new max across all lanes of our max accumulator max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e)); max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1)); - + dotMax = max; - - // find first occurrence of that max + + // find first occurrence of that max size_t test; for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4 {} @@ -170,29 +170,29 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl maxIndex = 4*index + segment + indexTable[test]; } } - + // account for work we've already done count -= segment; - + // Deal with the last < STACK_ARRAY_COUNT vectors max = dotMax; index = 0; - - + + if( b3Unlikely( count > 16) ) { - for( ; index + 4 <= count / 4; index+=4 ) + for( ; index + 4 <= count / 4; index+=4 ) { // do four dot products at a time. Carefully avoid touching the w element. float4 v0 = vertices[0]; float4 v1 = vertices[1]; float4 v2 = vertices[2]; float4 v3 = vertices[3]; vertices += 4; - + float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - + lo0 = lo0*vLo; lo1 = lo1*vLo; float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); @@ -203,17 +203,17 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl x = x+z; stack_array[index] = x; max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - + v0 = vertices[0]; v1 = vertices[1]; v2 = vertices[2]; v3 = vertices[3]; vertices += 4; - + lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - + lo0 = lo0*vLo; lo1 = lo1*vLo; z = _mm_shuffle_ps(hi0, hi1, 0x88); @@ -224,17 +224,17 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl x = x+z; stack_array[index+1] = x; max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - + v0 = vertices[0]; v1 = vertices[1]; v2 = vertices[2]; v3 = vertices[3]; vertices += 4; - + lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - + lo0 = lo0*vLo; lo1 = lo1*vLo; z = _mm_shuffle_ps(hi0, hi1, 0x88); @@ -245,17 +245,17 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl x = x+z; stack_array[index+2] = x; max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - + v0 = vertices[0]; v1 = vertices[1]; v2 = vertices[2]; v3 = vertices[3]; vertices += 4; - + lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - + lo0 = lo0*vLo; lo1 = lo1*vLo; z = _mm_shuffle_ps(hi0, hi1, 0x88); @@ -266,11 +266,11 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl x = x+z; stack_array[index+3] = x; max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan - + // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way. } } - + size_t localCount = (count & -4L) - 4*index; if( localCount ) { @@ -314,19 +314,19 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl index += localCount/4; #else { - for( unsigned int i=0; i 16) ) { - for( ; index + 4 <= count / 4; index+=4 ) + for( ; index + 4 <= count / 4; index+=4 ) { // do four dot products at a time. Carefully avoid touching the w element. float4 v0 = vertices[0]; float4 v1 = vertices[1]; float4 v2 = vertices[2]; float4 v3 = vertices[3]; vertices += 4; - + float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - + lo0 = lo0*vLo; lo1 = lo1*vLo; float4 z = _mm_shuffle_ps(hi0, hi1, 0x88); @@ -589,17 +589,17 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl x = x+z; stack_array[index] = x; min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - + v0 = vertices[0]; v1 = vertices[1]; v2 = vertices[2]; v3 = vertices[3]; vertices += 4; - + lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - + lo0 = lo0*vLo; lo1 = lo1*vLo; z = _mm_shuffle_ps(hi0, hi1, 0x88); @@ -610,17 +610,17 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl x = x+z; stack_array[index+1] = x; min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - + v0 = vertices[0]; v1 = vertices[1]; v2 = vertices[2]; v3 = vertices[3]; vertices += 4; - + lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - + lo0 = lo0*vLo; lo1 = lo1*vLo; z = _mm_shuffle_ps(hi0, hi1, 0x88); @@ -631,17 +631,17 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl x = x+z; stack_array[index+2] = x; min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - + v0 = vertices[0]; v1 = vertices[1]; v2 = vertices[2]; v3 = vertices[3]; vertices += 4; - + lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3 - + lo0 = lo0*vLo; lo1 = lo1*vLo; z = _mm_shuffle_ps(hi0, hi1, 0x88); @@ -652,22 +652,22 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl x = x+z; stack_array[index+3] = x; min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan - + // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way. } } - + size_t localCount = (count & -4L) - 4*index; if( localCount ) { - - + + #ifdef __APPLE__ vertices += localCount; // counter the offset float4 t0, t1, t2, t3, t4; size_t byteIndex = -(localCount) * sizeof(float); float4 * sap = &stack_array[index + localCount / 4]; - + asm volatile ( ".align 4 \n\ 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\ @@ -702,19 +702,19 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl index += localCount/4; #else { - for( unsigned int i=0; i