diff --git a/src/Bullet3Common/b3Vector3.cpp b/src/Bullet3Common/b3Vector3.cpp
index e43f13897..5f5ac4ac0 100644
--- a/src/Bullet3Common/b3Vector3.cpp
+++ b/src/Bullet3Common/b3Vector3.cpp
@@ -1,16 +1,16 @@
 /*
  Copyright (c) 2011-213 Apple Inc. http://bulletphysics.org
- 
+
  This software is provided 'as-is', without any express or implied warranty.
  In no event will the authors be held liable for any damages arising from the use of this software.
- Permission is granted to anyone to use this software for any purpose, 
- including commercial applications, and to alter it and redistribute it freely, 
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it freely,
  subject to the following restrictions:
- 
+
  1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution.
- 
+
  This source version has been altered.
  */
 
@@ -47,35 +47,35 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
     float4 vvec = _mm_loadu_ps( vec );
     float4 vHi = b3CastiTo128f(_mm_shuffle_epi32( b3CastfTo128i( vvec), 0xaa ));          /// zzzz
     float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
-    
+
     long maxIndex = -1L;
-    
+
     size_t segment = 0;
     float4 stack_array[ STACK_ARRAY_COUNT ];
-    
+
 #if DEBUG
-    memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+    // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
 #endif
-    
+
     size_t index;
     float4 max;
     // Faster loop without cleanup code for full tiles
-    for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) 
+    for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
     {
         max = dotMax;
-        
-        for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )   
+
+        for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
         { // do four dot products at a time. Carefully avoid touching the w element.
             float4 v0 = vertices[0];
             float4 v1 = vertices[1];
             float4 v2 = vertices[2];
             float4 v3 = vertices[3];            vertices += 4;
-            
+
             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -86,17 +86,17 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index] = x;
             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-            
+
             v0 = vertices[0];
             v1 = vertices[1];
             v2 = vertices[2];
             v3 = vertices[3];            vertices += 4;
-            
+
             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -107,17 +107,17 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index+1] = x;
             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-            
+
             v0 = vertices[0];
             v1 = vertices[1];
             v2 = vertices[2];
             v3 = vertices[3];            vertices += 4;
-            
+
             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -128,17 +128,17 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index+2] = x;
             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-            
+
             v0 = vertices[0];
             v1 = vertices[1];
             v2 = vertices[2];
             v3 = vertices[3];            vertices += 4;
-            
+
             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -149,20 +149,20 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index+3] = x;
             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-            
+
             // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
         }
-        
+
         // If we found a new max
         if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
-        { 
+        {
             // copy the new max across all lanes of our max accumulator
             max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
             max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
-            
+
             dotMax = max;
-            
-            // find first occurrence of that max  
+
+            // find first occurrence of that max
             size_t test;
             for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
             {}
@@ -170,29 +170,29 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
             maxIndex = 4*index + segment + indexTable[test];
         }
     }
-    
+
     // account for work we've already done
     count -= segment;
-    
+
     // Deal with the last < STACK_ARRAY_COUNT vectors
     max = dotMax;
     index = 0;
-    
-    
+
+
     if( b3Unlikely( count > 16) )
     {
-        for( ; index + 4 <= count / 4; index+=4 )   
+        for( ; index + 4 <= count / 4; index+=4 )
         { // do four dot products at a time. Carefully avoid touching the w element.
             float4 v0 = vertices[0];
             float4 v1 = vertices[1];
             float4 v2 = vertices[2];
             float4 v3 = vertices[3];            vertices += 4;
-            
+
             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -203,17 +203,17 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index] = x;
             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-            
+
             v0 = vertices[0];
             v1 = vertices[1];
             v2 = vertices[2];
             v3 = vertices[3];            vertices += 4;
-            
+
             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -224,17 +224,17 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index+1] = x;
             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-            
+
             v0 = vertices[0];
             v1 = vertices[1];
             v2 = vertices[2];
             v3 = vertices[3];            vertices += 4;
-            
+
             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -245,17 +245,17 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index+2] = x;
             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-            
+
             v0 = vertices[0];
             v1 = vertices[1];
             v2 = vertices[2];
             v3 = vertices[3];            vertices += 4;
-            
+
             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -266,11 +266,11 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index+3] = x;
             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
-            
+
             // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
         }
     }
-    
+
     size_t localCount = (count & -4L) - 4*index;
     if( localCount )
     {
@@ -314,19 +314,19 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
         index += localCount/4;
 #else
         {
-            for( unsigned int i=0; i<localCount/4; i++,index++)   
+            for( unsigned int i=0; i<localCount/4; i++,index++)
             { // do four dot products at a time. Carefully avoid touching the w element.
                 float4 v0 = vertices[0];
                 float4 v1 = vertices[1];
                 float4 v2 = vertices[2];
-                float4 v3 = vertices[3];            
+                float4 v3 = vertices[3];
                 vertices += 4;
-                
+
                 float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
                 float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
                 float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
                 float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-                
+
                 lo0 = lo0*vLo;
                 lo1 = lo1*vLo;
                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -353,7 +353,7 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
                 v0 = vertices[0];
                 v1 = vertices[1];
                 v2 = vertices[2];
-                
+
                 // Calculate 3 dot products, transpose, duplicate v2
                 float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
                 float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
@@ -396,28 +396,28 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
         max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
         index++;
     }
-    
-    // if we found a new max. 
+
+    // if we found a new max.
     if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
     { // we found a new max. Search for it
       // find max across the max vector, place in all elements of max -- big latency hit here
         max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
         max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
-        
+
         // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
-        // this where it actually makes a difference is handled in the early out at the top of the function, 
-        // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced 
+        // this where it actually makes a difference is handled in the early out at the top of the function,
+        // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
         // complexity, and removed it.
-        
+
         dotMax = max;
-        
-        // scan for the first occurence of max in the array  
+
+        // scan for the first occurence of max in the array
         size_t test;
         for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
         {}
         maxIndex = 4*index + segment + indexTable[test];
     }
-    
+
     _mm_store_ss( dotResult, dotMax);
     return maxIndex;
 }
@@ -433,35 +433,35 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
     float4 vvec = _mm_loadu_ps( vec );
     float4 vHi = b3CastiTo128f(_mm_shuffle_epi32( b3CastfTo128i( vvec), 0xaa ));          /// zzzz
     float4 vLo = _mm_movelh_ps( vvec, vvec );                               /// xyxy
-    
+
     long minIndex = -1L;
 
     size_t segment = 0;
     float4 stack_array[ STACK_ARRAY_COUNT ];
-    
+
 #if DEBUG
-    memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+    // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
 #endif
-    
+
     size_t index;
     float4 min;
     // Faster loop without cleanup code for full tiles
-    for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) 
+    for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
     {
         min = dotmin;
-        
-        for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )   
+
+        for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
         { // do four dot products at a time. Carefully avoid touching the w element.
             float4 v0 = vertices[0];
             float4 v1 = vertices[1];
             float4 v2 = vertices[2];
             float4 v3 = vertices[3];            vertices += 4;
-            
+
             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -472,17 +472,17 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index] = x;
             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-            
+
             v0 = vertices[0];
             v1 = vertices[1];
             v2 = vertices[2];
             v3 = vertices[3];            vertices += 4;
-            
+
             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -493,17 +493,17 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index+1] = x;
             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-            
+
             v0 = vertices[0];
             v1 = vertices[1];
             v2 = vertices[2];
             v3 = vertices[3];            vertices += 4;
-            
+
             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -514,17 +514,17 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index+2] = x;
             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-            
+
             v0 = vertices[0];
             v1 = vertices[1];
             v2 = vertices[2];
             v3 = vertices[3];            vertices += 4;
-            
+
             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -535,20 +535,20 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index+3] = x;
             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-            
+
             // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
         }
-        
+
         // If we found a new min
         if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
-        { 
+        {
             // copy the new min across all lanes of our min accumulator
             min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
             min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
-            
+
             dotmin = min;
-            
-            // find first occurrence of that min  
+
+            // find first occurrence of that min
             size_t test;
             for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
             {}
@@ -556,29 +556,29 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
             minIndex = 4*index + segment + indexTable[test];
         }
     }
-    
+
     // account for work we've already done
     count -= segment;
-    
+
     // Deal with the last < STACK_ARRAY_COUNT vectors
     min = dotmin;
     index = 0;
-    
-    
+
+
     if(b3Unlikely( count > 16) )
     {
-        for( ; index + 4 <= count / 4; index+=4 )   
+        for( ; index + 4 <= count / 4; index+=4 )
         { // do four dot products at a time. Carefully avoid touching the w element.
             float4 v0 = vertices[0];
             float4 v1 = vertices[1];
             float4 v2 = vertices[2];
             float4 v3 = vertices[3];            vertices += 4;
-            
+
             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -589,17 +589,17 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index] = x;
             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-            
+
             v0 = vertices[0];
             v1 = vertices[1];
             v2 = vertices[2];
             v3 = vertices[3];            vertices += 4;
-            
+
             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -610,17 +610,17 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index+1] = x;
             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-            
+
             v0 = vertices[0];
             v1 = vertices[1];
             v2 = vertices[2];
             v3 = vertices[3];            vertices += 4;
-            
+
             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -631,17 +631,17 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index+2] = x;
             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-            
+
             v0 = vertices[0];
             v1 = vertices[1];
             v2 = vertices[2];
             v3 = vertices[3];            vertices += 4;
-            
+
             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-            
+
             lo0 = lo0*vLo;
             lo1 = lo1*vLo;
             z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -652,22 +652,22 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
             x = x+z;
             stack_array[index+3] = x;
             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
-            
+
             // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
         }
     }
-    
+
     size_t localCount = (count & -4L) - 4*index;
     if( localCount )
     {
-        
-        
+
+
 #ifdef __APPLE__
         vertices += localCount;      // counter the offset
         float4 t0, t1, t2, t3, t4;
         size_t byteIndex = -(localCount) * sizeof(float);
         float4 * sap = &stack_array[index + localCount / 4];
-        
+
         asm volatile
         (   ".align 4                                                                   \n\
              0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\
@@ -702,19 +702,19 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
         index += localCount/4;
 #else
         {
-            for( unsigned int i=0; i<localCount/4; i++,index++)   
+            for( unsigned int i=0; i<localCount/4; i++,index++)
             { // do four dot products at a time. Carefully avoid touching the w element.
                 float4 v0 = vertices[0];
                 float4 v1 = vertices[1];
                 float4 v2 = vertices[2];
-                float4 v3 = vertices[3];            
+                float4 v3 = vertices[3];
                 vertices += 4;
-                
+
                 float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
                 float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
                 float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
                 float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
-                
+
                 lo0 = lo0*vLo;
                 lo1 = lo1*vLo;
                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
@@ -730,7 +730,7 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
 
 #endif
     }
-    
+
     // process the last few points
     if( count & 3 )
     {
@@ -742,7 +742,7 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
                 v0 = vertices[0];
                 v1 = vertices[1];
                 v2 = vertices[2];
-                
+
                 // Calculate 3 dot products, transpose, duplicate v2
                 float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
                 float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
@@ -785,28 +785,28 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
         min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
         index++;
     }
-    
-    // if we found a new min. 
+
+    // if we found a new min.
     if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
     { // we found a new min. Search for it
       // find min across the min vector, place in all elements of min -- big latency hit here
         min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
         min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
-        
+
         // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
-        // this where it actually makes a difference is handled in the early out at the top of the function, 
-        // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced 
+        // this where it actually makes a difference is handled in the early out at the top of the function,
+        // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
         // complexity, and removed it.
-        
+
         dotmin = min;
-        
-        // scan for the first occurence of min in the array  
+
+        // scan for the first occurence of min in the array
         size_t test;
         for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
         {}
         minIndex = 4*index + segment + indexTable[test];
     }
-    
+
     _mm_store_ss( dotResult, dotmin);
     return minIndex;
 }
@@ -835,7 +835,7 @@ static long b3_maxdot_large_sel( const float *vv, const float *vec, unsigned lon
         b3_maxdot_large = _maxdot_large_v1;
     else
         b3_maxdot_large = _maxdot_large_v0;
-    
+
     return b3_maxdot_large(vv, vec, count, dotResult);
 }
 
@@ -845,7 +845,7 @@ static long b3_mindot_large_sel( const float *vv, const float *vec, unsigned lon
         b3_mindot_large = _mindot_large_v1;
     else
         b3_mindot_large = _mindot_large_v0;
-    
+
     return b3_mindot_large(vv, vec, count, dotResult);
 }
 
@@ -874,51 +874,51 @@ long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count,
         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-        
+
         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-        
+
         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-        
+
         float32x2_t rLo = vpadd_f32( xy0, xy1);
         float32x2_t rHi = vpadd_f32( xy2, xy3);
         rLo = vadd_f32(rLo, zLo);
         rHi = vadd_f32(rHi, zHi);
-        
+
         uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
         uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
         iLo = vbsl_u32(maskLo, indexLo, iLo);
         iHi = vbsl_u32(maskHi, indexHi, iHi);
-        indexLo = vadd_u32(indexLo, four); 
+        indexLo = vadd_u32(indexLo, four);
         indexHi = vadd_u32(indexHi, four);
 
         v0 = vld1q_f32_aligned_postincrement( vv );
         v1 = vld1q_f32_aligned_postincrement( vv );
         v2 = vld1q_f32_aligned_postincrement( vv );
         v3 = vld1q_f32_aligned_postincrement( vv );
-        
+
         xy0 = vmul_f32( vget_low_f32(v0), vLo);
         xy1 = vmul_f32( vget_low_f32(v1), vLo);
         xy2 = vmul_f32( vget_low_f32(v2), vLo);
         xy3 = vmul_f32( vget_low_f32(v3), vLo);
-        
+
         z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
         z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
         zLo = vmul_f32( z0.val[0], vHi);
         zHi = vmul_f32( z1.val[0], vHi);
-        
+
         rLo = vpadd_f32( xy0, xy1);
         rHi = vpadd_f32( xy2, xy3);
         rLo = vadd_f32(rLo, zLo);
         rHi = vadd_f32(rHi, zHi);
-        
+
         maskLo = vcgt_f32( rLo, dotMaxLo );
         maskHi = vcgt_f32( rHi, dotMaxHi );
         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
@@ -935,22 +935,22 @@ long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count,
         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-        
+
         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-        
+
         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-        
+
         float32x2_t rLo = vpadd_f32( xy0, xy1);
         float32x2_t rHi = vpadd_f32( xy2, xy3);
         rLo = vadd_f32(rLo, zLo);
         rHi = vadd_f32(rHi, zHi);
-        
+
         uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
         uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
@@ -960,7 +960,7 @@ long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count,
         indexLo = vadd_u32(indexLo, four);
         indexHi = vadd_u32(indexHi, four);
     }
-    
+
     switch( count & 3 )
     {
         case 3:
@@ -968,20 +968,20 @@ long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count,
             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-            
+
             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
             float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
-            
+
             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
             float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
-            
+
             float32x2_t rLo = vpadd_f32( xy0, xy1);
             float32x2_t rHi = vpadd_f32( xy2, xy2);
             rLo = vadd_f32(rLo, zLo);
             rHi = vadd_f32(rHi, zHi);
-            
+
             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
             uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
@@ -994,16 +994,16 @@ long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count,
         {
             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-            
+
             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-            
+
             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-            
+
             float32x2_t rLo = vpadd_f32( xy0, xy1);
             rLo = vadd_f32(rLo, zLo);
-            
+
             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
             iLo = vbsl_u32(maskLo, indexLo, iLo);
@@ -1022,23 +1022,23 @@ long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count,
             iLo = vbsl_u32(maskLo, indexLo, iLo);
         }
             break;
-        
+
         default:
             break;
     }
-    
+
     // select best answer between hi and lo results
     uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
     dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
     iLo = vbsl_u32(mask, iHi, iLo);
-    
+
     // select best answer between even and odd results
     dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
     iHi = vdup_lane_u32(iLo, 1);
     mask = vcgt_f32( dotMaxHi, dotMaxLo );
     dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
     iLo = vbsl_u32(mask, iHi, iLo);
-    
+
     *dotResult = vget_lane_f32( dotMaxLo, 0);
     return vget_lane_u32(iLo, 0);
 }
@@ -1053,7 +1053,7 @@ long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count,
     uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
     uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
     float32x4_t maxDot = (float32x4_t) { -B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY };
-    
+
     unsigned long i = 0;
     for( ; i + 8 <= count; i += 8 )
     {
@@ -1061,23 +1061,23 @@ long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count,
         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-        
+
         // the next two lines should resolve to a single vswp d, d
         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
         // the next two lines should resolve to a single vswp d, d
         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-        
+
         xy0 = vmulq_f32(xy0, vLo);
         xy1 = vmulq_f32(xy1, vLo);
-        
+
         float32x4x2_t zb = vuzpq_f32( z0, z1);
         float32x4_t z = vmulq_f32( zb.val[0], vHi);
         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
         x = vaddq_f32(x, z);
-        
+
         uint32x4_t mask = vcgtq_f32(x, maxDot);
         maxDot = vbslq_f32( mask, x, maxDot);
         index = vbslq_u32(mask, local_index, index);
@@ -1087,23 +1087,23 @@ long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count,
         v1 = vld1q_f32_aligned_postincrement( vv );
         v2 = vld1q_f32_aligned_postincrement( vv );
         v3 = vld1q_f32_aligned_postincrement( vv );
-        
+
         // the next two lines should resolve to a single vswp d, d
         xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
         xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
         // the next two lines should resolve to a single vswp d, d
         z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
         z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-        
+
         xy0 = vmulq_f32(xy0, vLo);
         xy1 = vmulq_f32(xy1, vLo);
-        
+
         zb = vuzpq_f32( z0, z1);
         z = vmulq_f32( zb.val[0], vHi);
         xy = vuzpq_f32( xy0, xy1);
         x = vaddq_f32(xy.val[0], xy.val[1]);
         x = vaddq_f32(x, z);
-        
+
         mask = vcgtq_f32(x, maxDot);
         maxDot = vbslq_f32( mask, x, maxDot);
         index = vbslq_u32(mask, local_index, index);
@@ -1123,45 +1123,45 @@ long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count,
         // the next two lines should resolve to a single vswp d, d
         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-        
+
         xy0 = vmulq_f32(xy0, vLo);
         xy1 = vmulq_f32(xy1, vLo);
-        
+
         float32x4x2_t zb = vuzpq_f32( z0, z1);
         float32x4_t z = vmulq_f32( zb.val[0], vHi);
         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
         x = vaddq_f32(x, z);
-        
+
         uint32x4_t mask = vcgtq_f32(x, maxDot);
         maxDot = vbslq_f32( mask, x, maxDot);
         index = vbslq_u32(mask, local_index, index);
         local_index = vaddq_u32(local_index, four);
     }
-    
+
     switch (count & 3) {
         case 3:
         {
             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-            
+
             // the next two lines should resolve to a single vswp d, d
             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
             float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
             // the next two lines should resolve to a single vswp d, d
             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
             float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
-            
+
             xy0 = vmulq_f32(xy0, vLo);
             xy1 = vmulq_f32(xy1, vLo);
-            
+
             float32x4x2_t zb = vuzpq_f32( z0, z1);
             float32x4_t z = vmulq_f32( zb.val[0], vHi);
             float32x4x2_t xy = vuzpq_f32( xy0, xy1);
             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
             x = vaddq_f32(x, z);
-            
+
             uint32x4_t mask = vcgtq_f32(x, maxDot);
             maxDot = vbslq_f32( mask, x, maxDot);
             index = vbslq_u32(mask, local_index, index);
@@ -1173,20 +1173,20 @@ long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count,
         {
             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-            
+
             // the next two lines should resolve to a single vswp d, d
             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
             // the next two lines should resolve to a single vswp d, d
             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-            
+
             xy0 = vmulq_f32(xy0, vLo);
-            
+
             float32x4x2_t zb = vuzpq_f32( z0, z0);
             float32x4_t z = vmulq_f32( zb.val[0], vHi);
             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
             x = vaddq_f32(x, z);
-            
+
             uint32x4_t mask = vcgtq_f32(x, maxDot);
             maxDot = vbslq_f32( mask, x, maxDot);
             index = vbslq_u32(mask, local_index, index);
@@ -1197,19 +1197,19 @@ long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count,
         case 1:
         {
             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-            
+
             // the next two lines should resolve to a single vswp d, d
             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
             // the next two lines should resolve to a single vswp d, d
-            float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); 
-            
+            float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
+
             xy0 = vmulq_f32(xy0, vLo);
-            
+
             z = vmulq_f32( z, vHi);
             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
             x = vaddq_f32(x, z);
-            
+
             uint32x4_t mask = vcgtq_f32(x, maxDot);
             maxDot = vbslq_f32( mask, x, maxDot);
             index = vbslq_u32(mask, local_index, index);
@@ -1220,23 +1220,23 @@ long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count,
         default:
             break;
     }
-    
-    
+
+
     // select best answer between hi and lo results
     uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
     float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
     uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
-    
+
     // select best answer between even and odd results
     float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
     uint32x2_t indexHi = vdup_lane_u32(index2, 1);
     mask = vcgt_f32( maxDotO, maxDot2 );
     maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
     index2 = vbsl_u32(mask, indexHi, index2);
-    
+
     *dotResult = vget_lane_f32( maxDot2, 0);
     return vget_lane_u32(index2, 0);
-    
+
 }
 
 long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
@@ -1252,29 +1252,29 @@ long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count,
     uint32x2_t iLo = (uint32x2_t) {-1, -1};
     uint32x2_t iHi = (uint32x2_t) {-1, -1};
     const uint32x2_t four = (uint32x2_t) {4,4};
-    
+
     for( ; i+8 <= count; i+= 8 )
     {
         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-        
+
         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-        
+
         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-        
+
         float32x2_t rLo = vpadd_f32( xy0, xy1);
         float32x2_t rHi = vpadd_f32( xy2, xy3);
         rLo = vadd_f32(rLo, zLo);
         rHi = vadd_f32(rHi, zHi);
-        
+
         uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
         uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
@@ -1283,27 +1283,27 @@ long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count,
         iHi = vbsl_u32(maskHi, indexHi, iHi);
         indexLo = vadd_u32(indexLo, four);
         indexHi = vadd_u32(indexHi, four);
-        
+
         v0 = vld1q_f32_aligned_postincrement( vv );
         v1 = vld1q_f32_aligned_postincrement( vv );
         v2 = vld1q_f32_aligned_postincrement( vv );
         v3 = vld1q_f32_aligned_postincrement( vv );
-        
+
         xy0 = vmul_f32( vget_low_f32(v0), vLo);
         xy1 = vmul_f32( vget_low_f32(v1), vLo);
         xy2 = vmul_f32( vget_low_f32(v2), vLo);
         xy3 = vmul_f32( vget_low_f32(v3), vLo);
-        
+
         z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
         z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
         zLo = vmul_f32( z0.val[0], vHi);
         zHi = vmul_f32( z1.val[0], vHi);
-        
+
         rLo = vpadd_f32( xy0, xy1);
         rHi = vpadd_f32( xy2, xy3);
         rLo = vadd_f32(rLo, zLo);
         rHi = vadd_f32(rHi, zHi);
-        
+
         maskLo = vclt_f32( rLo, dotMinLo );
         maskHi = vclt_f32( rHi, dotMinHi );
         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
@@ -1320,22 +1320,22 @@ long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count,
         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-        
+
         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-        
+
         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-        
+
         float32x2_t rLo = vpadd_f32( xy0, xy1);
         float32x2_t rHi = vpadd_f32( xy2, xy3);
         rLo = vadd_f32(rLo, zLo);
         rHi = vadd_f32(rHi, zHi);
-        
+
         uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
         uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
@@ -1352,20 +1352,20 @@ long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count,
             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-            
+
             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
             float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
-            
+
             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
             float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
-            
+
             float32x2_t rLo = vpadd_f32( xy0, xy1);
             float32x2_t rHi = vpadd_f32( xy2, xy2);
             rLo = vadd_f32(rLo, zLo);
             rHi = vadd_f32(rHi, zHi);
-            
+
             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
             uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
@@ -1378,16 +1378,16 @@ long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count,
         {
             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-            
+
             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-            
+
             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-            
+
             float32x2_t rLo = vpadd_f32( xy0, xy1);
             rLo = vadd_f32(rLo, zLo);
-            
+
             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
             iLo = vbsl_u32(maskLo, indexLo, iLo);
@@ -1406,23 +1406,23 @@ long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count,
             iLo = vbsl_u32(maskLo, indexLo, iLo);
         }
             break;
-            
+
         default:
             break;
     }
-    
+
     // select best answer between hi and lo results
     uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
     dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
     iLo = vbsl_u32(mask, iHi, iLo);
-    
+
     // select best answer between even and odd results
     dotMinHi = vdup_lane_f32(dotMinLo, 1);
     iHi = vdup_lane_u32(iLo, 1);
     mask = vclt_f32( dotMinHi, dotMinLo );
     dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
     iLo = vbsl_u32(mask, iHi, iLo);
-    
+
     *dotResult = vget_lane_f32( dotMinLo, 0);
     return vget_lane_u32(iLo, 0);
 }
@@ -1436,7 +1436,7 @@ long b3_mindot_large_v1( const float *vv, const float *vec, unsigned long count,
     uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
     uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
     float32x4_t minDot = (float32x4_t) { B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY };
-    
+
     unsigned long i = 0;
     for( ; i + 8 <= count; i += 8 )
     {
@@ -1444,182 +1444,182 @@ long b3_mindot_large_v1( const float *vv, const float *vec, unsigned long count,
         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-        
+
         // the next two lines should resolve to a single vswp d, d
         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
         // the next two lines should resolve to a single vswp d, d
         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-        
+
         xy0 = vmulq_f32(xy0, vLo);
         xy1 = vmulq_f32(xy1, vLo);
-        
+
         float32x4x2_t zb = vuzpq_f32( z0, z1);
         float32x4_t z = vmulq_f32( zb.val[0], vHi);
         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
         x = vaddq_f32(x, z);
-        
+
         uint32x4_t mask = vcltq_f32(x, minDot);
         minDot = vbslq_f32( mask, x, minDot);
         index = vbslq_u32(mask, local_index, index);
         local_index = vaddq_u32(local_index, four);
-        
+
         v0 = vld1q_f32_aligned_postincrement( vv );
         v1 = vld1q_f32_aligned_postincrement( vv );
         v2 = vld1q_f32_aligned_postincrement( vv );
         v3 = vld1q_f32_aligned_postincrement( vv );
-        
+
         // the next two lines should resolve to a single vswp d, d
         xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
         xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
         // the next two lines should resolve to a single vswp d, d
         z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
         z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-        
+
         xy0 = vmulq_f32(xy0, vLo);
         xy1 = vmulq_f32(xy1, vLo);
-        
+
         zb = vuzpq_f32( z0, z1);
         z = vmulq_f32( zb.val[0], vHi);
         xy = vuzpq_f32( xy0, xy1);
         x = vaddq_f32(xy.val[0], xy.val[1]);
         x = vaddq_f32(x, z);
-        
+
         mask = vcltq_f32(x, minDot);
         minDot = vbslq_f32( mask, x, minDot);
         index = vbslq_u32(mask, local_index, index);
         local_index = vaddq_u32(local_index, four);
     }
-    
+
     for( ; i + 4 <= count; i += 4 )
     {
         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-        
+
         // the next two lines should resolve to a single vswp d, d
         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
         // the next two lines should resolve to a single vswp d, d
         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-        
+
         xy0 = vmulq_f32(xy0, vLo);
         xy1 = vmulq_f32(xy1, vLo);
-        
+
         float32x4x2_t zb = vuzpq_f32( z0, z1);
         float32x4_t z = vmulq_f32( zb.val[0], vHi);
         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
         x = vaddq_f32(x, z);
-        
+
         uint32x4_t mask = vcltq_f32(x, minDot);
         minDot = vbslq_f32( mask, x, minDot);
         index = vbslq_u32(mask, local_index, index);
         local_index = vaddq_u32(local_index, four);
     }
-    
+
     switch (count & 3) {
         case 3:
         {
             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-            
+
             // the next two lines should resolve to a single vswp d, d
             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
             float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
             // the next two lines should resolve to a single vswp d, d
             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
             float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
-            
+
             xy0 = vmulq_f32(xy0, vLo);
             xy1 = vmulq_f32(xy1, vLo);
-            
+
             float32x4x2_t zb = vuzpq_f32( z0, z1);
             float32x4_t z = vmulq_f32( zb.val[0], vHi);
             float32x4x2_t xy = vuzpq_f32( xy0, xy1);
             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
             x = vaddq_f32(x, z);
-            
+
             uint32x4_t mask = vcltq_f32(x, minDot);
             minDot = vbslq_f32( mask, x, minDot);
             index = vbslq_u32(mask, local_index, index);
             local_index = vaddq_u32(local_index, four);
         }
             break;
-            
+
         case 2:
         {
             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-            
+
             // the next two lines should resolve to a single vswp d, d
             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
             // the next two lines should resolve to a single vswp d, d
             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-            
+
             xy0 = vmulq_f32(xy0, vLo);
-            
+
             float32x4x2_t zb = vuzpq_f32( z0, z0);
             float32x4_t z = vmulq_f32( zb.val[0], vHi);
             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
             x = vaddq_f32(x, z);
-            
+
             uint32x4_t mask = vcltq_f32(x, minDot);
             minDot = vbslq_f32( mask, x, minDot);
             index = vbslq_u32(mask, local_index, index);
             local_index = vaddq_u32(local_index, four);
         }
             break;
-            
+
         case 1:
         {
             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-            
+
             // the next two lines should resolve to a single vswp d, d
             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
             // the next two lines should resolve to a single vswp d, d
-            float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); 
-            
+            float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
+
             xy0 = vmulq_f32(xy0, vLo);
-            
+
             z = vmulq_f32( z, vHi);
             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
             x = vaddq_f32(x, z);
-            
+
             uint32x4_t mask = vcltq_f32(x, minDot);
             minDot = vbslq_f32( mask, x, minDot);
             index = vbslq_u32(mask, local_index, index);
             local_index = vaddq_u32(local_index, four);
         }
             break;
-            
+
         default:
             break;
     }
-    
-    
+
+
     // select best answer between hi and lo results
     uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
     float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
     uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
-    
+
     // select best answer between even and odd results
     float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
     uint32x2_t indexHi = vdup_lane_u32(index2, 1);
     mask = vclt_f32( minDotO, minDot2 );
     minDot2 = vbsl_f32(mask, minDotO, minDot2);
     index2 = vbsl_u32(mask, indexHi, index2);
-    
+
     *dotResult = vget_lane_f32( minDot2, 0);
     return vget_lane_u32(index2, 0);
-    
+
 }
 
 #else
diff --git a/src/LinearMath/btVector3.cpp b/src/LinearMath/btVector3.cpp
index 80c7055d5..e05bdccd6 100644
--- a/src/LinearMath/btVector3.cpp
+++ b/src/LinearMath/btVector3.cpp
@@ -63,7 +63,7 @@ long _maxdot_large( const float *vv, const float *vec, unsigned long count, floa
     float4 stack_array[ STACK_ARRAY_COUNT ];
     
 #if DEBUG
-    memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+    //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
 #endif
     
     size_t index;
@@ -448,7 +448,7 @@ long _mindot_large( const float *vv, const float *vec, unsigned long count, floa
     float4 stack_array[ STACK_ARRAY_COUNT ];
     
 #if DEBUG
-    memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+    //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
 #endif
     
     size_t index;