use newton-rapson iteration for more precise normalize() method

2008-02-28 07:11:29 +00:00
parent e85ffb3323
commit f77a4f1494
3 changed files with 26 additions and 3 deletions
--- a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/mat_aos.h
+++ b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/mat_aos.h
@@ -2060,7 +2060,10 @@ inline Quat::Quat( const Matrix3 & tfrm )
    diagSum = vec_add( vec_add( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz );
    diagDiff = vec_sub( vec_sub( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz );
    radicand = vec_add( vec_sel( diagDiff, diagSum, select_w ), _mm_set1_ps(1.0f) );
-    invSqrt = rsqrtf4( radicand );
+ //   invSqrt = rsqrtf4( radicand );
+	invSqrt = newtonrapson_rsqrt4( radicand );
+
+	

    zy_xz_yx = vec_sel( col0, col1, select_z );									// zy_xz_yx = 00 01 12 03
    //zy_xz_yx = vec_perm( zy_xz_yx, col2, _VECTORMATH_PERM_ZAYX );
--- a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vec_aos.h
+++ b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vec_aos.h
@@ -595,11 +595,17 @@ inline const floatInVec length( const Vector3 &vec )
    return floatInVec(  _mm_sqrt_ps(_vmathVfDot3( vec.get128(), vec.get128() )), 0 );
 }

-inline const Vector3 normalize( const Vector3 &vec )
+
+inline const Vector3 normalizeApprox( const Vector3 &vec )
 {
    return Vector3( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
 }

+inline const Vector3 normalize( const Vector3 &vec )
+{
+	return Vector3( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
+}
+
 inline const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 )
 {
    return Vector3( _vmathVfCross( vec0.get128(), vec1.get128() ) );
@@ -1021,11 +1027,16 @@ inline const floatInVec length( const Vector4 &vec )
    return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( vec.get128(), vec.get128() )), 0 );
 }

-inline const Vector4 normalize( const Vector4 &vec )
+inline const Vector4 normalizeApprox( const Vector4 &vec )
 {
    return Vector4( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
 }

+inline const Vector4 normalize( const Vector4 &vec )
+{
+    return Vector4( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
+}
+
 inline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 )
 {
    return select( vec0, vec1, boolInVec(select1) );
--- a/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vectormath_aos.h
+++ b/Extras/vectormathlibrary/include/vectormath/SSE/cpp/vectormath_aos.h
@@ -153,6 +153,15 @@ static inline __m128 vec_ctf(__m128 x, int a)
 #define recipf4(x) _mm_rcp_ps( x )
 #define negatef4(x) _mm_sub_ps( _mm_setzero_ps(), x )

+static __forceinline __m128 newtonrapson_rsqrt4( const __m128 v )
+{   
+#define _half4 _mm_setr_ps(.5f,.5f,.5f,.5f) 
+#define _three _mm_setr_ps(3.f,3.f,3.f,3.f)
+const __m128 approx = _mm_rsqrt_ps( v );   
+const __m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx);   
+return _mm_mul_ps(_mm_mul_ps(_half4, approx), _mm_sub_ps(_three, muls) );
+}
+
 static inline __m128 acosf4(__m128 x)
 {
    __m128 xabs = fabsf4(x);