// // Test_v3dot.cpp // BulletTest // // Copyright (c) 2011 Apple Inc. // #include "LinearMath/btScalar.h" #if defined(BT_USE_SSE_IN_API) || defined(BT_USE_NEON) #include "Test_dot3.h" #include "vector.h" #include "Utils.h" #include "main.h" #include #include #include // reference code for testing purposes static btVector3 dot3_ref(const btVector3 &, const btVector3 &, const btVector3 &, const btVector3 &); static btVector3 dot3_ref(const btVector3 &v, const btVector3 &v1, const btVector3 &v2, const btVector3 &v3) { return btVector3(v.dot(v1), v.dot(v2), v.dot(v3)); } /* SIMD_FORCE_INLINE int operator!=(const btVector3 &s, const btVector3 &v) { #ifdef __SSE__ __m128 test = _mm_cmpneq_ps( s.mVec128, v.mVec128 ); return (_mm_movemask_ps( test ) & 7) != 0; #elif defined __ARM_NEON_H uint32x4_t test = vandq_u32( vceqq_f32( s.mVec128, v.mVec128 ), (uint32x4_t){-1,-1,-1,0}); uint32x2_t t = vpadd_u32( vget_low_u32(test), vget_high_u32(test)); t = vpadd_u32(t, t); return -3 != (int32_t) vget_lane_u32(t, 0); #else return s.m_floats[0] != v.m_floats[0] || s.m_floats[1] != v.m_floats[1] || s.m_floats[2] != v.m_floats[2]; #endif } */ #define LOOPCOUNT 1000 #define NUM_CYCLES 10000 int Test_dot3(void) { btVector3 v, v1, v2, v3; #define DATA_SIZE 1024 btVector3 vec3_arr[DATA_SIZE]; btVector3 vec3_arr1[DATA_SIZE]; btVector3 vec3_arr2[DATA_SIZE]; btVector3 vec3_arr3[DATA_SIZE]; btVector3 res_arr[DATA_SIZE]; uint64_t scalarTime; uint64_t vectorTime; size_t j, k; btVector3 correct, test; for (k = 0; k < DATA_SIZE; k++) { vec3_arr[k] = btVector3(btAssign128(RANDF, RANDF, RANDF, BT_NAN)); vec3_arr1[k] = btVector3(btAssign128(RANDF, RANDF, RANDF, BT_NAN)); vec3_arr2[k] = btVector3(btAssign128(RANDF, RANDF, RANDF, BT_NAN)); vec3_arr3[k] = btVector3(btAssign128(RANDF, RANDF, RANDF, BT_NAN)); correct = dot3_ref(vec3_arr[k], vec3_arr1[k], vec3_arr2[k], vec3_arr3[k]); test = vec3_arr[k].dot3(vec3_arr1[k], vec3_arr2[k], vec3_arr3[k]); if (correct != test) { vlog("Error (%ld) - dot3 result error! *{%a, %a, %a, %a} != {%a, %a, %a, %a} \n", k, correct.x(), correct.y(), correct.z(), correct.w(), test.x(), test.y(), test.z(), test.w()); return 1; } } { uint64_t startTime, bestTime, currentTime; bestTime = -1LL; scalarTime = 0; for (j = 0; j < NUM_CYCLES; j++) { startTime = ReadTicks(); for (k = 0; k + 4 <= LOOPCOUNT; k += 4) { size_t k32 = (k & (DATA_SIZE - 1)); res_arr[k32] = dot3_ref(vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++; res_arr[k32] = dot3_ref(vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++; res_arr[k32] = dot3_ref(vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++; res_arr[k32] = dot3_ref(vec3_arr[k32], vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); } currentTime = ReadTicks() - startTime; scalarTime += currentTime; if (currentTime < bestTime) bestTime = currentTime; } if (0 == gReportAverageTimes) scalarTime = bestTime; else scalarTime /= NUM_CYCLES; } { uint64_t startTime, bestTime, currentTime; bestTime = -1LL; vectorTime = 0; for (j = 0; j < NUM_CYCLES; j++) { startTime = ReadTicks(); for (k = 0; k + 4 <= LOOPCOUNT; k += 4) { size_t k32 = (k & (DATA_SIZE - 1)); res_arr[k32] = vec3_arr[k32].dot3(vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++; res_arr[k32] = vec3_arr[k32].dot3(vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++; res_arr[k32] = vec3_arr[k32].dot3(vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); k32++; res_arr[k32] = vec3_arr[k32].dot3(vec3_arr1[k32], vec3_arr2[k32], vec3_arr3[k32]); } currentTime = ReadTicks() - startTime; vectorTime += currentTime; if (currentTime < bestTime) bestTime = currentTime; } if (0 == gReportAverageTimes) vectorTime = bestTime; else vectorTime /= NUM_CYCLES; } vlog("Timing:\n"); vlog(" \t scalar\t vector\n"); vlog(" \t%10.4f\t%10.4f\n", TicksToCycles(scalarTime) / LOOPCOUNT, TicksToCycles(vectorTime) / LOOPCOUNT); return 0; } #endif //BT_USE_SSE