Add PhysicsEffects to Extras. The build is only tested on Windows and Android.
The Android/NEON optimized version of Physics Effects is thanks to Graham Rhodes and Anthony Hamilton, See Issue 587
This commit is contained in:
@@ -0,0 +1,146 @@
|
||||
@
|
||||
@ Applied Research Associates Inc. (c)2011
|
||||
@
|
||||
@ Redistribution and use in source and binary forms,
|
||||
@ with or without modification, are permitted provided that the
|
||||
@ following conditions are met:
|
||||
@ * Redistributions of source code must retain the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer.
|
||||
@ * Redistributions in binary form must reproduce the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer in the
|
||||
@ documentation and/or other materials provided with the distribution.
|
||||
@ * Neither the name of the Applied Research Associates Inc nor the names
|
||||
@ of its contributors may be used to endorse or promote products derived
|
||||
@ from this software without specific prior written permission.
|
||||
@
|
||||
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
@ POSSIBILITY OF SUCH DAMAGE.
|
||||
@
|
||||
.syntax unified
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.thumb
|
||||
.text
|
||||
.align 2
|
||||
|
||||
@ cross product, result stored directly to memory
|
||||
|
||||
.global CrossProductNeonResultInMemoryAssembly
|
||||
.thumb_func
|
||||
CrossProductNeonResultInMemoryAssembly:
|
||||
.fnstart
|
||||
vld1.32 {d18,d19}, [r1] @ input <x2,y2,z2,w2> = d18,d19
|
||||
vld1.32 {d16,d17}, [r0] @ input <x1,y1,z1,w1> = d16,d17
|
||||
@ rearrange inputs into convenient order
|
||||
vtrn.32 d18,d19 @ q9 = <x2,z2,y2,w2> = d18,d19
|
||||
vrev64.32 d16,d16 @ q8 = <y1,x1,z1,w1> = d16,d17
|
||||
vrev64.32 d18,d18 @ q9 = <z2,x2,y2,w2> = d18,d19
|
||||
vtrn.32 d16,d17 @ q8 = <y1,z1,x1,w1> = d16,d17
|
||||
@ perform first half of cross product using rearranged inputs
|
||||
vmul.f32 q10, q8, q9 @ q10 = <y1*z2,z1*x2,x1*y2,w1*w2>
|
||||
@ rearrange inputs again
|
||||
vtrn.32 d18,d19 @ q9 = <z2,y2,x2,w2> = d18,d19
|
||||
vrev64.32 d16,d16 @ q8 = <z1,y1,x1,w1> = d16,d17
|
||||
vrev64.32 d18,d18 @ q9 = <y2,z2,x2,w2> = d18,d19
|
||||
vtrn.32 d16,d17 @ q8 = <z1,x1,y1,w1> = d16,d17
|
||||
@ perform last half of cross product using rearranged inputs
|
||||
vmls.f32 q10, q8, q9 @ q10 = <y1*z2-y2*z1,z1*x2-z2*x1,x1*y2-x2*y1,w1*w2-w2*w1>
|
||||
@ and store the result
|
||||
vst1.32 {q10}, [r2]
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
@ cross product, result stored directly to memory, version 2
|
||||
|
||||
.global CrossProductNeonResultInMemoryAssembly2
|
||||
.thumb_func
|
||||
CrossProductNeonResultInMemoryAssembly2:
|
||||
.fnstart
|
||||
vld1.32 {d18[1]}, [r1]!
|
||||
vld1.32 {d19[0]}, [r1]!
|
||||
vld1.32 {d18[0]}, [r1]!
|
||||
vld1.32 {d19[1]}, [r1] @ q9 = <z2,x2,y2,w2> = d18,d19
|
||||
vld1.32 {d17[0]}, [r0]!
|
||||
vld1.32 {d16}, [r0]!
|
||||
vld1.32 {d17[1]}, [r0] @ q8 = <y1,z1,x1,w1> = d16,d17
|
||||
@ perform first half of cross product using rearranged inputs
|
||||
vmul.f32 q10, q8, q9 @ q10 = <y1*z2,z1*x2,x1*y2,w1*w2>
|
||||
@ rearrange inputs for second half
|
||||
vtrn.32 d18,d19 @ q9 = <z2,y2,x2,w2> = d18,d19
|
||||
vrev64.32 d16,d16 @ q8 = <z1,y1,x1,w1> = d16,d17
|
||||
vrev64.32 d18,d18 @ q9 = <y2,z2,x2,w2> = d18,d19
|
||||
vtrn.32 d16,d17 @ q8 = <z1,x1,y1,w1> = d16,d17
|
||||
@ perform last half of cross product using rearranged inputs
|
||||
vmls.f32 q10, q8, q9 @ q10 = <y1*z2-y2*z1,z1*x2-z2*x1,x1*y2-x2*y1,w1*w2-w2*w1>
|
||||
@ and store the result
|
||||
vst1.32 {q10}, [r2]
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
.global CrossProductNeonResultInMemoryAssembly3
|
||||
.thumb_func
|
||||
CrossProductNeonResultInMemoryAssembly3:
|
||||
.fnstart
|
||||
vld1.32 {d18[1]}, [r1]!
|
||||
vld1.32 {d19[0]}, [r1]!
|
||||
vld1.32 {d18[0]}, [r1]!
|
||||
vld1.32 {d19[1]}, [r1]! @ q9 = <z2,x2,y2,w2> = d18,d19
|
||||
vld1.32 {d17[0]}, [r0]!
|
||||
vld1.32 {d16}, [r0]!
|
||||
vld1.32 {d17[1]}, [r0]! @ q8 = <y1,z1,x1,w1> = d16,d17
|
||||
@ perform first half of cross product using rearranged inputs
|
||||
vmul.f32 q10, q8, q9 @ q10 = <y1*z2,z1*x2,x1*y2,w1*w2>
|
||||
vld1.32 {d18[1]}, [r0]!
|
||||
vld1.32 {d19[0]}, [r0]!
|
||||
vld1.32 {d18[0]}, [r0]!
|
||||
vld1.32 {d19[1]}, [r0]! @ q9 = <z2,x2,y2,w2> = d18,d19
|
||||
vld1.32 {d17[0]}, [r1]!
|
||||
vld1.32 {d16}, [r1]!
|
||||
vld1.32 {d17[1]}, [r1]! @ q8 = <y1,z1,x1,w1> = d16,d17
|
||||
@ perform last half of cross product using rearranged inputs
|
||||
vmls.f32 q10, q8, q9 @ q10 = <y1*z2-y2*z1,z1*x2-z2*x1,x1*y2-x2*y1,w1*w2-w2*w1>
|
||||
@ and store the result
|
||||
vst1.32 {q10}, [r2]
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
@ cross product, result stored directly to memory, version 3
|
||||
|
||||
@ .global CrossProductNeonResultInMemoryAssembly3
|
||||
@ .thumb_func
|
||||
@CrossProductNeonResultInMemoryAssembly3:
|
||||
@ .fnstart
|
||||
@ ldr r3, table
|
||||
@ vld1.32 {d18,d19}, [r1] @ input <x2,y2,z2,w2> = d18,d19
|
||||
@ vld1.32 {d16,d17}, [r0] @ input <x1,y1,z1,w1> = d16,d17
|
||||
@ rearrange inputs into convenient order
|
||||
@ vld1.32 {q10}, [r3]!
|
||||
@ vld1.32 {q11}, [r3]
|
||||
@ vtbl.8 q12, {d18,d19}, q10
|
||||
@ vtbl.8 q13, {d16,d17}, q11
|
||||
@ perform first half of cross product using rearranged inputs
|
||||
@ vmul.f32 q14, q12, q13
|
||||
@ rearrange inputs again
|
||||
@ vtbl.8 q12, {d18,d19}, q11
|
||||
@ vtbl.8 q13, {d16,d17}, q10
|
||||
@ perform last half of cross product using rearranged inputs
|
||||
@ vmls.f32 q14, q12, q13
|
||||
@ and store the result
|
||||
@ vst1.32 {q14}, [r2]
|
||||
@ bx lr
|
||||
@ .fnend
|
||||
|
||||
@table:
|
||||
@z2,x2,y2,w2
|
||||
@ dcb 08,09,10,11,00,01,02,03,04,05,06,07,00,00,00,00
|
||||
@y1,z1,x1,w1
|
||||
@ dcb 04,05,06,07,08,09,10,11,00,01,02,03,00,00,00,00
|
||||
@@ -0,0 +1,98 @@
|
||||
@
|
||||
@ Applied Research Associates Inc. (c)2011
|
||||
@
|
||||
@ Redistribution and use in source and binary forms,
|
||||
@ with or without modification, are permitted provided that the
|
||||
@ following conditions are met:
|
||||
@ * Redistributions of source code must retain the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer.
|
||||
@ * Redistributions in binary form must reproduce the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer in the
|
||||
@ documentation and/or other materials provided with the distribution.
|
||||
@ * Neither the name of the Applied Research Associates Inc nor the names
|
||||
@ of its contributors may be used to endorse or promote products derived
|
||||
@ from this software without specific prior written permission.
|
||||
@
|
||||
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
@ POSSIBILITY OF SUCH DAMAGE.
|
||||
@
|
||||
.syntax unified
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.thumb
|
||||
.text
|
||||
.align 2
|
||||
|
||||
@ dot product, result returned in register
|
||||
|
||||
.global DotProductNeonResultInRegisterAssembly
|
||||
.thumb_func
|
||||
DotProductNeonResultInRegisterAssembly:
|
||||
.fnstart
|
||||
vld1.32 {d16,d17}, [r0] @ input <x1,y1,z1,w1>
|
||||
vld1.32 {d18,d19}, [r1] @ input <x2,y2,z2,w2>
|
||||
vmul.f32 d14, d16, d18 @ d14=<x1*x2,y1*y2>
|
||||
vmla.f32 d14, d17, d19 @ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
|
||||
vpadd.f32 d14, d14, d14 @ add the two halves of d14 together, result in each lane
|
||||
vmov.f32 r0, s28 @ s28 aliases the first lane of d14
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
@ dot product, result stored directly to memory
|
||||
|
||||
.global DotProductNeonResultInMemoryAssembly
|
||||
.thumb_func
|
||||
DotProductNeonResultInMemoryAssembly:
|
||||
.fnstart
|
||||
vld1.32 {d16,d17}, [r0] @ input <x1,y1,z1,w1>
|
||||
vld1.32 {d18,d19}, [r1] @ input <x2,y2,z2,w2>
|
||||
vmul.f32 d14, d16, d18 @ d14=<x1*x2,y1*y2>
|
||||
@ non-fused multiple accumulate
|
||||
vmla.f32 d14, d17, d19 @ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
|
||||
@ fused multiple accumulate - not recognized by GNU assembler
|
||||
@@ vfma.f32 {d14}, d17, d19 @ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
|
||||
vpadd.f32 d14, d14, d14 @ add the two halves of d14 together, same result in each lane
|
||||
vst1.32 {d14}, [r2]
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
@ dot product, result stored directly to memory, all inputs are contiguous in memory
|
||||
|
||||
.global DotProductNeonResultInMemoryAssembly2
|
||||
.thumb_func
|
||||
DotProductNeonResultInMemoryAssembly2:
|
||||
.fnstart
|
||||
vld1.32 {d16-d19}, [r0] @ input q8=<x1,y1,z1,w1>, q9=<x2,y2,z2,w2>
|
||||
vmul.f32 d14, d16, d18 @ d14=<x1*x2,y1*y2>
|
||||
@ non-fused multiple accumulate
|
||||
vmla.f32 d14, d17, d19 @ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
|
||||
@ fused multiple accumulate - not recognized by GNU assembler
|
||||
@@ vfma.f32 d14, d17, d19 @ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
|
||||
vpadd.f32 d14, d14, d14 @ add the two halves of d14 together, same result in each lane
|
||||
vst1.32 {d14}, [r1]
|
||||
bx lr
|
||||
.fnend
|
||||
|
||||
@ same as DotProductNeonResultInMemoryAssembly2, but alternate methodology to compute result
|
||||
|
||||
.global DotProductNeonResultInMemoryAssembly2b
|
||||
.thumb_func
|
||||
DotProductNeonResultInMemoryAssembly2b:
|
||||
.fnstart
|
||||
vld1.32 {d14-d17}, [r0] @ input q7=<x1,y1,z1,w1>, q8=<x2,y2,z2,w2>
|
||||
vmul.f32 q7, q7, q8 @ q7 = <x1*x2,y1*y2,z1*z2,w1*w2> = d14,d15 = s28,s29,s30,s31
|
||||
vpadd.f32 d14,d14,d14 @ add elements of d14 together=<x1*x2+y1*y2,x1*x2+y1*y2>
|
||||
vpadd.f32 d15,d15,d15 @ add elements of d14 together=<z1*z2+w1*w2,z1*z2+w1*w2>
|
||||
vadd.f32 d14,d14,d15 @ add d14 and d15 together, final result in both lanes
|
||||
vst1.32 {d14}, [r1]
|
||||
bx lr
|
||||
.fnend
|
||||
@@ -0,0 +1,62 @@
|
||||
@
|
||||
@ Applied Research Associates Inc. (c)2011
|
||||
@
|
||||
@ Redistribution and use in source and binary forms,
|
||||
@ with or without modification, are permitted provided that the
|
||||
@ following conditions are met:
|
||||
@ * Redistributions of source code must retain the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer.
|
||||
@ * Redistributions in binary form must reproduce the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer in the
|
||||
@ documentation and/or other materials provided with the distribution.
|
||||
@ * Neither the name of the Applied Research Associates Inc nor the names
|
||||
@ of its contributors may be used to endorse or promote products derived
|
||||
@ from this software without specific prior written permission.
|
||||
@
|
||||
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
@ POSSIBILITY OF SUCH DAMAGE.
|
||||
@
|
||||
.syntax unified
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.thumb
|
||||
.text
|
||||
.align 2
|
||||
|
||||
@ matrix3 operator *, result stored directly to memory
|
||||
|
||||
.global Matrix3OperatorMultiplyNeon
|
||||
.thumb_func
|
||||
Matrix3OperatorMultiplyNeon:
|
||||
.fnstart
|
||||
vld1.32 {d16-d19}, [r0]! @ load first eight elements of matrix 0
|
||||
vld1.32 {d20-d21}, [r0] @ load second eight elements of matrix 0
|
||||
vld1.32 {d0-d3}, [r1]! @ load first eight elements of matrix 1
|
||||
vld1.32 {d4-d5}, [r1] @ load second eight elements of matrix 1
|
||||
|
||||
vmul.f32 q12, q8, d0[0] @ rslt col0 = (mat0 col0) * (mat1 col0 elt0)
|
||||
vmul.f32 q13, q8, d2[0] @ rslt col1 = (mat0 col0) * (mat1 col1 elt0)
|
||||
vmul.f32 q14, q8, d4[0] @ rslt col2 = (mat0 col0) * (mat1 col2 elt0)
|
||||
|
||||
vmla.f32 q12, q9, d0[1] @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
|
||||
vmla.f32 q13, q9, d2[1] @ rslt col1 += (mat0 col1) * (mat1 col1 elt1)
|
||||
vmla.f32 q14, q9, d4[1] @ rslt col2 += (mat0 col1) * (mat1 col2 elt1)
|
||||
|
||||
vmla.f32 q12, q10, d1[0] @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
|
||||
vmla.f32 q13, q10, d3[0] @ rslt col1 += (mat0 col2) * (mat1 col1 elt2)
|
||||
vmla.f32 q14, q10, d5[0] @ rslt col2 += (mat0 col2) * (mat1 col2 elt2)
|
||||
|
||||
vst1.32 {d24-d27}, [r2]! @ store first eight elements of result
|
||||
vst1.32 {d28-d29}, [r2] @ store second eight elements of result
|
||||
|
||||
bx lr
|
||||
.fnend
|
||||
@@ -0,0 +1,71 @@
|
||||
@
|
||||
@ Applied Research Associates Inc. (c)2011
|
||||
@
|
||||
@ Redistribution and use in source and binary forms,
|
||||
@ with or without modification, are permitted provided that the
|
||||
@ following conditions are met:
|
||||
@ * Redistributions of source code must retain the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer.
|
||||
@ * Redistributions in binary form must reproduce the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer in the
|
||||
@ documentation and/or other materials provided with the distribution.
|
||||
@ * Neither the name of the Applied Research Associates Inc nor the names
|
||||
@ of its contributors may be used to endorse or promote products derived
|
||||
@ from this software without specific prior written permission.
|
||||
@
|
||||
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
@ POSSIBILITY OF SUCH DAMAGE.
|
||||
@
|
||||
.syntax unified
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.thumb
|
||||
.text
|
||||
.align 2
|
||||
|
||||
@ matrix4 operator *, result stored directly to memory
|
||||
|
||||
.global Matrix4OperatorMultiplyNeon
|
||||
.thumb_func
|
||||
Matrix4OperatorMultiplyNeon:
|
||||
.fnstart
|
||||
|
||||
vld1.32 {d16-d19}, [r0]! @ load first eight elements of matrix 0
|
||||
vld1.32 {d20-d23}, [r0] @ load second eight elements of matrix 0
|
||||
vld1.32 {d0-d3}, [r1]! @ load first eight elements of matrix 1
|
||||
vld1.32 {d4-d7}, [r1] @ load second eight elements of matrix 1
|
||||
|
||||
vmul.f32 q12, q8, d0[0] @ rslt col0 = (mat0 col0) * (mat1 col0 elt0)
|
||||
vmul.f32 q13, q8, d2[0] @ rslt col1 = (mat0 col0) * (mat1 col1 elt0)
|
||||
vmul.f32 q14, q8, d4[0] @ rslt col2 = (mat0 col0) * (mat1 col2 elt0)
|
||||
vmul.f32 q15, q8, d6[0] @ rslt col3 = (mat0 col0) * (mat1 col3 elt0)
|
||||
|
||||
vmla.f32 q12, q9, d0[1] @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
|
||||
vmla.f32 q13, q9, d2[1] @ rslt col1 += (mat0 col1) * (mat1 col1 elt1)
|
||||
vmla.f32 q14, q9, d4[1] @ rslt col2 += (mat0 col1) * (mat1 col2 elt1)
|
||||
vmla.f32 q15, q9, d6[1] @ rslt col3 += (mat0 col1) * (mat1 col3 elt1)
|
||||
|
||||
vmla.f32 q12, q10, d1[0] @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
|
||||
vmla.f32 q13, q10, d3[0] @ rslt col1 += (mat0 col2) * (mat1 col1 elt2)
|
||||
vmla.f32 q14, q10, d5[0] @ rslt col2 += (mat0 col2) * (mat1 col2 elt2)
|
||||
vmla.f32 q15, q10, d7[0] @ rslt col3 += (mat0 col2) * (mat1 col2 elt2)
|
||||
|
||||
vmla.f32 q12, q11, d1[1] @ rslt col0 += (mat0 col3) * (mat1 col0 elt3)
|
||||
vmla.f32 q13, q11, d3[1] @ rslt col1 += (mat0 col3) * (mat1 col1 elt3)
|
||||
vmla.f32 q14, q11, d5[1] @ rslt col2 += (mat0 col3) * (mat1 col2 elt3)
|
||||
vmla.f32 q15, q11, d7[1] @ rslt col3 += (mat0 col3) * (mat1 col3 elt3)
|
||||
|
||||
vst1.32 {d24-d27}, [r2]! @ store first eight elements of result
|
||||
vst1.32 {d28-d31}, [r2] @ store second eight elements of result
|
||||
|
||||
bx lr
|
||||
.fnend
|
||||
@@ -0,0 +1,81 @@
|
||||
@
|
||||
@ Applied Research Associates Inc. (c)2011
|
||||
@
|
||||
@ Redistribution and use in source and binary forms,
|
||||
@ with or without modification, are permitted provided that the
|
||||
@ following conditions are met:
|
||||
@ * Redistributions of source code must retain the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer.
|
||||
@ * Redistributions in binary form must reproduce the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer in the
|
||||
@ documentation and/or other materials provided with the distribution.
|
||||
@ * Neither the name of the Applied Research Associates Inc nor the names
|
||||
@ of its contributors may be used to endorse or promote products derived
|
||||
@ from this software without specific prior written permission.
|
||||
@
|
||||
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
@ POSSIBILITY OF SUCH DAMAGE.
|
||||
@
|
||||
.syntax unified
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.thumb
|
||||
.text
|
||||
.align 2
|
||||
|
||||
@ orthoInverse of a transform3, result stored directly to memory
|
||||
|
||||
.global OrthoInverseTransform3Neon
|
||||
.thumb_func
|
||||
OrthoInverseTransform3Neon:
|
||||
.fnstart
|
||||
@way 1
|
||||
@ vld1.f32 {d0-d3}, [r0]! @ load first eight elements of result
|
||||
@ vld1.f32 {d4-d7}, [r0] @ load second eight elements of result
|
||||
|
||||
@ vzip.f32 d0, d2
|
||||
@ vzip.f32 d1, d3
|
||||
@ vswp.f32 d1, d4
|
||||
@ vzip.f32 d1, d3
|
||||
|
||||
@way 2 (faster)
|
||||
@
|
||||
@
|
||||
@ NOTE NOTE! This is broken! It does not use the last column of the input! Please see
|
||||
@ implementation of pfxTransform3OrthoInverseNEON in vectormath_neon_assembly_implementations.S, in
|
||||
@ include/vecmath/neon, for a correct implementation!
|
||||
@
|
||||
vld1.f32 d0[0], [r0]!
|
||||
vld1.f32 d2[0], [r0]!
|
||||
vld1.f32 d4[0], [r0]!
|
||||
vld1.f32 d1[1], [r0]!
|
||||
|
||||
vld1.f32 d0[1], [r0]!
|
||||
vld1.f32 d2[1], [r0]!
|
||||
vld1.f32 d4[1], [r0]!
|
||||
vld1.f32 d3[1], [r0]!
|
||||
|
||||
vld1.f32 d1[0], [r0]!
|
||||
vld1.f32 d3[0], [r0]!
|
||||
vld1.f32 d5[0], [r0]!
|
||||
vld1.f32 d5[1], [r0]
|
||||
|
||||
vmul.f32 q3, q0, d1[0]
|
||||
vmla.f32 q3, q1, d3[0]
|
||||
vmla.f32 q3, q2, d5[0]
|
||||
vneg.f32 q3, q3
|
||||
|
||||
vst1.f32 {d0-d3}, [r1]! @ store first eight elements of result
|
||||
vst1.f32 {d4-d7}, [r1] @ store second eight elements of result
|
||||
|
||||
bx lr
|
||||
.fnend
|
||||
@@ -0,0 +1,51 @@
|
||||
@
|
||||
@ Applied Research Associates Inc. (c)2011
|
||||
@
|
||||
@ Redistribution and use in source and binary forms,
|
||||
@ with or without modification, are permitted provided that the
|
||||
@ following conditions are met:
|
||||
@ * Redistributions of source code must retain the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer.
|
||||
@ * Redistributions in binary form must reproduce the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer in the
|
||||
@ documentation and/or other materials provided with the distribution.
|
||||
@ * Neither the name of the Applied Research Associates Inc nor the names
|
||||
@ of its contributors may be used to endorse or promote products derived
|
||||
@ from this software without specific prior written permission.
|
||||
@
|
||||
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
@ POSSIBILITY OF SUCH DAMAGE.
|
||||
@
|
||||
.syntax unified
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.thumb
|
||||
.text
|
||||
.align 2
|
||||
|
||||
@ transform3 operator *, result stored directly to memory
|
||||
|
||||
.global Transform3OperatorMultiplyNeon
|
||||
.thumb_func
|
||||
Transform3OperatorMultiplyNeon:
|
||||
.fnstart
|
||||
|
||||
vld1.32 {d16-d19}, [r0]! @ load first eight elements of transform matrix
|
||||
vld1.32 {d20-d21}, [r0] @ load second eight elements of transform matrix
|
||||
vld1.32 {d0-d1}, [r1] @ load the four elements of vector3
|
||||
vmul.f32 q12, q8, d0[0] @ rslt col0 = (mat0 col0) * (mat1 col0 elt0)
|
||||
vmla.f32 q12, q9, d0[1] @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
|
||||
vmla.f32 q12, q10, d1[0] @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
|
||||
vst1.32 {d24-d25}, [r2] @ store first eight elements of result
|
||||
|
||||
bx lr
|
||||
.fnend
|
||||
@@ -0,0 +1,81 @@
|
||||
@
|
||||
@ Applied Research Associates Inc. (c)2011
|
||||
@
|
||||
@ Redistribution and use in source and binary forms,
|
||||
@ with or without modification, are permitted provided that the
|
||||
@ following conditions are met:
|
||||
@ * Redistributions of source code must retain the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer.
|
||||
@ * Redistributions in binary form must reproduce the above copyright
|
||||
@ notice, this list of conditions and the following disclaimer in the
|
||||
@ documentation and/or other materials provided with the distribution.
|
||||
@ * Neither the name of the Applied Research Associates Inc nor the names
|
||||
@ of its contributors may be used to endorse or promote products derived
|
||||
@ from this software without specific prior written permission.
|
||||
@
|
||||
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
@ POSSIBILITY OF SUCH DAMAGE.
|
||||
@
|
||||
.syntax unified
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.thumb
|
||||
.text
|
||||
.align 2
|
||||
|
||||
@ transpose a matrix3, result stored directly to memory
|
||||
|
||||
.global TransposeMatrix3Neon
|
||||
.thumb_func
|
||||
TransposeMatrix3Neon:
|
||||
.fnstart
|
||||
|
||||
@way1
|
||||
@ vld1.f32 d0[0], [r0]!
|
||||
@ vld1.f32 d2[0], [r0]!
|
||||
@ vld1.f32 d4[0], [r0]!
|
||||
@ vld1.f32 d1[1], [r0]!
|
||||
|
||||
@ vld1.f32 d0[1], [r0]!
|
||||
@ vld1.f32 d2[1], [r0]!
|
||||
@ vld1.f32 d4[1], [r0]!
|
||||
@ vld1.f32 d3[1], [r0]!
|
||||
|
||||
@ vld1.f32 d1[0], [r0]!
|
||||
@ vld1.f32 d3[0], [r0]!
|
||||
@ vld1.f32 d5[0], [r0]!
|
||||
@ vld1.f32 d5[1], [r0]
|
||||
|
||||
@way2
|
||||
@ vld2.f32 {d0-d1}, [r0]!
|
||||
@ vld2.f32 {d2-d3}, [r0]!
|
||||
@ vld2.f32 {d4-d5}, [r0]
|
||||
|
||||
@ vzip.f32 q0, q1
|
||||
@ vswp.f32 d1, d4
|
||||
@ vswp.f32 d3, d5
|
||||
@ vzip.f32 d1, d5
|
||||
|
||||
@way3 (Fastest)
|
||||
vld1.f32 {d0-d3}, [r0]! @ store first eight elements of result
|
||||
vld1.f32 {d4-d5}, [r0] @ store second eight elements of result
|
||||
|
||||
vzip.f32 d0, d2
|
||||
vzip.f32 d1, d3
|
||||
vswp.f32 d1, d4
|
||||
vzip.f32 d1, d3
|
||||
|
||||
vst1.f32 {d0-d3}, [r1]! @ store first eight elements of result
|
||||
vst1.f32 {d4-d5}, [r1] @ store second eight elements of result
|
||||
|
||||
bx lr
|
||||
.fnend
|
||||
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
Applied Research Associates Inc. (c)2011
|
||||
|
||||
Redistribution and use in source and binary forms,
|
||||
with or without modification, are permitted provided that the
|
||||
following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Applied Research Associates Inc nor the names
|
||||
of its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef TEST_NEON_H
|
||||
#define TEST_NEON_H
|
||||
|
||||
//Neon functions to test
|
||||
void TestNeonDotProduct();
|
||||
void TestNeonCrossProduct();
|
||||
void TestNeonMatrix3OperatorMultiply();
|
||||
void TestNeonMatrix4OperatorMultiply();
|
||||
void TestNeonOrthoInverseTransform3();
|
||||
void TestNeonTransform3OperatorMultiply();
|
||||
void TestNeonTransposeMatrix3();
|
||||
void TestNeonSolveLinearConstraintRow();
|
||||
|
||||
#endif // TEST_NEON_H
|
||||
@@ -0,0 +1,229 @@
|
||||
/*
|
||||
Applied Research Associates Inc. (c)2011
|
||||
|
||||
Redistribution and use in source and binary forms,
|
||||
with or without modification, are permitted provided that the
|
||||
following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Applied Research Associates Inc nor the names
|
||||
of its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "test_neon.h"
|
||||
#define SCE_PFX_USE_PERFCOUNTER
|
||||
#include "physics_effects.h"
|
||||
#include <arm_neon.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <android/log.h>
|
||||
|
||||
// This works with gcc
|
||||
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
|
||||
|
||||
//#define PRINT_COMPUTED_VECTOR_RESULTS
|
||||
|
||||
// assembly implementations
|
||||
extern "C"
|
||||
{
|
||||
void CrossProductNeonResultInMemoryAssembly(float *a, float *b, float *pfResult);
|
||||
void CrossProductNeonResultInMemoryAssembly2(float *a, float *b, float *pfResult);
|
||||
void CrossProductNeonResultInMemoryAssembly3(float *a, float *b, float *pfResult);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// CrossProductNeonResultInMemory
|
||||
//
|
||||
/// Performs a Vector3 style cross product using NEON intrinsics, storing the
|
||||
/// result directly into system memory.
|
||||
///
|
||||
/// @param a Input vector 1. Must point to 4 float values
|
||||
/// @param b Input vector 2. Must point to 4 float values
|
||||
/// @param pfResult [in] must point to an array of at least *4*
|
||||
/// float values. [out] The result of the cross
|
||||
/// product is contained in the first 3 lanes.
|
||||
///
|
||||
/// NOTE: The parameter types here are floats, not float32_t's. gcc
|
||||
/// sometimes doesn't interpret float32_t's correctly. In particular,
|
||||
/// if the type of pfResult is set to float32_t*, gcc will throw an
|
||||
/// internal compiler error (ICE) for this code. In memory, float32_t
|
||||
/// and float are equivalent, so can cast between them explicitly.
|
||||
//----------------------------------------------------------------------------
|
||||
void CrossProductNeonResultInMemory(float *a, float *b, float *pfResult)
|
||||
{
|
||||
float32x4_t v1 = {a[1],a[2],a[0], 0.0f};
|
||||
float32x4_t v2 = {b[2],b[0],b[1], 0.0f};
|
||||
float32x4_t v3 = {a[2],a[0],a[1], 0.0f};
|
||||
float32x4_t v4 = {b[1],b[2],b[0], 0.0f};
|
||||
v1 = vmulq_f32(v1, v2);
|
||||
v1 = vmlsq_f32(v1, v3, v4);
|
||||
vst1q_f32((float32_t*)pfResult, v1);
|
||||
}
|
||||
|
||||
void CrossProductNeonResultInMemoryCPPAssembly(float *a, float *b, float *result) {
|
||||
asm volatile(
|
||||
"vld1.32 {d18[1]}, [r1]! \n\t"
|
||||
"vld1.32 {d19[0]}, [r1]! \n\t"
|
||||
"vld1.32 {d18[0]}, [r1]! \n\t"
|
||||
"vld1.32 {d19[1]}, [r1] \n\t"
|
||||
"vld1.32 {d17[0]}, [r0]! \n\t"
|
||||
"vld1.32 {d16}, [r0]! \n\t"
|
||||
"vld1.32 {d17[1]}, [r0] \n\t"
|
||||
"vmul.f32 q10, q8, q9 \n\t"
|
||||
"vtrn.32 d18,d19 \n\t"
|
||||
"vrev64.32 d16,d16 \n\t"
|
||||
"vrev64.32 d18,d18 \n\t"
|
||||
"vtrn.32 d16,d17 \n\t"
|
||||
"vmls.f32 q10, q8, q9 \n\t"
|
||||
"vst1.32 {q10}, [r2] \n\t"
|
||||
);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// CrossProductScalarResultInMemory
|
||||
//
|
||||
/// Performs a Vector3 style cross product using scalar math, storing the
|
||||
/// result directly into system memory.
|
||||
///
|
||||
/// @param a Input vector 1. Must point to 4 float values
|
||||
/// @param b Input vector 2. Must point to 4 float values
|
||||
/// @param pfResult [in] pointer to a float. [out] Contains the
|
||||
/// result, dotproduct(a,b)
|
||||
//----------------------------------------------------------------------------
|
||||
void CrossProductScalarResultInMemory(float *a, float *b, float *pfResult)
|
||||
{
|
||||
pfResult[0] = a[1]*b[2] - a[2]*b[1];
|
||||
pfResult[1] = a[2]*b[0] - a[0]*b[2];
|
||||
pfResult[2] = a[0]*b[1] - a[1]*b[0];
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// TestFastNeonCrossProduct
|
||||
//
|
||||
/// Run timing study of the cross product functions above, writing the
|
||||
/// results to the Android verbose log.
|
||||
//----------------------------------------------------------------------------
|
||||
void TestNeonCrossProduct()
|
||||
{
|
||||
float SET_ALIGNMENT(64) data[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
|
||||
|
||||
float *a = &data[0];
|
||||
float *b = &data[4];
|
||||
|
||||
char szMsg[256];
|
||||
|
||||
sprintf(szMsg, "");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"---------------------------------------");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "Cross product test inputs A=<%f,%f,%f>, B=<%f,%f,%f>",
|
||||
a[0], a[1], a[2], b[0], b[1], b[2]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
float SET_ALIGNMENT(64) fResult[4];
|
||||
|
||||
double dTimeSpan, dRefTimeSpan;;
|
||||
unsigned int uiNumTries = 10000000;
|
||||
unsigned int i;
|
||||
|
||||
sce::PhysicsEffects::PfxPerfCounter pc;
|
||||
|
||||
// profile scalar cross product with direct memory return
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
CrossProductScalarResultInMemory(a, b, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
dRefTimeSpan = dTimeSpan;
|
||||
sprintf(szMsg, "Time to do %i calls for CrossProductScalarResultInMemory: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
|
||||
// profile NEON assembly volatile cross product with direct memory return
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
CrossProductNeonResultInMemoryCPPAssembly(a, b, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryFast: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
|
||||
// profile NEON cross product with direct memory return, assembly version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
CrossProductNeonResultInMemoryAssembly(a, b, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
|
||||
// profile NEON cross product with direct memory return, assembly version 2
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
CrossProductNeonResultInMemoryAssembly2(a, b, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
|
||||
sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly2: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
|
||||
// profile NEON cross product with direct memory return, assembly version 3
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
CrossProductNeonResultInMemoryAssembly3(a, b, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
|
||||
sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly3: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
}
|
||||
@@ -0,0 +1,330 @@
|
||||
/*
|
||||
Applied Research Associates Inc. (c)2011
|
||||
|
||||
Redistribution and use in source and binary forms,
|
||||
with or without modification, are permitted provided that the
|
||||
following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Applied Research Associates Inc nor the names
|
||||
of its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "test_neon.h"
|
||||
#define SCE_PFX_USE_PERFCOUNTER
|
||||
#include "physics_effects.h"
|
||||
#include <arm_neon.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <android/log.h>
|
||||
|
||||
// This works with gcc
|
||||
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
|
||||
|
||||
//#define PRINT_COMPUTED_VECTOR_RESULTS
|
||||
|
||||
// assembly implementations
|
||||
extern "C"
|
||||
{
|
||||
float32_t DotProductNeonResultInRegisterAssembly(float *a, float *b);
|
||||
void DotProductNeonResultInMemoryAssembly(float *a, float *b, float *pfResult);
|
||||
void DotProductNeonResultInMemoryAssembly2(float *data, float *pfResult);
|
||||
void DotProductNeonResultInMemoryAssembly2b(float *data, float *pfResult);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// DotProductNeonResultInRegister
|
||||
//
|
||||
/// Performs a Vector4 style dot product using NEON intrinsics, returning
|
||||
/// the result in an ARM register.
|
||||
///
|
||||
/// @param a Input vector 1. Must point to 4 float values
|
||||
/// @param b Input vector 2. Must point to 4 float values
|
||||
///
|
||||
/// @return dotproduct(a,b)
|
||||
//----------------------------------------------------------------------------
|
||||
float32_t DotProductNeonResultInRegister(float *a, float *b)
|
||||
{
|
||||
float32_t result[2];
|
||||
float32x2_t v1a = vld1_f32((float32_t*)a);
|
||||
float32x2_t v1b = vld1_f32((float32_t*)(a + 2));
|
||||
float32x2_t v2a = vld1_f32((float32_t*)b);
|
||||
float32x2_t v2b = vld1_f32((float32_t*)(b + 2));
|
||||
v1a = vmul_f32(v1a, v2a);
|
||||
v1a = vmla_f32(v1a, v1b, v2b);
|
||||
v1a = vpadd_f32(v1a, v1a);
|
||||
vst1_f32(result, v1a);
|
||||
return(result[0]);
|
||||
}
|
||||
|
||||
// pfResult must point to memory large enough to store *two*
|
||||
// float32_t values
|
||||
//----------------------------------------------------------------------------
|
||||
// DotProductNeonResultInMemory
|
||||
//
|
||||
/// Performs a Vector4 style dot product using NEON intrinsics, storing the
|
||||
/// result directly into system memory.
|
||||
///
|
||||
/// @param a Input vector 1. Must point to 4 float values
|
||||
/// @param b Input vector 2. Must point to 4 float values
|
||||
/// @param pfResult [in] must point to an array of at least *2*
|
||||
/// float values. [out] The result of the dot
|
||||
/// product is contained in both slots of the array.
|
||||
/// Recommended to use the first one though they
|
||||
/// have the same value
|
||||
///
|
||||
/// NOTE: The parameter types here are floats, not float32_t's. gcc
|
||||
/// sometimes doesn't interpret float32_t's correctly. In particular,
|
||||
/// if the type of pfResult is set to float32_t*, gcc will throw an
|
||||
/// internal compiler error (ICE) for this code. In memory, float32_t
|
||||
/// and float are equivalent, so can cast between them explicitly.
|
||||
//----------------------------------------------------------------------------
|
||||
void DotProductNeonResultInMemory(float *a, float *b, float *pfResult)
|
||||
{
|
||||
float32x2_t v1a = vld1_f32((float32_t*)a);
|
||||
float32x2_t v1b = vld1_f32((float32_t*)(a + 2));
|
||||
float32x2_t v2a = vld1_f32((float32_t*)b);
|
||||
float32x2_t v2b = vld1_f32((float32_t*)(b + 2));
|
||||
v1a = vmul_f32(v1a, v2a);
|
||||
v1a = vmla_f32(v1a, v1b, v2b);
|
||||
v1a = vpadd_f32(v1a, v1a);
|
||||
vst1_f32((float32_t*)pfResult, v1a);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// DotProductScalarResultInRegister
|
||||
//
|
||||
/// Performs a Vector4 style dot product using scalar math, and returning
|
||||
/// the result in an ARM register.
|
||||
///
|
||||
/// @param a Input vector 1. Must point to 4 float values
|
||||
/// @param b Input vector 2. Must point to 4 float values
|
||||
///
|
||||
/// @return dotproduct(a,b)
|
||||
//----------------------------------------------------------------------------
|
||||
float DotProductScalarResultInRegister(float *a, float *b)
|
||||
{
|
||||
return(a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3]);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// DotProductScalarResultInMemory
|
||||
//
|
||||
/// Performs a Vector4 style dot product using scalar math, storing the
|
||||
/// result directly into system memory.
|
||||
///
|
||||
/// @param a Input vector 1. Must point to 4 float values
|
||||
/// @param b Input vector 2. Must point to 4 float values
|
||||
/// @param pfResult [in] pointer to a float. [out] Contains the
|
||||
/// result, dotproduct(a,b)
|
||||
//----------------------------------------------------------------------------
|
||||
void DotProductScalarResultInMemory(float *a, float *b, float *pfResult)
|
||||
{
|
||||
*pfResult = a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// TestFastNeonDotProduct
|
||||
//
|
||||
/// Run timing study of the four dot product functions above, writing the
|
||||
/// results to the Android verbose log.
|
||||
//----------------------------------------------------------------------------
|
||||
void TestNeonDotProduct()
|
||||
{
|
||||
float SET_ALIGNMENT(64) data[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX)};
|
||||
|
||||
float *a = &data[0];
|
||||
float *b = &data[4];
|
||||
|
||||
char szMsg[256];
|
||||
|
||||
sprintf(szMsg, "");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"---------------------------------------");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
|
||||
sprintf(szMsg, "Dot product test inputs A=<%f,%f,%f>, B=<%f,%f,%f>",
|
||||
a[0], a[1], a[2], b[0], b[1], b[2]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
#endif
|
||||
|
||||
float SET_ALIGNMENT(64) fResult[2];
|
||||
|
||||
sce::PhysicsEffects::PfxPerfCounter pc;
|
||||
double dTimeSpan, dRefTimeSpan;
|
||||
unsigned int uiNumTries = 10000000;
|
||||
unsigned int i;
|
||||
|
||||
// profile scalar dot product with register return
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
fResult[0] = DotProductScalarResultInRegister(a, b); // C++
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
dRefTimeSpan = dTimeSpan;
|
||||
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
|
||||
sprintf(szMsg, "Time to do %i calls to scalar dot product return in register: %f, result value=%f",
|
||||
uiNumTries, dTimeSpan, fResult[0]);
|
||||
#else
|
||||
sprintf(szMsg, "Sclr C++ Dot Product Register Return: %10.7f secs, REF TIME", dTimeSpan);
|
||||
#endif
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
// profile NEON dot product with register return
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
fResult[0] = DotProductNeonResultInRegister(a, b); // C++
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
|
||||
sprintf(szMsg, "Time to do %i calls to NEON dot product return in register: %f, result value=%f",
|
||||
uiNumTries, dTimeSpan, fResult[0]);
|
||||
#else
|
||||
sprintf(szMsg, "Neon C++ Dot Product Register Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
|
||||
#endif
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
// profile NEON dot product with register return, assembly version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
fResult[0] = DotProductNeonResultInRegisterAssembly(a, b);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
|
||||
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
|
||||
sprintf(szMsg, "Time to do %i calls to NEON dot product return in register (assembly version): %f, result value=%f",
|
||||
uiNumTries, dTimeSpan, fResult[0]);
|
||||
#else
|
||||
sprintf(szMsg, "Neon Asm Dot Product Register Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
|
||||
#endif
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
// profile scalar dot product with direct memory return
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
DotProductScalarResultInMemory(a, b, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
|
||||
dRefTimeSpan = dTimeSpan;
|
||||
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
|
||||
sprintf(szMsg, "Time to do %i calls to scalar dot product return in memory: %f, result value=%f",
|
||||
uiNumTries, dTimeSpan, fResult[0]);
|
||||
#else
|
||||
sprintf(szMsg, " Sclr C++ Dot Product Memory Return: %10.7f secs, REF TIME", dTimeSpan);
|
||||
#endif
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
// profile NEON dot product with direct memory return
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
DotProductNeonResultInMemory(a, b, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
|
||||
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
|
||||
sprintf(szMsg, "Time to do %i calls to NEON dot product return in memory: %f, result value=%f",
|
||||
uiNumTries, dTimeSpan, fResult[0]);
|
||||
#else
|
||||
sprintf(szMsg, " Neon C++ Dot Product Memory Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
|
||||
#endif
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
// profile NEON dot product with direct memory return, assembly version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
DotProductNeonResultInMemoryAssembly(a, b, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
|
||||
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
|
||||
sprintf(szMsg, "Time to do %i calls to NEON dot product return in memory (assembly version): %f, result value=%f",
|
||||
uiNumTries, dTimeSpan, fResult[0]);
|
||||
#else
|
||||
sprintf(szMsg, " Neon Asm Dot Product Memory Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
|
||||
#endif
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
// profile NEON dot product with direct memory return, assembly version with both
|
||||
// inputs in contiguous memory block (array of structures)
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
DotProductNeonResultInMemoryAssembly2(data, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
|
||||
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
|
||||
sprintf(szMsg, "Time to do %i calls to NEON dot product return in memory (assembly version - array of structures): %f, result value=%f",
|
||||
uiNumTries, dTimeSpan, fResult[0]);
|
||||
#else
|
||||
sprintf(szMsg, " Neon Asm2 Dot Product Memory Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
|
||||
#endif
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
// profile NEON dot product with direct memory return, assembly version with both
|
||||
// inputs in contiguous memory block (array of structures), alternate methodology
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
DotProductNeonResultInMemoryAssembly2b(data, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
|
||||
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
|
||||
sprintf(szMsg, "Time to do %i calls to NEON dot product return in memory (assembly version - alternate methodology): %f, result value=%f",
|
||||
uiNumTries, dTimeSpan, fResult[0]);
|
||||
#else
|
||||
sprintf(szMsg, " Neon Asm3 Dot Product Memory Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
|
||||
#endif
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
}
|
||||
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
Applied Research Associates Inc. (c)2011
|
||||
|
||||
Redistribution and use in source and binary forms,
|
||||
with or without modification, are permitted provided that the
|
||||
following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Applied Research Associates Inc nor the names
|
||||
of its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "test_neon.h"
|
||||
#define SCE_PFX_USE_PERFCOUNTER
|
||||
#include "physics_effects.h"
|
||||
#include <arm_neon.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <android/log.h>
|
||||
|
||||
// This works with gcc
|
||||
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
|
||||
|
||||
// assembly implementations
|
||||
extern "C"
|
||||
{
|
||||
void Matrix3OperatorMultiplyNeon(float *mCol, float *mat, float *pfResult);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Matrix3OperatorMultiplyScalar
|
||||
//
|
||||
/// Performs a Matrix3 Vector3 multiply using scalar math, storing the
|
||||
/// result directly into system memory.
|
||||
///
|
||||
/// @param mCol Matrix3 Columns. Must point to 3x 4 float values
|
||||
/// @param mat Matrix3 Columns. Must point to 3x 4 float values
|
||||
/// @param pfResult [in] pointer to a float(matrix 3). [out] Contains the result
|
||||
//----------------------------------------------------------------------------
|
||||
//inline const Vector3 Matrix3::operator *( const Vector3 & vec ) const
|
||||
//{
|
||||
// return Vector3(
|
||||
// ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
|
||||
// ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
|
||||
// ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
|
||||
// );
|
||||
//}
|
||||
//
|
||||
//inline const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
|
||||
//{
|
||||
// return Matrix3(
|
||||
// ( *this * mat.mCol0 ),
|
||||
// ( *this * mat.mCol1 ),
|
||||
// ( *this * mat.mCol2 )
|
||||
// );
|
||||
//}
|
||||
void Matrix3OperatorMultiplyScalar(float *mCol, float *mat, float *pfResult)
|
||||
{
|
||||
pfResult[0] = ( ( ( mCol[0] * mat[0] ) + ( mCol[4] * mat[1] ) ) + ( mCol[8] * mat[2] ) );
|
||||
pfResult[1] = ( ( ( mCol[1] * mat[0] ) + ( mCol[5] * mat[1] ) ) + ( mCol[9] * mat[2] ) );
|
||||
pfResult[2] = ( ( ( mCol[2] * mat[0] ) + ( mCol[6] * mat[1] ) ) + ( mCol[10] * mat[2] ) );
|
||||
pfResult[3] = 0.0f;
|
||||
|
||||
pfResult[4] = ( ( ( mCol[0] * mat[4] ) + ( mCol[4] * mat[5] ) ) + ( mCol[8] * mat[6] ) );
|
||||
pfResult[5] = ( ( ( mCol[1] * mat[4] ) + ( mCol[5] * mat[5] ) ) + ( mCol[9] * mat[6] ) );
|
||||
pfResult[6] = ( ( ( mCol[2] * mat[4] ) + ( mCol[6] * mat[5] ) ) + ( mCol[10] * mat[6] ) );
|
||||
pfResult[7] = 0.0f;
|
||||
|
||||
pfResult[8] = ( ( ( mCol[0] * mat[8] ) + ( mCol[4] * mat[9] ) ) + ( mCol[8] * mat[10] ) );
|
||||
pfResult[9] = ( ( ( mCol[1] * mat[8] ) + ( mCol[5] * mat[9] ) ) + ( mCol[9] * mat[10] ) );
|
||||
pfResult[10] = ( ( ( mCol[2] * mat[8] ) + ( mCol[6] * mat[9] ) ) + ( mCol[10] * mat[10] ) );
|
||||
pfResult[11] = 0.0f;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// TestNeonMatrix3OperatorMultiply
|
||||
//
|
||||
/// Run timing study of the NeonMatrix3OperatorMultiply product functions above, writing the
|
||||
/// results to the Android verbose log.
|
||||
//----------------------------------------------------------------------------
|
||||
void TestNeonMatrix3OperatorMultiply()
|
||||
{
|
||||
float SET_ALIGNMENT(64) data1[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
|
||||
float SET_ALIGNMENT(64) data2[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
|
||||
float *mCol = &data1[0];
|
||||
float *mat = &data2[0];
|
||||
float SET_ALIGNMENT(64) fResult[12];
|
||||
|
||||
sce::PhysicsEffects::PfxPerfCounter pc;
|
||||
double dTimeSpan, dRefTimeSpan;;
|
||||
unsigned int uiNumTries = 10000000;
|
||||
unsigned int i;
|
||||
|
||||
char szMsg[256];
|
||||
|
||||
sprintf(szMsg, "");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"---------------------------------------");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"TestNeonMatrix3OperatorMultiply Start");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "Test input mCol: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
mCol[0], mCol[1], mCol[2], mCol[3],
|
||||
mCol[4], mCol[5], mCol[6], mCol[7],
|
||||
mCol[8], mCol[9], mCol[10], mCol[11]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "Test input mat: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
mat[0], mat[1], mat[2], mat[3],
|
||||
mat[4], mat[5], mat[6], mat[7],
|
||||
mat[8], mat[9], mat[10], mat[11]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
|
||||
// profile scalar NeonMatrix3OperatorMultiplyScalar with direct memory return, c++ version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
Matrix3OperatorMultiplyScalar(mCol, mat, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
dRefTimeSpan = dTimeSpan;
|
||||
sprintf(szMsg, "Time to do %i calls for Matrix3OperatorMultiplyScalar: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
|
||||
fResult[4], fResult[5], fResult[6], fResult[7],
|
||||
fResult[8], fResult[9], fResult[10], fResult[11]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
// profile NEON Matrix3OperatorMultiplyNeon with direct memory return, assembly version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
Matrix3OperatorMultiplyNeon(mCol, mat, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
sprintf(szMsg, "Time to do %i calls for Matrix3OperatorMultiplyNeon: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
|
||||
fResult[4], fResult[5], fResult[6], fResult[7],
|
||||
fResult[8], fResult[9], fResult[10], fResult[11]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
sprintf(szMsg,"TestNeonMatrix3OperatorMultiply End");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
}
|
||||
@@ -0,0 +1,188 @@
|
||||
/*
|
||||
Applied Research Associates Inc. (c)2011
|
||||
|
||||
Redistribution and use in source and binary forms,
|
||||
with or without modification, are permitted provided that the
|
||||
following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Applied Research Associates Inc nor the names
|
||||
of its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "test_neon.h"
|
||||
#define SCE_PFX_USE_PERFCOUNTER
|
||||
#include "physics_effects.h"
|
||||
#include <arm_neon.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <android/log.h>
|
||||
|
||||
// This works with gcc
|
||||
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
|
||||
|
||||
// assembly implementations
|
||||
extern "C"
|
||||
{
|
||||
void Matrix4OperatorMultiplyNeon(float *mCol, float *mat, float *pfResult);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Matrix4OperatorMultiplyScalar
|
||||
//
|
||||
/// Performs a Matrix4 vector3 multiply using scalar math, storing the
|
||||
/// result directly into system memory.
|
||||
///
|
||||
/// @param mCol Input matrix 1. Must point to 4x 4 float values
|
||||
/// @param mat Input matrix 2. Must point to 4x 4 float values
|
||||
/// @param pfResult [in] pointer to a float(matrix). [out] Contains the result
|
||||
//----------------------------------------------------------------------------
|
||||
//inline const Vector4 Matrix4::operator *( const Vector4 & vec ) const
|
||||
//{
|
||||
// return Vector4(
|
||||
// ( ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ) + ( mCol3.getX() * vec.getW() ) ),
|
||||
// ( ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ) + ( mCol3.getY() * vec.getW() ) ),
|
||||
// ( ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) ) + ( mCol3.getZ() * vec.getW() ) ),
|
||||
// ( ( ( ( mCol0.getW() * vec.getX() ) + ( mCol1.getW() * vec.getY() ) ) + ( mCol2.getW() * vec.getZ() ) ) + ( mCol3.getW() * vec.getW() ) )
|
||||
// );
|
||||
//}
|
||||
//inline const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
|
||||
//{
|
||||
// return Matrix4(
|
||||
// ( *this * mat.mCol0 ),
|
||||
// ( *this * mat.mCol1 ),
|
||||
// ( *this * mat.mCol2 ),
|
||||
// ( *this * mat.mCol3 )
|
||||
// );
|
||||
//}
|
||||
void Matrix4OperatorMultiplyScalar(float *mCol, float *mat, float *pfResult)
|
||||
{
|
||||
pfResult[0] = ( ( ( mCol[0] * mat[0] ) + ( mCol[4] * mat[1] ) ) + ( mCol[8] * mat[2] ) + ( mCol[12] * mat[3] ) );
|
||||
pfResult[1] = ( ( ( mCol[1] * mat[0] ) + ( mCol[5] * mat[1] ) ) + ( mCol[9] * mat[2] ) + ( mCol[13] * mat[3] ) );
|
||||
pfResult[2] = ( ( ( mCol[2] * mat[0] ) + ( mCol[6] * mat[1] ) ) + ( mCol[10] * mat[2] ) + ( mCol[14] * mat[3] ) );
|
||||
pfResult[3] = ( ( ( mCol[3] * mat[0] ) + ( mCol[7] * mat[1] ) ) + ( mCol[11] * mat[2] ) + ( mCol[15] * mat[3] ) );
|
||||
|
||||
pfResult[4] = ( ( ( mCol[0] * mat[4] ) + ( mCol[4] * mat[5] ) ) + ( mCol[8] * mat[6] ) + ( mCol[12] * mat[7] ) );
|
||||
pfResult[5] = ( ( ( mCol[1] * mat[4] ) + ( mCol[5] * mat[5] ) ) + ( mCol[9] * mat[6] ) + ( mCol[13] * mat[7] ) );
|
||||
pfResult[6] = ( ( ( mCol[2] * mat[4] ) + ( mCol[6] * mat[5] ) ) + ( mCol[10] * mat[6] ) + ( mCol[14] * mat[7] ) );
|
||||
pfResult[7] = ( ( ( mCol[3] * mat[4] ) + ( mCol[7] * mat[5] ) ) + ( mCol[11] * mat[6] ) + ( mCol[15] * mat[7] ) );
|
||||
|
||||
pfResult[8] = ( ( ( mCol[0] * mat[8] ) + ( mCol[4] * mat[9] ) ) + ( mCol[8] * mat[10] ) + ( mCol[12] * mat[11] ) );
|
||||
pfResult[9] = ( ( ( mCol[1] * mat[8] ) + ( mCol[5] * mat[9] ) ) + ( mCol[9] * mat[10] ) + ( mCol[13] * mat[11] ) );
|
||||
pfResult[10] = ( ( ( mCol[2] * mat[8] ) + ( mCol[6] * mat[9] ) ) + ( mCol[10] * mat[10] ) + ( mCol[14] * mat[11] ) );
|
||||
pfResult[11] = ( ( ( mCol[3] * mat[8] ) + ( mCol[7] * mat[9] ) ) + ( mCol[11] * mat[10] ) + ( mCol[15] * mat[11] ) );
|
||||
|
||||
pfResult[12] = ( ( ( mCol[0] * mat[12] ) + ( mCol[4] * mat[13] ) ) + ( mCol[8] * mat[14] ) + ( mCol[12] * mat[15] ) );
|
||||
pfResult[13] = ( ( ( mCol[1] * mat[12] ) + ( mCol[5] * mat[13] ) ) + ( mCol[9] * mat[14] ) + ( mCol[13] * mat[15] ) );
|
||||
pfResult[14] = ( ( ( mCol[2] * mat[12] ) + ( mCol[6] * mat[13] ) ) + ( mCol[10] * mat[14] ) + ( mCol[14] * mat[15] ) );
|
||||
pfResult[15] = ( ( ( mCol[3] * mat[12] ) + ( mCol[7] * mat[13] ) ) + ( mCol[11] * mat[14] ) + ( mCol[15] * mat[15] ) );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// TestNeonMatrix4OperatorMultiply
|
||||
//
|
||||
/// Run timing study of the matrix4 vector3 multiply operator from above, writing the
|
||||
/// results to the Android verbose log.
|
||||
//----------------------------------------------------------------------------
|
||||
void TestNeonMatrix4OperatorMultiply()
|
||||
{
|
||||
float SET_ALIGNMENT(64) data1[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX)};
|
||||
float SET_ALIGNMENT(64) data2[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX)};
|
||||
float *mCol = &data1[0];
|
||||
float *mat = &data2[0];
|
||||
|
||||
char szMsg[256];
|
||||
|
||||
sprintf(szMsg, "");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"---------------------------------------");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"TestNeonMatrix4OperatorMultiply Start");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "Test input mCol: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
mCol[0], mCol[1], mCol[2], mCol[3],
|
||||
mCol[4], mCol[5], mCol[6], mCol[7],
|
||||
mCol[8], mCol[9], mCol[10], mCol[11],
|
||||
mCol[12], mCol[13], mCol[14], mCol[15]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "Test input mat: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
mat[0], mat[1], mat[2], mat[3],
|
||||
mat[4], mat[5], mat[6], mat[7],
|
||||
mat[8], mat[9], mat[10], mat[11],
|
||||
mCol[12], mCol[13], mCol[14], mCol[15]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
float SET_ALIGNMENT(64) fResult[16];
|
||||
|
||||
|
||||
sce::PhysicsEffects::PfxPerfCounter pc;
|
||||
double dTimeSpan, dRefTimeSpan;;
|
||||
unsigned int uiNumTries = 10000000;
|
||||
unsigned int i;
|
||||
|
||||
// profile scalar Matrix4OperatorMultiplyScalar with direct memory return, c++ version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
Matrix4OperatorMultiplyScalar(mCol, mat, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
dRefTimeSpan = dTimeSpan;
|
||||
sprintf(szMsg, "Time to do %i calls for Matrix4OperatorMultiplyScalar: %f secs, speedup: %5.2f",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "result value: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
fResult[0], fResult[1], fResult[2], fResult[3],
|
||||
fResult[4], fResult[5], fResult[6], fResult[7],
|
||||
fResult[8], fResult[9], fResult[10], fResult[11],
|
||||
fResult[12], fResult[13], fResult[14], fResult[15]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
// profile NEON Matrix4OperatorMultiplyNeon with direct memory return, assembly version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
Matrix4OperatorMultiplyNeon(mCol, mat, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
sprintf(szMsg, "Time to do %i calls for Matrix4OperatorMultiplyNeon: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
|
||||
fResult[4], fResult[5], fResult[6], fResult[7],
|
||||
fResult[8], fResult[9], fResult[10], fResult[11],
|
||||
fResult[12], fResult[13], fResult[14], fResult[15]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
sprintf(szMsg,"TestNeonMatrix4OperatorMultiply End");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
}
|
||||
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
Applied Research Associates Inc. (c)2011
|
||||
|
||||
Redistribution and use in source and binary forms,
|
||||
with or without modification, are permitted provided that the
|
||||
following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Applied Research Associates Inc nor the names
|
||||
of its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "test_neon.h"
|
||||
#define SCE_PFX_USE_PERFCOUNTER
|
||||
#include "physics_effects.h"
|
||||
#include <arm_neon.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <android/log.h>
|
||||
|
||||
// This works with gcc
|
||||
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
|
||||
|
||||
// assembly implementations
|
||||
extern "C"
|
||||
{
|
||||
void OrthoInverseTransform3Neon(float *trfm, float *pfResult);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// OrthoInverseTransform3Scalar
|
||||
//
|
||||
/// Performs a ortho inverse of a transform3 using scalar math, storing the
|
||||
/// result directly into system memory.
|
||||
///
|
||||
/// @param trfm Input transform3. Must point to 4x 4 float values
|
||||
/// @param pfResult [in] pointer to a float(tranform3). [out] Contains the result
|
||||
//----------------------------------------------------------------------------
|
||||
//inline const Transform3 orthoInverse( const Transform3 & tfrm )
|
||||
//{
|
||||
// Vector3 inv0, inv1, inv2;
|
||||
// inv0 = Vector3( tfrm.getCol0().getX(), tfrm.getCol1().getX(), tfrm.getCol2().getX() );
|
||||
// inv1 = Vector3( tfrm.getCol0().getY(), tfrm.getCol1().getY(), tfrm.getCol2().getY() );
|
||||
// inv2 = Vector3( tfrm.getCol0().getZ(), tfrm.getCol1().getZ(), tfrm.getCol2().getZ() );
|
||||
// return Transform3(
|
||||
// inv0,
|
||||
// inv1,
|
||||
// inv2,
|
||||
// Vector3( ( -( ( inv0 * tfrm.getCol3().getX() ) + ( ( inv1 * tfrm.getCol3().getY() ) + ( inv2 * tfrm.getCol3().getZ() ) ) ) ) )
|
||||
// );
|
||||
//}
|
||||
void OrthoInverseTransform3Scalar(float *trfm, float *pfResult)
|
||||
{
|
||||
pfResult[0] = trfm[0];
|
||||
pfResult[1] = trfm[4];
|
||||
pfResult[2] = trfm[8];
|
||||
pfResult[3] = 0.0f;
|
||||
|
||||
pfResult[4] = trfm[1];
|
||||
pfResult[5] = trfm[5];
|
||||
pfResult[6] = trfm[9];
|
||||
pfResult[7] = 0.0f;
|
||||
|
||||
pfResult[8] = trfm[2];
|
||||
pfResult[9] = trfm[6];
|
||||
pfResult[10] = trfm[10];
|
||||
pfResult[11] = 0.0f;
|
||||
|
||||
pfResult[12] = ( -( ( trfm[0] * trfm[8]) + ( ( trfm[1] * trfm[9]) + ( trfm[2] * trfm[10]) ) ) );
|
||||
pfResult[13] = ( -( ( trfm[4] * trfm[8]) + ( ( trfm[5] * trfm[9]) + ( trfm[6] * trfm[10]) ) ) );
|
||||
pfResult[14] = ( -( ( trfm[8] * trfm[8]) + ( ( trfm[9] * trfm[9]) + ( trfm[10] * trfm[10]) ) ) );
|
||||
pfResult[15] = 0.0f;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// TestNeonOrthoInverseTransform3
|
||||
//
|
||||
/// Run timing study of the orthoinverse on transform3 functions above, writing the
|
||||
/// results to the Android verbose log.
|
||||
//----------------------------------------------------------------------------
|
||||
void TestNeonOrthoInverseTransform3()
|
||||
{
|
||||
float SET_ALIGNMENT(64) data1[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
|
||||
float *trfm = &data1[0];
|
||||
|
||||
char szMsg[256];
|
||||
|
||||
sprintf(szMsg, "");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"---------------------------------------");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"TestNeonOrthoInverseTransform3 Start");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "Test input trfm: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>>",
|
||||
trfm[0], trfm[1], trfm[2], trfm[3],
|
||||
trfm[4], trfm[5], trfm[6], trfm[7],
|
||||
trfm[8], trfm[9], trfm[10], trfm[11],
|
||||
trfm[12], trfm[13], trfm[14], trfm[15]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
float SET_ALIGNMENT(64) fResult[16];
|
||||
|
||||
|
||||
sce::PhysicsEffects::PfxPerfCounter pc;
|
||||
double dTimeSpan, dRefTimeSpan;;
|
||||
unsigned int uiNumTries = 10000000;
|
||||
unsigned int i;
|
||||
|
||||
// profile scalar OrthoInverseTransform3Scalar with direct memory return, c++ version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
OrthoInverseTransform3Scalar(trfm, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
dRefTimeSpan = dTimeSpan;
|
||||
sprintf(szMsg, "Time to do %i calls for OrthoInverseTransform3Scalar: %f secs, speedup: %5.2f",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
fResult[0], fResult[1], fResult[2], fResult[3],
|
||||
fResult[4], fResult[5], fResult[6], fResult[7],
|
||||
fResult[8], fResult[9], fResult[10], fResult[11],
|
||||
fResult[12], fResult[13], fResult[14], fResult[15]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
// profile NEON OrthoInverseTransform3Neon with direct memory return, assembly version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
OrthoInverseTransform3Neon(trfm, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
sprintf(szMsg, "Time to do %i calls for OrthoInverseTransform3Neon: %f secs, speedup: %5.2f",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
fResult[0], fResult[1], fResult[2], fResult[3],
|
||||
fResult[4], fResult[5], fResult[6], fResult[7],
|
||||
fResult[8], fResult[9], fResult[10], fResult[11],
|
||||
fResult[12], fResult[13], fResult[14], fResult[15]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
sprintf(szMsg,"TestNeonOrthoInverseTransform3 End");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
}
|
||||
@@ -0,0 +1,477 @@
|
||||
/*
|
||||
Applied Research Associates Inc. (c)2011
|
||||
|
||||
Redistribution and use in source and binary forms,
|
||||
with or without modification, are permitted provided that the
|
||||
following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Applied Research Associates Inc nor the names
|
||||
of its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "test_neon.h"
|
||||
#define SCE_PFX_USE_PERFCOUNTER
|
||||
#include "physics_effects.h"
|
||||
#include <arm_neon.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <android/log.h>
|
||||
|
||||
// This works with gcc
|
||||
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// pfxSolveLinearConstraintRowScalar
|
||||
//
|
||||
/// Performs
|
||||
///
|
||||
/// @param Input
|
||||
//----------------------------------------------------------------------------
|
||||
void pfxSolveLinearConstraintRowScalar(float *constraint, // 8 floats
|
||||
float *deltaLinearVelocityA, // 4 floats
|
||||
float *deltaAngularVelocityA, // 4 floats
|
||||
float massInvA, // 1 floats
|
||||
float *inertiaInvA, // 12 floats
|
||||
float *rA, // 4 floats
|
||||
float *deltaLinearVelocityB, // 4 floats
|
||||
float *deltaAngularVelocityB, // 4 floats
|
||||
float massInvB, // 1 floats
|
||||
float *inertiaInvB, // 12 floats
|
||||
float *rB) // 4 floats
|
||||
{
|
||||
//const PfxVector3 normal(pfxReadVector3(constraint.m_normal)); //fourth element is not zero and it need to be in assembly
|
||||
float SET_ALIGNMENT(64) normalRead[] = {constraint[0],constraint[1],constraint[2],0.0f};
|
||||
float *normal = &normalRead[0];
|
||||
//PfxFloat deltaImpulse = constraint.m_rhs;
|
||||
float deltaImpulse = constraint[4];
|
||||
//PfxVector3 dVA = deltaLinearVelocityA + cross(deltaAngularVelocityA,rA);
|
||||
float SET_ALIGNMENT(64) dVARead[] = {deltaLinearVelocityA[0] + (deltaAngularVelocityA[1]*rA[2] - deltaAngularVelocityA[2]*rA[1]),
|
||||
deltaLinearVelocityA[1] + (deltaAngularVelocityA[2]*rA[0] - deltaAngularVelocityA[0]*rA[2]),
|
||||
deltaLinearVelocityA[2] + (deltaAngularVelocityA[0]*rA[1] - deltaAngularVelocityA[1]*rA[0]),
|
||||
0.0f};
|
||||
float *dVA = &dVARead[0];
|
||||
//PfxVector3 dVB = deltaLinearVelocityB + cross(deltaAngularVelocityB,rB);
|
||||
float SET_ALIGNMENT(64) dVBRead[] = {deltaLinearVelocityB[0] + (deltaAngularVelocityB[1]*rB[2] - deltaAngularVelocityB[2]*rB[1]),
|
||||
deltaLinearVelocityB[1] + (deltaAngularVelocityB[2]*rB[0] - deltaAngularVelocityB[0]*rB[2]),
|
||||
deltaLinearVelocityB[2] + (deltaAngularVelocityB[0]*rB[1] - deltaAngularVelocityB[1]*rB[0]),
|
||||
0.0f};
|
||||
float *dVB = &dVBRead[0];
|
||||
//deltaImpulse -= constraint.m_jacDiagInv * dot(normal,dVA-dVB);
|
||||
deltaImpulse -= constraint[4] * (normal[0]*(dVA[0]-dVB[0]) + normal[1]*(dVA[1]-dVB[1]) + normal[2]*(dVA[2]-dVB[2]));
|
||||
//PfxFloat oldImpulse = constraint.m_accumImpulse;
|
||||
float oldImpulse = constraint[7];
|
||||
//constraint.m_accumImpulse = SCE_PFX_CLAMP(oldImpulse + deltaImpulse,constraint.m_lowerLimit,constraint.m_upperLimit);
|
||||
if((oldImpulse + deltaImpulse) > constraint[6]){
|
||||
if(constraint[6] > constraint[5]){
|
||||
constraint[7] = constraint[6];
|
||||
}else {
|
||||
constraint[7] = constraint[5];
|
||||
}
|
||||
} else {
|
||||
if(oldImpulse + deltaImpulse > constraint[5]){
|
||||
constraint[7] = oldImpulse + deltaImpulse;
|
||||
}else {
|
||||
constraint[7] = constraint[5];
|
||||
}
|
||||
}
|
||||
//deltaImpulse = constraint.m_accumImpulse - oldImpulse;
|
||||
deltaImpulse = constraint[7] - oldImpulse;
|
||||
//deltaLinearVelocityA += deltaImpulse * massInvA * normal;
|
||||
deltaLinearVelocityA[0] += deltaImpulse * massInvA * normal[0];
|
||||
deltaLinearVelocityA[1] += deltaImpulse * massInvA * normal[1];
|
||||
deltaLinearVelocityA[2] += deltaImpulse * massInvA * normal[2];
|
||||
deltaLinearVelocityA[3] += deltaImpulse * massInvA * normal[3];
|
||||
|
||||
//deltaAngularVelocityA += deltaImpulse * inertiaInvA * cross(rA,normal);
|
||||
deltaAngularVelocityA[0] += deltaImpulse * ((inertiaInvA[0] * (rA[1]*normal[2] - rA[2]*normal[1])) +
|
||||
(inertiaInvA[4] * (rA[2]*normal[0] - rA[0]*normal[2])) +//
|
||||
(inertiaInvA[8] * (rA[0]*normal[1] - rA[1]*normal[0])));
|
||||
deltaAngularVelocityA[1] += deltaImpulse * ((inertiaInvA[1] * (rA[1]*normal[2] - rA[2]*normal[1])) +
|
||||
(inertiaInvA[5] * (rA[2]*normal[0] - rA[0]*normal[2])) +
|
||||
(inertiaInvA[9] * (rA[0]*normal[1] - rA[1]*normal[0])));
|
||||
deltaAngularVelocityA[2] += deltaImpulse * ((inertiaInvA[2] * (rA[1]*normal[2] - rA[2]*normal[1])) +
|
||||
(inertiaInvA[6] * (rA[2]*normal[0] - rA[0]*normal[2])) +
|
||||
(inertiaInvA[10] * (rA[0]*normal[1] - rA[1]*normal[0])));
|
||||
deltaAngularVelocityA[3] += deltaImpulse * ((inertiaInvA[3] * (rA[1]*normal[2] - rA[2]*normal[1])) +
|
||||
(inertiaInvA[7] * (rA[2]*normal[0] - rA[0]*normal[2])) +
|
||||
(inertiaInvA[11] * (rA[0]*normal[1] - rA[1]*normal[0])));
|
||||
//deltaLinearVelocityB -= deltaImpulse * massInvB * normal;
|
||||
deltaLinearVelocityB[0] -= deltaImpulse * massInvB * normal[0];
|
||||
deltaLinearVelocityB[1] -= deltaImpulse * massInvB * normal[1];
|
||||
deltaLinearVelocityB[2] -= deltaImpulse * massInvB * normal[2];
|
||||
deltaLinearVelocityB[3] -= deltaImpulse * massInvB * normal[3];
|
||||
|
||||
//deltaAngularVelocityB -= deltaImpulse * inertiaInvB * cross(rB,normal);
|
||||
deltaAngularVelocityB[0] -= deltaImpulse * ((inertiaInvB[0] * (rB[1]*normal[2] - rB[2]*normal[1])) +
|
||||
(inertiaInvB[4] * (rB[2]*normal[0] - rB[0]*normal[2])) +
|
||||
(inertiaInvB[8] * (rB[0]*normal[1] - rB[1]*normal[0])));
|
||||
deltaAngularVelocityB[1] -= deltaImpulse * ((inertiaInvB[1] * (rB[1]*normal[2] - rB[2]*normal[1])) +
|
||||
(inertiaInvB[5] * (rB[2]*normal[0] - rB[0]*normal[2])) +
|
||||
(inertiaInvB[9] * (rB[0]*normal[1] - rB[1]*normal[0])));
|
||||
deltaAngularVelocityB[2] -= deltaImpulse * ((inertiaInvB[2] * (rB[1]*normal[2] - rB[2]*normal[1])) +
|
||||
(inertiaInvB[6] * (rB[2]*normal[0] - rB[0]*normal[2])) +
|
||||
(inertiaInvB[10] * (rB[0]*normal[1] - rB[1]*normal[0])));
|
||||
deltaAngularVelocityB[3] -= deltaImpulse * ((inertiaInvB[3] * (rB[1]*normal[2] - rB[2]*normal[1])) +
|
||||
(inertiaInvB[7] * (rB[2]*normal[0] - rB[0]*normal[2])) +
|
||||
(inertiaInvB[11] * (rB[0]*normal[1] - rB[1]*normal[0])));
|
||||
}
|
||||
|
||||
/*//////////////////////////////////////Source Code///////////////////////////////////////
|
||||
void pfxSolveLinearConstraintRow(
|
||||
PfxConstraintRow &constraint, // 8 floats
|
||||
PfxVector3 &deltaLinearVelocityA, // 4 floats
|
||||
PfxVector3 &deltaAngularVelocityA, // 4 floats
|
||||
PfxFloat massInvA, // 1 float
|
||||
const PfxMatrix3 &inertiaInvA, // 12 floats
|
||||
const PfxVector3 &rA, // 4 floats
|
||||
PfxVector3 &deltaLinearVelocityB, // 4 floats
|
||||
PfxVector3 &deltaAngularVelocityB, // 4 floats
|
||||
PfxFloat massInvB, // 1 float
|
||||
const PfxMatrix3 &inertiaInvB, // 12 floats
|
||||
const PfxVector3 &rB) // 4 floats
|
||||
{
|
||||
// PfxConstraintRow structure:
|
||||
// PfxFloat m_normal[3];
|
||||
// PfxFloat m_rhs;
|
||||
// PfxFloat m_jacDiagInv;
|
||||
// PfxFloat m_lowerLimit;
|
||||
// PfxFloat m_upperLimit;
|
||||
// PfxFloat m_accumImpulse;
|
||||
const PfxVector3 normal(pfxReadVector3(constraint.m_normal));
|
||||
PfxFloat deltaImpulse = constraint.m_rhs;
|
||||
PfxVector3 dVA = deltaLinearVelocityA + cross(deltaAngularVelocityA,rA);
|
||||
PfxVector3 dVB = deltaLinearVelocityB + cross(deltaAngularVelocityB,rB);
|
||||
deltaImpulse -= constraint.m_jacDiagInv * dot(normal,dVA-dVB);
|
||||
PfxFloat oldImpulse = constraint.m_accumImpulse;
|
||||
constraint.m_accumImpulse = SCE_PFX_CLAMP(oldImpulse + deltaImpulse,constraint.m_lowerLimit,constraint.m_upperLimit);
|
||||
deltaImpulse = constraint.m_accumImpulse - oldImpulse;
|
||||
deltaLinearVelocityA += deltaImpulse * massInvA * normal;
|
||||
deltaAngularVelocityA += deltaImpulse * inertiaInvA * cross(rA,normal);
|
||||
deltaLinearVelocityB -= deltaImpulse * massInvB * normal;
|
||||
deltaAngularVelocityB -= deltaImpulse * inertiaInvB * cross(rB,normal);
|
||||
}
|
||||
*/
|
||||
|
||||
void pfxSolveLinearConstraintRowNeonInline(float *constraint, // 8 floats
|
||||
float *deltaLinearVelocityA, // 4 floats
|
||||
float *deltaAngularVelocityA, // 4 floats
|
||||
float *massInvA, // 1 floats
|
||||
float *inertiaInvA, // 12 floats
|
||||
float *rA, // 4 floats
|
||||
float *deltaLinearVelocityB, // 4 floats
|
||||
float *deltaAngularVelocityB, // 4 floats
|
||||
float *massInvB, // 1 floats
|
||||
float *inertiaInvB, // 12 floats
|
||||
float *rB) // 4 floats
|
||||
{
|
||||
// "r" (constraint), //%0
|
||||
// "r" (deltaLinearVelocityA), //%1
|
||||
// "r" (deltaAngularVelocityA), //%2
|
||||
// "r" (massInvA), //%3
|
||||
// "r" (inertiaInvA), //%4
|
||||
// "r" (rA), //%5
|
||||
// "r" (deltaLinearVelocityB), //%6
|
||||
// "r" (deltaAngularVelocityB), //%7
|
||||
// "r" (massInvB), //%8
|
||||
// "r" (inertiaInvB), //%9
|
||||
// "r" (rB) //%10
|
||||
asm volatile
|
||||
(
|
||||
//Loads beforehand so the normal vector will be able to have zero in the 4th element
|
||||
"vld1.32 {q0}, [%1] \n\t" //LOAD => deltaLinearVelocityA ----------------> q0
|
||||
"vld1.32 {q1}, [%2] \n\t" //LOAD => deltaAngularVelocityA ---------------> q1
|
||||
"vld1.32 {q2}, [%6] \n\t" //LOAD => deltaLinearVelocityB ----------------> q2
|
||||
"vld1.32 {q3}, [%7] \n\t" //LOAD => deltaAngularVelocityB ---------------> q3
|
||||
//const PfxVector3 normal(pfxReadVector3(constraint.m_normal));
|
||||
"vld1.32 {q4}, [%0]! \n\t" //LOAD => constraint normal--------------------> q4
|
||||
"vdup.32 d10, d1[1] \n\t" //set 4th element on normal vector to zero
|
||||
"vtrn.32 d9, d10 \n\t" //LOAD => deltaImpulse ------------------------> q5 (d10[0] only)
|
||||
//PfxFloat deltaImpulse = constraint.m_rhs;
|
||||
"vld1.32 {d12}, [%0]! \n\t" //LOAD => constraint variables ----------------> q6
|
||||
"vld1.32 {d13[0]}, [%0]! \n\t" //constraint variables
|
||||
"vld1.32 {d13[1]}, [%0] \n\t" //constraint load of remaining variables loaded this way keep pointer to m_accumilate in order to store back in later
|
||||
//PfxVector3 dVA = deltaLinearVelocityA + cross(deltaAngularVelocityA,rA);
|
||||
//PfxVector3 dVB = deltaLinearVelocityB + cross(deltaAngularVelocityB,rB);
|
||||
"vld1.32 {d14[1]}, [%5] \n\t" //LOAD => rA for cross product use 1A ---------> q7 (save these for use later)
|
||||
"vld1.32 {d18[1]}, [%10] \n\t" //LOAD => rB for cross product use 1B ---------> q9
|
||||
"vld1.32 {d17[0]}, [%5]! \n\t" //LOAD => rA for cross product use 2A ---------> q8
|
||||
"vld1.32 {d21[0]}, [%10]! \n\t" //LOAD => rB for cross product use 2B ---------> q10
|
||||
"vld1.32 {d16}, [%5] \n\t" //rA 2
|
||||
"vld1.32 {d20}, [%10] \n\t" //rB 2
|
||||
"vld1.32 {d15[0]}, [%5]! \n\t" //rA 1
|
||||
"vld1.32 {d19[0]}, [%10]! \n\t" //rB 1
|
||||
"vld1.32 {d14[0]}, [%5]! \n\t" //rA 1
|
||||
"vld1.32 {d18[0]}, [%10]! \n\t" //rB 1
|
||||
"vld1.32 {d15[1]}, [%5] \n\t" //rA 1
|
||||
"vld1.32 {d19[1]}, [%10] \n\t" //rB 1
|
||||
"vld1.32 {d17[1]}, [%5]! \n\t" //rA 2
|
||||
"vld1.32 {d21[1]}, [%10]! \n\t" //rB 2
|
||||
"vdup.32 q12, d1[1] \n\t" //set deltaAngularVelocityB 2
|
||||
"vdup.32 q13, d1[1] \n\t" //set deltaAngularVelocityB 1
|
||||
"vdup.32 q14, d1[1] \n\t" //set deltaAngularVelocityA 1
|
||||
"vdup.32 q15, d1[1] \n\t" //set deltaAngularVelocityA 2
|
||||
"vadd.f32 q14, q14, q1 \n\t"
|
||||
"vadd.f32 q13, q13, q3 \n\t"
|
||||
"vadd.f32 q15, q15, q1 \n\t"
|
||||
"vadd.f32 q12, q12, q3 \n\t"
|
||||
"vrev64.32 d28, d28 \n\t" //set deltaAngularVelocityA 1
|
||||
"vrev64.32 d26, d26 \n\t" //set deltaAngularVelocityB 1
|
||||
"vtrn.32 d30, d31 \n\t" //set deltaAngularVelocityA 2
|
||||
"vtrn.32 d24, d25 \n\t" //set deltaAngularVelocityB 2
|
||||
"vtrn.32 d28, d29 \n\t" //set deltaAngularVelocityA 1
|
||||
"vtrn.32 d26, d27 \n\t" //set deltaAngularVelocityB 1
|
||||
"vrev64.32 d30, d30 \n\t" //set deltaAngularVelocityA 2
|
||||
"vrev64.32 d24, d24 \n\t" //set deltaAngularVelocityB 2
|
||||
"vmul.f32 q14, q7, q14 \n\t" //operation for cross product 1A
|
||||
"vmul.f32 q13, q9, q13 \n\t" //operation for cross product 1B
|
||||
"vmls.f32 q14, q8, q15 \n\t" //operation for cross product 2A
|
||||
"vmls.f32 q13, q10, q12 \n\t" //operation for cross product 2B
|
||||
"vadd.f32 q14, q14, q0 \n\t" //operation for adding cross to linearVelocityA
|
||||
"vadd.f32 q13, q13, q2 \n\t" //operation for adding cross to linearVelocityB
|
||||
//LOAD => dVA --------------------------------> q14
|
||||
//LOAD => dVB --------------------------------> q13
|
||||
//FREE q11, q12, q15, d11
|
||||
//deltaImpulse -= constraint.m_jacDiagInv * dot(normal,dVA-dVB);
|
||||
"vsub.f32 q11, q14, q13 \n\t" //TEMP q11 => dVA-dVB for dot product
|
||||
/*find*/ "vmul.f32 q11, q11, q4 \n\t" //operation for dot product
|
||||
/*fastest*/ "vpadd.f32 d22, d22, d22 \n\t" //operation for dot product
|
||||
/*dot*/ "vpadd.f32 d23, d23, d23 \n\t" //operation for dot product
|
||||
"vadd.f32 d22, d22, d23 \n\t" //operation for dot product
|
||||
"vmul.f32 d22, d22, d12[0] \n\t" //m_jacDiagInv times dot product result
|
||||
"vsub.f32 d10, d10, d22 \n\t" //subtract result from deltaImpule
|
||||
//PfxFloat oldImpulse = constraint.m_accumImpulse;
|
||||
"vdup.32 d11, d13[1] \n\t" //LOAD => oldImpulse -------------------------> q5 (d11 only)
|
||||
//constraint.m_accumImpulse = SCE_PFX_CLAMP(oldImpulse + deltaImpulse,constraint.m_lowerLimit,constraint.m_upperLimit);
|
||||
"vdup.32 d23, d13[0] \n\t" //TEMP q11 => m_upperLimit (d23 only)
|
||||
"vdup.32 d24, d12[1] \n\t" //TEMP q12 => m_lowerLimit (d34 only)
|
||||
"vadd.f32 d25, d10, d11 \n\t" //TEMP q12 => deltaImpulse + oldImplues (d25 only)
|
||||
"vmin.f32 d25, d25, d23 \n\t" //operation MIN(v,b)
|
||||
"vmax.f32 d22, d25, d24 \n\t" //operation MAX(a,MIN(v,b)
|
||||
"vst1.32 {d22[0]}, [%0] \n\t" //store m_accumImpulse (incremented so that it can be reloaded for cross product later)
|
||||
//deltaImpulse = constraint.m_accumImpulse - oldImpulse;
|
||||
"vsub.f32 d10, d22, d11 \n\t" //operation to calculate new deltaImpule
|
||||
//FREE q6, q11, q12, q13, q14, q15, d11
|
||||
//deltaLinearVelocityB -= deltaImpulse * massInvB * normal;
|
||||
//deltaLinearVelocityA += deltaImpulse * massInvA * normal;
|
||||
"vld1.32 {d11[0]}, [%3] \n\t" //LOAD => massInvA ---------------------------> q5 (d11[0] only)
|
||||
"vld1.32 {d11[1]}, [%8] \n\t" //LOAD => massInvB ---------------------------> q5 (d11[1] only)
|
||||
"vmul.f32 q11, q4, d11[0] \n\t" //TEMP q11 => operation normal times massInvA A
|
||||
"vmul.f32 q12, q4, d11[1] \n\t" //TEMP q12 => operation normal times massInvB B
|
||||
"vmul.f32 q11, q11, d10[0] \n\t" //TEMP q11 => operation result times DeltaImpulse A
|
||||
"vmul.f32 q12, q12, d10[0] \n\t" //TEMP q12 => operation result times DeltaImpulse B
|
||||
"vadd.f32 q0, q0, q11 \n\t" //operation create new deltaLinearVelocityA A
|
||||
"vsub.f32 q2, q2, q12 \n\t" //operation create new deltaLinearVelocityB B
|
||||
"vst1.32 {q0}, [%1] \n\t" //store the new deltaLinearVelocityA A
|
||||
"vst1.32 {q2}, [%6] \n\t" //store the new deltaLinearVelocityB B
|
||||
//FREE q0, q2, q6, q11, q12, q13, q14, q15, d11
|
||||
//deltaAngularVelocityA += deltaImpulse * inertiaInvA * cross(rA,normal);
|
||||
//deltaAngularVelocityB -= deltaImpulse * inertiaInvB * cross(rB,normal);
|
||||
"vdup.32 q14, d1[1] \n\t" //set normal cross load
|
||||
"vdup.32 q15, d1[1] \n\t" //
|
||||
"vadd.f32 q14, q14, q4 \n\t" //
|
||||
"vadd.f32 q15, q15, q4 \n\t" //
|
||||
"vrev64.32 d28, d28 \n\t" //
|
||||
"vtrn.32 d30, d31 \n\t" //
|
||||
"vtrn.32 d28, d29 \n\t" //
|
||||
"vrev64.32 d30, d30 \n\t" //
|
||||
"vmul.f32 q0, q8, q15 \n\t" //operation for cross product A
|
||||
"vmul.f32 q2, q10, q15 \n\t" //operation for cross product B
|
||||
"vmls.f32 q0, q14, q7 \n\t" //operation for cross product A
|
||||
"vmls.f32 q2, q14, q9 \n\t" //operation for cross product B
|
||||
//LOAD => cross product result A ------------> q0
|
||||
//LOAD => cross product result B ------------> q2
|
||||
//FREE q6, q7, q8, q9, q10, q11, q12, q13, q14, q15, d11
|
||||
|
||||
"vld1.32 {q13-q14}, [%4]! \n\t" //LOAD => inertiaInvA col0, col1 A ----------> q13, q14
|
||||
"vld1.32 {q9-q10}, [%9]! \n\t" //LOAD => inertiaInvB col0, col1 B -----------> q9, q10
|
||||
"vld1.32 {q15}, [%4] \n\t" //LOAD => inertiaInvA col2 A ----------------> q5
|
||||
"vld1.32 {q11}, [%9] \n\t" //LOAD => inertiaInvB col2 B -----------------> q11
|
||||
"vmul.f32 q13, q13, d0[0] \n\t" //operation inertiaInvA col0 = (col0) * (crossA elem0) A
|
||||
"vmul.f32 q9, q9, d4[0] \n\t" //operation inertiaInvB col0 = (col0) * (crossB elem0) B
|
||||
"vmla.f32 q13, q14, d0[1] \n\t" //operation inertiaInvA col1 = (col1) * (crossA elem1) A
|
||||
"vmla.f32 q9, q10, d4[1] \n\t" //operation inertiaInvB col1 = (col1) * (crossB elem1) B
|
||||
"vmla.f32 q13, q15, d1[0] \n\t" //operation inertiaInvA col2 = (col2) * (crossA elem2) A
|
||||
"vmla.f32 q9, q11, d5[0] \n\t" //operation inertiaInvB col2 = (col2) * (crossB elem2) B
|
||||
"vmul.f32 q13, q13, d10[0] \n\t" //operation inertiaInvA times deltaImpulse A
|
||||
"vmul.f32 q9, q9, d10[0] \n\t" //operation inertiaInvB times deltaImpulse B
|
||||
"vadd.f32 q1, q1, q13 \n\t" //operation accumulate the deltaAngularVelocityA A
|
||||
"vsub.f32 q3, q3, q9 \n\t" //operation accumulate the deltaAngularVelocityB B
|
||||
"vst1.32 {q1}, [%2] \n\t" //store deltaAngularVelocityA A
|
||||
"vst1.32 {q3}, [%7] \n\t" //store deltaAngularVelocityB B
|
||||
: // NO outputs! It is important to *not* put anything here. (Putting something here forces use of r0, which wreak havok
|
||||
: "r" (constraint), "r" (deltaLinearVelocityA), "r" (deltaAngularVelocityA), "r" (massInvA), "r" (inertiaInvA), "r" (rA), "r" (deltaLinearVelocityB), "r" (deltaAngularVelocityB), "r" (massInvB), "r" (inertiaInvB), "r" (rB) //inputs
|
||||
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" // clobbers
|
||||
);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// TestNeonSolveLinearConstraintRow
|
||||
//
|
||||
/// Run timing study of the Linear Constraint Row Solver from above, writing the
|
||||
/// results to the Android verbose log.
|
||||
//----------------------------------------------------------------------------
|
||||
void TestNeonSolveLinearConstraintRow()
|
||||
{
|
||||
//Scalar
|
||||
float SET_ALIGNMENT(64) constraintDataScalar[] = {0.0f,1.0f,2.0f,3.0f,1.0f,2.0f,3.0f,4.0f};
|
||||
float SET_ALIGNMENT(64) deltaLinearVelocityADataScalar[] = {1.0f,2.0f,3.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) deltaAngularVelocityADataScalar[] = {2.0f,3.0f,4.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) inertiaInvADataScalar[] = {4.0f,5.0f,6.0f,0.0f,5.0f,6.0f,7.0f,0.0f,6.0f,7.0f,8.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) rADataScalar[] = {5.0f,6.0f,7.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) deltaLinearVelocityBDataScalar[] = {6.0f,7.0f,8.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) deltaAngularVelocityBDataScalar[] = {9.0f,8.0f,7.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) inertiaInvBDataScalar[] = {7.0f,6.0f,5.0f,0.0f,9.0f,8.0f,7.0f,0.0f,6.0f,7.0f,8.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) rBDataScalar[] = {6.0f,5.0f,4.0f,0.0f};
|
||||
|
||||
float *constraintScalar = &constraintDataScalar[0];
|
||||
float *deltaLinearVelocityAScalar = &deltaLinearVelocityADataScalar[0];
|
||||
float *deltaAngularVelocityAScalar = &deltaAngularVelocityADataScalar[0];
|
||||
float massInvAScalar = 3.0f;
|
||||
float *inertiaInvAScalar = &inertiaInvADataScalar[0];
|
||||
float *rAScalar = &rADataScalar[0];
|
||||
float *deltaLinearVelocityBScalar = &deltaLinearVelocityBDataScalar[0];
|
||||
float *deltaAngularVelocityBScalar = &deltaAngularVelocityBDataScalar[0];
|
||||
float massInvBScalar = 8.0f;
|
||||
float *inertiaInvBScalar = &inertiaInvBDataScalar[0];
|
||||
float *rBScalar = &rBDataScalar[0];
|
||||
|
||||
//Neon
|
||||
float SET_ALIGNMENT(64) constraintDataNeon[] = {0.0f,1.0f,2.0f,3.0f,1.0f,2.0f,3.0f,4.0f};
|
||||
float SET_ALIGNMENT(64) deltaLinearVelocityADataNeon[] = {1.0f,2.0f,3.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) deltaAngularVelocityADataNeon[] = {2.0f,3.0f,4.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) massInvADataNeon[] = {3.0f};
|
||||
float SET_ALIGNMENT(64) inertiaInvADataNeon[] = {4.0f,5.0f,6.0f,0.0f,5.0f,6.0f,7.0f,0.0f,6.0f,7.0f,8.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) rADataNeon[] = {5.0f,6.0f,7.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) deltaLinearVelocityBDataNeon[] = {6.0f,7.0f,8.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) deltaAngularVelocityBDataNeon[] = {9.0f,8.0f,7.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) massInvBDataNeon[] = {8.0f};
|
||||
float SET_ALIGNMENT(64) inertiaInvBDataNeon[] = {7.0f,6.0f,5.0f,0.0f,9.0f,8.0f,7.0f,0.0f,6.0f,7.0f,8.0f,0.0f};
|
||||
float SET_ALIGNMENT(64) rBDataNeon[] = {6.0f,5.0f,4.0f,0.0f};
|
||||
|
||||
float *constraintNeon = &constraintDataNeon[0];
|
||||
float *deltaLinearVelocityANeon = &deltaLinearVelocityADataNeon[0];
|
||||
float *deltaAngularVelocityANeon = &deltaAngularVelocityADataNeon[0];
|
||||
float *massInvANeon = &massInvADataNeon[0];
|
||||
float *inertiaInvANeon = &inertiaInvADataNeon[0];
|
||||
float *rANeon = &rADataNeon[0];
|
||||
float *deltaLinearVelocityBNeon = &deltaLinearVelocityBDataNeon[0];
|
||||
float *deltaAngularVelocityBNeon = &deltaAngularVelocityBDataNeon[0];
|
||||
float *massInvBNeon = &massInvBDataNeon[0];
|
||||
float *inertiaInvBNeon = &inertiaInvBDataNeon[0];
|
||||
float *rBNeon = &rBDataNeon[0];
|
||||
|
||||
char szMsg[256];
|
||||
|
||||
sprintf(szMsg, "");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"---------------------------------------");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"TestNeonSolveLinearConstraintRow Start");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
sce::PhysicsEffects::PfxPerfCounter pc;
|
||||
double dTimeSpan, dRefTimeSpan;;
|
||||
unsigned int uiNumTries = 10000000;
|
||||
unsigned int i;
|
||||
|
||||
// profile pfxSolveLinearConstraintRowScalar with direct memory return, c++ version
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
pfxSolveLinearConstraintRowScalar(
|
||||
constraintScalar,
|
||||
deltaLinearVelocityAScalar,
|
||||
deltaAngularVelocityAScalar,
|
||||
massInvAScalar,
|
||||
inertiaInvAScalar,
|
||||
rAScalar,
|
||||
deltaLinearVelocityBScalar,
|
||||
deltaAngularVelocityBScalar,
|
||||
massInvBScalar,
|
||||
inertiaInvBScalar,
|
||||
rBScalar
|
||||
);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
dRefTimeSpan = dTimeSpan;
|
||||
sprintf(szMsg, "Time to do %i calls for pfxSolveLinearConstraintRowScalar: %f secs, speedup: %5.2f",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "constraint.m_accumImpulse = <%f>", constraintScalar[7]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "deltaLinearVelocityA = <%f,%f,%f,%f>",
|
||||
deltaLinearVelocityAScalar[0], deltaLinearVelocityAScalar[1], deltaLinearVelocityAScalar[2], deltaLinearVelocityAScalar[3]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "deltaAngularVelocityA = <%f,%f,%f,%f>",
|
||||
deltaAngularVelocityAScalar[0], deltaAngularVelocityAScalar[1], deltaAngularVelocityAScalar[2], deltaAngularVelocityAScalar[3]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "deltaLinearVelocityB = <%f,%f,%f,%f>",
|
||||
deltaLinearVelocityBScalar[0], deltaLinearVelocityBScalar[1], deltaLinearVelocityBScalar[2], deltaLinearVelocityBScalar[3]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "deltaAngularVelocityB = <%f,%f,%f,%f>",
|
||||
deltaAngularVelocityBScalar[0], deltaAngularVelocityBScalar[1], deltaAngularVelocityBScalar[2], deltaAngularVelocityBScalar[3]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
// profile NEON pfxSolveLinearConstraintRowNeonInline with direct memory return, inline assembly version
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
pfxSolveLinearConstraintRowNeonInline(constraintNeon,
|
||||
deltaLinearVelocityANeon,
|
||||
deltaAngularVelocityANeon,
|
||||
massInvANeon,
|
||||
inertiaInvANeon,
|
||||
rANeon,
|
||||
deltaLinearVelocityBNeon,
|
||||
deltaAngularVelocityBNeon,
|
||||
massInvBNeon,
|
||||
inertiaInvBNeon,
|
||||
rBNeon);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
sprintf(szMsg, "Time to do %i calls for pfxSolveLinearConstraintRowNeon: %f secs, speedup: %5.2f",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "constraint.m_accumImpulse = <%f>", constraintNeon[7]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "deltaLinearVelocityA = <%f,%f,%f,%f>",
|
||||
deltaLinearVelocityANeon[0], deltaLinearVelocityANeon[1], deltaLinearVelocityANeon[2], deltaLinearVelocityANeon[3]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "deltaAngularVelocityA = <%f,%f,%f,%f>",
|
||||
deltaAngularVelocityANeon[0], deltaAngularVelocityANeon[1], deltaAngularVelocityANeon[2], deltaAngularVelocityANeon[3]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "deltaLinearVelocityB = <%f,%f,%f,%f>",
|
||||
deltaLinearVelocityBNeon[0], deltaLinearVelocityBNeon[1], deltaLinearVelocityBNeon[2], deltaLinearVelocityBNeon[3]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "deltaAngularVelocityB = <%f,%f,%f,%f>",
|
||||
deltaAngularVelocityBNeon[0], deltaAngularVelocityBNeon[1], deltaAngularVelocityBNeon[2], deltaAngularVelocityBNeon[3]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
sprintf(szMsg,"TestNeonSolveLinearConstraintRow End");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
}
|
||||
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
Applied Research Associates Inc. (c)2011
|
||||
|
||||
Redistribution and use in source and binary forms,
|
||||
with or without modification, are permitted provided that the
|
||||
following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Applied Research Associates Inc nor the names
|
||||
of its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "test_neon.h"
|
||||
#define SCE_PFX_USE_PERFCOUNTER
|
||||
#include "physics_effects.h"
|
||||
#include <arm_neon.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <android/log.h>
|
||||
|
||||
// This works with gcc
|
||||
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
|
||||
|
||||
// assembly implementations
|
||||
extern "C"
|
||||
{
|
||||
void Transform3OperatorMultiplyNeon(float *trfm, float *vec3, float *pfResult);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Transform3OperatorMultiplyScalar
|
||||
//
|
||||
/// Performs a multiply operation on a tranform3 using scalar math, storing the
|
||||
/// result directly into system memory.
|
||||
///
|
||||
/// @param trfm Input transform. Must point to 4x 4 float values
|
||||
/// @param vec3 Input vector3. Must point to 4 float values
|
||||
/// @param pfResult [in] pointer to a float(vector3). [out] Contains the result
|
||||
//----------------------------------------------------------------------------
|
||||
//inline const Vector3 Transform3::operator *( const Vector3 & vec ) const
|
||||
//{
|
||||
// return Vector3(
|
||||
// ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
|
||||
// ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
|
||||
// ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
|
||||
// );
|
||||
//}
|
||||
void Transform3OperatorMultiplyScalar(float *trfm, float *vec3, float *pfResult)
|
||||
{
|
||||
pfResult[0] = ( ( ( trfm[0] * vec3[0] ) + ( trfm[4] * vec3[1] ) ) + ( trfm[8] * vec3[2] ) );
|
||||
pfResult[1] = ( ( ( trfm[1] * vec3[0] ) + ( trfm[5] * vec3[1] ) ) + ( trfm[9] * vec3[2] ) );
|
||||
pfResult[2] = ( ( ( trfm[2] * vec3[0] ) + ( trfm[6] * vec3[1] ) ) + ( trfm[10] * vec3[2] ) );
|
||||
pfResult[3] = 0.0f;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// TestNeonTransform3OperatorMultiply
|
||||
//
|
||||
/// Run timing study of the Tranform3 multiply operator from above, writing the
|
||||
/// results to the Android verbose log.
|
||||
//----------------------------------------------------------------------------
|
||||
void TestNeonTransform3OperatorMultiply()
|
||||
{
|
||||
float SET_ALIGNMENT(64) data1[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
|
||||
float SET_ALIGNMENT(64) data2[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
|
||||
|
||||
float *trfm = &data1[0];
|
||||
float *vec3 = &data2[0];
|
||||
|
||||
|
||||
char szMsg[256];
|
||||
|
||||
sprintf(szMsg, "");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"---------------------------------------");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"TestNeonTransform3OperatorMultiply Start");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "Test input vec3: <%f,%f,%f,%f>",
|
||||
vec3[0], vec3[1], vec3[2], vec3[3]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "Test input trfm: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>>",
|
||||
trfm[0], trfm[1], trfm[2], trfm[3],
|
||||
trfm[4], trfm[5], trfm[6], trfm[7],
|
||||
trfm[8], trfm[9], trfm[10], trfm[11],
|
||||
trfm[12], trfm[13], trfm[14], trfm[15]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
float SET_ALIGNMENT(64) fResult[4];
|
||||
|
||||
|
||||
sce::PhysicsEffects::PfxPerfCounter pc;
|
||||
double dTimeSpan, dRefTimeSpan;;
|
||||
unsigned int uiNumTries = 10000000;
|
||||
unsigned int i;
|
||||
|
||||
// profile scalar Transform3OperatorMultiplyScalar with direct memory return, c++ version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
Transform3OperatorMultiplyScalar(trfm, vec3, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
dRefTimeSpan = dTimeSpan;
|
||||
sprintf(szMsg, "Time to do %i calls for Transform3OperatorMultiplyScalar: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f>",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
// profile NEON Transform3OperatorMultiplyNeon with direct memory return, assembly version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
Transform3OperatorMultiplyNeon(trfm, vec3, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
sprintf(szMsg, "Time to do %i calls for Transform3OperatorMultiplyNeon: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f>",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
sprintf(szMsg,"TestNeonTransform3OperatorMultiply End");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
/*
|
||||
Applied Research Associates Inc. (c)2011
|
||||
|
||||
Redistribution and use in source and binary forms,
|
||||
with or without modification, are permitted provided that the
|
||||
following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Applied Research Associates Inc nor the names
|
||||
of its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "test_neon.h"
|
||||
#define SCE_PFX_USE_PERFCOUNTER
|
||||
#include "physics_effects.h"
|
||||
#include <arm_neon.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <android/log.h>
|
||||
|
||||
// This works with gcc
|
||||
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
|
||||
|
||||
// assembly implementations
|
||||
extern "C"
|
||||
{
|
||||
void TransposeMatrix3Neon(float *mCol, float *pfResult);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// TransposeMatrix3Scalar
|
||||
//
|
||||
/// Performs a transpose on a matrix3 using scalar math, storing the
|
||||
/// result directly into system memory.
|
||||
///
|
||||
/// @param mCol Input matrix. Must point to 3x 4 float values
|
||||
/// @param pfResult [in] pointer to a float(matrix3). [out] Contains the result
|
||||
//----------------------------------------------------------------------------
|
||||
//inline const Matrix3 transpose( const Matrix3 & mat )
|
||||
//{
|
||||
// return Matrix3(
|
||||
// Vector3( mat.getCol0().getX(), mat.getCol1().getX(), mat.getCol2().getX() ),
|
||||
// Vector3( mat.getCol0().getY(), mat.getCol1().getY(), mat.getCol2().getY() ),
|
||||
// Vector3( mat.getCol0().getZ(), mat.getCol1().getZ(), mat.getCol2().getZ() )
|
||||
// );
|
||||
//}
|
||||
void TransposeMatrix3Scalar(float *mCol, float *pfResult)
|
||||
{
|
||||
pfResult[0] = mCol[0];
|
||||
pfResult[1] = mCol[4];
|
||||
pfResult[2] = mCol[8];
|
||||
pfResult[3] = 0.0f;
|
||||
|
||||
pfResult[4] = mCol[1];
|
||||
pfResult[5] = mCol[5];
|
||||
pfResult[6] = mCol[9];
|
||||
pfResult[7] = 0.0f;
|
||||
|
||||
pfResult[8] = mCol[2];
|
||||
pfResult[9] = mCol[6];
|
||||
pfResult[10] = mCol[10];
|
||||
pfResult[11] = 0.0f;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// TestNeonTransposeMatrix3
|
||||
//
|
||||
/// Run timing study of the matrix3 transpose functions above, writing the
|
||||
/// results to the Android verbose log.
|
||||
//----------------------------------------------------------------------------
|
||||
void TestNeonTransposeMatrix3()
|
||||
{
|
||||
float SET_ALIGNMENT(64) data[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
|
||||
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
|
||||
float *mCol = &data[0];
|
||||
|
||||
char szMsg[256];
|
||||
|
||||
sprintf(szMsg, "");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"---------------------------------------");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg,"TestNeonTransposeMatrix3 Start");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
sprintf(szMsg, "Test input mCol: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
mCol[0], mCol[1], mCol[2], mCol[3],
|
||||
mCol[4], mCol[5], mCol[6], mCol[7],
|
||||
mCol[8], mCol[9], mCol[10], mCol[11]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
float SET_ALIGNMENT(64) fResult[12];
|
||||
|
||||
|
||||
sce::PhysicsEffects::PfxPerfCounter pc;
|
||||
double dTimeSpan, dRefTimeSpan;;
|
||||
unsigned int uiNumTries = 10000000;
|
||||
unsigned int i;
|
||||
|
||||
// profile scalar TransposeMatrix3Scalar with direct memory return, c++ version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
TransposeMatrix3Scalar(mCol, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
dRefTimeSpan = dTimeSpan;
|
||||
sprintf(szMsg, "Time to do %i calls for TransposeMatrix3Scalar: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
|
||||
fResult[4], fResult[5], fResult[6], fResult[7],
|
||||
fResult[8], fResult[9], fResult[10], fResult[11]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
// profile NEON TransposeMatrix3Neon with direct memory return, assembly version
|
||||
fResult[0] = 0.0f;
|
||||
pc.countBegin("");
|
||||
for(i = 0; i < uiNumTries; i++)
|
||||
{
|
||||
TransposeMatrix3Neon(mCol, fResult);
|
||||
}
|
||||
pc.countEnd();
|
||||
dTimeSpan = pc.getCountTime(0);
|
||||
pc.resetCount();
|
||||
sprintf(szMsg, "Time to do %i calls for TransposeMatrix3Neon: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
|
||||
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
|
||||
fResult[4], fResult[5], fResult[6], fResult[7],
|
||||
fResult[8], fResult[9], fResult[10], fResult[11]);
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
|
||||
|
||||
sprintf(szMsg,"TestNeonTransposeMatrix3 End");
|
||||
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
|
||||
}
|
||||
Reference in New Issue
Block a user