Add PhysicsEffects to Extras. The build is only tested on Windows and Android.

The Android/NEON optimized version of Physics Effects is thanks to Graham Rhodes and Anthony Hamilton, See Issue 587
This commit is contained in:
erwin.coumans
2012-03-05 04:59:58 +00:00
parent 6cf8dfc202
commit a93a661b94
462 changed files with 86626 additions and 0 deletions

View File

@@ -0,0 +1,146 @@
@
@ Applied Research Associates Inc. (c)2011
@
@ Redistribution and use in source and binary forms,
@ with or without modification, are permitted provided that the
@ following conditions are met:
@ * Redistributions of source code must retain the above copyright
@ notice, this list of conditions and the following disclaimer.
@ * Redistributions in binary form must reproduce the above copyright
@ notice, this list of conditions and the following disclaimer in the
@ documentation and/or other materials provided with the distribution.
@ * Neither the name of the Applied Research Associates Inc nor the names
@ of its contributors may be used to endorse or promote products derived
@ from this software without specific prior written permission.
@
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
@ POSSIBILITY OF SUCH DAMAGE.
@
.syntax unified
.arch armv7-a
.fpu neon
.thumb
.text
.align 2
@ cross product, result stored directly to memory
.global CrossProductNeonResultInMemoryAssembly
.thumb_func
CrossProductNeonResultInMemoryAssembly:
.fnstart
vld1.32 {d18,d19}, [r1] @ input <x2,y2,z2,w2> = d18,d19
vld1.32 {d16,d17}, [r0] @ input <x1,y1,z1,w1> = d16,d17
@ rearrange inputs into convenient order
vtrn.32 d18,d19 @ q9 = <x2,z2,y2,w2> = d18,d19
vrev64.32 d16,d16 @ q8 = <y1,x1,z1,w1> = d16,d17
vrev64.32 d18,d18 @ q9 = <z2,x2,y2,w2> = d18,d19
vtrn.32 d16,d17 @ q8 = <y1,z1,x1,w1> = d16,d17
@ perform first half of cross product using rearranged inputs
vmul.f32 q10, q8, q9 @ q10 = <y1*z2,z1*x2,x1*y2,w1*w2>
@ rearrange inputs again
vtrn.32 d18,d19 @ q9 = <z2,y2,x2,w2> = d18,d19
vrev64.32 d16,d16 @ q8 = <z1,y1,x1,w1> = d16,d17
vrev64.32 d18,d18 @ q9 = <y2,z2,x2,w2> = d18,d19
vtrn.32 d16,d17 @ q8 = <z1,x1,y1,w1> = d16,d17
@ perform last half of cross product using rearranged inputs
vmls.f32 q10, q8, q9 @ q10 = <y1*z2-y2*z1,z1*x2-z2*x1,x1*y2-x2*y1,w1*w2-w2*w1>
@ and store the result
vst1.32 {q10}, [r2]
bx lr
.fnend
@ cross product, result stored directly to memory, version 2
.global CrossProductNeonResultInMemoryAssembly2
.thumb_func
CrossProductNeonResultInMemoryAssembly2:
.fnstart
vld1.32 {d18[1]}, [r1]!
vld1.32 {d19[0]}, [r1]!
vld1.32 {d18[0]}, [r1]!
vld1.32 {d19[1]}, [r1] @ q9 = <z2,x2,y2,w2> = d18,d19
vld1.32 {d17[0]}, [r0]!
vld1.32 {d16}, [r0]!
vld1.32 {d17[1]}, [r0] @ q8 = <y1,z1,x1,w1> = d16,d17
@ perform first half of cross product using rearranged inputs
vmul.f32 q10, q8, q9 @ q10 = <y1*z2,z1*x2,x1*y2,w1*w2>
@ rearrange inputs for second half
vtrn.32 d18,d19 @ q9 = <z2,y2,x2,w2> = d18,d19
vrev64.32 d16,d16 @ q8 = <z1,y1,x1,w1> = d16,d17
vrev64.32 d18,d18 @ q9 = <y2,z2,x2,w2> = d18,d19
vtrn.32 d16,d17 @ q8 = <z1,x1,y1,w1> = d16,d17
@ perform last half of cross product using rearranged inputs
vmls.f32 q10, q8, q9 @ q10 = <y1*z2-y2*z1,z1*x2-z2*x1,x1*y2-x2*y1,w1*w2-w2*w1>
@ and store the result
vst1.32 {q10}, [r2]
bx lr
.fnend
.global CrossProductNeonResultInMemoryAssembly3
.thumb_func
CrossProductNeonResultInMemoryAssembly3:
.fnstart
vld1.32 {d18[1]}, [r1]!
vld1.32 {d19[0]}, [r1]!
vld1.32 {d18[0]}, [r1]!
vld1.32 {d19[1]}, [r1]! @ q9 = <z2,x2,y2,w2> = d18,d19
vld1.32 {d17[0]}, [r0]!
vld1.32 {d16}, [r0]!
vld1.32 {d17[1]}, [r0]! @ q8 = <y1,z1,x1,w1> = d16,d17
@ perform first half of cross product using rearranged inputs
vmul.f32 q10, q8, q9 @ q10 = <y1*z2,z1*x2,x1*y2,w1*w2>
vld1.32 {d18[1]}, [r0]!
vld1.32 {d19[0]}, [r0]!
vld1.32 {d18[0]}, [r0]!
vld1.32 {d19[1]}, [r0]! @ q9 = <z2,x2,y2,w2> = d18,d19
vld1.32 {d17[0]}, [r1]!
vld1.32 {d16}, [r1]!
vld1.32 {d17[1]}, [r1]! @ q8 = <y1,z1,x1,w1> = d16,d17
@ perform last half of cross product using rearranged inputs
vmls.f32 q10, q8, q9 @ q10 = <y1*z2-y2*z1,z1*x2-z2*x1,x1*y2-x2*y1,w1*w2-w2*w1>
@ and store the result
vst1.32 {q10}, [r2]
bx lr
.fnend
@ cross product, result stored directly to memory, version 3
@ .global CrossProductNeonResultInMemoryAssembly3
@ .thumb_func
@CrossProductNeonResultInMemoryAssembly3:
@ .fnstart
@ ldr r3, table
@ vld1.32 {d18,d19}, [r1] @ input <x2,y2,z2,w2> = d18,d19
@ vld1.32 {d16,d17}, [r0] @ input <x1,y1,z1,w1> = d16,d17
@ rearrange inputs into convenient order
@ vld1.32 {q10}, [r3]!
@ vld1.32 {q11}, [r3]
@ vtbl.8 q12, {d18,d19}, q10
@ vtbl.8 q13, {d16,d17}, q11
@ perform first half of cross product using rearranged inputs
@ vmul.f32 q14, q12, q13
@ rearrange inputs again
@ vtbl.8 q12, {d18,d19}, q11
@ vtbl.8 q13, {d16,d17}, q10
@ perform last half of cross product using rearranged inputs
@ vmls.f32 q14, q12, q13
@ and store the result
@ vst1.32 {q14}, [r2]
@ bx lr
@ .fnend
@table:
@z2,x2,y2,w2
@ dcb 08,09,10,11,00,01,02,03,04,05,06,07,00,00,00,00
@y1,z1,x1,w1
@ dcb 04,05,06,07,08,09,10,11,00,01,02,03,00,00,00,00

View File

@@ -0,0 +1,98 @@
@
@ Applied Research Associates Inc. (c)2011
@
@ Redistribution and use in source and binary forms,
@ with or without modification, are permitted provided that the
@ following conditions are met:
@ * Redistributions of source code must retain the above copyright
@ notice, this list of conditions and the following disclaimer.
@ * Redistributions in binary form must reproduce the above copyright
@ notice, this list of conditions and the following disclaimer in the
@ documentation and/or other materials provided with the distribution.
@ * Neither the name of the Applied Research Associates Inc nor the names
@ of its contributors may be used to endorse or promote products derived
@ from this software without specific prior written permission.
@
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
@ POSSIBILITY OF SUCH DAMAGE.
@
.syntax unified
.arch armv7-a
.fpu neon
.thumb
.text
.align 2
@ dot product, result returned in register
.global DotProductNeonResultInRegisterAssembly
.thumb_func
DotProductNeonResultInRegisterAssembly:
.fnstart
vld1.32 {d16,d17}, [r0] @ input <x1,y1,z1,w1>
vld1.32 {d18,d19}, [r1] @ input <x2,y2,z2,w2>
vmul.f32 d14, d16, d18 @ d14=<x1*x2,y1*y2>
vmla.f32 d14, d17, d19 @ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
vpadd.f32 d14, d14, d14 @ add the two halves of d14 together, result in each lane
vmov.f32 r0, s28 @ s28 aliases the first lane of d14
bx lr
.fnend
@ dot product, result stored directly to memory
.global DotProductNeonResultInMemoryAssembly
.thumb_func
DotProductNeonResultInMemoryAssembly:
.fnstart
vld1.32 {d16,d17}, [r0] @ input <x1,y1,z1,w1>
vld1.32 {d18,d19}, [r1] @ input <x2,y2,z2,w2>
vmul.f32 d14, d16, d18 @ d14=<x1*x2,y1*y2>
@ non-fused multiple accumulate
vmla.f32 d14, d17, d19 @ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
@ fused multiple accumulate - not recognized by GNU assembler
@@ vfma.f32 {d14}, d17, d19 @ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
vpadd.f32 d14, d14, d14 @ add the two halves of d14 together, same result in each lane
vst1.32 {d14}, [r2]
bx lr
.fnend
@ dot product, result stored directly to memory, all inputs are contiguous in memory
.global DotProductNeonResultInMemoryAssembly2
.thumb_func
DotProductNeonResultInMemoryAssembly2:
.fnstart
vld1.32 {d16-d19}, [r0] @ input q8=<x1,y1,z1,w1>, q9=<x2,y2,z2,w2>
vmul.f32 d14, d16, d18 @ d14=<x1*x2,y1*y2>
@ non-fused multiple accumulate
vmla.f32 d14, d17, d19 @ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
@ fused multiple accumulate - not recognized by GNU assembler
@@ vfma.f32 d14, d17, d19 @ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
vpadd.f32 d14, d14, d14 @ add the two halves of d14 together, same result in each lane
vst1.32 {d14}, [r1]
bx lr
.fnend
@ same as DotProductNeonResultInMemoryAssembly2, but alternate methodology to compute result
.global DotProductNeonResultInMemoryAssembly2b
.thumb_func
DotProductNeonResultInMemoryAssembly2b:
.fnstart
vld1.32 {d14-d17}, [r0] @ input q7=<x1,y1,z1,w1>, q8=<x2,y2,z2,w2>
vmul.f32 q7, q7, q8 @ q7 = <x1*x2,y1*y2,z1*z2,w1*w2> = d14,d15 = s28,s29,s30,s31
vpadd.f32 d14,d14,d14 @ add elements of d14 together=<x1*x2+y1*y2,x1*x2+y1*y2>
vpadd.f32 d15,d15,d15 @ add elements of d14 together=<z1*z2+w1*w2,z1*z2+w1*w2>
vadd.f32 d14,d14,d15 @ add d14 and d15 together, final result in both lanes
vst1.32 {d14}, [r1]
bx lr
.fnend

View File

@@ -0,0 +1,62 @@
@
@ Applied Research Associates Inc. (c)2011
@
@ Redistribution and use in source and binary forms,
@ with or without modification, are permitted provided that the
@ following conditions are met:
@ * Redistributions of source code must retain the above copyright
@ notice, this list of conditions and the following disclaimer.
@ * Redistributions in binary form must reproduce the above copyright
@ notice, this list of conditions and the following disclaimer in the
@ documentation and/or other materials provided with the distribution.
@ * Neither the name of the Applied Research Associates Inc nor the names
@ of its contributors may be used to endorse or promote products derived
@ from this software without specific prior written permission.
@
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
@ POSSIBILITY OF SUCH DAMAGE.
@
.syntax unified
.arch armv7-a
.fpu neon
.thumb
.text
.align 2
@ matrix3 operator *, result stored directly to memory
.global Matrix3OperatorMultiplyNeon
.thumb_func
Matrix3OperatorMultiplyNeon:
.fnstart
vld1.32 {d16-d19}, [r0]! @ load first eight elements of matrix 0
vld1.32 {d20-d21}, [r0] @ load second eight elements of matrix 0
vld1.32 {d0-d3}, [r1]! @ load first eight elements of matrix 1
vld1.32 {d4-d5}, [r1] @ load second eight elements of matrix 1
vmul.f32 q12, q8, d0[0] @ rslt col0 = (mat0 col0) * (mat1 col0 elt0)
vmul.f32 q13, q8, d2[0] @ rslt col1 = (mat0 col0) * (mat1 col1 elt0)
vmul.f32 q14, q8, d4[0] @ rslt col2 = (mat0 col0) * (mat1 col2 elt0)
vmla.f32 q12, q9, d0[1] @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
vmla.f32 q13, q9, d2[1] @ rslt col1 += (mat0 col1) * (mat1 col1 elt1)
vmla.f32 q14, q9, d4[1] @ rslt col2 += (mat0 col1) * (mat1 col2 elt1)
vmla.f32 q12, q10, d1[0] @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
vmla.f32 q13, q10, d3[0] @ rslt col1 += (mat0 col2) * (mat1 col1 elt2)
vmla.f32 q14, q10, d5[0] @ rslt col2 += (mat0 col2) * (mat1 col2 elt2)
vst1.32 {d24-d27}, [r2]! @ store first eight elements of result
vst1.32 {d28-d29}, [r2] @ store second eight elements of result
bx lr
.fnend

View File

@@ -0,0 +1,71 @@
@
@ Applied Research Associates Inc. (c)2011
@
@ Redistribution and use in source and binary forms,
@ with or without modification, are permitted provided that the
@ following conditions are met:
@ * Redistributions of source code must retain the above copyright
@ notice, this list of conditions and the following disclaimer.
@ * Redistributions in binary form must reproduce the above copyright
@ notice, this list of conditions and the following disclaimer in the
@ documentation and/or other materials provided with the distribution.
@ * Neither the name of the Applied Research Associates Inc nor the names
@ of its contributors may be used to endorse or promote products derived
@ from this software without specific prior written permission.
@
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
@ POSSIBILITY OF SUCH DAMAGE.
@
.syntax unified
.arch armv7-a
.fpu neon
.thumb
.text
.align 2
@ matrix4 operator *, result stored directly to memory
.global Matrix4OperatorMultiplyNeon
.thumb_func
Matrix4OperatorMultiplyNeon:
.fnstart
vld1.32 {d16-d19}, [r0]! @ load first eight elements of matrix 0
vld1.32 {d20-d23}, [r0] @ load second eight elements of matrix 0
vld1.32 {d0-d3}, [r1]! @ load first eight elements of matrix 1
vld1.32 {d4-d7}, [r1] @ load second eight elements of matrix 1
vmul.f32 q12, q8, d0[0] @ rslt col0 = (mat0 col0) * (mat1 col0 elt0)
vmul.f32 q13, q8, d2[0] @ rslt col1 = (mat0 col0) * (mat1 col1 elt0)
vmul.f32 q14, q8, d4[0] @ rslt col2 = (mat0 col0) * (mat1 col2 elt0)
vmul.f32 q15, q8, d6[0] @ rslt col3 = (mat0 col0) * (mat1 col3 elt0)
vmla.f32 q12, q9, d0[1] @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
vmla.f32 q13, q9, d2[1] @ rslt col1 += (mat0 col1) * (mat1 col1 elt1)
vmla.f32 q14, q9, d4[1] @ rslt col2 += (mat0 col1) * (mat1 col2 elt1)
vmla.f32 q15, q9, d6[1] @ rslt col3 += (mat0 col1) * (mat1 col3 elt1)
vmla.f32 q12, q10, d1[0] @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
vmla.f32 q13, q10, d3[0] @ rslt col1 += (mat0 col2) * (mat1 col1 elt2)
vmla.f32 q14, q10, d5[0] @ rslt col2 += (mat0 col2) * (mat1 col2 elt2)
vmla.f32 q15, q10, d7[0] @ rslt col3 += (mat0 col2) * (mat1 col2 elt2)
vmla.f32 q12, q11, d1[1] @ rslt col0 += (mat0 col3) * (mat1 col0 elt3)
vmla.f32 q13, q11, d3[1] @ rslt col1 += (mat0 col3) * (mat1 col1 elt3)
vmla.f32 q14, q11, d5[1] @ rslt col2 += (mat0 col3) * (mat1 col2 elt3)
vmla.f32 q15, q11, d7[1] @ rslt col3 += (mat0 col3) * (mat1 col3 elt3)
vst1.32 {d24-d27}, [r2]! @ store first eight elements of result
vst1.32 {d28-d31}, [r2] @ store second eight elements of result
bx lr
.fnend

View File

@@ -0,0 +1,81 @@
@
@ Applied Research Associates Inc. (c)2011
@
@ Redistribution and use in source and binary forms,
@ with or without modification, are permitted provided that the
@ following conditions are met:
@ * Redistributions of source code must retain the above copyright
@ notice, this list of conditions and the following disclaimer.
@ * Redistributions in binary form must reproduce the above copyright
@ notice, this list of conditions and the following disclaimer in the
@ documentation and/or other materials provided with the distribution.
@ * Neither the name of the Applied Research Associates Inc nor the names
@ of its contributors may be used to endorse or promote products derived
@ from this software without specific prior written permission.
@
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
@ POSSIBILITY OF SUCH DAMAGE.
@
.syntax unified
.arch armv7-a
.fpu neon
.thumb
.text
.align 2
@ orthoInverse of a transform3, result stored directly to memory
.global OrthoInverseTransform3Neon
.thumb_func
OrthoInverseTransform3Neon:
.fnstart
@way 1
@ vld1.f32 {d0-d3}, [r0]! @ load first eight elements of result
@ vld1.f32 {d4-d7}, [r0] @ load second eight elements of result
@ vzip.f32 d0, d2
@ vzip.f32 d1, d3
@ vswp.f32 d1, d4
@ vzip.f32 d1, d3
@way 2 (faster)
@
@
@ NOTE NOTE! This is broken! It does not use the last column of the input! Please see
@ implementation of pfxTransform3OrthoInverseNEON in vectormath_neon_assembly_implementations.S, in
@ include/vecmath/neon, for a correct implementation!
@
vld1.f32 d0[0], [r0]!
vld1.f32 d2[0], [r0]!
vld1.f32 d4[0], [r0]!
vld1.f32 d1[1], [r0]!
vld1.f32 d0[1], [r0]!
vld1.f32 d2[1], [r0]!
vld1.f32 d4[1], [r0]!
vld1.f32 d3[1], [r0]!
vld1.f32 d1[0], [r0]!
vld1.f32 d3[0], [r0]!
vld1.f32 d5[0], [r0]!
vld1.f32 d5[1], [r0]
vmul.f32 q3, q0, d1[0]
vmla.f32 q3, q1, d3[0]
vmla.f32 q3, q2, d5[0]
vneg.f32 q3, q3
vst1.f32 {d0-d3}, [r1]! @ store first eight elements of result
vst1.f32 {d4-d7}, [r1] @ store second eight elements of result
bx lr
.fnend

View File

@@ -0,0 +1,51 @@
@
@ Applied Research Associates Inc. (c)2011
@
@ Redistribution and use in source and binary forms,
@ with or without modification, are permitted provided that the
@ following conditions are met:
@ * Redistributions of source code must retain the above copyright
@ notice, this list of conditions and the following disclaimer.
@ * Redistributions in binary form must reproduce the above copyright
@ notice, this list of conditions and the following disclaimer in the
@ documentation and/or other materials provided with the distribution.
@ * Neither the name of the Applied Research Associates Inc nor the names
@ of its contributors may be used to endorse or promote products derived
@ from this software without specific prior written permission.
@
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
@ POSSIBILITY OF SUCH DAMAGE.
@
.syntax unified
.arch armv7-a
.fpu neon
.thumb
.text
.align 2
@ transform3 operator *, result stored directly to memory
.global Transform3OperatorMultiplyNeon
.thumb_func
Transform3OperatorMultiplyNeon:
.fnstart
vld1.32 {d16-d19}, [r0]! @ load first eight elements of transform matrix
vld1.32 {d20-d21}, [r0] @ load second eight elements of transform matrix
vld1.32 {d0-d1}, [r1] @ load the four elements of vector3
vmul.f32 q12, q8, d0[0] @ rslt col0 = (mat0 col0) * (mat1 col0 elt0)
vmla.f32 q12, q9, d0[1] @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
vmla.f32 q12, q10, d1[0] @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
vst1.32 {d24-d25}, [r2] @ store first eight elements of result
bx lr
.fnend

View File

@@ -0,0 +1,81 @@
@
@ Applied Research Associates Inc. (c)2011
@
@ Redistribution and use in source and binary forms,
@ with or without modification, are permitted provided that the
@ following conditions are met:
@ * Redistributions of source code must retain the above copyright
@ notice, this list of conditions and the following disclaimer.
@ * Redistributions in binary form must reproduce the above copyright
@ notice, this list of conditions and the following disclaimer in the
@ documentation and/or other materials provided with the distribution.
@ * Neither the name of the Applied Research Associates Inc nor the names
@ of its contributors may be used to endorse or promote products derived
@ from this software without specific prior written permission.
@
@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
@ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
@ POSSIBILITY OF SUCH DAMAGE.
@
.syntax unified
.arch armv7-a
.fpu neon
.thumb
.text
.align 2
@ transpose a matrix3, result stored directly to memory
.global TransposeMatrix3Neon
.thumb_func
TransposeMatrix3Neon:
.fnstart
@way1
@ vld1.f32 d0[0], [r0]!
@ vld1.f32 d2[0], [r0]!
@ vld1.f32 d4[0], [r0]!
@ vld1.f32 d1[1], [r0]!
@ vld1.f32 d0[1], [r0]!
@ vld1.f32 d2[1], [r0]!
@ vld1.f32 d4[1], [r0]!
@ vld1.f32 d3[1], [r0]!
@ vld1.f32 d1[0], [r0]!
@ vld1.f32 d3[0], [r0]!
@ vld1.f32 d5[0], [r0]!
@ vld1.f32 d5[1], [r0]
@way2
@ vld2.f32 {d0-d1}, [r0]!
@ vld2.f32 {d2-d3}, [r0]!
@ vld2.f32 {d4-d5}, [r0]
@ vzip.f32 q0, q1
@ vswp.f32 d1, d4
@ vswp.f32 d3, d5
@ vzip.f32 d1, d5
@way3 (Fastest)
vld1.f32 {d0-d3}, [r0]! @ store first eight elements of result
vld1.f32 {d4-d5}, [r0] @ store second eight elements of result
vzip.f32 d0, d2
vzip.f32 d1, d3
vswp.f32 d1, d4
vzip.f32 d1, d3
vst1.f32 {d0-d3}, [r1]! @ store first eight elements of result
vst1.f32 {d4-d5}, [r1] @ store second eight elements of result
bx lr
.fnend

View File

@@ -0,0 +1,43 @@
/*
Applied Research Associates Inc. (c)2011
Redistribution and use in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Applied Research Associates Inc nor the names
of its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef TEST_NEON_H
#define TEST_NEON_H
//Neon functions to test
void TestNeonDotProduct();
void TestNeonCrossProduct();
void TestNeonMatrix3OperatorMultiply();
void TestNeonMatrix4OperatorMultiply();
void TestNeonOrthoInverseTransform3();
void TestNeonTransform3OperatorMultiply();
void TestNeonTransposeMatrix3();
void TestNeonSolveLinearConstraintRow();
#endif // TEST_NEON_H

View File

@@ -0,0 +1,229 @@
/*
Applied Research Associates Inc. (c)2011
Redistribution and use in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Applied Research Associates Inc nor the names
of its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "test_neon.h"
#define SCE_PFX_USE_PERFCOUNTER
#include "physics_effects.h"
#include <arm_neon.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <android/log.h>
// This works with gcc
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
//#define PRINT_COMPUTED_VECTOR_RESULTS
// assembly implementations
extern "C"
{
void CrossProductNeonResultInMemoryAssembly(float *a, float *b, float *pfResult);
void CrossProductNeonResultInMemoryAssembly2(float *a, float *b, float *pfResult);
void CrossProductNeonResultInMemoryAssembly3(float *a, float *b, float *pfResult);
}
//----------------------------------------------------------------------------
// CrossProductNeonResultInMemory
//
/// Performs a Vector3 style cross product using NEON intrinsics, storing the
/// result directly into system memory.
///
/// @param a Input vector 1. Must point to 4 float values
/// @param b Input vector 2. Must point to 4 float values
/// @param pfResult [in] must point to an array of at least *4*
/// float values. [out] The result of the cross
/// product is contained in the first 3 lanes.
///
/// NOTE: The parameter types here are floats, not float32_t's. gcc
/// sometimes doesn't interpret float32_t's correctly. In particular,
/// if the type of pfResult is set to float32_t*, gcc will throw an
/// internal compiler error (ICE) for this code. In memory, float32_t
/// and float are equivalent, so can cast between them explicitly.
//----------------------------------------------------------------------------
void CrossProductNeonResultInMemory(float *a, float *b, float *pfResult)
{
float32x4_t v1 = {a[1],a[2],a[0], 0.0f};
float32x4_t v2 = {b[2],b[0],b[1], 0.0f};
float32x4_t v3 = {a[2],a[0],a[1], 0.0f};
float32x4_t v4 = {b[1],b[2],b[0], 0.0f};
v1 = vmulq_f32(v1, v2);
v1 = vmlsq_f32(v1, v3, v4);
vst1q_f32((float32_t*)pfResult, v1);
}
void CrossProductNeonResultInMemoryCPPAssembly(float *a, float *b, float *result) {
asm volatile(
"vld1.32 {d18[1]}, [r1]! \n\t"
"vld1.32 {d19[0]}, [r1]! \n\t"
"vld1.32 {d18[0]}, [r1]! \n\t"
"vld1.32 {d19[1]}, [r1] \n\t"
"vld1.32 {d17[0]}, [r0]! \n\t"
"vld1.32 {d16}, [r0]! \n\t"
"vld1.32 {d17[1]}, [r0] \n\t"
"vmul.f32 q10, q8, q9 \n\t"
"vtrn.32 d18,d19 \n\t"
"vrev64.32 d16,d16 \n\t"
"vrev64.32 d18,d18 \n\t"
"vtrn.32 d16,d17 \n\t"
"vmls.f32 q10, q8, q9 \n\t"
"vst1.32 {q10}, [r2] \n\t"
);
}
//----------------------------------------------------------------------------
// CrossProductScalarResultInMemory
//
/// Performs a Vector3 style cross product using scalar math, storing the
/// result directly into system memory.
///
/// @param a Input vector 1. Must point to 4 float values
/// @param b Input vector 2. Must point to 4 float values
/// @param pfResult [in] pointer to a float. [out] Contains the
/// result, dotproduct(a,b)
//----------------------------------------------------------------------------
void CrossProductScalarResultInMemory(float *a, float *b, float *pfResult)
{
pfResult[0] = a[1]*b[2] - a[2]*b[1];
pfResult[1] = a[2]*b[0] - a[0]*b[2];
pfResult[2] = a[0]*b[1] - a[1]*b[0];
}
//----------------------------------------------------------------------------
// TestFastNeonCrossProduct
//
/// Run timing study of the cross product functions above, writing the
/// results to the Android verbose log.
//----------------------------------------------------------------------------
void TestNeonCrossProduct()
{
float SET_ALIGNMENT(64) data[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
float *a = &data[0];
float *b = &data[4];
char szMsg[256];
sprintf(szMsg, "");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"---------------------------------------");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "Cross product test inputs A=<%f,%f,%f>, B=<%f,%f,%f>",
a[0], a[1], a[2], b[0], b[1], b[2]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
float SET_ALIGNMENT(64) fResult[4];
double dTimeSpan, dRefTimeSpan;;
unsigned int uiNumTries = 10000000;
unsigned int i;
sce::PhysicsEffects::PfxPerfCounter pc;
// profile scalar cross product with direct memory return
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
CrossProductScalarResultInMemory(a, b, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
dRefTimeSpan = dTimeSpan;
sprintf(szMsg, "Time to do %i calls for CrossProductScalarResultInMemory: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON assembly volatile cross product with direct memory return
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
CrossProductNeonResultInMemoryCPPAssembly(a, b, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryFast: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON cross product with direct memory return, assembly version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
CrossProductNeonResultInMemoryAssembly(a, b, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON cross product with direct memory return, assembly version 2
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
CrossProductNeonResultInMemoryAssembly2(a, b, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly2: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON cross product with direct memory return, assembly version 3
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
CrossProductNeonResultInMemoryAssembly3(a, b, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly3: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
}

View File

@@ -0,0 +1,330 @@
/*
Applied Research Associates Inc. (c)2011
Redistribution and use in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Applied Research Associates Inc nor the names
of its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "test_neon.h"
#define SCE_PFX_USE_PERFCOUNTER
#include "physics_effects.h"
#include <arm_neon.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <android/log.h>
// This works with gcc
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
//#define PRINT_COMPUTED_VECTOR_RESULTS
// assembly implementations
extern "C"
{
float32_t DotProductNeonResultInRegisterAssembly(float *a, float *b);
void DotProductNeonResultInMemoryAssembly(float *a, float *b, float *pfResult);
void DotProductNeonResultInMemoryAssembly2(float *data, float *pfResult);
void DotProductNeonResultInMemoryAssembly2b(float *data, float *pfResult);
}
//----------------------------------------------------------------------------
// DotProductNeonResultInRegister
//
/// Performs a Vector4 style dot product using NEON intrinsics, returning
/// the result in an ARM register.
///
/// @param a Input vector 1. Must point to 4 float values
/// @param b Input vector 2. Must point to 4 float values
///
/// @return dotproduct(a,b)
//----------------------------------------------------------------------------
float32_t DotProductNeonResultInRegister(float *a, float *b)
{
float32_t result[2];
float32x2_t v1a = vld1_f32((float32_t*)a);
float32x2_t v1b = vld1_f32((float32_t*)(a + 2));
float32x2_t v2a = vld1_f32((float32_t*)b);
float32x2_t v2b = vld1_f32((float32_t*)(b + 2));
v1a = vmul_f32(v1a, v2a);
v1a = vmla_f32(v1a, v1b, v2b);
v1a = vpadd_f32(v1a, v1a);
vst1_f32(result, v1a);
return(result[0]);
}
// pfResult must point to memory large enough to store *two*
// float32_t values
//----------------------------------------------------------------------------
// DotProductNeonResultInMemory
//
/// Performs a Vector4 style dot product using NEON intrinsics, storing the
/// result directly into system memory.
///
/// @param a Input vector 1. Must point to 4 float values
/// @param b Input vector 2. Must point to 4 float values
/// @param pfResult [in] must point to an array of at least *2*
/// float values. [out] The result of the dot
/// product is contained in both slots of the array.
/// Recommended to use the first one though they
/// have the same value
///
/// NOTE: The parameter types here are floats, not float32_t's. gcc
/// sometimes doesn't interpret float32_t's correctly. In particular,
/// if the type of pfResult is set to float32_t*, gcc will throw an
/// internal compiler error (ICE) for this code. In memory, float32_t
/// and float are equivalent, so can cast between them explicitly.
//----------------------------------------------------------------------------
void DotProductNeonResultInMemory(float *a, float *b, float *pfResult)
{
float32x2_t v1a = vld1_f32((float32_t*)a);
float32x2_t v1b = vld1_f32((float32_t*)(a + 2));
float32x2_t v2a = vld1_f32((float32_t*)b);
float32x2_t v2b = vld1_f32((float32_t*)(b + 2));
v1a = vmul_f32(v1a, v2a);
v1a = vmla_f32(v1a, v1b, v2b);
v1a = vpadd_f32(v1a, v1a);
vst1_f32((float32_t*)pfResult, v1a);
}
//----------------------------------------------------------------------------
// DotProductScalarResultInRegister
//
/// Performs a Vector4 style dot product using scalar math, and returning
/// the result in an ARM register.
///
/// @param a Input vector 1. Must point to 4 float values
/// @param b Input vector 2. Must point to 4 float values
///
/// @return dotproduct(a,b)
//----------------------------------------------------------------------------
float DotProductScalarResultInRegister(float *a, float *b)
{
return(a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3]);
}
//----------------------------------------------------------------------------
// DotProductScalarResultInMemory
//
/// Performs a Vector4 style dot product using scalar math, storing the
/// result directly into system memory.
///
/// @param a Input vector 1. Must point to 4 float values
/// @param b Input vector 2. Must point to 4 float values
/// @param pfResult [in] pointer to a float. [out] Contains the
/// result, dotproduct(a,b)
//----------------------------------------------------------------------------
void DotProductScalarResultInMemory(float *a, float *b, float *pfResult)
{
*pfResult = a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];
}
//----------------------------------------------------------------------------
// TestFastNeonDotProduct
//
/// Run timing study of the four dot product functions above, writing the
/// results to the Android verbose log.
//----------------------------------------------------------------------------
void TestNeonDotProduct()
{
float SET_ALIGNMENT(64) data[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX)};
float *a = &data[0];
float *b = &data[4];
char szMsg[256];
sprintf(szMsg, "");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"---------------------------------------");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
sprintf(szMsg, "Dot product test inputs A=<%f,%f,%f>, B=<%f,%f,%f>",
a[0], a[1], a[2], b[0], b[1], b[2]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
#endif
float SET_ALIGNMENT(64) fResult[2];
sce::PhysicsEffects::PfxPerfCounter pc;
double dTimeSpan, dRefTimeSpan;
unsigned int uiNumTries = 10000000;
unsigned int i;
// profile scalar dot product with register return
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
fResult[0] = DotProductScalarResultInRegister(a, b); // C++
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
dRefTimeSpan = dTimeSpan;
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
sprintf(szMsg, "Time to do %i calls to scalar dot product return in register: %f, result value=%f",
uiNumTries, dTimeSpan, fResult[0]);
#else
sprintf(szMsg, "Sclr C++ Dot Product Register Return: %10.7f secs, REF TIME", dTimeSpan);
#endif
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON dot product with register return
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
fResult[0] = DotProductNeonResultInRegister(a, b); // C++
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
sprintf(szMsg, "Time to do %i calls to NEON dot product return in register: %f, result value=%f",
uiNumTries, dTimeSpan, fResult[0]);
#else
sprintf(szMsg, "Neon C++ Dot Product Register Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
#endif
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON dot product with register return, assembly version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
fResult[0] = DotProductNeonResultInRegisterAssembly(a, b);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
sprintf(szMsg, "Time to do %i calls to NEON dot product return in register (assembly version): %f, result value=%f",
uiNumTries, dTimeSpan, fResult[0]);
#else
sprintf(szMsg, "Neon Asm Dot Product Register Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
#endif
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile scalar dot product with direct memory return
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
DotProductScalarResultInMemory(a, b, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
dRefTimeSpan = dTimeSpan;
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
sprintf(szMsg, "Time to do %i calls to scalar dot product return in memory: %f, result value=%f",
uiNumTries, dTimeSpan, fResult[0]);
#else
sprintf(szMsg, " Sclr C++ Dot Product Memory Return: %10.7f secs, REF TIME", dTimeSpan);
#endif
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON dot product with direct memory return
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
DotProductNeonResultInMemory(a, b, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
sprintf(szMsg, "Time to do %i calls to NEON dot product return in memory: %f, result value=%f",
uiNumTries, dTimeSpan, fResult[0]);
#else
sprintf(szMsg, " Neon C++ Dot Product Memory Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
#endif
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON dot product with direct memory return, assembly version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
DotProductNeonResultInMemoryAssembly(a, b, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
sprintf(szMsg, "Time to do %i calls to NEON dot product return in memory (assembly version): %f, result value=%f",
uiNumTries, dTimeSpan, fResult[0]);
#else
sprintf(szMsg, " Neon Asm Dot Product Memory Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
#endif
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON dot product with direct memory return, assembly version with both
// inputs in contiguous memory block (array of structures)
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
DotProductNeonResultInMemoryAssembly2(data, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
sprintf(szMsg, "Time to do %i calls to NEON dot product return in memory (assembly version - array of structures): %f, result value=%f",
uiNumTries, dTimeSpan, fResult[0]);
#else
sprintf(szMsg, " Neon Asm2 Dot Product Memory Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
#endif
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON dot product with direct memory return, assembly version with both
// inputs in contiguous memory block (array of structures), alternate methodology
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
DotProductNeonResultInMemoryAssembly2b(data, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
#ifdef PRINT_COMPUTED_VECTOR_RESULTS
sprintf(szMsg, "Time to do %i calls to NEON dot product return in memory (assembly version - alternate methodology): %f, result value=%f",
uiNumTries, dTimeSpan, fResult[0]);
#else
sprintf(szMsg, " Neon Asm3 Dot Product Memory Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
#endif
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
}

View File

@@ -0,0 +1,173 @@
/*
Applied Research Associates Inc. (c)2011
Redistribution and use in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Applied Research Associates Inc nor the names
of its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "test_neon.h"
#define SCE_PFX_USE_PERFCOUNTER
#include "physics_effects.h"
#include <arm_neon.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <android/log.h>
// This works with gcc
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
// assembly implementations
extern "C"
{
void Matrix3OperatorMultiplyNeon(float *mCol, float *mat, float *pfResult);
}
//----------------------------------------------------------------------------
// Matrix3OperatorMultiplyScalar
//
/// Performs a Matrix3 Vector3 multiply using scalar math, storing the
/// result directly into system memory.
///
/// @param mCol Matrix3 Columns. Must point to 3x 4 float values
/// @param mat Matrix3 Columns. Must point to 3x 4 float values
/// @param pfResult [in] pointer to a float(matrix 3). [out] Contains the result
//----------------------------------------------------------------------------
//inline const Vector3 Matrix3::operator *( const Vector3 & vec ) const
//{
// return Vector3(
// ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
// ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
// ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
// );
//}
//
//inline const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
//{
// return Matrix3(
// ( *this * mat.mCol0 ),
// ( *this * mat.mCol1 ),
// ( *this * mat.mCol2 )
// );
//}
void Matrix3OperatorMultiplyScalar(float *mCol, float *mat, float *pfResult)
{
pfResult[0] = ( ( ( mCol[0] * mat[0] ) + ( mCol[4] * mat[1] ) ) + ( mCol[8] * mat[2] ) );
pfResult[1] = ( ( ( mCol[1] * mat[0] ) + ( mCol[5] * mat[1] ) ) + ( mCol[9] * mat[2] ) );
pfResult[2] = ( ( ( mCol[2] * mat[0] ) + ( mCol[6] * mat[1] ) ) + ( mCol[10] * mat[2] ) );
pfResult[3] = 0.0f;
pfResult[4] = ( ( ( mCol[0] * mat[4] ) + ( mCol[4] * mat[5] ) ) + ( mCol[8] * mat[6] ) );
pfResult[5] = ( ( ( mCol[1] * mat[4] ) + ( mCol[5] * mat[5] ) ) + ( mCol[9] * mat[6] ) );
pfResult[6] = ( ( ( mCol[2] * mat[4] ) + ( mCol[6] * mat[5] ) ) + ( mCol[10] * mat[6] ) );
pfResult[7] = 0.0f;
pfResult[8] = ( ( ( mCol[0] * mat[8] ) + ( mCol[4] * mat[9] ) ) + ( mCol[8] * mat[10] ) );
pfResult[9] = ( ( ( mCol[1] * mat[8] ) + ( mCol[5] * mat[9] ) ) + ( mCol[9] * mat[10] ) );
pfResult[10] = ( ( ( mCol[2] * mat[8] ) + ( mCol[6] * mat[9] ) ) + ( mCol[10] * mat[10] ) );
pfResult[11] = 0.0f;
}
//----------------------------------------------------------------------------
// TestNeonMatrix3OperatorMultiply
//
/// Run timing study of the NeonMatrix3OperatorMultiply product functions above, writing the
/// results to the Android verbose log.
//----------------------------------------------------------------------------
void TestNeonMatrix3OperatorMultiply()
{
float SET_ALIGNMENT(64) data1[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
float SET_ALIGNMENT(64) data2[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
float *mCol = &data1[0];
float *mat = &data2[0];
float SET_ALIGNMENT(64) fResult[12];
sce::PhysicsEffects::PfxPerfCounter pc;
double dTimeSpan, dRefTimeSpan;;
unsigned int uiNumTries = 10000000;
unsigned int i;
char szMsg[256];
sprintf(szMsg, "");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"---------------------------------------");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"TestNeonMatrix3OperatorMultiply Start");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "Test input mCol: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
mCol[0], mCol[1], mCol[2], mCol[3],
mCol[4], mCol[5], mCol[6], mCol[7],
mCol[8], mCol[9], mCol[10], mCol[11]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "Test input mat: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
mat[0], mat[1], mat[2], mat[3],
mat[4], mat[5], mat[6], mat[7],
mat[8], mat[9], mat[10], mat[11]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile scalar NeonMatrix3OperatorMultiplyScalar with direct memory return, c++ version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
Matrix3OperatorMultiplyScalar(mCol, mat, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
dRefTimeSpan = dTimeSpan;
sprintf(szMsg, "Time to do %i calls for Matrix3OperatorMultiplyScalar: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
fResult[4], fResult[5], fResult[6], fResult[7],
fResult[8], fResult[9], fResult[10], fResult[11]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON Matrix3OperatorMultiplyNeon with direct memory return, assembly version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
Matrix3OperatorMultiplyNeon(mCol, mat, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
sprintf(szMsg, "Time to do %i calls for Matrix3OperatorMultiplyNeon: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
fResult[4], fResult[5], fResult[6], fResult[7],
fResult[8], fResult[9], fResult[10], fResult[11]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"TestNeonMatrix3OperatorMultiply End");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
}

View File

@@ -0,0 +1,188 @@
/*
Applied Research Associates Inc. (c)2011
Redistribution and use in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Applied Research Associates Inc nor the names
of its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "test_neon.h"
#define SCE_PFX_USE_PERFCOUNTER
#include "physics_effects.h"
#include <arm_neon.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <android/log.h>
// This works with gcc
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
// assembly implementations
extern "C"
{
void Matrix4OperatorMultiplyNeon(float *mCol, float *mat, float *pfResult);
}
//----------------------------------------------------------------------------
// Matrix4OperatorMultiplyScalar
//
/// Performs a Matrix4 vector3 multiply using scalar math, storing the
/// result directly into system memory.
///
/// @param mCol Input matrix 1. Must point to 4x 4 float values
/// @param mat Input matrix 2. Must point to 4x 4 float values
/// @param pfResult [in] pointer to a float(matrix). [out] Contains the result
//----------------------------------------------------------------------------
//inline const Vector4 Matrix4::operator *( const Vector4 & vec ) const
//{
// return Vector4(
// ( ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ) + ( mCol3.getX() * vec.getW() ) ),
// ( ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ) + ( mCol3.getY() * vec.getW() ) ),
// ( ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) ) + ( mCol3.getZ() * vec.getW() ) ),
// ( ( ( ( mCol0.getW() * vec.getX() ) + ( mCol1.getW() * vec.getY() ) ) + ( mCol2.getW() * vec.getZ() ) ) + ( mCol3.getW() * vec.getW() ) )
// );
//}
//inline const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
//{
// return Matrix4(
// ( *this * mat.mCol0 ),
// ( *this * mat.mCol1 ),
// ( *this * mat.mCol2 ),
// ( *this * mat.mCol3 )
// );
//}
void Matrix4OperatorMultiplyScalar(float *mCol, float *mat, float *pfResult)
{
pfResult[0] = ( ( ( mCol[0] * mat[0] ) + ( mCol[4] * mat[1] ) ) + ( mCol[8] * mat[2] ) + ( mCol[12] * mat[3] ) );
pfResult[1] = ( ( ( mCol[1] * mat[0] ) + ( mCol[5] * mat[1] ) ) + ( mCol[9] * mat[2] ) + ( mCol[13] * mat[3] ) );
pfResult[2] = ( ( ( mCol[2] * mat[0] ) + ( mCol[6] * mat[1] ) ) + ( mCol[10] * mat[2] ) + ( mCol[14] * mat[3] ) );
pfResult[3] = ( ( ( mCol[3] * mat[0] ) + ( mCol[7] * mat[1] ) ) + ( mCol[11] * mat[2] ) + ( mCol[15] * mat[3] ) );
pfResult[4] = ( ( ( mCol[0] * mat[4] ) + ( mCol[4] * mat[5] ) ) + ( mCol[8] * mat[6] ) + ( mCol[12] * mat[7] ) );
pfResult[5] = ( ( ( mCol[1] * mat[4] ) + ( mCol[5] * mat[5] ) ) + ( mCol[9] * mat[6] ) + ( mCol[13] * mat[7] ) );
pfResult[6] = ( ( ( mCol[2] * mat[4] ) + ( mCol[6] * mat[5] ) ) + ( mCol[10] * mat[6] ) + ( mCol[14] * mat[7] ) );
pfResult[7] = ( ( ( mCol[3] * mat[4] ) + ( mCol[7] * mat[5] ) ) + ( mCol[11] * mat[6] ) + ( mCol[15] * mat[7] ) );
pfResult[8] = ( ( ( mCol[0] * mat[8] ) + ( mCol[4] * mat[9] ) ) + ( mCol[8] * mat[10] ) + ( mCol[12] * mat[11] ) );
pfResult[9] = ( ( ( mCol[1] * mat[8] ) + ( mCol[5] * mat[9] ) ) + ( mCol[9] * mat[10] ) + ( mCol[13] * mat[11] ) );
pfResult[10] = ( ( ( mCol[2] * mat[8] ) + ( mCol[6] * mat[9] ) ) + ( mCol[10] * mat[10] ) + ( mCol[14] * mat[11] ) );
pfResult[11] = ( ( ( mCol[3] * mat[8] ) + ( mCol[7] * mat[9] ) ) + ( mCol[11] * mat[10] ) + ( mCol[15] * mat[11] ) );
pfResult[12] = ( ( ( mCol[0] * mat[12] ) + ( mCol[4] * mat[13] ) ) + ( mCol[8] * mat[14] ) + ( mCol[12] * mat[15] ) );
pfResult[13] = ( ( ( mCol[1] * mat[12] ) + ( mCol[5] * mat[13] ) ) + ( mCol[9] * mat[14] ) + ( mCol[13] * mat[15] ) );
pfResult[14] = ( ( ( mCol[2] * mat[12] ) + ( mCol[6] * mat[13] ) ) + ( mCol[10] * mat[14] ) + ( mCol[14] * mat[15] ) );
pfResult[15] = ( ( ( mCol[3] * mat[12] ) + ( mCol[7] * mat[13] ) ) + ( mCol[11] * mat[14] ) + ( mCol[15] * mat[15] ) );
}
//----------------------------------------------------------------------------
// TestNeonMatrix4OperatorMultiply
//
/// Run timing study of the matrix4 vector3 multiply operator from above, writing the
/// results to the Android verbose log.
//----------------------------------------------------------------------------
void TestNeonMatrix4OperatorMultiply()
{
float SET_ALIGNMENT(64) data1[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX)};
float SET_ALIGNMENT(64) data2[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX)};
float *mCol = &data1[0];
float *mat = &data2[0];
char szMsg[256];
sprintf(szMsg, "");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"---------------------------------------");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"TestNeonMatrix4OperatorMultiply Start");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "Test input mCol: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
mCol[0], mCol[1], mCol[2], mCol[3],
mCol[4], mCol[5], mCol[6], mCol[7],
mCol[8], mCol[9], mCol[10], mCol[11],
mCol[12], mCol[13], mCol[14], mCol[15]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "Test input mat: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
mat[0], mat[1], mat[2], mat[3],
mat[4], mat[5], mat[6], mat[7],
mat[8], mat[9], mat[10], mat[11],
mCol[12], mCol[13], mCol[14], mCol[15]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
float SET_ALIGNMENT(64) fResult[16];
sce::PhysicsEffects::PfxPerfCounter pc;
double dTimeSpan, dRefTimeSpan;;
unsigned int uiNumTries = 10000000;
unsigned int i;
// profile scalar Matrix4OperatorMultiplyScalar with direct memory return, c++ version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
Matrix4OperatorMultiplyScalar(mCol, mat, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
dRefTimeSpan = dTimeSpan;
sprintf(szMsg, "Time to do %i calls for Matrix4OperatorMultiplyScalar: %f secs, speedup: %5.2f",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "result value: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
fResult[0], fResult[1], fResult[2], fResult[3],
fResult[4], fResult[5], fResult[6], fResult[7],
fResult[8], fResult[9], fResult[10], fResult[11],
fResult[12], fResult[13], fResult[14], fResult[15]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON Matrix4OperatorMultiplyNeon with direct memory return, assembly version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
Matrix4OperatorMultiplyNeon(mCol, mat, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
sprintf(szMsg, "Time to do %i calls for Matrix4OperatorMultiplyNeon: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
fResult[4], fResult[5], fResult[6], fResult[7],
fResult[8], fResult[9], fResult[10], fResult[11],
fResult[12], fResult[13], fResult[14], fResult[15]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"TestNeonMatrix4OperatorMultiply End");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
}

View File

@@ -0,0 +1,173 @@
/*
Applied Research Associates Inc. (c)2011
Redistribution and use in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Applied Research Associates Inc nor the names
of its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "test_neon.h"
#define SCE_PFX_USE_PERFCOUNTER
#include "physics_effects.h"
#include <arm_neon.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <android/log.h>
// This works with gcc
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
// assembly implementations
extern "C"
{
void OrthoInverseTransform3Neon(float *trfm, float *pfResult);
}
//----------------------------------------------------------------------------
// OrthoInverseTransform3Scalar
//
/// Performs a ortho inverse of a transform3 using scalar math, storing the
/// result directly into system memory.
///
/// @param trfm Input transform3. Must point to 4x 4 float values
/// @param pfResult [in] pointer to a float(tranform3). [out] Contains the result
//----------------------------------------------------------------------------
//inline const Transform3 orthoInverse( const Transform3 & tfrm )
//{
// Vector3 inv0, inv1, inv2;
// inv0 = Vector3( tfrm.getCol0().getX(), tfrm.getCol1().getX(), tfrm.getCol2().getX() );
// inv1 = Vector3( tfrm.getCol0().getY(), tfrm.getCol1().getY(), tfrm.getCol2().getY() );
// inv2 = Vector3( tfrm.getCol0().getZ(), tfrm.getCol1().getZ(), tfrm.getCol2().getZ() );
// return Transform3(
// inv0,
// inv1,
// inv2,
// Vector3( ( -( ( inv0 * tfrm.getCol3().getX() ) + ( ( inv1 * tfrm.getCol3().getY() ) + ( inv2 * tfrm.getCol3().getZ() ) ) ) ) )
// );
//}
void OrthoInverseTransform3Scalar(float *trfm, float *pfResult)
{
pfResult[0] = trfm[0];
pfResult[1] = trfm[4];
pfResult[2] = trfm[8];
pfResult[3] = 0.0f;
pfResult[4] = trfm[1];
pfResult[5] = trfm[5];
pfResult[6] = trfm[9];
pfResult[7] = 0.0f;
pfResult[8] = trfm[2];
pfResult[9] = trfm[6];
pfResult[10] = trfm[10];
pfResult[11] = 0.0f;
pfResult[12] = ( -( ( trfm[0] * trfm[8]) + ( ( trfm[1] * trfm[9]) + ( trfm[2] * trfm[10]) ) ) );
pfResult[13] = ( -( ( trfm[4] * trfm[8]) + ( ( trfm[5] * trfm[9]) + ( trfm[6] * trfm[10]) ) ) );
pfResult[14] = ( -( ( trfm[8] * trfm[8]) + ( ( trfm[9] * trfm[9]) + ( trfm[10] * trfm[10]) ) ) );
pfResult[15] = 0.0f;
}
//----------------------------------------------------------------------------
// TestNeonOrthoInverseTransform3
//
/// Run timing study of the orthoinverse on transform3 functions above, writing the
/// results to the Android verbose log.
//----------------------------------------------------------------------------
void TestNeonOrthoInverseTransform3()
{
float SET_ALIGNMENT(64) data1[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
float *trfm = &data1[0];
char szMsg[256];
sprintf(szMsg, "");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"---------------------------------------");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"TestNeonOrthoInverseTransform3 Start");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "Test input trfm: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>>",
trfm[0], trfm[1], trfm[2], trfm[3],
trfm[4], trfm[5], trfm[6], trfm[7],
trfm[8], trfm[9], trfm[10], trfm[11],
trfm[12], trfm[13], trfm[14], trfm[15]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
float SET_ALIGNMENT(64) fResult[16];
sce::PhysicsEffects::PfxPerfCounter pc;
double dTimeSpan, dRefTimeSpan;;
unsigned int uiNumTries = 10000000;
unsigned int i;
// profile scalar OrthoInverseTransform3Scalar with direct memory return, c++ version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
OrthoInverseTransform3Scalar(trfm, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
dRefTimeSpan = dTimeSpan;
sprintf(szMsg, "Time to do %i calls for OrthoInverseTransform3Scalar: %f secs, speedup: %5.2f",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
fResult[0], fResult[1], fResult[2], fResult[3],
fResult[4], fResult[5], fResult[6], fResult[7],
fResult[8], fResult[9], fResult[10], fResult[11],
fResult[12], fResult[13], fResult[14], fResult[15]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON OrthoInverseTransform3Neon with direct memory return, assembly version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
OrthoInverseTransform3Neon(trfm, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
sprintf(szMsg, "Time to do %i calls for OrthoInverseTransform3Neon: %f secs, speedup: %5.2f",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
fResult[0], fResult[1], fResult[2], fResult[3],
fResult[4], fResult[5], fResult[6], fResult[7],
fResult[8], fResult[9], fResult[10], fResult[11],
fResult[12], fResult[13], fResult[14], fResult[15]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"TestNeonOrthoInverseTransform3 End");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
}

View File

@@ -0,0 +1,477 @@
/*
Applied Research Associates Inc. (c)2011
Redistribution and use in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Applied Research Associates Inc nor the names
of its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "test_neon.h"
#define SCE_PFX_USE_PERFCOUNTER
#include "physics_effects.h"
#include <arm_neon.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <android/log.h>
// This works with gcc
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
//----------------------------------------------------------------------------
// pfxSolveLinearConstraintRowScalar
//
/// Performs
///
/// @param Input
//----------------------------------------------------------------------------
void pfxSolveLinearConstraintRowScalar(float *constraint, // 8 floats
float *deltaLinearVelocityA, // 4 floats
float *deltaAngularVelocityA, // 4 floats
float massInvA, // 1 floats
float *inertiaInvA, // 12 floats
float *rA, // 4 floats
float *deltaLinearVelocityB, // 4 floats
float *deltaAngularVelocityB, // 4 floats
float massInvB, // 1 floats
float *inertiaInvB, // 12 floats
float *rB) // 4 floats
{
//const PfxVector3 normal(pfxReadVector3(constraint.m_normal)); //fourth element is not zero and it need to be in assembly
float SET_ALIGNMENT(64) normalRead[] = {constraint[0],constraint[1],constraint[2],0.0f};
float *normal = &normalRead[0];
//PfxFloat deltaImpulse = constraint.m_rhs;
float deltaImpulse = constraint[4];
//PfxVector3 dVA = deltaLinearVelocityA + cross(deltaAngularVelocityA,rA);
float SET_ALIGNMENT(64) dVARead[] = {deltaLinearVelocityA[0] + (deltaAngularVelocityA[1]*rA[2] - deltaAngularVelocityA[2]*rA[1]),
deltaLinearVelocityA[1] + (deltaAngularVelocityA[2]*rA[0] - deltaAngularVelocityA[0]*rA[2]),
deltaLinearVelocityA[2] + (deltaAngularVelocityA[0]*rA[1] - deltaAngularVelocityA[1]*rA[0]),
0.0f};
float *dVA = &dVARead[0];
//PfxVector3 dVB = deltaLinearVelocityB + cross(deltaAngularVelocityB,rB);
float SET_ALIGNMENT(64) dVBRead[] = {deltaLinearVelocityB[0] + (deltaAngularVelocityB[1]*rB[2] - deltaAngularVelocityB[2]*rB[1]),
deltaLinearVelocityB[1] + (deltaAngularVelocityB[2]*rB[0] - deltaAngularVelocityB[0]*rB[2]),
deltaLinearVelocityB[2] + (deltaAngularVelocityB[0]*rB[1] - deltaAngularVelocityB[1]*rB[0]),
0.0f};
float *dVB = &dVBRead[0];
//deltaImpulse -= constraint.m_jacDiagInv * dot(normal,dVA-dVB);
deltaImpulse -= constraint[4] * (normal[0]*(dVA[0]-dVB[0]) + normal[1]*(dVA[1]-dVB[1]) + normal[2]*(dVA[2]-dVB[2]));
//PfxFloat oldImpulse = constraint.m_accumImpulse;
float oldImpulse = constraint[7];
//constraint.m_accumImpulse = SCE_PFX_CLAMP(oldImpulse + deltaImpulse,constraint.m_lowerLimit,constraint.m_upperLimit);
if((oldImpulse + deltaImpulse) > constraint[6]){
if(constraint[6] > constraint[5]){
constraint[7] = constraint[6];
}else {
constraint[7] = constraint[5];
}
} else {
if(oldImpulse + deltaImpulse > constraint[5]){
constraint[7] = oldImpulse + deltaImpulse;
}else {
constraint[7] = constraint[5];
}
}
//deltaImpulse = constraint.m_accumImpulse - oldImpulse;
deltaImpulse = constraint[7] - oldImpulse;
//deltaLinearVelocityA += deltaImpulse * massInvA * normal;
deltaLinearVelocityA[0] += deltaImpulse * massInvA * normal[0];
deltaLinearVelocityA[1] += deltaImpulse * massInvA * normal[1];
deltaLinearVelocityA[2] += deltaImpulse * massInvA * normal[2];
deltaLinearVelocityA[3] += deltaImpulse * massInvA * normal[3];
//deltaAngularVelocityA += deltaImpulse * inertiaInvA * cross(rA,normal);
deltaAngularVelocityA[0] += deltaImpulse * ((inertiaInvA[0] * (rA[1]*normal[2] - rA[2]*normal[1])) +
(inertiaInvA[4] * (rA[2]*normal[0] - rA[0]*normal[2])) +//
(inertiaInvA[8] * (rA[0]*normal[1] - rA[1]*normal[0])));
deltaAngularVelocityA[1] += deltaImpulse * ((inertiaInvA[1] * (rA[1]*normal[2] - rA[2]*normal[1])) +
(inertiaInvA[5] * (rA[2]*normal[0] - rA[0]*normal[2])) +
(inertiaInvA[9] * (rA[0]*normal[1] - rA[1]*normal[0])));
deltaAngularVelocityA[2] += deltaImpulse * ((inertiaInvA[2] * (rA[1]*normal[2] - rA[2]*normal[1])) +
(inertiaInvA[6] * (rA[2]*normal[0] - rA[0]*normal[2])) +
(inertiaInvA[10] * (rA[0]*normal[1] - rA[1]*normal[0])));
deltaAngularVelocityA[3] += deltaImpulse * ((inertiaInvA[3] * (rA[1]*normal[2] - rA[2]*normal[1])) +
(inertiaInvA[7] * (rA[2]*normal[0] - rA[0]*normal[2])) +
(inertiaInvA[11] * (rA[0]*normal[1] - rA[1]*normal[0])));
//deltaLinearVelocityB -= deltaImpulse * massInvB * normal;
deltaLinearVelocityB[0] -= deltaImpulse * massInvB * normal[0];
deltaLinearVelocityB[1] -= deltaImpulse * massInvB * normal[1];
deltaLinearVelocityB[2] -= deltaImpulse * massInvB * normal[2];
deltaLinearVelocityB[3] -= deltaImpulse * massInvB * normal[3];
//deltaAngularVelocityB -= deltaImpulse * inertiaInvB * cross(rB,normal);
deltaAngularVelocityB[0] -= deltaImpulse * ((inertiaInvB[0] * (rB[1]*normal[2] - rB[2]*normal[1])) +
(inertiaInvB[4] * (rB[2]*normal[0] - rB[0]*normal[2])) +
(inertiaInvB[8] * (rB[0]*normal[1] - rB[1]*normal[0])));
deltaAngularVelocityB[1] -= deltaImpulse * ((inertiaInvB[1] * (rB[1]*normal[2] - rB[2]*normal[1])) +
(inertiaInvB[5] * (rB[2]*normal[0] - rB[0]*normal[2])) +
(inertiaInvB[9] * (rB[0]*normal[1] - rB[1]*normal[0])));
deltaAngularVelocityB[2] -= deltaImpulse * ((inertiaInvB[2] * (rB[1]*normal[2] - rB[2]*normal[1])) +
(inertiaInvB[6] * (rB[2]*normal[0] - rB[0]*normal[2])) +
(inertiaInvB[10] * (rB[0]*normal[1] - rB[1]*normal[0])));
deltaAngularVelocityB[3] -= deltaImpulse * ((inertiaInvB[3] * (rB[1]*normal[2] - rB[2]*normal[1])) +
(inertiaInvB[7] * (rB[2]*normal[0] - rB[0]*normal[2])) +
(inertiaInvB[11] * (rB[0]*normal[1] - rB[1]*normal[0])));
}
/*//////////////////////////////////////Source Code///////////////////////////////////////
void pfxSolveLinearConstraintRow(
PfxConstraintRow &constraint, // 8 floats
PfxVector3 &deltaLinearVelocityA, // 4 floats
PfxVector3 &deltaAngularVelocityA, // 4 floats
PfxFloat massInvA, // 1 float
const PfxMatrix3 &inertiaInvA, // 12 floats
const PfxVector3 &rA, // 4 floats
PfxVector3 &deltaLinearVelocityB, // 4 floats
PfxVector3 &deltaAngularVelocityB, // 4 floats
PfxFloat massInvB, // 1 float
const PfxMatrix3 &inertiaInvB, // 12 floats
const PfxVector3 &rB) // 4 floats
{
// PfxConstraintRow structure:
// PfxFloat m_normal[3];
// PfxFloat m_rhs;
// PfxFloat m_jacDiagInv;
// PfxFloat m_lowerLimit;
// PfxFloat m_upperLimit;
// PfxFloat m_accumImpulse;
const PfxVector3 normal(pfxReadVector3(constraint.m_normal));
PfxFloat deltaImpulse = constraint.m_rhs;
PfxVector3 dVA = deltaLinearVelocityA + cross(deltaAngularVelocityA,rA);
PfxVector3 dVB = deltaLinearVelocityB + cross(deltaAngularVelocityB,rB);
deltaImpulse -= constraint.m_jacDiagInv * dot(normal,dVA-dVB);
PfxFloat oldImpulse = constraint.m_accumImpulse;
constraint.m_accumImpulse = SCE_PFX_CLAMP(oldImpulse + deltaImpulse,constraint.m_lowerLimit,constraint.m_upperLimit);
deltaImpulse = constraint.m_accumImpulse - oldImpulse;
deltaLinearVelocityA += deltaImpulse * massInvA * normal;
deltaAngularVelocityA += deltaImpulse * inertiaInvA * cross(rA,normal);
deltaLinearVelocityB -= deltaImpulse * massInvB * normal;
deltaAngularVelocityB -= deltaImpulse * inertiaInvB * cross(rB,normal);
}
*/
void pfxSolveLinearConstraintRowNeonInline(float *constraint, // 8 floats
float *deltaLinearVelocityA, // 4 floats
float *deltaAngularVelocityA, // 4 floats
float *massInvA, // 1 floats
float *inertiaInvA, // 12 floats
float *rA, // 4 floats
float *deltaLinearVelocityB, // 4 floats
float *deltaAngularVelocityB, // 4 floats
float *massInvB, // 1 floats
float *inertiaInvB, // 12 floats
float *rB) // 4 floats
{
// "r" (constraint), //%0
// "r" (deltaLinearVelocityA), //%1
// "r" (deltaAngularVelocityA), //%2
// "r" (massInvA), //%3
// "r" (inertiaInvA), //%4
// "r" (rA), //%5
// "r" (deltaLinearVelocityB), //%6
// "r" (deltaAngularVelocityB), //%7
// "r" (massInvB), //%8
// "r" (inertiaInvB), //%9
// "r" (rB) //%10
asm volatile
(
//Loads beforehand so the normal vector will be able to have zero in the 4th element
"vld1.32 {q0}, [%1] \n\t" //LOAD => deltaLinearVelocityA ----------------> q0
"vld1.32 {q1}, [%2] \n\t" //LOAD => deltaAngularVelocityA ---------------> q1
"vld1.32 {q2}, [%6] \n\t" //LOAD => deltaLinearVelocityB ----------------> q2
"vld1.32 {q3}, [%7] \n\t" //LOAD => deltaAngularVelocityB ---------------> q3
//const PfxVector3 normal(pfxReadVector3(constraint.m_normal));
"vld1.32 {q4}, [%0]! \n\t" //LOAD => constraint normal--------------------> q4
"vdup.32 d10, d1[1] \n\t" //set 4th element on normal vector to zero
"vtrn.32 d9, d10 \n\t" //LOAD => deltaImpulse ------------------------> q5 (d10[0] only)
//PfxFloat deltaImpulse = constraint.m_rhs;
"vld1.32 {d12}, [%0]! \n\t" //LOAD => constraint variables ----------------> q6
"vld1.32 {d13[0]}, [%0]! \n\t" //constraint variables
"vld1.32 {d13[1]}, [%0] \n\t" //constraint load of remaining variables loaded this way keep pointer to m_accumilate in order to store back in later
//PfxVector3 dVA = deltaLinearVelocityA + cross(deltaAngularVelocityA,rA);
//PfxVector3 dVB = deltaLinearVelocityB + cross(deltaAngularVelocityB,rB);
"vld1.32 {d14[1]}, [%5] \n\t" //LOAD => rA for cross product use 1A ---------> q7 (save these for use later)
"vld1.32 {d18[1]}, [%10] \n\t" //LOAD => rB for cross product use 1B ---------> q9
"vld1.32 {d17[0]}, [%5]! \n\t" //LOAD => rA for cross product use 2A ---------> q8
"vld1.32 {d21[0]}, [%10]! \n\t" //LOAD => rB for cross product use 2B ---------> q10
"vld1.32 {d16}, [%5] \n\t" //rA 2
"vld1.32 {d20}, [%10] \n\t" //rB 2
"vld1.32 {d15[0]}, [%5]! \n\t" //rA 1
"vld1.32 {d19[0]}, [%10]! \n\t" //rB 1
"vld1.32 {d14[0]}, [%5]! \n\t" //rA 1
"vld1.32 {d18[0]}, [%10]! \n\t" //rB 1
"vld1.32 {d15[1]}, [%5] \n\t" //rA 1
"vld1.32 {d19[1]}, [%10] \n\t" //rB 1
"vld1.32 {d17[1]}, [%5]! \n\t" //rA 2
"vld1.32 {d21[1]}, [%10]! \n\t" //rB 2
"vdup.32 q12, d1[1] \n\t" //set deltaAngularVelocityB 2
"vdup.32 q13, d1[1] \n\t" //set deltaAngularVelocityB 1
"vdup.32 q14, d1[1] \n\t" //set deltaAngularVelocityA 1
"vdup.32 q15, d1[1] \n\t" //set deltaAngularVelocityA 2
"vadd.f32 q14, q14, q1 \n\t"
"vadd.f32 q13, q13, q3 \n\t"
"vadd.f32 q15, q15, q1 \n\t"
"vadd.f32 q12, q12, q3 \n\t"
"vrev64.32 d28, d28 \n\t" //set deltaAngularVelocityA 1
"vrev64.32 d26, d26 \n\t" //set deltaAngularVelocityB 1
"vtrn.32 d30, d31 \n\t" //set deltaAngularVelocityA 2
"vtrn.32 d24, d25 \n\t" //set deltaAngularVelocityB 2
"vtrn.32 d28, d29 \n\t" //set deltaAngularVelocityA 1
"vtrn.32 d26, d27 \n\t" //set deltaAngularVelocityB 1
"vrev64.32 d30, d30 \n\t" //set deltaAngularVelocityA 2
"vrev64.32 d24, d24 \n\t" //set deltaAngularVelocityB 2
"vmul.f32 q14, q7, q14 \n\t" //operation for cross product 1A
"vmul.f32 q13, q9, q13 \n\t" //operation for cross product 1B
"vmls.f32 q14, q8, q15 \n\t" //operation for cross product 2A
"vmls.f32 q13, q10, q12 \n\t" //operation for cross product 2B
"vadd.f32 q14, q14, q0 \n\t" //operation for adding cross to linearVelocityA
"vadd.f32 q13, q13, q2 \n\t" //operation for adding cross to linearVelocityB
//LOAD => dVA --------------------------------> q14
//LOAD => dVB --------------------------------> q13
//FREE q11, q12, q15, d11
//deltaImpulse -= constraint.m_jacDiagInv * dot(normal,dVA-dVB);
"vsub.f32 q11, q14, q13 \n\t" //TEMP q11 => dVA-dVB for dot product
/*find*/ "vmul.f32 q11, q11, q4 \n\t" //operation for dot product
/*fastest*/ "vpadd.f32 d22, d22, d22 \n\t" //operation for dot product
/*dot*/ "vpadd.f32 d23, d23, d23 \n\t" //operation for dot product
"vadd.f32 d22, d22, d23 \n\t" //operation for dot product
"vmul.f32 d22, d22, d12[0] \n\t" //m_jacDiagInv times dot product result
"vsub.f32 d10, d10, d22 \n\t" //subtract result from deltaImpule
//PfxFloat oldImpulse = constraint.m_accumImpulse;
"vdup.32 d11, d13[1] \n\t" //LOAD => oldImpulse -------------------------> q5 (d11 only)
//constraint.m_accumImpulse = SCE_PFX_CLAMP(oldImpulse + deltaImpulse,constraint.m_lowerLimit,constraint.m_upperLimit);
"vdup.32 d23, d13[0] \n\t" //TEMP q11 => m_upperLimit (d23 only)
"vdup.32 d24, d12[1] \n\t" //TEMP q12 => m_lowerLimit (d34 only)
"vadd.f32 d25, d10, d11 \n\t" //TEMP q12 => deltaImpulse + oldImplues (d25 only)
"vmin.f32 d25, d25, d23 \n\t" //operation MIN(v,b)
"vmax.f32 d22, d25, d24 \n\t" //operation MAX(a,MIN(v,b)
"vst1.32 {d22[0]}, [%0] \n\t" //store m_accumImpulse (incremented so that it can be reloaded for cross product later)
//deltaImpulse = constraint.m_accumImpulse - oldImpulse;
"vsub.f32 d10, d22, d11 \n\t" //operation to calculate new deltaImpule
//FREE q6, q11, q12, q13, q14, q15, d11
//deltaLinearVelocityB -= deltaImpulse * massInvB * normal;
//deltaLinearVelocityA += deltaImpulse * massInvA * normal;
"vld1.32 {d11[0]}, [%3] \n\t" //LOAD => massInvA ---------------------------> q5 (d11[0] only)
"vld1.32 {d11[1]}, [%8] \n\t" //LOAD => massInvB ---------------------------> q5 (d11[1] only)
"vmul.f32 q11, q4, d11[0] \n\t" //TEMP q11 => operation normal times massInvA A
"vmul.f32 q12, q4, d11[1] \n\t" //TEMP q12 => operation normal times massInvB B
"vmul.f32 q11, q11, d10[0] \n\t" //TEMP q11 => operation result times DeltaImpulse A
"vmul.f32 q12, q12, d10[0] \n\t" //TEMP q12 => operation result times DeltaImpulse B
"vadd.f32 q0, q0, q11 \n\t" //operation create new deltaLinearVelocityA A
"vsub.f32 q2, q2, q12 \n\t" //operation create new deltaLinearVelocityB B
"vst1.32 {q0}, [%1] \n\t" //store the new deltaLinearVelocityA A
"vst1.32 {q2}, [%6] \n\t" //store the new deltaLinearVelocityB B
//FREE q0, q2, q6, q11, q12, q13, q14, q15, d11
//deltaAngularVelocityA += deltaImpulse * inertiaInvA * cross(rA,normal);
//deltaAngularVelocityB -= deltaImpulse * inertiaInvB * cross(rB,normal);
"vdup.32 q14, d1[1] \n\t" //set normal cross load
"vdup.32 q15, d1[1] \n\t" //
"vadd.f32 q14, q14, q4 \n\t" //
"vadd.f32 q15, q15, q4 \n\t" //
"vrev64.32 d28, d28 \n\t" //
"vtrn.32 d30, d31 \n\t" //
"vtrn.32 d28, d29 \n\t" //
"vrev64.32 d30, d30 \n\t" //
"vmul.f32 q0, q8, q15 \n\t" //operation for cross product A
"vmul.f32 q2, q10, q15 \n\t" //operation for cross product B
"vmls.f32 q0, q14, q7 \n\t" //operation for cross product A
"vmls.f32 q2, q14, q9 \n\t" //operation for cross product B
//LOAD => cross product result A ------------> q0
//LOAD => cross product result B ------------> q2
//FREE q6, q7, q8, q9, q10, q11, q12, q13, q14, q15, d11
"vld1.32 {q13-q14}, [%4]! \n\t" //LOAD => inertiaInvA col0, col1 A ----------> q13, q14
"vld1.32 {q9-q10}, [%9]! \n\t" //LOAD => inertiaInvB col0, col1 B -----------> q9, q10
"vld1.32 {q15}, [%4] \n\t" //LOAD => inertiaInvA col2 A ----------------> q5
"vld1.32 {q11}, [%9] \n\t" //LOAD => inertiaInvB col2 B -----------------> q11
"vmul.f32 q13, q13, d0[0] \n\t" //operation inertiaInvA col0 = (col0) * (crossA elem0) A
"vmul.f32 q9, q9, d4[0] \n\t" //operation inertiaInvB col0 = (col0) * (crossB elem0) B
"vmla.f32 q13, q14, d0[1] \n\t" //operation inertiaInvA col1 = (col1) * (crossA elem1) A
"vmla.f32 q9, q10, d4[1] \n\t" //operation inertiaInvB col1 = (col1) * (crossB elem1) B
"vmla.f32 q13, q15, d1[0] \n\t" //operation inertiaInvA col2 = (col2) * (crossA elem2) A
"vmla.f32 q9, q11, d5[0] \n\t" //operation inertiaInvB col2 = (col2) * (crossB elem2) B
"vmul.f32 q13, q13, d10[0] \n\t" //operation inertiaInvA times deltaImpulse A
"vmul.f32 q9, q9, d10[0] \n\t" //operation inertiaInvB times deltaImpulse B
"vadd.f32 q1, q1, q13 \n\t" //operation accumulate the deltaAngularVelocityA A
"vsub.f32 q3, q3, q9 \n\t" //operation accumulate the deltaAngularVelocityB B
"vst1.32 {q1}, [%2] \n\t" //store deltaAngularVelocityA A
"vst1.32 {q3}, [%7] \n\t" //store deltaAngularVelocityB B
: // NO outputs! It is important to *not* put anything here. (Putting something here forces use of r0, which wreak havok
: "r" (constraint), "r" (deltaLinearVelocityA), "r" (deltaAngularVelocityA), "r" (massInvA), "r" (inertiaInvA), "r" (rA), "r" (deltaLinearVelocityB), "r" (deltaAngularVelocityB), "r" (massInvB), "r" (inertiaInvB), "r" (rB) //inputs
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" // clobbers
);
}
//----------------------------------------------------------------------------
// TestNeonSolveLinearConstraintRow
//
/// Run timing study of the Linear Constraint Row Solver from above, writing the
/// results to the Android verbose log.
//----------------------------------------------------------------------------
void TestNeonSolveLinearConstraintRow()
{
//Scalar
float SET_ALIGNMENT(64) constraintDataScalar[] = {0.0f,1.0f,2.0f,3.0f,1.0f,2.0f,3.0f,4.0f};
float SET_ALIGNMENT(64) deltaLinearVelocityADataScalar[] = {1.0f,2.0f,3.0f,0.0f};
float SET_ALIGNMENT(64) deltaAngularVelocityADataScalar[] = {2.0f,3.0f,4.0f,0.0f};
float SET_ALIGNMENT(64) inertiaInvADataScalar[] = {4.0f,5.0f,6.0f,0.0f,5.0f,6.0f,7.0f,0.0f,6.0f,7.0f,8.0f,0.0f};
float SET_ALIGNMENT(64) rADataScalar[] = {5.0f,6.0f,7.0f,0.0f};
float SET_ALIGNMENT(64) deltaLinearVelocityBDataScalar[] = {6.0f,7.0f,8.0f,0.0f};
float SET_ALIGNMENT(64) deltaAngularVelocityBDataScalar[] = {9.0f,8.0f,7.0f,0.0f};
float SET_ALIGNMENT(64) inertiaInvBDataScalar[] = {7.0f,6.0f,5.0f,0.0f,9.0f,8.0f,7.0f,0.0f,6.0f,7.0f,8.0f,0.0f};
float SET_ALIGNMENT(64) rBDataScalar[] = {6.0f,5.0f,4.0f,0.0f};
float *constraintScalar = &constraintDataScalar[0];
float *deltaLinearVelocityAScalar = &deltaLinearVelocityADataScalar[0];
float *deltaAngularVelocityAScalar = &deltaAngularVelocityADataScalar[0];
float massInvAScalar = 3.0f;
float *inertiaInvAScalar = &inertiaInvADataScalar[0];
float *rAScalar = &rADataScalar[0];
float *deltaLinearVelocityBScalar = &deltaLinearVelocityBDataScalar[0];
float *deltaAngularVelocityBScalar = &deltaAngularVelocityBDataScalar[0];
float massInvBScalar = 8.0f;
float *inertiaInvBScalar = &inertiaInvBDataScalar[0];
float *rBScalar = &rBDataScalar[0];
//Neon
float SET_ALIGNMENT(64) constraintDataNeon[] = {0.0f,1.0f,2.0f,3.0f,1.0f,2.0f,3.0f,4.0f};
float SET_ALIGNMENT(64) deltaLinearVelocityADataNeon[] = {1.0f,2.0f,3.0f,0.0f};
float SET_ALIGNMENT(64) deltaAngularVelocityADataNeon[] = {2.0f,3.0f,4.0f,0.0f};
float SET_ALIGNMENT(64) massInvADataNeon[] = {3.0f};
float SET_ALIGNMENT(64) inertiaInvADataNeon[] = {4.0f,5.0f,6.0f,0.0f,5.0f,6.0f,7.0f,0.0f,6.0f,7.0f,8.0f,0.0f};
float SET_ALIGNMENT(64) rADataNeon[] = {5.0f,6.0f,7.0f,0.0f};
float SET_ALIGNMENT(64) deltaLinearVelocityBDataNeon[] = {6.0f,7.0f,8.0f,0.0f};
float SET_ALIGNMENT(64) deltaAngularVelocityBDataNeon[] = {9.0f,8.0f,7.0f,0.0f};
float SET_ALIGNMENT(64) massInvBDataNeon[] = {8.0f};
float SET_ALIGNMENT(64) inertiaInvBDataNeon[] = {7.0f,6.0f,5.0f,0.0f,9.0f,8.0f,7.0f,0.0f,6.0f,7.0f,8.0f,0.0f};
float SET_ALIGNMENT(64) rBDataNeon[] = {6.0f,5.0f,4.0f,0.0f};
float *constraintNeon = &constraintDataNeon[0];
float *deltaLinearVelocityANeon = &deltaLinearVelocityADataNeon[0];
float *deltaAngularVelocityANeon = &deltaAngularVelocityADataNeon[0];
float *massInvANeon = &massInvADataNeon[0];
float *inertiaInvANeon = &inertiaInvADataNeon[0];
float *rANeon = &rADataNeon[0];
float *deltaLinearVelocityBNeon = &deltaLinearVelocityBDataNeon[0];
float *deltaAngularVelocityBNeon = &deltaAngularVelocityBDataNeon[0];
float *massInvBNeon = &massInvBDataNeon[0];
float *inertiaInvBNeon = &inertiaInvBDataNeon[0];
float *rBNeon = &rBDataNeon[0];
char szMsg[256];
sprintf(szMsg, "");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"---------------------------------------");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"TestNeonSolveLinearConstraintRow Start");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sce::PhysicsEffects::PfxPerfCounter pc;
double dTimeSpan, dRefTimeSpan;;
unsigned int uiNumTries = 10000000;
unsigned int i;
// profile pfxSolveLinearConstraintRowScalar with direct memory return, c++ version
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
pfxSolveLinearConstraintRowScalar(
constraintScalar,
deltaLinearVelocityAScalar,
deltaAngularVelocityAScalar,
massInvAScalar,
inertiaInvAScalar,
rAScalar,
deltaLinearVelocityBScalar,
deltaAngularVelocityBScalar,
massInvBScalar,
inertiaInvBScalar,
rBScalar
);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
dRefTimeSpan = dTimeSpan;
sprintf(szMsg, "Time to do %i calls for pfxSolveLinearConstraintRowScalar: %f secs, speedup: %5.2f",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "constraint.m_accumImpulse = <%f>", constraintScalar[7]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "deltaLinearVelocityA = <%f,%f,%f,%f>",
deltaLinearVelocityAScalar[0], deltaLinearVelocityAScalar[1], deltaLinearVelocityAScalar[2], deltaLinearVelocityAScalar[3]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "deltaAngularVelocityA = <%f,%f,%f,%f>",
deltaAngularVelocityAScalar[0], deltaAngularVelocityAScalar[1], deltaAngularVelocityAScalar[2], deltaAngularVelocityAScalar[3]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "deltaLinearVelocityB = <%f,%f,%f,%f>",
deltaLinearVelocityBScalar[0], deltaLinearVelocityBScalar[1], deltaLinearVelocityBScalar[2], deltaLinearVelocityBScalar[3]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "deltaAngularVelocityB = <%f,%f,%f,%f>",
deltaAngularVelocityBScalar[0], deltaAngularVelocityBScalar[1], deltaAngularVelocityBScalar[2], deltaAngularVelocityBScalar[3]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON pfxSolveLinearConstraintRowNeonInline with direct memory return, inline assembly version
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
pfxSolveLinearConstraintRowNeonInline(constraintNeon,
deltaLinearVelocityANeon,
deltaAngularVelocityANeon,
massInvANeon,
inertiaInvANeon,
rANeon,
deltaLinearVelocityBNeon,
deltaAngularVelocityBNeon,
massInvBNeon,
inertiaInvBNeon,
rBNeon);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
sprintf(szMsg, "Time to do %i calls for pfxSolveLinearConstraintRowNeon: %f secs, speedup: %5.2f",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "constraint.m_accumImpulse = <%f>", constraintNeon[7]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "deltaLinearVelocityA = <%f,%f,%f,%f>",
deltaLinearVelocityANeon[0], deltaLinearVelocityANeon[1], deltaLinearVelocityANeon[2], deltaLinearVelocityANeon[3]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "deltaAngularVelocityA = <%f,%f,%f,%f>",
deltaAngularVelocityANeon[0], deltaAngularVelocityANeon[1], deltaAngularVelocityANeon[2], deltaAngularVelocityANeon[3]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "deltaLinearVelocityB = <%f,%f,%f,%f>",
deltaLinearVelocityBNeon[0], deltaLinearVelocityBNeon[1], deltaLinearVelocityBNeon[2], deltaLinearVelocityBNeon[3]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "deltaAngularVelocityB = <%f,%f,%f,%f>",
deltaAngularVelocityBNeon[0], deltaAngularVelocityBNeon[1], deltaAngularVelocityBNeon[2], deltaAngularVelocityBNeon[3]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"TestNeonSolveLinearConstraintRow End");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
}

View File

@@ -0,0 +1,150 @@
/*
Applied Research Associates Inc. (c)2011
Redistribution and use in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Applied Research Associates Inc nor the names
of its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "test_neon.h"
#define SCE_PFX_USE_PERFCOUNTER
#include "physics_effects.h"
#include <arm_neon.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <android/log.h>
// This works with gcc
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
// assembly implementations
extern "C"
{
void Transform3OperatorMultiplyNeon(float *trfm, float *vec3, float *pfResult);
}
//----------------------------------------------------------------------------
// Transform3OperatorMultiplyScalar
//
/// Performs a multiply operation on a tranform3 using scalar math, storing the
/// result directly into system memory.
///
/// @param trfm Input transform. Must point to 4x 4 float values
/// @param vec3 Input vector3. Must point to 4 float values
/// @param pfResult [in] pointer to a float(vector3). [out] Contains the result
//----------------------------------------------------------------------------
//inline const Vector3 Transform3::operator *( const Vector3 & vec ) const
//{
// return Vector3(
// ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
// ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
// ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
// );
//}
void Transform3OperatorMultiplyScalar(float *trfm, float *vec3, float *pfResult)
{
pfResult[0] = ( ( ( trfm[0] * vec3[0] ) + ( trfm[4] * vec3[1] ) ) + ( trfm[8] * vec3[2] ) );
pfResult[1] = ( ( ( trfm[1] * vec3[0] ) + ( trfm[5] * vec3[1] ) ) + ( trfm[9] * vec3[2] ) );
pfResult[2] = ( ( ( trfm[2] * vec3[0] ) + ( trfm[6] * vec3[1] ) ) + ( trfm[10] * vec3[2] ) );
pfResult[3] = 0.0f;
}
//----------------------------------------------------------------------------
// TestNeonTransform3OperatorMultiply
//
/// Run timing study of the Tranform3 multiply operator from above, writing the
/// results to the Android verbose log.
//----------------------------------------------------------------------------
void TestNeonTransform3OperatorMultiply()
{
float SET_ALIGNMENT(64) data1[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
float SET_ALIGNMENT(64) data2[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
float *trfm = &data1[0];
float *vec3 = &data2[0];
char szMsg[256];
sprintf(szMsg, "");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"---------------------------------------");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"TestNeonTransform3OperatorMultiply Start");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "Test input vec3: <%f,%f,%f,%f>",
vec3[0], vec3[1], vec3[2], vec3[3]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "Test input trfm: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>>",
trfm[0], trfm[1], trfm[2], trfm[3],
trfm[4], trfm[5], trfm[6], trfm[7],
trfm[8], trfm[9], trfm[10], trfm[11],
trfm[12], trfm[13], trfm[14], trfm[15]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
float SET_ALIGNMENT(64) fResult[4];
sce::PhysicsEffects::PfxPerfCounter pc;
double dTimeSpan, dRefTimeSpan;;
unsigned int uiNumTries = 10000000;
unsigned int i;
// profile scalar Transform3OperatorMultiplyScalar with direct memory return, c++ version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
Transform3OperatorMultiplyScalar(trfm, vec3, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
dRefTimeSpan = dTimeSpan;
sprintf(szMsg, "Time to do %i calls for Transform3OperatorMultiplyScalar: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f>",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON Transform3OperatorMultiplyNeon with direct memory return, assembly version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
Transform3OperatorMultiplyNeon(trfm, vec3, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
sprintf(szMsg, "Time to do %i calls for Transform3OperatorMultiplyNeon: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f>",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"TestNeonTransform3OperatorMultiply End");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
}

View File

@@ -0,0 +1,154 @@
/*
Applied Research Associates Inc. (c)2011
Redistribution and use in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Applied Research Associates Inc nor the names
of its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#include "test_neon.h"
#define SCE_PFX_USE_PERFCOUNTER
#include "physics_effects.h"
#include <arm_neon.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <android/log.h>
// This works with gcc
#define SET_ALIGNMENT(alignment) __attribute__((__aligned__((alignment))))
// assembly implementations
extern "C"
{
void TransposeMatrix3Neon(float *mCol, float *pfResult);
}
//----------------------------------------------------------------------------
// TransposeMatrix3Scalar
//
/// Performs a transpose on a matrix3 using scalar math, storing the
/// result directly into system memory.
///
/// @param mCol Input matrix. Must point to 3x 4 float values
/// @param pfResult [in] pointer to a float(matrix3). [out] Contains the result
//----------------------------------------------------------------------------
//inline const Matrix3 transpose( const Matrix3 & mat )
//{
// return Matrix3(
// Vector3( mat.getCol0().getX(), mat.getCol1().getX(), mat.getCol2().getX() ),
// Vector3( mat.getCol0().getY(), mat.getCol1().getY(), mat.getCol2().getY() ),
// Vector3( mat.getCol0().getZ(), mat.getCol1().getZ(), mat.getCol2().getZ() )
// );
//}
void TransposeMatrix3Scalar(float *mCol, float *pfResult)
{
pfResult[0] = mCol[0];
pfResult[1] = mCol[4];
pfResult[2] = mCol[8];
pfResult[3] = 0.0f;
pfResult[4] = mCol[1];
pfResult[5] = mCol[5];
pfResult[6] = mCol[9];
pfResult[7] = 0.0f;
pfResult[8] = mCol[2];
pfResult[9] = mCol[6];
pfResult[10] = mCol[10];
pfResult[11] = 0.0f;
}
//----------------------------------------------------------------------------
// TestNeonTransposeMatrix3
//
/// Run timing study of the matrix3 transpose functions above, writing the
/// results to the Android verbose log.
//----------------------------------------------------------------------------
void TestNeonTransposeMatrix3()
{
float SET_ALIGNMENT(64) data[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
float *mCol = &data[0];
char szMsg[256];
sprintf(szMsg, "");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"---------------------------------------");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"TestNeonTransposeMatrix3 Start");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg, "Test input mCol: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
mCol[0], mCol[1], mCol[2], mCol[3],
mCol[4], mCol[5], mCol[6], mCol[7],
mCol[8], mCol[9], mCol[10], mCol[11]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
float SET_ALIGNMENT(64) fResult[12];
sce::PhysicsEffects::PfxPerfCounter pc;
double dTimeSpan, dRefTimeSpan;;
unsigned int uiNumTries = 10000000;
unsigned int i;
// profile scalar TransposeMatrix3Scalar with direct memory return, c++ version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
TransposeMatrix3Scalar(mCol, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
dRefTimeSpan = dTimeSpan;
sprintf(szMsg, "Time to do %i calls for TransposeMatrix3Scalar: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
fResult[4], fResult[5], fResult[6], fResult[7],
fResult[8], fResult[9], fResult[10], fResult[11]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
// profile NEON TransposeMatrix3Neon with direct memory return, assembly version
fResult[0] = 0.0f;
pc.countBegin("");
for(i = 0; i < uiNumTries; i++)
{
TransposeMatrix3Neon(mCol, fResult);
}
pc.countEnd();
dTimeSpan = pc.getCountTime(0);
pc.resetCount();
sprintf(szMsg, "Time to do %i calls for TransposeMatrix3Neon: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
fResult[4], fResult[5], fResult[6], fResult[7],
fResult[8], fResult[9], fResult[10], fResult[11]);
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
sprintf(szMsg,"TestNeonTransposeMatrix3 End");
__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
}