Add PhysicsEffects to Extras. The build is only tested on Windows and Android.

The Android/NEON optimized version of Physics Effects is thanks to Graham Rhodes and Anthony Hamilton, See Issue 587
2012-03-05 04:59:58 +00:00
parent 6cf8dfc202
commit a93a661b94
462 changed files with 86626 additions and 0 deletions
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_cross_product.S
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_cross_product.S
@@ -0,0 +1,146 @@
+@
+@ Applied Research Associates Inc. (c)2011
+@
+@ Redistribution and use in source and binary forms,
+@   with or without modification, are permitted provided that the
+@   following conditions are met:
+@    * Redistributions of source code must retain the above copyright
+@      notice, this list of conditions and the following disclaimer.
+@    * Redistributions in binary form must reproduce the above copyright
+@      notice, this list of conditions and the following disclaimer in the
+@      documentation and/or other materials provided with the distribution.
+@    * Neither the name of the Applied Research Associates Inc nor the names
+@      of its contributors may be used to endorse or promote products derived
+@      from this software without specific prior written permission.
+@
+@   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+@   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+@   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+@   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+@   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+@   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+@   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+@   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+@   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+@   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+@   POSSIBILITY OF SUCH DAMAGE.
+@
+	.syntax unified
+	.arch armv7-a
+	.fpu neon
+	.thumb
+	.text
+	.align 2
+
+@ cross product, result stored directly to memory
+
+	.global	CrossProductNeonResultInMemoryAssembly
+	.thumb_func
+CrossProductNeonResultInMemoryAssembly:
+	.fnstart
+	vld1.32	{d18,d19}, [r1]	@ input <x2,y2,z2,w2> = d18,d19
+	vld1.32	{d16,d17}, [r0]	@ input <x1,y1,z1,w1> = d16,d17
+@ rearrange inputs into convenient order
+	vtrn.32 d18,d19			@  q9 = <x2,z2,y2,w2> = d18,d19
+	vrev64.32 d16,d16		@  q8 = <y1,x1,z1,w1> = d16,d17
+	vrev64.32 d18,d18		@  q9 = <z2,x2,y2,w2> = d18,d19
+	vtrn.32 d16,d17			@  q8 = <y1,z1,x1,w1> = d16,d17
+@ perform first half of cross product using rearranged inputs
+	vmul.f32 q10, q8, q9	@ q10 = <y1*z2,z1*x2,x1*y2,w1*w2>
+@ rearrange inputs again
+	vtrn.32 d18,d19			@  q9 = <z2,y2,x2,w2> = d18,d19
+	vrev64.32 d16,d16		@  q8 = <z1,y1,x1,w1> = d16,d17
+	vrev64.32 d18,d18		@  q9 = <y2,z2,x2,w2> = d18,d19
+	vtrn.32 d16,d17			@  q8 = <z1,x1,y1,w1> = d16,d17
+@ perform last half of cross product using rearranged inputs
+	vmls.f32 q10, q8, q9	@ q10 = <y1*z2-y2*z1,z1*x2-z2*x1,x1*y2-x2*y1,w1*w2-w2*w1>
+@ and store the result
+	vst1.32	{q10}, [r2]
+	bx	lr
+	.fnend
+
+@ cross product, result stored directly to memory, version 2
+
+	.global	CrossProductNeonResultInMemoryAssembly2
+	.thumb_func
+CrossProductNeonResultInMemoryAssembly2:
+	.fnstart
+	vld1.32 {d18[1]}, [r1]!
+	vld1.32 {d19[0]}, [r1]!
+	vld1.32 {d18[0]}, [r1]!
+	vld1.32 {d19[1]}, [r1]	@  q9 = <z2,x2,y2,w2> = d18,d19
+	vld1.32 {d17[0]}, [r0]!
+	vld1.32 {d16}, [r0]!
+	vld1.32 {d17[1]}, [r0]	@  q8 = <y1,z1,x1,w1> = d16,d17
+@ perform first half of cross product using rearranged inputs
+	vmul.f32 q10, q8, q9	@ q10 = <y1*z2,z1*x2,x1*y2,w1*w2>
+@ rearrange inputs for second half
+	vtrn.32 d18,d19			@  q9 = <z2,y2,x2,w2> = d18,d19
+	vrev64.32 d16,d16		@  q8 = <z1,y1,x1,w1> = d16,d17
+	vrev64.32 d18,d18		@  q9 = <y2,z2,x2,w2> = d18,d19
+	vtrn.32 d16,d17			@  q8 = <z1,x1,y1,w1> = d16,d17
+@ perform last half of cross product using rearranged inputs
+	vmls.f32 q10, q8, q9	@ q10 = <y1*z2-y2*z1,z1*x2-z2*x1,x1*y2-x2*y1,w1*w2-w2*w1>
+@ and store the result
+	vst1.32	{q10}, [r2]
+	bx	lr
+	.fnend
+
+	.global	CrossProductNeonResultInMemoryAssembly3
+	.thumb_func
+CrossProductNeonResultInMemoryAssembly3:
+	.fnstart
+	vld1.32 {d18[1]}, [r1]!
+	vld1.32 {d19[0]}, [r1]!
+	vld1.32 {d18[0]}, [r1]!
+	vld1.32 {d19[1]}, [r1]!	@  q9 = <z2,x2,y2,w2> = d18,d19
+	vld1.32 {d17[0]}, [r0]!
+	vld1.32 {d16}, [r0]!
+	vld1.32 {d17[1]}, [r0]!	@  q8 = <y1,z1,x1,w1> = d16,d17
+@ perform first half of cross product using rearranged inputs
+	vmul.f32 q10, q8, q9	@ q10 = <y1*z2,z1*x2,x1*y2,w1*w2>
+	vld1.32 {d18[1]}, [r0]!
+	vld1.32 {d19[0]}, [r0]!
+	vld1.32 {d18[0]}, [r0]!
+	vld1.32 {d19[1]}, [r0]!	@  q9 = <z2,x2,y2,w2> = d18,d19
+	vld1.32 {d17[0]}, [r1]!
+	vld1.32 {d16}, [r1]!
+	vld1.32 {d17[1]}, [r1]!	@  q8 = <y1,z1,x1,w1> = d16,d17
+@ perform last half of cross product using rearranged inputs
+	vmls.f32 q10, q8, q9	@ q10 = <y1*z2-y2*z1,z1*x2-z2*x1,x1*y2-x2*y1,w1*w2-w2*w1>
+@ and store the result
+	vst1.32	{q10}, [r2]
+	bx	lr
+	.fnend
+
+@ cross product, result stored directly to memory, version 3
+
+@	.global	CrossProductNeonResultInMemoryAssembly3
+@	.thumb_func
+@CrossProductNeonResultInMemoryAssembly3:
+@	.fnstart
+@	ldr r3, table
+@	vld1.32	{d18,d19}, [r1]	@ input <x2,y2,z2,w2> = d18,d19
+@	vld1.32	{d16,d17}, [r0]	@ input <x1,y1,z1,w1> = d16,d17
+@ rearrange inputs into convenient order
+@	vld1.32 {q10}, [r3]!
+@	vld1.32 {q11}, [r3]
+@	vtbl.8 q12, {d18,d19}, q10
+@	vtbl.8 q13, {d16,d17}, q11
+@ perform first half of cross product using rearranged inputs
+@	vmul.f32 q14, q12, q13
+@ rearrange inputs again
+@	vtbl.8 q12, {d18,d19}, q11
+@	vtbl.8 q13, {d16,d17}, q10
+@ perform last half of cross product using rearranged inputs
+@	vmls.f32 q14, q12, q13
+@ and store the result
+@	vst1.32	{q14}, [r2]
+@	bx	lr
+@	.fnend
+
+@table:
+@z2,x2,y2,w2
+@	dcb 08,09,10,11,00,01,02,03,04,05,06,07,00,00,00,00
+@y1,z1,x1,w1
+@	dcb 04,05,06,07,08,09,10,11,00,01,02,03,00,00,00,00
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_dot_product.S
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_dot_product.S
@@ -0,0 +1,98 @@
+@
+@ Applied Research Associates Inc. (c)2011
+@
+@ Redistribution and use in source and binary forms,
+@   with or without modification, are permitted provided that the
+@   following conditions are met:
+@    * Redistributions of source code must retain the above copyright
+@      notice, this list of conditions and the following disclaimer.
+@    * Redistributions in binary form must reproduce the above copyright
+@      notice, this list of conditions and the following disclaimer in the
+@      documentation and/or other materials provided with the distribution.
+@    * Neither the name of the Applied Research Associates Inc nor the names
+@      of its contributors may be used to endorse or promote products derived
+@      from this software without specific prior written permission.
+@
+@   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+@   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+@   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+@   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+@   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+@   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+@   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+@   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+@   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+@   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+@   POSSIBILITY OF SUCH DAMAGE.
+@
+	.syntax unified
+	.arch armv7-a
+	.fpu neon
+	.thumb
+	.text
+	.align 2
+
+@ dot product, result returned in register
+
+	.global	DotProductNeonResultInRegisterAssembly
+	.thumb_func
+DotProductNeonResultInRegisterAssembly:
+	.fnstart
+	vld1.32	{d16,d17}, [r0]	@ input <x1,y1,z1,w1>
+	vld1.32	{d18,d19}, [r1]	@ input <x2,y2,z2,w2>
+	vmul.f32 d14, d16, d18	@ d14=<x1*x2,y1*y2>
+	vmla.f32 d14, d17, d19	@ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
+	vpadd.f32 d14, d14, d14	@ add the two halves of d14 together, result in each lane
+	vmov.f32 r0, s28		@ s28 aliases the first lane of d14
+	bx	lr
+	.fnend
+
+@ dot product, result stored directly to memory
+
+	.global	DotProductNeonResultInMemoryAssembly
+	.thumb_func
+DotProductNeonResultInMemoryAssembly:
+	.fnstart
+	vld1.32	{d16,d17}, [r0]	@ input <x1,y1,z1,w1>
+	vld1.32	{d18,d19}, [r1]	@ input <x2,y2,z2,w2>
+	vmul.f32 d14, d16, d18	@ d14=<x1*x2,y1*y2>
+@ non-fused multiple accumulate
+	vmla.f32 d14, d17, d19	@ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
+@ fused multiple accumulate - not recognized by GNU assembler
+@@	vfma.f32 {d14}, d17, d19	@ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
+	vpadd.f32 d14, d14, d14	@ add the two halves of d14 together, same result in each lane
+	vst1.32	{d14}, [r2]
+	bx	lr
+	.fnend
+
+@ dot product, result stored directly to memory, all inputs are contiguous in memory
+
+	.global	DotProductNeonResultInMemoryAssembly2
+	.thumb_func
+DotProductNeonResultInMemoryAssembly2:
+	.fnstart
+	vld1.32	{d16-d19}, [r0]	@ input q8=<x1,y1,z1,w1>, q9=<x2,y2,z2,w2>
+	vmul.f32 d14, d16, d18	@ d14=<x1*x2,y1*y2>
+@ non-fused multiple accumulate
+	vmla.f32 d14, d17, d19	@ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
+@ fused multiple accumulate - not recognized by GNU assembler
+@@	vfma.f32 d14, d17, d19	@ d14=d14+<z1*z2,w1*w2>=<x1*x2+z1*z2,y1*y2+w1*w2>
+	vpadd.f32 d14, d14, d14	@ add the two halves of d14 together, same result in each lane
+	vst1.32	{d14}, [r1]
+	bx	lr
+	.fnend
+
+@ same as DotProductNeonResultInMemoryAssembly2, but alternate methodology to compute result
+
+	.global	DotProductNeonResultInMemoryAssembly2b
+	.thumb_func
+DotProductNeonResultInMemoryAssembly2b:
+	.fnstart
+	vld1.32	{d14-d17}, [r0]	@ input q7=<x1,y1,z1,w1>, q8=<x2,y2,z2,w2>
+	vmul.f32 q7, q7, q8		@ q7 = <x1*x2,y1*y2,z1*z2,w1*w2> = d14,d15 = s28,s29,s30,s31
+	vpadd.f32 d14,d14,d14	@ add elements of d14 together=<x1*x2+y1*y2,x1*x2+y1*y2>
+	vpadd.f32 d15,d15,d15	@ add elements of d14 together=<z1*z2+w1*w2,z1*z2+w1*w2>
+	vadd.f32 d14,d14,d15	@ add d14 and d15 together, final result in both lanes
+	vst1.32	{d14}, [r1]
+	bx	lr
+	.fnend
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_matrix3_operator_multiply.S
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_matrix3_operator_multiply.S
@@ -0,0 +1,62 @@
+@
+@ Applied Research Associates Inc. (c)2011
+@
+@ Redistribution and use in source and binary forms,
+@   with or without modification, are permitted provided that the
+@   following conditions are met:
+@    * Redistributions of source code must retain the above copyright
+@      notice, this list of conditions and the following disclaimer.
+@    * Redistributions in binary form must reproduce the above copyright
+@      notice, this list of conditions and the following disclaimer in the
+@      documentation and/or other materials provided with the distribution.
+@    * Neither the name of the Applied Research Associates Inc nor the names
+@      of its contributors may be used to endorse or promote products derived
+@      from this software without specific prior written permission.
+@
+@   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+@   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+@   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+@   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+@   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+@   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+@   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+@   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+@   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+@   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+@   POSSIBILITY OF SUCH DAMAGE.
+@
+	.syntax unified
+	.arch armv7-a
+	.fpu neon
+	.thumb
+	.text
+	.align 2
+
+@ matrix3 operator *, result stored directly to memory
+
+	.global	Matrix3OperatorMultiplyNeon
+	.thumb_func
+Matrix3OperatorMultiplyNeon:
+	.fnstart
+	vld1.32     {d16-d19}, [r0]!            @ load first eight elements of matrix 0
+    vld1.32     {d20-d21}, [r0]             @ load second eight elements of matrix 0
+    vld1.32     {d0-d3}, [r1]!              @ load first eight elements of matrix 1
+    vld1.32     {d4-d5}, [r1]               @ load second eight elements of matrix 1
+
+	vmul.f32    q12, q8, d0[0]              @ rslt col0  = (mat0 col0) * (mat1 col0 elt0)
+    vmul.f32    q13, q8, d2[0]              @ rslt col1  = (mat0 col0) * (mat1 col1 elt0)
+    vmul.f32    q14, q8, d4[0]              @ rslt col2  = (mat0 col0) * (mat1 col2 elt0)
+
+    vmla.f32    q12, q9, d0[1]              @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
+    vmla.f32    q13, q9, d2[1]              @ rslt col1 += (mat0 col1) * (mat1 col1 elt1)
+    vmla.f32    q14, q9, d4[1]              @ rslt col2 += (mat0 col1) * (mat1 col2 elt1)
+
+    vmla.f32    q12, q10, d1[0]             @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
+    vmla.f32    q13, q10, d3[0]             @ rslt col1 += (mat0 col2) * (mat1 col1 elt2)
+    vmla.f32    q14, q10, d5[0]             @ rslt col2 += (mat0 col2) * (mat1 col2 elt2)
+
+    vst1.32     {d24-d27}, [r2]!            @ store first eight elements of result
+    vst1.32     {d28-d29}, [r2]             @ store second eight elements of result
+
+	bx	lr
+	.fnend
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_matrix4_operator_multiply.S
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_matrix4_operator_multiply.S
@@ -0,0 +1,71 @@
+@
+@ Applied Research Associates Inc. (c)2011
+@
+@ Redistribution and use in source and binary forms,
+@   with or without modification, are permitted provided that the
+@   following conditions are met:
+@    * Redistributions of source code must retain the above copyright
+@      notice, this list of conditions and the following disclaimer.
+@    * Redistributions in binary form must reproduce the above copyright
+@      notice, this list of conditions and the following disclaimer in the
+@      documentation and/or other materials provided with the distribution.
+@    * Neither the name of the Applied Research Associates Inc nor the names
+@      of its contributors may be used to endorse or promote products derived
+@      from this software without specific prior written permission.
+@
+@   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+@   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+@   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+@   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+@   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+@   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+@   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+@   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+@   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+@   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+@   POSSIBILITY OF SUCH DAMAGE.
+@
+	.syntax unified
+	.arch armv7-a
+	.fpu neon
+	.thumb
+	.text
+	.align 2
+
+@ matrix4 operator *, result stored directly to memory
+
+	.global	Matrix4OperatorMultiplyNeon
+	.thumb_func
+Matrix4OperatorMultiplyNeon:
+	.fnstart
+
+	vld1.32     {d16-d19}, [r0]!            @ load first eight elements of matrix 0
+    vld1.32     {d20-d23}, [r0]             @ load second eight elements of matrix 0
+    vld1.32     {d0-d3}, [r1]!              @ load first eight elements of matrix 1
+    vld1.32     {d4-d7}, [r1]               @ load second eight elements of matrix 1
+
+    vmul.f32    q12, q8, d0[0]              @ rslt col0  = (mat0 col0) * (mat1 col0 elt0)
+    vmul.f32    q13, q8, d2[0]              @ rslt col1  = (mat0 col0) * (mat1 col1 elt0)
+    vmul.f32    q14, q8, d4[0]              @ rslt col2  = (mat0 col0) * (mat1 col2 elt0)
+    vmul.f32    q15, q8, d6[0]              @ rslt col3  = (mat0 col0) * (mat1 col3 elt0)
+
+    vmla.f32    q12, q9, d0[1]              @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
+    vmla.f32    q13, q9, d2[1]              @ rslt col1 += (mat0 col1) * (mat1 col1 elt1)
+    vmla.f32    q14, q9, d4[1]              @ rslt col2 += (mat0 col1) * (mat1 col2 elt1)
+    vmla.f32    q15, q9, d6[1]              @ rslt col3 += (mat0 col1) * (mat1 col3 elt1)
+
+    vmla.f32    q12, q10, d1[0]             @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
+    vmla.f32    q13, q10, d3[0]             @ rslt col1 += (mat0 col2) * (mat1 col1 elt2)
+    vmla.f32    q14, q10, d5[0]             @ rslt col2 += (mat0 col2) * (mat1 col2 elt2)
+    vmla.f32    q15, q10, d7[0]             @ rslt col3 += (mat0 col2) * (mat1 col2 elt2)
+
+    vmla.f32    q12, q11, d1[1]             @ rslt col0 += (mat0 col3) * (mat1 col0 elt3)
+    vmla.f32    q13, q11, d3[1]             @ rslt col1 += (mat0 col3) * (mat1 col1 elt3)
+    vmla.f32    q14, q11, d5[1]             @ rslt col2 += (mat0 col3) * (mat1 col2 elt3)
+    vmla.f32    q15, q11, d7[1]             @ rslt col3 += (mat0 col3) * (mat1 col3 elt3)
+
+    vst1.32     {d24-d27}, [r2]!            @ store first eight elements of result
+    vst1.32     {d28-d31}, [r2]             @ store second eight elements of result
+
+	bx	lr
+	.fnend
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_orthoInverse_transform3.S
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_orthoInverse_transform3.S
@@ -0,0 +1,81 @@
+@
+@ Applied Research Associates Inc. (c)2011
+@
+@ Redistribution and use in source and binary forms,
+@   with or without modification, are permitted provided that the
+@   following conditions are met:
+@    * Redistributions of source code must retain the above copyright
+@      notice, this list of conditions and the following disclaimer.
+@    * Redistributions in binary form must reproduce the above copyright
+@      notice, this list of conditions and the following disclaimer in the
+@      documentation and/or other materials provided with the distribution.
+@    * Neither the name of the Applied Research Associates Inc nor the names
+@      of its contributors may be used to endorse or promote products derived
+@      from this software without specific prior written permission.
+@
+@   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+@   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+@   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+@   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+@   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+@   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+@   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+@   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+@   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+@   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+@   POSSIBILITY OF SUCH DAMAGE.
+@
+	.syntax unified
+	.arch armv7-a
+	.fpu neon
+	.thumb
+	.text
+	.align 2
+
+@ orthoInverse of a transform3, result stored directly to memory
+
+	.global	OrthoInverseTransform3Neon
+	.thumb_func
+OrthoInverseTransform3Neon:
+	.fnstart
+@way 1
+@	vld1.f32 {d0-d3}, [r0]!   @ load first eight elements of result
+@   vld1.f32 {d4-d7}, [r0]    @ load second eight elements of result
+
+@   vzip.f32 d0, d2
+@	vzip.f32 d1, d3
+@	vswp.f32 d1, d4
+@	vzip.f32 d1, d3
+
+@way 2 (faster)
+@
+@
+@ NOTE NOTE! This is broken! It does not use the last column of the input! Please see
+@ implementation of pfxTransform3OrthoInverseNEON in vectormath_neon_assembly_implementations.S, in
+@ include/vecmath/neon, for a correct implementation!
+@
+	vld1.f32 d0[0], [r0]!
+	vld1.f32 d2[0], [r0]!
+	vld1.f32 d4[0], [r0]!
+	vld1.f32 d1[1], [r0]!
+
+	vld1.f32 d0[1], [r0]!
+	vld1.f32 d2[1], [r0]!
+	vld1.f32 d4[1], [r0]!
+	vld1.f32 d3[1], [r0]!
+
+	vld1.f32 d1[0], [r0]!
+	vld1.f32 d3[0], [r0]!
+	vld1.f32 d5[0], [r0]!
+	vld1.f32 d5[1], [r0]
+
+	vmul.f32 q3, q0, d1[0]
+	vmla.f32 q3, q1, d3[0]
+	vmla.f32 q3, q2, d5[0]
+	vneg.f32 q3, q3
+
+	vst1.f32 {d0-d3}, [r1]!   @ store first eight elements of result
+    vst1.f32 {d4-d7}, [r1]    @ store second eight elements of result
+
+	bx	lr
+	.fnend
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_transform3_operator_multiply.S
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_transform3_operator_multiply.S
@@ -0,0 +1,51 @@
+@
+@ Applied Research Associates Inc. (c)2011
+@
+@ Redistribution and use in source and binary forms,
+@   with or without modification, are permitted provided that the
+@   following conditions are met:
+@    * Redistributions of source code must retain the above copyright
+@      notice, this list of conditions and the following disclaimer.
+@    * Redistributions in binary form must reproduce the above copyright
+@      notice, this list of conditions and the following disclaimer in the
+@      documentation and/or other materials provided with the distribution.
+@    * Neither the name of the Applied Research Associates Inc nor the names
+@      of its contributors may be used to endorse or promote products derived
+@      from this software without specific prior written permission.
+@
+@   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+@   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+@   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+@   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+@   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+@   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+@   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+@   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+@   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+@   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+@   POSSIBILITY OF SUCH DAMAGE.
+@
+	.syntax unified
+	.arch armv7-a
+	.fpu neon
+	.thumb
+	.text
+	.align 2
+
+@ transform3 operator *, result stored directly to memory
+
+	.global	Transform3OperatorMultiplyNeon
+	.thumb_func
+Transform3OperatorMultiplyNeon:
+	.fnstart
+
+	vld1.32     {d16-d19}, [r0]!            @ load first eight elements of transform matrix
+    vld1.32     {d20-d21}, [r0]             @ load second eight elements of transform matrix
+    vld1.32     {d0-d1}, [r1]               @ load the four elements of vector3
+	vmul.f32    q12, q8, d0[0]              @ rslt col0  = (mat0 col0) * (mat1 col0 elt0)
+    vmla.f32    q12, q9, d0[1]              @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
+    vmla.f32    q12, q10, d1[0]             @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
+    vst1.32     {d24-d25}, [r2]             @ store first eight elements of result
+
+	bx	lr
+	.fnend
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_transpose_matrix3.S
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/neon_transpose_matrix3.S
@@ -0,0 +1,81 @@
+@
+@ Applied Research Associates Inc. (c)2011
+@
+@ Redistribution and use in source and binary forms,
+@   with or without modification, are permitted provided that the
+@   following conditions are met:
+@    * Redistributions of source code must retain the above copyright
+@      notice, this list of conditions and the following disclaimer.
+@    * Redistributions in binary form must reproduce the above copyright
+@      notice, this list of conditions and the following disclaimer in the
+@      documentation and/or other materials provided with the distribution.
+@    * Neither the name of the Applied Research Associates Inc nor the names
+@      of its contributors may be used to endorse or promote products derived
+@      from this software without specific prior written permission.
+@
+@   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+@   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+@   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+@   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+@   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+@   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+@   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+@   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+@   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+@   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+@   POSSIBILITY OF SUCH DAMAGE.
+@
+	.syntax unified
+	.arch armv7-a
+	.fpu neon
+	.thumb
+	.text
+	.align 2
+
+@ transpose a matrix3, result stored directly to memory
+
+	.global	TransposeMatrix3Neon
+	.thumb_func
+TransposeMatrix3Neon:
+	.fnstart
+
+@way1
+@	vld1.f32 d0[0], [r0]!
+@	vld1.f32 d2[0], [r0]!
+@	vld1.f32 d4[0], [r0]!
+@	vld1.f32 d1[1], [r0]!
+
+@	vld1.f32 d0[1], [r0]!
+@	vld1.f32 d2[1], [r0]!
+@	vld1.f32 d4[1], [r0]!
+@	vld1.f32 d3[1], [r0]!
+
+@	vld1.f32 d1[0], [r0]!
+@	vld1.f32 d3[0], [r0]!
+@	vld1.f32 d5[0], [r0]!
+@	vld1.f32 d5[1], [r0]
+
+@way2
+@	vld2.f32 {d0-d1}, [r0]!
+@	vld2.f32 {d2-d3}, [r0]!
+@	vld2.f32 {d4-d5}, [r0]
+
+@	vzip.f32 q0, q1
+@	vswp.f32 d1, d4
+@	vswp.f32 d3, d5
+@	vzip.f32 d1, d5
+
+@way3 (Fastest)
+	vld1.f32 {d0-d3}, [r0]!   @ store first eight elements of result
+    vld1.f32 {d4-d5}, [r0]    @ store second eight elements of result
+
+    vzip.f32 d0, d2
+	vzip.f32 d1, d3
+	vswp.f32 d1, d4
+	vzip.f32 d1, d3
+
+	vst1.f32 {d0-d3}, [r1]!   @ store first eight elements of result
+    vst1.f32 {d4-d5}, [r1]    @ store second eight elements of result
+
+	bx	lr
+	.fnend
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon.h
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon.h
@@ -0,0 +1,43 @@
+/*
+ Applied Research Associates Inc. (c)2011
+
+ Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Applied Research Associates Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef TEST_NEON_H
+#define TEST_NEON_H
+
+//Neon functions to test
+void TestNeonDotProduct();
+void TestNeonCrossProduct();
+void TestNeonMatrix3OperatorMultiply();
+void TestNeonMatrix4OperatorMultiply();
+void TestNeonOrthoInverseTransform3();
+void TestNeonTransform3OperatorMultiply();
+void TestNeonTransposeMatrix3();
+void TestNeonSolveLinearConstraintRow();
+
+#endif // TEST_NEON_H
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_cross_product.cpp
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_cross_product.cpp
@@ -0,0 +1,229 @@
+/*
+ Applied Research Associates Inc. (c)2011
+
+ Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Applied Research Associates Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "test_neon.h"
+#define SCE_PFX_USE_PERFCOUNTER
+#include "physics_effects.h"
+#include <arm_neon.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <android/log.h>
+
+// This works with gcc
+#define SET_ALIGNMENT(alignment)   __attribute__((__aligned__((alignment))))
+
+//#define PRINT_COMPUTED_VECTOR_RESULTS
+
+// assembly implementations
+extern "C"
+{
+	void CrossProductNeonResultInMemoryAssembly(float *a, float *b, float *pfResult);
+	void CrossProductNeonResultInMemoryAssembly2(float *a, float *b, float *pfResult);
+	void CrossProductNeonResultInMemoryAssembly3(float *a, float *b, float *pfResult);
+}
+
+//----------------------------------------------------------------------------
+//  CrossProductNeonResultInMemory
+//
+/// Performs a Vector3 style cross product using NEON intrinsics, storing the
+/// result directly into system memory.
+///
+/// @param  a         Input vector 1. Must point to 4 float values
+/// @param  b         Input vector 2. Must point to 4 float values
+/// @param  pfResult  [in] must point to an array of at least *4*
+///                   float values. [out] The result of the cross
+///                   product is contained in the first 3 lanes.
+///
+/// NOTE: The parameter types here are floats, not float32_t's. gcc
+/// sometimes doesn't interpret float32_t's correctly. In particular,
+/// if the type of pfResult is set to float32_t*, gcc will throw an
+/// internal compiler error (ICE) for this code. In memory, float32_t
+/// and float are equivalent, so can cast between them explicitly.
+//----------------------------------------------------------------------------
+void CrossProductNeonResultInMemory(float *a, float *b, float *pfResult)
+{
+	float32x4_t v1 = {a[1],a[2],a[0], 0.0f};
+	float32x4_t v2 = {b[2],b[0],b[1], 0.0f};
+	float32x4_t v3 = {a[2],a[0],a[1], 0.0f};
+	float32x4_t v4 = {b[1],b[2],b[0], 0.0f};
+	v1 = vmulq_f32(v1, v2);
+	v1 = vmlsq_f32(v1, v3, v4);
+	vst1q_f32((float32_t*)pfResult, v1);
+}
+
+void CrossProductNeonResultInMemoryCPPAssembly(float *a, float *b, float *result) {
+    asm volatile(
+		"vld1.32 {d18[1]}, [r1]!	 \n\t"
+		"vld1.32 {d19[0]}, [r1]!	 \n\t"
+		"vld1.32 {d18[0]}, [r1]!	 \n\t"
+		"vld1.32 {d19[1]}, [r1]		 \n\t"
+		"vld1.32 {d17[0]}, [r0]!	 \n\t"
+		"vld1.32 {d16}, [r0]!	 	 \n\t"
+		"vld1.32 {d17[1]}, [r0]		 \n\t"
+		"vmul.f32 q10, q8, q9		 \n\t"
+		"vtrn.32 d18,d19			 \n\t"
+		"vrev64.32 d16,d16			 \n\t"
+		"vrev64.32 d18,d18			 \n\t"
+		"vtrn.32 d16,d17			 \n\t"
+		"vmls.f32 q10, q8, q9		 \n\t"
+		"vst1.32	{q10}, [r2]	 	 \n\t"
+    );
+}
+
+//----------------------------------------------------------------------------
+//  CrossProductScalarResultInMemory
+//
+/// Performs a Vector3 style cross product using scalar math, storing the
+/// result directly into system memory.
+///
+/// @param  a         Input vector 1. Must point to 4 float values
+/// @param  b         Input vector 2. Must point to 4 float values
+/// @param  pfResult  [in] pointer to a float. [out] Contains the
+///                   result, dotproduct(a,b)
+//----------------------------------------------------------------------------
+void CrossProductScalarResultInMemory(float *a, float *b, float *pfResult)
+{
+	pfResult[0] = a[1]*b[2] - a[2]*b[1];
+	pfResult[1] = a[2]*b[0] - a[0]*b[2];
+	pfResult[2] = a[0]*b[1] - a[1]*b[0];
+}
+
+//----------------------------------------------------------------------------
+//  TestFastNeonCrossProduct
+//
+/// Run timing study of the cross product functions above, writing the
+/// results to the Android verbose log.
+//----------------------------------------------------------------------------
+void TestNeonCrossProduct()
+{
+	float SET_ALIGNMENT(64) data[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+										float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
+
+	float *a = &data[0];
+	float *b = &data[4];
+
+	char szMsg[256];
+
+	sprintf(szMsg, "");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"---------------------------------------");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "Cross product test inputs A=<%f,%f,%f>, B=<%f,%f,%f>",
+					a[0], a[1], a[2], b[0], b[1], b[2]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	float SET_ALIGNMENT(64) fResult[4];
+    
+	double dTimeSpan, dRefTimeSpan;; 
+	unsigned int uiNumTries = 10000000;
+	unsigned int i;
+
+	sce::PhysicsEffects::PfxPerfCounter pc;
+
+// profile scalar cross product with direct memory return
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		CrossProductScalarResultInMemory(a, b, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	dRefTimeSpan = dTimeSpan;
+	sprintf(szMsg, "Time to do %i calls for CrossProductScalarResultInMemory: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+
+// profile NEON assembly volatile cross product with direct memory return
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		CrossProductNeonResultInMemoryCPPAssembly(a, b, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryFast: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+
+// profile NEON cross product with direct memory return, assembly version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		CrossProductNeonResultInMemoryAssembly(a, b, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+
+// profile NEON cross product with direct memory return, assembly version 2
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		CrossProductNeonResultInMemoryAssembly2(a, b, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+
+	sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly2: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+
+// profile NEON cross product with direct memory return, assembly version 3
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		CrossProductNeonResultInMemoryAssembly3(a, b, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+
+	sprintf(szMsg, "Time to do %i calls for CrossProductNeonResultInMemoryAssembly3: %f secs, speedup: %5.2f, result value=<%f,%f,%f>",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+}
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_dot_product.cpp
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_dot_product.cpp
@@ -0,0 +1,330 @@
+/*
+ Applied Research Associates Inc. (c)2011
+
+ Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Applied Research Associates Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "test_neon.h"
+#define SCE_PFX_USE_PERFCOUNTER
+#include "physics_effects.h"
+#include <arm_neon.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <android/log.h>
+
+// This works with gcc
+#define SET_ALIGNMENT(alignment)   __attribute__((__aligned__((alignment))))
+
+//#define PRINT_COMPUTED_VECTOR_RESULTS
+
+// assembly implementations
+extern "C"
+{
+	float32_t DotProductNeonResultInRegisterAssembly(float *a, float *b);
+	void DotProductNeonResultInMemoryAssembly(float *a, float *b, float *pfResult);
+	void DotProductNeonResultInMemoryAssembly2(float *data, float *pfResult);
+	void DotProductNeonResultInMemoryAssembly2b(float *data, float *pfResult);
+}
+
+//----------------------------------------------------------------------------
+//  DotProductNeonResultInRegister
+//
+/// Performs a Vector4 style dot product using NEON intrinsics, returning
+/// the result in an ARM register.
+///
+/// @param  a     Input vector 1. Must point to 4 float values
+/// @param  b     Input vector 2. Must point to 4 float values
+///
+/// @return dotproduct(a,b)
+//----------------------------------------------------------------------------
+float32_t DotProductNeonResultInRegister(float *a, float *b)
+{
+	float32_t result[2];
+	float32x2_t v1a = vld1_f32((float32_t*)a);
+	float32x2_t v1b = vld1_f32((float32_t*)(a + 2));
+	float32x2_t v2a = vld1_f32((float32_t*)b);
+	float32x2_t v2b = vld1_f32((float32_t*)(b + 2));
+	v1a = vmul_f32(v1a, v2a);
+	v1a = vmla_f32(v1a, v1b, v2b);
+	v1a = vpadd_f32(v1a, v1a);
+	vst1_f32(result, v1a);
+	return(result[0]);
+}
+
+// pfResult must point to memory large enough to store *two*
+// float32_t values
+//----------------------------------------------------------------------------
+//  DotProductNeonResultInMemory
+//
+/// Performs a Vector4 style dot product using NEON intrinsics, storing the
+/// result directly into system memory.
+///
+/// @param  a         Input vector 1. Must point to 4 float values
+/// @param  b         Input vector 2. Must point to 4 float values
+/// @param  pfResult  [in] must point to an array of at least *2*
+///                   float values. [out] The result of the dot
+///                   product is contained in both slots of the array.
+///                   Recommended to use the first one though they
+///                   have the same value
+///
+/// NOTE: The parameter types here are floats, not float32_t's. gcc
+/// sometimes doesn't interpret float32_t's correctly. In particular,
+/// if the type of pfResult is set to float32_t*, gcc will throw an
+/// internal compiler error (ICE) for this code. In memory, float32_t
+/// and float are equivalent, so can cast between them explicitly.
+//----------------------------------------------------------------------------
+void DotProductNeonResultInMemory(float *a, float *b, float *pfResult)
+{
+	float32x2_t v1a = vld1_f32((float32_t*)a);
+	float32x2_t v1b = vld1_f32((float32_t*)(a + 2));
+	float32x2_t v2a = vld1_f32((float32_t*)b);
+	float32x2_t v2b = vld1_f32((float32_t*)(b + 2));
+	v1a = vmul_f32(v1a, v2a);
+	v1a = vmla_f32(v1a, v1b, v2b);
+	v1a = vpadd_f32(v1a, v1a);
+	vst1_f32((float32_t*)pfResult, v1a);
+}
+
+//----------------------------------------------------------------------------
+//  DotProductScalarResultInRegister
+//
+/// Performs a Vector4 style dot product using scalar math, and returning
+/// the result in an ARM register.
+///
+/// @param  a     Input vector 1. Must point to 4 float values
+/// @param  b     Input vector 2. Must point to 4 float values
+///
+/// @return dotproduct(a,b)
+//----------------------------------------------------------------------------
+float DotProductScalarResultInRegister(float *a, float *b)
+{
+	return(a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3]);
+}
+
+//----------------------------------------------------------------------------
+//  DotProductScalarResultInMemory
+//
+/// Performs a Vector4 style dot product using scalar math, storing the
+/// result directly into system memory.
+///
+/// @param  a         Input vector 1. Must point to 4 float values
+/// @param  b         Input vector 2. Must point to 4 float values
+/// @param  pfResult  [in] pointer to a float. [out] Contains the
+///                   result, dotproduct(a,b)
+//----------------------------------------------------------------------------
+void DotProductScalarResultInMemory(float *a, float *b, float *pfResult)
+{
+	*pfResult = a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];
+}
+
+//----------------------------------------------------------------------------
+//  TestFastNeonDotProduct
+//
+/// Run timing study of the four dot product functions above, writing the
+/// results to the Android verbose log.
+//----------------------------------------------------------------------------
+void TestNeonDotProduct()
+{
+	float SET_ALIGNMENT(64) data[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
+										float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX)};
+
+	float *a = &data[0];
+	float *b = &data[4];
+
+	char szMsg[256];
+	
+	sprintf(szMsg, "");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"---------------------------------------");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+#ifdef PRINT_COMPUTED_VECTOR_RESULTS
+	sprintf(szMsg, "Dot product test inputs A=<%f,%f,%f>, B=<%f,%f,%f>",
+					a[0], a[1], a[2], b[0], b[1], b[2]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+#endif
+
+	float SET_ALIGNMENT(64) fResult[2];
+    
+	sce::PhysicsEffects::PfxPerfCounter pc;
+	double dTimeSpan, dRefTimeSpan;
+	unsigned int uiNumTries = 10000000;
+	unsigned int i;
+
+	// profile scalar dot product with register return
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		fResult[0] = DotProductScalarResultInRegister(a, b); // C++
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	dRefTimeSpan = dTimeSpan;
+#ifdef PRINT_COMPUTED_VECTOR_RESULTS
+	sprintf(szMsg, "Time to do %i calls to scalar dot product return in register: %f, result value=%f",
+					uiNumTries, dTimeSpan, fResult[0]);
+#else
+	sprintf(szMsg, "Sclr C++ Dot Product Register Return: %10.7f secs, REF TIME", dTimeSpan);
+#endif
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	// profile NEON dot product with register return
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		fResult[0] = DotProductNeonResultInRegister(a, b); // C++
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+#ifdef PRINT_COMPUTED_VECTOR_RESULTS
+	sprintf(szMsg, "Time to do %i calls to NEON dot product return in register: %f, result value=%f",
+					uiNumTries, dTimeSpan, fResult[0]);
+#else
+	sprintf(szMsg, "Neon C++ Dot Product Register Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
+#endif
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	// profile NEON dot product with register return, assembly version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		fResult[0] = DotProductNeonResultInRegisterAssembly(a, b);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+
+#ifdef PRINT_COMPUTED_VECTOR_RESULTS
+	sprintf(szMsg, "Time to do %i calls to NEON dot product return in register (assembly version): %f, result value=%f",
+					uiNumTries, dTimeSpan, fResult[0]);
+#else
+	sprintf(szMsg, "Neon Asm Dot Product Register Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
+#endif
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	// profile scalar dot product with direct memory return
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		DotProductScalarResultInMemory(a, b, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+
+	dRefTimeSpan = dTimeSpan;
+#ifdef PRINT_COMPUTED_VECTOR_RESULTS
+	sprintf(szMsg, "Time to do %i calls to scalar dot product return in memory: %f, result value=%f",
+					uiNumTries, dTimeSpan, fResult[0]);
+#else
+	sprintf(szMsg, "  Sclr C++ Dot Product Memory Return: %10.7f secs, REF TIME", dTimeSpan);
+#endif
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	// profile NEON dot product with direct memory return
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		DotProductNeonResultInMemory(a, b, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+
+#ifdef PRINT_COMPUTED_VECTOR_RESULTS
+	sprintf(szMsg, "Time to do %i calls to NEON dot product return in memory: %f, result value=%f",
+					uiNumTries, dTimeSpan, fResult[0]);
+#else
+	sprintf(szMsg, "  Neon C++ Dot Product Memory Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
+#endif
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	// profile NEON dot product with direct memory return, assembly version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		DotProductNeonResultInMemoryAssembly(a, b, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+
+#ifdef PRINT_COMPUTED_VECTOR_RESULTS
+	sprintf(szMsg, "Time to do %i calls to NEON dot product return in memory (assembly version): %f, result value=%f",
+					uiNumTries, dTimeSpan, fResult[0]);
+#else
+	sprintf(szMsg, "  Neon Asm Dot Product Memory Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
+#endif
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	// profile NEON dot product with direct memory return, assembly version with both
+	// inputs in contiguous memory block (array of structures)
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		DotProductNeonResultInMemoryAssembly2(data, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+
+#ifdef PRINT_COMPUTED_VECTOR_RESULTS
+	sprintf(szMsg, "Time to do %i calls to NEON dot product return in memory (assembly version - array of structures): %f, result value=%f",
+					uiNumTries, dTimeSpan, fResult[0]);
+#else
+	sprintf(szMsg, " Neon Asm2 Dot Product Memory Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
+#endif
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	// profile NEON dot product with direct memory return, assembly version with both
+	// inputs in contiguous memory block (array of structures), alternate methodology
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		DotProductNeonResultInMemoryAssembly2b(data, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+
+#ifdef PRINT_COMPUTED_VECTOR_RESULTS
+	sprintf(szMsg, "Time to do %i calls to NEON dot product return in memory (assembly version - alternate methodology): %f, result value=%f",
+					uiNumTries, dTimeSpan, fResult[0]);
+#else
+	sprintf(szMsg, " Neon Asm3 Dot Product Memory Return: %10.7f secs, speedup: %5.2f", dTimeSpan, dRefTimeSpan/dTimeSpan);
+#endif
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+}
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_matrix3_operator_multiply.cpp
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_matrix3_operator_multiply.cpp
@@ -0,0 +1,173 @@
+/*
+ Applied Research Associates Inc. (c)2011
+
+ Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Applied Research Associates Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "test_neon.h"
+#define SCE_PFX_USE_PERFCOUNTER
+#include "physics_effects.h"
+#include <arm_neon.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <android/log.h>
+
+// This works with gcc
+#define SET_ALIGNMENT(alignment)   __attribute__((__aligned__((alignment))))
+
+// assembly implementations
+extern "C"
+{
+	void Matrix3OperatorMultiplyNeon(float *mCol, float *mat, float *pfResult);
+}
+
+//----------------------------------------------------------------------------
+//  Matrix3OperatorMultiplyScalar
+//
+/// Performs a Matrix3 Vector3 multiply using scalar math, storing the
+/// result directly into system memory.
+///
+/// @param  mCol         Matrix3 Columns. Must point to 3x 4 float values
+/// @param  mat          Matrix3 Columns. Must point to 3x 4 float values
+/// @param  pfResult     [in] pointer to a float(matrix 3). [out] Contains the result
+//----------------------------------------------------------------------------
+//inline const Vector3 Matrix3::operator *( const Vector3 & vec ) const
+//{
+//    return Vector3(
+//        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
+//        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
+//        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
+//    );
+//}
+//
+//inline const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
+//{
+//    return Matrix3(
+//        ( *this * mat.mCol0 ),
+//        ( *this * mat.mCol1 ),
+//        ( *this * mat.mCol2 )
+//    );
+//}
+void Matrix3OperatorMultiplyScalar(float *mCol, float *mat, float *pfResult)
+{
+	pfResult[0] = ( ( ( mCol[0] * mat[0] ) + ( mCol[4] * mat[1] ) ) + ( mCol[8] * mat[2] ) );
+	pfResult[1] = ( ( ( mCol[1] * mat[0] ) + ( mCol[5] * mat[1] ) ) + ( mCol[9] * mat[2] ) );
+	pfResult[2] = ( ( ( mCol[2] * mat[0] ) + ( mCol[6] * mat[1] ) ) + ( mCol[10] * mat[2] ) );
+	pfResult[3] = 0.0f;
+
+	pfResult[4] = ( ( ( mCol[0] * mat[4] ) + ( mCol[4] * mat[5] ) ) + ( mCol[8] * mat[6] ) );
+	pfResult[5] = ( ( ( mCol[1] * mat[4] ) + ( mCol[5] * mat[5] ) ) + ( mCol[9] * mat[6] ) );
+	pfResult[6] = ( ( ( mCol[2] * mat[4] ) + ( mCol[6] * mat[5] ) ) + ( mCol[10] * mat[6] ) );
+	pfResult[7] = 0.0f;
+
+	pfResult[8] = ( ( ( mCol[0] * mat[8] ) + ( mCol[4] * mat[9] ) ) + ( mCol[8] * mat[10] ) );
+	pfResult[9] = ( ( ( mCol[1] * mat[8] ) + ( mCol[5] * mat[9] ) ) + ( mCol[9] * mat[10] ) );
+	pfResult[10] = ( ( ( mCol[2] * mat[8] ) + ( mCol[6] * mat[9] ) ) + ( mCol[10] * mat[10] ) );
+	pfResult[11] = 0.0f;
+}
+
+//----------------------------------------------------------------------------
+//  TestNeonMatrix3OperatorMultiply
+//
+/// Run timing study of the NeonMatrix3OperatorMultiply product functions above, writing the
+/// results to the Android verbose log.
+//----------------------------------------------------------------------------
+void TestNeonMatrix3OperatorMultiply()
+{
+	float SET_ALIGNMENT(64) data1[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
+	float SET_ALIGNMENT(64) data2[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
+	float *mCol = &data1[0];
+	float *mat = &data2[0];
+	float SET_ALIGNMENT(64) fResult[12];
+
+	sce::PhysicsEffects::PfxPerfCounter pc;
+	double dTimeSpan, dRefTimeSpan;;
+	unsigned int uiNumTries = 10000000;
+	unsigned int i;
+
+	char szMsg[256];
+
+	sprintf(szMsg, "");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"---------------------------------------");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"TestNeonMatrix3OperatorMultiply Start");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "Test input mCol:  <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+			mCol[0], mCol[1], mCol[2], mCol[3],
+			mCol[4], mCol[5], mCol[6], mCol[7],
+			mCol[8], mCol[9], mCol[10], mCol[11]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "Test input mat: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+			mat[0], mat[1], mat[2], mat[3],
+			mat[4], mat[5], mat[6], mat[7],
+			mat[8], mat[9], mat[10], mat[11]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+
+// profile scalar NeonMatrix3OperatorMultiplyScalar with direct memory return, c++ version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		Matrix3OperatorMultiplyScalar(mCol, mat, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	dRefTimeSpan = dTimeSpan;
+	sprintf(szMsg, "Time to do %i calls for Matrix3OperatorMultiplyScalar: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
+																   fResult[4], fResult[5], fResult[6], fResult[7],
+																   fResult[8], fResult[9], fResult[10], fResult[11]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+// profile NEON Matrix3OperatorMultiplyNeon with direct memory return, assembly version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		Matrix3OperatorMultiplyNeon(mCol, mat, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	sprintf(szMsg, "Time to do %i calls for Matrix3OperatorMultiplyNeon: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
+																   fResult[4], fResult[5], fResult[6], fResult[7],
+																   fResult[8], fResult[9], fResult[10], fResult[11]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+	sprintf(szMsg,"TestNeonMatrix3OperatorMultiply End");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+}
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_matrix4_operator_multiply.cpp
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_matrix4_operator_multiply.cpp
@@ -0,0 +1,188 @@
+/*
+ Applied Research Associates Inc. (c)2011
+
+ Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Applied Research Associates Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "test_neon.h"
+#define SCE_PFX_USE_PERFCOUNTER
+#include "physics_effects.h"
+#include <arm_neon.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <android/log.h>
+
+// This works with gcc
+#define SET_ALIGNMENT(alignment)   __attribute__((__aligned__((alignment))))
+
+// assembly implementations
+extern "C"
+{
+	void Matrix4OperatorMultiplyNeon(float *mCol, float *mat, float *pfResult);
+}
+
+//----------------------------------------------------------------------------
+//  Matrix4OperatorMultiplyScalar
+//
+/// Performs a Matrix4 vector3 multiply using scalar math, storing the
+/// result directly into system memory.
+///
+/// @param  mCol      Input matrix 1. Must point to 4x 4 float values
+/// @param  mat       Input matrix 2. Must point to 4x 4 float values
+/// @param  pfResult  [in] pointer to a float(matrix). [out] Contains the result
+//----------------------------------------------------------------------------
+//inline const Vector4 Matrix4::operator *( const Vector4 & vec ) const
+//{
+//    return Vector4(
+//        ( ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ) + ( mCol3.getX() * vec.getW() ) ),
+//        ( ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ) + ( mCol3.getY() * vec.getW() ) ),
+//        ( ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) ) + ( mCol3.getZ() * vec.getW() ) ),
+//        ( ( ( ( mCol0.getW() * vec.getX() ) + ( mCol1.getW() * vec.getY() ) ) + ( mCol2.getW() * vec.getZ() ) ) + ( mCol3.getW() * vec.getW() ) )
+//    );
+//}
+//inline const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
+//{
+//    return Matrix4(
+//        ( *this * mat.mCol0 ),
+//        ( *this * mat.mCol1 ),
+//        ( *this * mat.mCol2 ),
+//        ( *this * mat.mCol3 )
+//    );
+//}
+void Matrix4OperatorMultiplyScalar(float *mCol, float *mat, float *pfResult)
+{
+	pfResult[0] = ( ( ( mCol[0] * mat[0] ) + ( mCol[4] * mat[1] ) ) + ( mCol[8] * mat[2] )  + ( mCol[12] * mat[3] ) );
+	pfResult[1] = ( ( ( mCol[1] * mat[0] ) + ( mCol[5] * mat[1] ) ) + ( mCol[9] * mat[2] )  + ( mCol[13] * mat[3] ) );
+	pfResult[2] = ( ( ( mCol[2] * mat[0] ) + ( mCol[6] * mat[1] ) ) + ( mCol[10] * mat[2] )  + ( mCol[14] * mat[3] ) );
+	pfResult[3] = ( ( ( mCol[3] * mat[0] ) + ( mCol[7] * mat[1] ) ) + ( mCol[11] * mat[2] )  + ( mCol[15] * mat[3] ) );
+
+	pfResult[4] = ( ( ( mCol[0] * mat[4] ) + ( mCol[4] * mat[5] ) ) + ( mCol[8] * mat[6] )  + ( mCol[12] * mat[7] ) );
+	pfResult[5] = ( ( ( mCol[1] * mat[4] ) + ( mCol[5] * mat[5] ) ) + ( mCol[9] * mat[6] )  + ( mCol[13] * mat[7] ) );
+	pfResult[6] = ( ( ( mCol[2] * mat[4] ) + ( mCol[6] * mat[5] ) ) + ( mCol[10] * mat[6] )  + ( mCol[14] * mat[7] ) );
+	pfResult[7] = ( ( ( mCol[3] * mat[4] ) + ( mCol[7] * mat[5] ) ) + ( mCol[11] * mat[6] )  + ( mCol[15] * mat[7] ) );
+
+	pfResult[8] = ( ( ( mCol[0] * mat[8] ) + ( mCol[4] * mat[9] ) ) + ( mCol[8] * mat[10] )  + ( mCol[12] * mat[11] ) );
+	pfResult[9] = ( ( ( mCol[1] * mat[8] ) + ( mCol[5] * mat[9] ) ) + ( mCol[9] * mat[10] )  + ( mCol[13] * mat[11] ) );
+	pfResult[10] = ( ( ( mCol[2] * mat[8] ) + ( mCol[6] * mat[9] ) ) + ( mCol[10] * mat[10] )  + ( mCol[14] * mat[11] ) );
+	pfResult[11] = ( ( ( mCol[3] * mat[8] ) + ( mCol[7] * mat[9] ) ) + ( mCol[11] * mat[10] )  + ( mCol[15] * mat[11] ) );
+
+	pfResult[12] = ( ( ( mCol[0] * mat[12] ) + ( mCol[4] * mat[13] ) ) + ( mCol[8] * mat[14] )  + ( mCol[12] * mat[15] ) );
+	pfResult[13] = ( ( ( mCol[1] * mat[12] ) + ( mCol[5] * mat[13] ) ) + ( mCol[9] * mat[14] )  + ( mCol[13] * mat[15] ) );
+	pfResult[14] = ( ( ( mCol[2] * mat[12] ) + ( mCol[6] * mat[13] ) ) + ( mCol[10] * mat[14] )  + ( mCol[14] * mat[15] ) );
+	pfResult[15] = ( ( ( mCol[3] * mat[12] ) + ( mCol[7] * mat[13] ) ) + ( mCol[11] * mat[14] )  + ( mCol[15] * mat[15] ) );
+}
+
+//----------------------------------------------------------------------------
+//  TestNeonMatrix4OperatorMultiply
+//
+/// Run timing study of the matrix4 vector3 multiply operator from above, writing the
+/// results to the Android verbose log.
+//----------------------------------------------------------------------------
+void TestNeonMatrix4OperatorMultiply()
+{
+	float SET_ALIGNMENT(64) data1[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX)};
+	float SET_ALIGNMENT(64) data2[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX)};
+	float *mCol = &data1[0];
+	float *mat = &data2[0];
+
+	char szMsg[256];
+
+	sprintf(szMsg, "");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"---------------------------------------");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"TestNeonMatrix4OperatorMultiply Start");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "Test input mCol:  <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+			mCol[0], mCol[1], mCol[2], mCol[3],
+			mCol[4], mCol[5], mCol[6], mCol[7],
+			mCol[8], mCol[9], mCol[10], mCol[11],
+			mCol[12], mCol[13], mCol[14], mCol[15]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "Test input mat: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+			mat[0], mat[1], mat[2], mat[3],
+			mat[4], mat[5], mat[6], mat[7],
+			mat[8], mat[9], mat[10], mat[11],
+			mCol[12], mCol[13], mCol[14], mCol[15]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	float SET_ALIGNMENT(64) fResult[16];
+    
+
+	sce::PhysicsEffects::PfxPerfCounter pc;
+	double dTimeSpan, dRefTimeSpan;; 
+	unsigned int uiNumTries = 10000000;
+	unsigned int i;
+
+// profile scalar Matrix4OperatorMultiplyScalar with direct memory return, c++ version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		Matrix4OperatorMultiplyScalar(mCol, mat, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	dRefTimeSpan = dTimeSpan;
+	sprintf(szMsg, "Time to do %i calls for Matrix4OperatorMultiplyScalar: %f secs, speedup: %5.2f",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "result value: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+			fResult[0], fResult[1], fResult[2], fResult[3],
+		    fResult[4], fResult[5], fResult[6], fResult[7],
+		    fResult[8], fResult[9], fResult[10], fResult[11],
+		    fResult[12], fResult[13], fResult[14], fResult[15]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+// profile NEON Matrix4OperatorMultiplyNeon with direct memory return, assembly version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		Matrix4OperatorMultiplyNeon(mCol, mat, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	sprintf(szMsg, "Time to do %i calls for Matrix4OperatorMultiplyNeon: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
+																   fResult[4], fResult[5], fResult[6], fResult[7],
+																   fResult[8], fResult[9], fResult[10], fResult[11],
+																   fResult[12], fResult[13], fResult[14], fResult[15]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+	sprintf(szMsg,"TestNeonMatrix4OperatorMultiply End");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+}
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_orthoInverse_transform3.cpp
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_orthoInverse_transform3.cpp
@@ -0,0 +1,173 @@
+/*
+ Applied Research Associates Inc. (c)2011
+
+ Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Applied Research Associates Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "test_neon.h"
+#define SCE_PFX_USE_PERFCOUNTER
+#include "physics_effects.h"
+#include <arm_neon.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <android/log.h>
+
+// This works with gcc
+#define SET_ALIGNMENT(alignment)   __attribute__((__aligned__((alignment))))
+
+// assembly implementations
+extern "C"
+{
+	void OrthoInverseTransform3Neon(float *trfm, float *pfResult);
+}
+
+//----------------------------------------------------------------------------
+//  OrthoInverseTransform3Scalar
+//
+/// Performs a ortho inverse of a transform3 using scalar math, storing the
+/// result directly into system memory.
+///
+/// @param  trfm      Input transform3. Must point to 4x 4 float values
+/// @param  pfResult  [in] pointer to a float(tranform3). [out] Contains the result
+//----------------------------------------------------------------------------
+//inline const Transform3 orthoInverse( const Transform3 & tfrm )
+//{
+//    Vector3 inv0, inv1, inv2;
+//    inv0 = Vector3( tfrm.getCol0().getX(), tfrm.getCol1().getX(), tfrm.getCol2().getX() );
+//    inv1 = Vector3( tfrm.getCol0().getY(), tfrm.getCol1().getY(), tfrm.getCol2().getY() );
+//    inv2 = Vector3( tfrm.getCol0().getZ(), tfrm.getCol1().getZ(), tfrm.getCol2().getZ() );
+//    return Transform3(
+//        inv0,
+//        inv1,
+//        inv2,
+//        Vector3( ( -( ( inv0 * tfrm.getCol3().getX() ) + ( ( inv1 * tfrm.getCol3().getY() ) + ( inv2 * tfrm.getCol3().getZ() ) ) ) ) )
+//    );
+//}
+void OrthoInverseTransform3Scalar(float *trfm, float *pfResult)
+{
+	pfResult[0] = trfm[0];
+	pfResult[1] = trfm[4];
+	pfResult[2] = trfm[8];
+	pfResult[3] = 0.0f;
+
+	pfResult[4] = trfm[1];
+	pfResult[5] = trfm[5];
+	pfResult[6] = trfm[9];
+	pfResult[7] = 0.0f;
+
+	pfResult[8] = trfm[2];
+	pfResult[9] = trfm[6];
+	pfResult[10] = trfm[10];
+	pfResult[11] = 0.0f;
+
+	pfResult[12] = ( -( ( trfm[0] * trfm[8]) + ( ( trfm[1] * trfm[9]) + ( trfm[2] * trfm[10]) ) ) );
+	pfResult[13] = ( -( ( trfm[4] * trfm[8]) + ( ( trfm[5] * trfm[9]) + ( trfm[6] * trfm[10]) ) ) );
+	pfResult[14] = ( -( ( trfm[8] * trfm[8]) + ( ( trfm[9] * trfm[9]) + ( trfm[10] * trfm[10]) ) ) );
+	pfResult[15] = 0.0f;
+}
+
+//----------------------------------------------------------------------------
+//  TestNeonOrthoInverseTransform3
+//
+/// Run timing study of the orthoinverse on transform3 functions above, writing the
+/// results to the Android verbose log.
+//----------------------------------------------------------------------------
+void TestNeonOrthoInverseTransform3()
+{
+	float SET_ALIGNMENT(64) data1[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
+	float *trfm = &data1[0];
+
+	char szMsg[256];
+
+	sprintf(szMsg, "");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"---------------------------------------");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"TestNeonOrthoInverseTransform3 Start");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "Test input trfm: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>>",
+					trfm[0], trfm[1], trfm[2], trfm[3],
+					trfm[4], trfm[5], trfm[6], trfm[7],
+					trfm[8], trfm[9], trfm[10], trfm[11],
+					trfm[12], trfm[13], trfm[14], trfm[15]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	float SET_ALIGNMENT(64) fResult[16];
+    
+
+	sce::PhysicsEffects::PfxPerfCounter pc;
+	double dTimeSpan, dRefTimeSpan;; 
+	unsigned int uiNumTries = 10000000;
+	unsigned int i;
+
+// profile scalar OrthoInverseTransform3Scalar with direct memory return, c++ version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		OrthoInverseTransform3Scalar(trfm, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	dRefTimeSpan = dTimeSpan;
+	sprintf(szMsg, "Time to do %i calls for OrthoInverseTransform3Scalar: %f secs, speedup: %5.2f",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+				   fResult[0], fResult[1], fResult[2], fResult[3],
+				   fResult[4], fResult[5], fResult[6], fResult[7],
+				   fResult[8], fResult[9], fResult[10], fResult[11],
+				   fResult[12], fResult[13], fResult[14], fResult[15]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+// profile NEON OrthoInverseTransform3Neon with direct memory return, assembly version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		OrthoInverseTransform3Neon(trfm, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	sprintf(szMsg, "Time to do %i calls for OrthoInverseTransform3Neon: %f secs, speedup: %5.2f",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+					fResult[0], fResult[1], fResult[2], fResult[3],
+				    fResult[4], fResult[5], fResult[6], fResult[7],
+				    fResult[8], fResult[9], fResult[10], fResult[11],
+				    fResult[12], fResult[13], fResult[14], fResult[15]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	sprintf(szMsg,"TestNeonOrthoInverseTransform3 End");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+}
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_solve_linear_constraint_row.cpp
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_solve_linear_constraint_row.cpp
@@ -0,0 +1,477 @@
+/*
+ Applied Research Associates Inc. (c)2011
+
+ Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Applied Research Associates Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "test_neon.h"
+#define SCE_PFX_USE_PERFCOUNTER
+#include "physics_effects.h"
+#include <arm_neon.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <android/log.h>
+
+// This works with gcc
+#define SET_ALIGNMENT(alignment)   __attribute__((__aligned__((alignment))))
+
+//----------------------------------------------------------------------------
+//  pfxSolveLinearConstraintRowScalar
+//
+/// Performs
+///
+/// @param           Input
+//----------------------------------------------------------------------------
+void pfxSolveLinearConstraintRowScalar(float *constraint,				// 8 floats
+									   float *deltaLinearVelocityA,		// 4 floats
+									   float *deltaAngularVelocityA,	// 4 floats
+									   float massInvA,					// 1 floats
+									   float *inertiaInvA,				// 12 floats
+									   float *rA,						// 4 floats
+									   float *deltaLinearVelocityB,		// 4 floats
+									   float *deltaAngularVelocityB,	// 4 floats
+									   float massInvB,					// 1 floats
+									   float *inertiaInvB,				// 12 floats
+									   float *rB)						// 4 floats
+{
+//const PfxVector3 normal(pfxReadVector3(constraint.m_normal)); //fourth element is not zero and it need to be in assembly
+	float SET_ALIGNMENT(64) normalRead[] = {constraint[0],constraint[1],constraint[2],0.0f};
+	float *normal = &normalRead[0];
+//PfxFloat deltaImpulse = constraint.m_rhs;
+	float deltaImpulse = constraint[4];
+//PfxVector3 dVA = deltaLinearVelocityA + cross(deltaAngularVelocityA,rA);
+	float SET_ALIGNMENT(64) dVARead[] = {deltaLinearVelocityA[0] + (deltaAngularVelocityA[1]*rA[2] - deltaAngularVelocityA[2]*rA[1]),
+										 deltaLinearVelocityA[1] + (deltaAngularVelocityA[2]*rA[0] - deltaAngularVelocityA[0]*rA[2]),
+										 deltaLinearVelocityA[2] + (deltaAngularVelocityA[0]*rA[1] - deltaAngularVelocityA[1]*rA[0]),
+										 0.0f};
+	float *dVA = &dVARead[0];
+//PfxVector3 dVB = deltaLinearVelocityB + cross(deltaAngularVelocityB,rB);
+	float SET_ALIGNMENT(64) dVBRead[] = {deltaLinearVelocityB[0] + (deltaAngularVelocityB[1]*rB[2] - deltaAngularVelocityB[2]*rB[1]),
+										 deltaLinearVelocityB[1] + (deltaAngularVelocityB[2]*rB[0] - deltaAngularVelocityB[0]*rB[2]),
+										 deltaLinearVelocityB[2] + (deltaAngularVelocityB[0]*rB[1] - deltaAngularVelocityB[1]*rB[0]),
+										 0.0f};
+	float *dVB = &dVBRead[0];
+//deltaImpulse -= constraint.m_jacDiagInv * dot(normal,dVA-dVB);
+	deltaImpulse -= constraint[4] * (normal[0]*(dVA[0]-dVB[0]) + normal[1]*(dVA[1]-dVB[1]) + normal[2]*(dVA[2]-dVB[2]));
+//PfxFloat oldImpulse = constraint.m_accumImpulse;
+	float oldImpulse = constraint[7];
+//constraint.m_accumImpulse = SCE_PFX_CLAMP(oldImpulse + deltaImpulse,constraint.m_lowerLimit,constraint.m_upperLimit);
+	if((oldImpulse + deltaImpulse) > constraint[6]){
+		if(constraint[6] > constraint[5]){
+			constraint[7] = constraint[6];
+		}else {
+			constraint[7] = constraint[5];
+		}
+	} else {
+		if(oldImpulse + deltaImpulse > constraint[5]){
+			constraint[7] = oldImpulse + deltaImpulse;
+		}else {
+			constraint[7] = constraint[5];
+		}
+	}
+//deltaImpulse = constraint.m_accumImpulse - oldImpulse;
+	deltaImpulse = constraint[7] - oldImpulse;
+//deltaLinearVelocityA += deltaImpulse * massInvA * normal;
+	deltaLinearVelocityA[0] += deltaImpulse * massInvA * normal[0];
+	deltaLinearVelocityA[1] += deltaImpulse * massInvA * normal[1];
+	deltaLinearVelocityA[2] += deltaImpulse * massInvA * normal[2];
+	deltaLinearVelocityA[3] += deltaImpulse * massInvA * normal[3];
+
+//deltaAngularVelocityA += deltaImpulse * inertiaInvA * cross(rA,normal);
+	deltaAngularVelocityA[0] += deltaImpulse * ((inertiaInvA[0] * (rA[1]*normal[2] - rA[2]*normal[1])) +
+											    (inertiaInvA[4] * (rA[2]*normal[0] - rA[0]*normal[2])) +//
+											    (inertiaInvA[8] * (rA[0]*normal[1] - rA[1]*normal[0])));
+	deltaAngularVelocityA[1] += deltaImpulse * ((inertiaInvA[1] * (rA[1]*normal[2] - rA[2]*normal[1])) +
+												(inertiaInvA[5] * (rA[2]*normal[0] - rA[0]*normal[2])) +
+												(inertiaInvA[9] * (rA[0]*normal[1] - rA[1]*normal[0])));
+	deltaAngularVelocityA[2] += deltaImpulse * ((inertiaInvA[2] * (rA[1]*normal[2] - rA[2]*normal[1])) +
+												(inertiaInvA[6] * (rA[2]*normal[0] - rA[0]*normal[2])) +
+												(inertiaInvA[10] * (rA[0]*normal[1] - rA[1]*normal[0])));
+	deltaAngularVelocityA[3] += deltaImpulse * ((inertiaInvA[3] * (rA[1]*normal[2] - rA[2]*normal[1])) +
+												(inertiaInvA[7] * (rA[2]*normal[0] - rA[0]*normal[2])) +
+												(inertiaInvA[11] * (rA[0]*normal[1] - rA[1]*normal[0])));
+//deltaLinearVelocityB -= deltaImpulse * massInvB * normal;
+	deltaLinearVelocityB[0] -= deltaImpulse * massInvB * normal[0];
+	deltaLinearVelocityB[1] -= deltaImpulse * massInvB * normal[1];
+	deltaLinearVelocityB[2] -= deltaImpulse * massInvB * normal[2];
+	deltaLinearVelocityB[3] -= deltaImpulse * massInvB * normal[3];
+
+//deltaAngularVelocityB -= deltaImpulse * inertiaInvB * cross(rB,normal);
+	deltaAngularVelocityB[0] -= deltaImpulse * ((inertiaInvB[0] * (rB[1]*normal[2] - rB[2]*normal[1])) +
+											    (inertiaInvB[4] * (rB[2]*normal[0] - rB[0]*normal[2])) +
+											    (inertiaInvB[8] * (rB[0]*normal[1] - rB[1]*normal[0])));
+	deltaAngularVelocityB[1] -= deltaImpulse * ((inertiaInvB[1] * (rB[1]*normal[2] - rB[2]*normal[1])) +
+												(inertiaInvB[5] * (rB[2]*normal[0] - rB[0]*normal[2])) +
+												(inertiaInvB[9] * (rB[0]*normal[1] - rB[1]*normal[0])));
+	deltaAngularVelocityB[2] -= deltaImpulse * ((inertiaInvB[2] * (rB[1]*normal[2] - rB[2]*normal[1])) +
+												(inertiaInvB[6] * (rB[2]*normal[0] - rB[0]*normal[2])) +
+												(inertiaInvB[10] * (rB[0]*normal[1] - rB[1]*normal[0])));
+	deltaAngularVelocityB[3] -= deltaImpulse * ((inertiaInvB[3] * (rB[1]*normal[2] - rB[2]*normal[1])) +
+												(inertiaInvB[7] * (rB[2]*normal[0] - rB[0]*normal[2])) +
+												(inertiaInvB[11] * (rB[0]*normal[1] - rB[1]*normal[0])));
+}
+
+/*//////////////////////////////////////Source Code///////////////////////////////////////
+void pfxSolveLinearConstraintRow(
+	  PfxConstraintRow &constraint,      	// 8 floats
+	  PfxVector3 &deltaLinearVelocityA,		// 4 floats
+	  PfxVector3 &deltaAngularVelocityA,	// 4 floats
+	  PfxFloat massInvA,					// 1 float
+const PfxMatrix3 &inertiaInvA,				// 12 floats
+const PfxVector3 &rA,						// 4 floats
+	  PfxVector3 &deltaLinearVelocityB,		// 4 floats
+	  PfxVector3 &deltaAngularVelocityB,	// 4 floats
+	  PfxFloat massInvB,					// 1 float
+const PfxMatrix3 &inertiaInvB,				// 12 floats
+const PfxVector3 &rB)						// 4 floats
+{
+	//  PfxConstraintRow structure:
+	//		PfxFloat m_normal[3];
+	//		PfxFloat m_rhs;
+	//		PfxFloat m_jacDiagInv;
+	//		PfxFloat m_lowerLimit;
+	//		PfxFloat m_upperLimit;
+	//		PfxFloat m_accumImpulse;
+const PfxVector3 normal(pfxReadVector3(constraint.m_normal));
+PfxFloat deltaImpulse = constraint.m_rhs;
+PfxVector3 dVA = deltaLinearVelocityA + cross(deltaAngularVelocityA,rA);
+PfxVector3 dVB = deltaLinearVelocityB + cross(deltaAngularVelocityB,rB);
+deltaImpulse -= constraint.m_jacDiagInv * dot(normal,dVA-dVB);
+PfxFloat oldImpulse = constraint.m_accumImpulse;
+constraint.m_accumImpulse = SCE_PFX_CLAMP(oldImpulse + deltaImpulse,constraint.m_lowerLimit,constraint.m_upperLimit);
+deltaImpulse = constraint.m_accumImpulse - oldImpulse;
+deltaLinearVelocityA += deltaImpulse * massInvA * normal;
+deltaAngularVelocityA += deltaImpulse * inertiaInvA * cross(rA,normal);
+deltaLinearVelocityB -= deltaImpulse * massInvB * normal;
+deltaAngularVelocityB -= deltaImpulse * inertiaInvB * cross(rB,normal);
+}
+*/
+
+void pfxSolveLinearConstraintRowNeonInline(float *constraint,		// 8 floats
+								  float *deltaLinearVelocityA,		// 4 floats
+								  float *deltaAngularVelocityA,		// 4 floats
+								  float *massInvA,					// 1 floats
+								  float *inertiaInvA,				// 12 floats
+								  float *rA,						// 4 floats
+								  float *deltaLinearVelocityB,		// 4 floats
+								  float *deltaAngularVelocityB,		// 4 floats
+								  float *massInvB,					// 1 floats
+								  float *inertiaInvB,				// 12 floats
+								  float *rB)						// 4 floats
+{
+//	  "r" (constraint), 			//%0
+//	  "r" (deltaLinearVelocityA), 	//%1
+//	  "r" (deltaAngularVelocityA), 	//%2
+//	  "r" (massInvA), 				//%3
+//	  "r" (inertiaInvA), 			//%4
+//	  "r" (rA), 					//%5
+//	  "r" (deltaLinearVelocityB), 	//%6
+//	  "r" (deltaAngularVelocityB),  //%7
+//	  "r" (massInvB), 				//%8
+//	  "r" (inertiaInvB), 			//%9
+//	  "r" (rB) 						//%10
+	asm volatile
+			(
+			//Loads beforehand so the normal vector will be able to have zero in the 4th element
+				"vld1.32 	{q0}, [%1]			\n\t" //LOAD => deltaLinearVelocityA ----------------> q0
+				"vld1.32	{q1}, [%2]			\n\t" //LOAD => deltaAngularVelocityA ---------------> q1
+				"vld1.32	{q2}, [%6]			\n\t" //LOAD => deltaLinearVelocityB ----------------> q2
+				"vld1.32	{q3}, [%7]			\n\t" //LOAD => deltaAngularVelocityB ---------------> q3
+	//const PfxVector3 normal(pfxReadVector3(constraint.m_normal));
+				"vld1.32	{q4}, [%0]!			\n\t" //LOAD => constraint normal--------------------> q4
+				"vdup.32	d10, d1[1]			\n\t" //set 4th element on normal vector to zero
+				"vtrn.32	d9, d10				\n\t" //LOAD => deltaImpulse ------------------------> q5 (d10[0] only)
+	//PfxFloat deltaImpulse = constraint.m_rhs;
+				"vld1.32	{d12}, [%0]!		\n\t" //LOAD => constraint variables ----------------> q6
+				"vld1.32	{d13[0]}, [%0]!		\n\t" //constraint  variables
+				"vld1.32	{d13[1]}, [%0]		\n\t" //constraint load of remaining variables loaded this way keep pointer to m_accumilate in order to store back in later
+	//PfxVector3 dVA = deltaLinearVelocityA + cross(deltaAngularVelocityA,rA);
+	//PfxVector3 dVB = deltaLinearVelocityB + cross(deltaAngularVelocityB,rB);
+				"vld1.32 	{d14[1]}, [%5]		\n\t" //LOAD => rA for cross product use 1A ---------> q7 (save these for use later)
+				"vld1.32 	{d18[1]}, [%10]		\n\t" //LOAD => rB for cross product use 1B ---------> q9
+				"vld1.32 	{d17[0]}, [%5]!		\n\t" //LOAD => rA for cross product use 2A ---------> q8
+				"vld1.32 	{d21[0]}, [%10]!	\n\t" //LOAD => rB for cross product use 2B ---------> q10
+				"vld1.32 	{d16}, [%5]			\n\t" //rA 2
+				"vld1.32 	{d20}, [%10]		\n\t" //rB 2
+				"vld1.32 	{d15[0]}, [%5]!		\n\t" //rA 1
+				"vld1.32 	{d19[0]}, [%10]!	\n\t" //rB 1
+				"vld1.32 	{d14[0]}, [%5]!		\n\t" //rA 1
+				"vld1.32 	{d18[0]}, [%10]!	\n\t" //rB 1
+				"vld1.32 	{d15[1]}, [%5]		\n\t" //rA 1
+				"vld1.32 	{d19[1]}, [%10]		\n\t" //rB 1
+				"vld1.32 	{d17[1]}, [%5]!		\n\t" //rA 2
+				"vld1.32 	{d21[1]}, [%10]!	\n\t" //rB 2
+				"vdup.32	q12, d1[1]			\n\t" //set deltaAngularVelocityB 2
+				"vdup.32	q13, d1[1]			\n\t" //set deltaAngularVelocityB 1
+				"vdup.32	q14, d1[1]			\n\t" //set deltaAngularVelocityA 1
+				"vdup.32	q15, d1[1]			\n\t" //set deltaAngularVelocityA 2
+				"vadd.f32 	q14, q14, q1		\n\t"
+				"vadd.f32 	q13, q13, q3		\n\t"
+				"vadd.f32 	q15, q15, q1		\n\t"
+				"vadd.f32 	q12, q12, q3		\n\t"
+				"vrev64.32	d28, d28			\n\t" //set deltaAngularVelocityA 1
+				"vrev64.32	d26, d26			\n\t" //set deltaAngularVelocityB 1
+				"vtrn.32	d30, d31			\n\t" //set deltaAngularVelocityA 2
+				"vtrn.32	d24, d25			\n\t" //set deltaAngularVelocityB 2
+				"vtrn.32	d28, d29			\n\t" //set deltaAngularVelocityA 1
+				"vtrn.32	d26, d27			\n\t" //set deltaAngularVelocityB 1
+				"vrev64.32	d30, d30			\n\t" //set deltaAngularVelocityA 2
+				"vrev64.32	d24, d24			\n\t" //set deltaAngularVelocityB 2
+				"vmul.f32 	q14, q7, q14		\n\t" //operation for cross product 1A
+				"vmul.f32 	q13, q9, q13		\n\t" //operation for cross product 1B
+				"vmls.f32 	q14, q8, q15		\n\t" //operation for cross product 2A
+				"vmls.f32 	q13, q10, q12		\n\t" //operation for cross product 2B
+				"vadd.f32 	q14, q14, q0		\n\t" //operation for adding cross to linearVelocityA
+				"vadd.f32 	q13, q13, q2		\n\t" //operation for adding cross to linearVelocityB
+													  //LOAD => dVA --------------------------------> q14
+													  //LOAD => dVB --------------------------------> q13
+													  //FREE q11, q12, q15, d11
+	//deltaImpulse -= constraint.m_jacDiagInv * dot(normal,dVA-dVB);
+				"vsub.f32 	q11, q14, q13		\n\t" //TEMP q11 => dVA-dVB for dot product
+	/*find*/	"vmul.f32 	q11, q11, q4		\n\t" //operation for dot product
+	/*fastest*/	"vpadd.f32 	d22, d22, d22		\n\t" //operation for dot product
+	/*dot*/		"vpadd.f32 	d23, d23, d23		\n\t" //operation for dot product
+				"vadd.f32 	d22, d22, d23		\n\t" //operation for dot product
+				"vmul.f32 	d22, d22, d12[0]	\n\t" //m_jacDiagInv times dot product result
+				"vsub.f32 	d10, d10, d22		\n\t" //subtract result from deltaImpule
+	//PfxFloat oldImpulse = constraint.m_accumImpulse;
+				"vdup.32	d11, d13[1]			\n\t" //LOAD => oldImpulse -------------------------> q5 (d11 only)
+	//constraint.m_accumImpulse = SCE_PFX_CLAMP(oldImpulse + deltaImpulse,constraint.m_lowerLimit,constraint.m_upperLimit);
+				"vdup.32	d23, d13[0]			\n\t" //TEMP q11 => m_upperLimit (d23 only)
+				"vdup.32	d24, d12[1]			\n\t" //TEMP q12 => m_lowerLimit (d34 only)
+				"vadd.f32   d25, d10, d11		\n\t" //TEMP q12 => deltaImpulse + oldImplues (d25 only)
+				"vmin.f32   d25, d25, d23		\n\t" //operation MIN(v,b)
+				"vmax.f32	d22, d25, d24		\n\t" //operation MAX(a,MIN(v,b)
+				"vst1.32    {d22[0]}, [%0]		\n\t" //store m_accumImpulse (incremented so that it can be reloaded for cross product later)
+	//deltaImpulse = constraint.m_accumImpulse - oldImpulse;
+				"vsub.f32 	d10, d22, d11		\n\t" //operation to calculate new deltaImpule
+													  //FREE q6, q11, q12, q13, q14, q15, d11
+	//deltaLinearVelocityB -= deltaImpulse * massInvB * normal;
+	//deltaLinearVelocityA += deltaImpulse * massInvA * normal;
+				"vld1.32 	{d11[0]}, [%3]		\n\t" //LOAD => massInvA ---------------------------> q5 (d11[0] only)
+				"vld1.32 	{d11[1]}, [%8]		\n\t" //LOAD => massInvB ---------------------------> q5 (d11[1] only)
+				"vmul.f32	q11, q4, d11[0]		\n\t" //TEMP q11 => operation normal times massInvA A
+				"vmul.f32	q12, q4, d11[1]		\n\t" //TEMP q12 => operation normal times massInvB B
+				"vmul.f32	q11, q11, d10[0]	\n\t" //TEMP q11 => operation result times DeltaImpulse A
+				"vmul.f32	q12, q12, d10[0]	\n\t" //TEMP q12 => operation result times DeltaImpulse B
+				"vadd.f32	q0, q0, q11			\n\t" //operation create new deltaLinearVelocityA A
+				"vsub.f32	q2, q2, q12			\n\t" //operation create new deltaLinearVelocityB B
+				"vst1.32	{q0}, [%1]			\n\t" //store the new deltaLinearVelocityA A
+				"vst1.32	{q2}, [%6]			\n\t" //store the new deltaLinearVelocityB B
+													  //FREE q0, q2, q6, q11, q12, q13, q14, q15, d11
+	//deltaAngularVelocityA += deltaImpulse * inertiaInvA * cross(rA,normal);
+	//deltaAngularVelocityB -= deltaImpulse * inertiaInvB * cross(rB,normal);
+				"vdup.32	q14, d1[1]			\n\t" //set normal cross load
+				"vdup.32	q15, d1[1]			\n\t" //
+				"vadd.f32 	q14, q14, q4		\n\t" //
+				"vadd.f32 	q15, q15, q4		\n\t" //
+				"vrev64.32	d28, d28			\n\t" //
+				"vtrn.32	d30, d31			\n\t" //
+				"vtrn.32	d28, d29			\n\t" //
+				"vrev64.32	d30, d30			\n\t" //
+				"vmul.f32 	q0, q8, q15			\n\t" //operation for cross product A
+				"vmul.f32 	q2, q10, q15		\n\t" //operation for cross product B
+				"vmls.f32 	q0, q14, q7			\n\t" //operation for cross product A
+				"vmls.f32 	q2, q14, q9			\n\t" //operation for cross product B
+										  	  	  	  //LOAD => cross product result A ------------> q0
+													  //LOAD => cross product result B ------------> q2
+													  //FREE q6, q7, q8, q9, q10, q11, q12, q13, q14, q15, d11
+
+				"vld1.32    {q13-q14}, [%4]! 	\n\t" //LOAD => inertiaInvA col0, col1 A ----------> q13, q14
+				"vld1.32    {q9-q10}, [%9]!	 	\n\t" //LOAD => inertiaInvB col0, col1 B -----------> q9, q10
+				"vld1.32    {q15}, [%4]   	    \n\t" //LOAD => inertiaInvA col2 A ----------------> q5
+				"vld1.32    {q11}, [%9]   	    \n\t" //LOAD => inertiaInvB col2 B -----------------> q11
+				"vmul.f32   q13, q13, d0[0]    	\n\t" //operation inertiaInvA col0 = (col0) * (crossA elem0) A
+				"vmul.f32   q9, q9, d4[0]      	\n\t" //operation inertiaInvB col0 = (col0) * (crossB elem0) B
+				"vmla.f32   q13, q14, d0[1]    	\n\t" //operation inertiaInvA col1 = (col1) * (crossA elem1) A
+				"vmla.f32   q9, q10, d4[1]     	\n\t" //operation inertiaInvB col1 = (col1) * (crossB elem1) B
+				"vmla.f32   q13, q15, d1[0]    	\n\t" //operation inertiaInvA col2 = (col2) * (crossA elem2) A
+				"vmla.f32   q9, q11, d5[0]     	\n\t" //operation inertiaInvB col2 = (col2) * (crossB elem2) B
+				"vmul.f32   q13, q13, d10[0]    \n\t" //operation inertiaInvA times deltaImpulse A
+				"vmul.f32   q9, q9, d10[0]      \n\t" //operation inertiaInvB times deltaImpulse B
+				"vadd.f32   q1, q1, q13      	\n\t" //operation accumulate the deltaAngularVelocityA A
+				"vsub.f32   q3, q3, q9      	\n\t" //operation accumulate the deltaAngularVelocityB B
+				"vst1.32    {q1}, [%2]      	\n\t" //store deltaAngularVelocityA A
+				"vst1.32    {q3}, [%7]      	\n\t" //store deltaAngularVelocityB B
+				:	// NO outputs! It is important to *not* put anything here. (Putting something here forces use of r0, which wreak havok
+				: "r" (constraint), "r" (deltaLinearVelocityA), "r" (deltaAngularVelocityA), "r" (massInvA), "r" (inertiaInvA), "r" (rA), "r" (deltaLinearVelocityB), "r" (deltaAngularVelocityB), "r" (massInvB), "r" (inertiaInvB), "r" (rB) //inputs
+				: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" // clobbers
+			);
+}
+
+//----------------------------------------------------------------------------
+//  TestNeonSolveLinearConstraintRow
+//
+/// Run timing study of the Linear Constraint Row Solver from above, writing the
+/// results to the Android verbose log.
+//----------------------------------------------------------------------------
+void TestNeonSolveLinearConstraintRow()
+{
+	//Scalar
+	float SET_ALIGNMENT(64) constraintDataScalar[] = 				{0.0f,1.0f,2.0f,3.0f,1.0f,2.0f,3.0f,4.0f};
+	float SET_ALIGNMENT(64) deltaLinearVelocityADataScalar[] = 		{1.0f,2.0f,3.0f,0.0f};
+	float SET_ALIGNMENT(64) deltaAngularVelocityADataScalar[] = 	{2.0f,3.0f,4.0f,0.0f};
+	float SET_ALIGNMENT(64) inertiaInvADataScalar[] = 				{4.0f,5.0f,6.0f,0.0f,5.0f,6.0f,7.0f,0.0f,6.0f,7.0f,8.0f,0.0f};
+	float SET_ALIGNMENT(64) rADataScalar[] = 						{5.0f,6.0f,7.0f,0.0f};
+	float SET_ALIGNMENT(64) deltaLinearVelocityBDataScalar[] = 		{6.0f,7.0f,8.0f,0.0f};
+	float SET_ALIGNMENT(64) deltaAngularVelocityBDataScalar[] = 	{9.0f,8.0f,7.0f,0.0f};
+	float SET_ALIGNMENT(64) inertiaInvBDataScalar[] = 				{7.0f,6.0f,5.0f,0.0f,9.0f,8.0f,7.0f,0.0f,6.0f,7.0f,8.0f,0.0f};
+	float SET_ALIGNMENT(64) rBDataScalar[] = 						{6.0f,5.0f,4.0f,0.0f};
+
+	float *constraintScalar = &constraintDataScalar[0];
+	float *deltaLinearVelocityAScalar = &deltaLinearVelocityADataScalar[0];
+	float *deltaAngularVelocityAScalar = &deltaAngularVelocityADataScalar[0];
+	float massInvAScalar = 3.0f;
+	float *inertiaInvAScalar = &inertiaInvADataScalar[0];
+	float *rAScalar = &rADataScalar[0];
+	float *deltaLinearVelocityBScalar = &deltaLinearVelocityBDataScalar[0];
+	float *deltaAngularVelocityBScalar = &deltaAngularVelocityBDataScalar[0];
+	float massInvBScalar = 8.0f;
+	float *inertiaInvBScalar = &inertiaInvBDataScalar[0];
+	float *rBScalar = &rBDataScalar[0];
+
+	//Neon
+	float SET_ALIGNMENT(64) constraintDataNeon[] = 				{0.0f,1.0f,2.0f,3.0f,1.0f,2.0f,3.0f,4.0f};
+	float SET_ALIGNMENT(64) deltaLinearVelocityADataNeon[] = 	{1.0f,2.0f,3.0f,0.0f};
+	float SET_ALIGNMENT(64) deltaAngularVelocityADataNeon[] = 	{2.0f,3.0f,4.0f,0.0f};
+	float SET_ALIGNMENT(64) massInvADataNeon[] = 				{3.0f};
+	float SET_ALIGNMENT(64) inertiaInvADataNeon[] = 			{4.0f,5.0f,6.0f,0.0f,5.0f,6.0f,7.0f,0.0f,6.0f,7.0f,8.0f,0.0f};
+	float SET_ALIGNMENT(64) rADataNeon[] = 						{5.0f,6.0f,7.0f,0.0f};
+	float SET_ALIGNMENT(64) deltaLinearVelocityBDataNeon[] = 	{6.0f,7.0f,8.0f,0.0f};
+	float SET_ALIGNMENT(64) deltaAngularVelocityBDataNeon[] = 	{9.0f,8.0f,7.0f,0.0f};
+	float SET_ALIGNMENT(64) massInvBDataNeon[] = 				{8.0f};
+	float SET_ALIGNMENT(64) inertiaInvBDataNeon[] = 			{7.0f,6.0f,5.0f,0.0f,9.0f,8.0f,7.0f,0.0f,6.0f,7.0f,8.0f,0.0f};
+	float SET_ALIGNMENT(64) rBDataNeon[] = 						{6.0f,5.0f,4.0f,0.0f};
+
+	float *constraintNeon = &constraintDataNeon[0];
+	float *deltaLinearVelocityANeon = &deltaLinearVelocityADataNeon[0];
+	float *deltaAngularVelocityANeon = &deltaAngularVelocityADataNeon[0];
+	float *massInvANeon = &massInvADataNeon[0];
+	float *inertiaInvANeon = &inertiaInvADataNeon[0];
+	float *rANeon = &rADataNeon[0];
+	float *deltaLinearVelocityBNeon = &deltaLinearVelocityBDataNeon[0];
+	float *deltaAngularVelocityBNeon = &deltaAngularVelocityBDataNeon[0];
+	float *massInvBNeon = &massInvBDataNeon[0];
+	float *inertiaInvBNeon = &inertiaInvBDataNeon[0];
+	float *rBNeon = &rBDataNeon[0];
+
+	char szMsg[256];
+
+	sprintf(szMsg, "");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"---------------------------------------");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"TestNeonSolveLinearConstraintRow Start");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+	sce::PhysicsEffects::PfxPerfCounter pc;
+	double dTimeSpan, dRefTimeSpan;; 
+	unsigned int uiNumTries = 10000000;
+	unsigned int i;
+
+// profile pfxSolveLinearConstraintRowScalar with direct memory return, c++ version
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		pfxSolveLinearConstraintRowScalar(
+				 	 constraintScalar,
+					 deltaLinearVelocityAScalar,
+					 deltaAngularVelocityAScalar,
+					 massInvAScalar,
+					 inertiaInvAScalar,
+					 rAScalar,
+					 deltaLinearVelocityBScalar,
+					 deltaAngularVelocityBScalar,
+					 massInvBScalar,
+					 inertiaInvBScalar,
+					 rBScalar
+					 );
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	dRefTimeSpan = dTimeSpan;
+	sprintf(szMsg, "Time to do %i calls for pfxSolveLinearConstraintRowScalar: %f secs, speedup: %5.2f",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "constraint.m_accumImpulse = <%f>", constraintScalar[7]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "deltaLinearVelocityA = <%f,%f,%f,%f>",
+					deltaLinearVelocityAScalar[0], deltaLinearVelocityAScalar[1], deltaLinearVelocityAScalar[2], deltaLinearVelocityAScalar[3]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "deltaAngularVelocityA = <%f,%f,%f,%f>",
+					deltaAngularVelocityAScalar[0], deltaAngularVelocityAScalar[1], deltaAngularVelocityAScalar[2], deltaAngularVelocityAScalar[3]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "deltaLinearVelocityB = <%f,%f,%f,%f>",
+					deltaLinearVelocityBScalar[0], deltaLinearVelocityBScalar[1], deltaLinearVelocityBScalar[2], deltaLinearVelocityBScalar[3]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "deltaAngularVelocityB = <%f,%f,%f,%f>",
+					deltaAngularVelocityBScalar[0], deltaAngularVelocityBScalar[1], deltaAngularVelocityBScalar[2], deltaAngularVelocityBScalar[3]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+// profile NEON pfxSolveLinearConstraintRowNeonInline with direct memory return, inline assembly version
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		pfxSolveLinearConstraintRowNeonInline(constraintNeon,
+											  deltaLinearVelocityANeon,
+											  deltaAngularVelocityANeon,
+											  massInvANeon,
+											  inertiaInvANeon,
+											  rANeon,
+											  deltaLinearVelocityBNeon,
+											  deltaAngularVelocityBNeon,
+											  massInvBNeon,
+											  inertiaInvBNeon,
+											  rBNeon);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	sprintf(szMsg, "Time to do %i calls for pfxSolveLinearConstraintRowNeon: %f secs, speedup: %5.2f",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "constraint.m_accumImpulse = <%f>", constraintNeon[7]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "deltaLinearVelocityA = <%f,%f,%f,%f>",
+					deltaLinearVelocityANeon[0], deltaLinearVelocityANeon[1], deltaLinearVelocityANeon[2], deltaLinearVelocityANeon[3]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "deltaAngularVelocityA = <%f,%f,%f,%f>",
+					deltaAngularVelocityANeon[0], deltaAngularVelocityANeon[1], deltaAngularVelocityANeon[2], deltaAngularVelocityANeon[3]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "deltaLinearVelocityB = <%f,%f,%f,%f>",
+					deltaLinearVelocityBNeon[0], deltaLinearVelocityBNeon[1], deltaLinearVelocityBNeon[2], deltaLinearVelocityBNeon[3]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "deltaAngularVelocityB = <%f,%f,%f,%f>",
+					deltaAngularVelocityBNeon[0], deltaAngularVelocityBNeon[1], deltaAngularVelocityBNeon[2], deltaAngularVelocityBNeon[3]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+	sprintf(szMsg,"TestNeonSolveLinearConstraintRow End");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+}
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_transform3_operator_multiply.cpp
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_transform3_operator_multiply.cpp
@@ -0,0 +1,150 @@
+/*
+ Applied Research Associates Inc. (c)2011
+
+ Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Applied Research Associates Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "test_neon.h"
+#define SCE_PFX_USE_PERFCOUNTER
+#include "physics_effects.h"
+#include <arm_neon.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <android/log.h>
+
+// This works with gcc
+#define SET_ALIGNMENT(alignment)   __attribute__((__aligned__((alignment))))
+
+// assembly implementations
+extern "C"
+{
+	void Transform3OperatorMultiplyNeon(float *trfm, float *vec3, float *pfResult);
+}
+
+//----------------------------------------------------------------------------
+//  Transform3OperatorMultiplyScalar
+//
+/// Performs a multiply operation on a tranform3 using scalar math, storing the
+/// result directly into system memory.
+///
+/// @param  trfm      Input transform. Must point to 4x 4 float values
+/// @param  vec3      Input vector3. Must point to 4 float values
+/// @param  pfResult  [in] pointer to a float(vector3). [out] Contains the result
+//----------------------------------------------------------------------------
+//inline const Vector3 Transform3::operator *( const Vector3 & vec ) const
+//{
+//    return Vector3(
+//        ( ( ( mCol0.getX() * vec.getX() ) + ( mCol1.getX() * vec.getY() ) ) + ( mCol2.getX() * vec.getZ() ) ),
+//        ( ( ( mCol0.getY() * vec.getX() ) + ( mCol1.getY() * vec.getY() ) ) + ( mCol2.getY() * vec.getZ() ) ),
+//        ( ( ( mCol0.getZ() * vec.getX() ) + ( mCol1.getZ() * vec.getY() ) ) + ( mCol2.getZ() * vec.getZ() ) )
+//    );
+//}
+void Transform3OperatorMultiplyScalar(float *trfm, float *vec3, float *pfResult)
+{
+	pfResult[0] = ( ( ( trfm[0] * vec3[0] ) + ( trfm[4] * vec3[1] ) ) + ( trfm[8] * vec3[2] ) );
+	pfResult[1] = ( ( ( trfm[1] * vec3[0] ) + ( trfm[5] * vec3[1] ) ) + ( trfm[9] * vec3[2] ) );
+	pfResult[2] = ( ( ( trfm[2] * vec3[0] ) + ( trfm[6] * vec3[1] ) ) + ( trfm[10] * vec3[2] ) );
+	pfResult[3] = 0.0f;
+}
+
+//----------------------------------------------------------------------------
+//  TestNeonTransform3OperatorMultiply
+//
+/// Run timing study of the Tranform3 multiply operator from above, writing the
+/// results to the Android verbose log.
+//----------------------------------------------------------------------------
+void TestNeonTransform3OperatorMultiply()
+{
+	float SET_ALIGNMENT(64) data1[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+									   float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
+	float SET_ALIGNMENT(64) data2[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
+
+	float *trfm = &data1[0];
+	float *vec3 = &data2[0];
+
+
+	char szMsg[256];
+
+	sprintf(szMsg, "");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"---------------------------------------");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"TestNeonTransform3OperatorMultiply Start");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "Test input vec3: <%f,%f,%f,%f>",
+					vec3[0], vec3[1], vec3[2], vec3[3]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "Test input trfm: <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>>",
+					trfm[0], trfm[1], trfm[2], trfm[3],
+					trfm[4], trfm[5], trfm[6], trfm[7],
+					trfm[8], trfm[9], trfm[10], trfm[11],
+					trfm[12], trfm[13], trfm[14], trfm[15]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	float SET_ALIGNMENT(64) fResult[4];
+    
+
+	sce::PhysicsEffects::PfxPerfCounter pc;
+	double dTimeSpan, dRefTimeSpan;; 
+	unsigned int uiNumTries = 10000000;
+	unsigned int i;
+
+// profile scalar Transform3OperatorMultiplyScalar with direct memory return, c++ version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		Transform3OperatorMultiplyScalar(trfm, vec3, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	dRefTimeSpan = dTimeSpan;
+	sprintf(szMsg, "Time to do %i calls for Transform3OperatorMultiplyScalar: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f>",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+// profile NEON Transform3OperatorMultiplyNeon with direct memory return, assembly version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		Transform3OperatorMultiplyNeon(trfm, vec3, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	sprintf(szMsg, "Time to do %i calls for Transform3OperatorMultiplyNeon: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f>",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+	sprintf(szMsg,"TestNeonTransform3OperatorMultiply End");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+}
--- a/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_transpose_matrix3.cpp
+++ b/Extras/PhysicsEffects/sample/test_ARM_NEON_performance/test_neon_transpose_matrix3.cpp
@@ -0,0 +1,154 @@
+/*
+ Applied Research Associates Inc. (c)2011
+
+ Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Applied Research Associates Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "test_neon.h"
+#define SCE_PFX_USE_PERFCOUNTER
+#include "physics_effects.h"
+#include <arm_neon.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <android/log.h>
+
+// This works with gcc
+#define SET_ALIGNMENT(alignment)   __attribute__((__aligned__((alignment))))
+
+// assembly implementations
+extern "C"
+{
+	void TransposeMatrix3Neon(float *mCol, float *pfResult);
+}
+
+//----------------------------------------------------------------------------
+//  TransposeMatrix3Scalar
+//
+/// Performs a transpose on a matrix3 using scalar math, storing the
+/// result directly into system memory.
+///
+/// @param  mCol      Input matrix. Must point to 3x 4 float values
+/// @param  pfResult  [in] pointer to a float(matrix3). [out] Contains the result
+//----------------------------------------------------------------------------
+//inline const Matrix3 transpose( const Matrix3 & mat )
+//{
+//    return Matrix3(
+//        Vector3( mat.getCol0().getX(), mat.getCol1().getX(), mat.getCol2().getX() ),
+//        Vector3( mat.getCol0().getY(), mat.getCol1().getY(), mat.getCol2().getY() ),
+//        Vector3( mat.getCol0().getZ(), mat.getCol1().getZ(), mat.getCol2().getZ() )
+//    );
+//}
+void TransposeMatrix3Scalar(float *mCol, float *pfResult)
+{
+	pfResult[0] = mCol[0];
+	pfResult[1] = mCol[4];
+	pfResult[2] = mCol[8];
+	pfResult[3] = 0.0f;
+
+	pfResult[4] = mCol[1];
+	pfResult[5] = mCol[5];
+	pfResult[6] = mCol[9];
+	pfResult[7] = 0.0f;
+
+	pfResult[8] = mCol[2];
+	pfResult[9] = mCol[6];
+	pfResult[10] = mCol[10];
+	pfResult[11] = 0.0f;
+}
+
+//----------------------------------------------------------------------------
+//  TestNeonTransposeMatrix3
+//
+/// Run timing study of the matrix3 transpose functions above, writing the
+/// results to the Android verbose log.
+//----------------------------------------------------------------------------
+void TestNeonTransposeMatrix3()
+{
+	float SET_ALIGNMENT(64) data[] = {float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+									  float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f,
+									  float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),float(rand())/float(RAND_MAX),0.0f};
+	float *mCol = &data[0];
+
+	char szMsg[256];
+
+	sprintf(szMsg, "");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"---------------------------------------");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg,"TestNeonTransposeMatrix3 Start");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+	sprintf(szMsg, "Test input mCol:  <%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+			mCol[0], mCol[1], mCol[2], mCol[3],
+			mCol[4], mCol[5], mCol[6], mCol[7],
+			mCol[8], mCol[9], mCol[10], mCol[11]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+	float SET_ALIGNMENT(64) fResult[12];
+    
+
+	sce::PhysicsEffects::PfxPerfCounter pc;
+	double dTimeSpan, dRefTimeSpan;; 
+	unsigned int uiNumTries = 10000000;
+	unsigned int i;
+
+// profile scalar TransposeMatrix3Scalar with direct memory return, c++ version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		TransposeMatrix3Scalar(mCol, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	dRefTimeSpan = dTimeSpan;
+	sprintf(szMsg, "Time to do %i calls for TransposeMatrix3Scalar: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
+																   fResult[4], fResult[5], fResult[6], fResult[7],
+																   fResult[8], fResult[9], fResult[10], fResult[11]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+// profile NEON TransposeMatrix3Neon with direct memory return, assembly version
+	fResult[0] = 0.0f;
+	pc.countBegin("");
+	for(i = 0; i < uiNumTries; i++)
+	{
+		TransposeMatrix3Neon(mCol, fResult);
+	}
+	pc.countEnd();
+	dTimeSpan = pc.getCountTime(0);
+	pc.resetCount();
+	sprintf(szMsg, "Time to do %i calls for TransposeMatrix3Neon: %f secs, speedup: %5.2f, result value=<%f,%f,%f,%f> <%f,%f,%f,%f> <%f,%f,%f,%f>",
+					uiNumTries, dTimeSpan, dRefTimeSpan/dTimeSpan, fResult[0], fResult[1], fResult[2], fResult[3],
+																   fResult[4], fResult[5], fResult[6], fResult[7],
+																   fResult[8], fResult[9], fResult[10], fResult[11]);
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+
+
+	sprintf(szMsg,"TestNeonTransposeMatrix3 End");
+	__android_log_write(ANDROID_LOG_VERBOSE,"PHYSICS TIMING STUDY", szMsg);
+}