.text .type thr_neon_fmla_4s, %function .global thr_neon_fmla_4s /* * Microbenchmark measuring fmla (vector), 4s throughput. * @param x0 number of repetitions. * @return number of flops per iteration. */ thr_neon_fmla_4s: // save frame pointer and link register stp fp, lr, [sp, #-16]! // update frame pointer to current stack pointer mov fp, sp // save callee-saved registers stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! stp x25, x26, [sp, #-16]! stp x27, x28, [sp, #-16]! stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! // init SIMD registers fmov s0, #1.0 dup v0.4s, v0.s[0] mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b mov v4.16b, v0.16b mov v5.16b, v0.16b mov v6.16b, v0.16b mov v7.16b, v0.16b mov v8.16b, v0.16b mov v9.16b, v0.16b mov v10.16b, v0.16b mov v11.16b, v0.16b mov v12.16b, v0.16b mov v13.16b, v0.16b mov v14.16b, v0.16b mov v15.16b, v0.16b mov v16.16b, v0.16b mov v17.16b, v0.16b mov v18.16b, v0.16b mov v19.16b, v0.16b mov v20.16b, v0.16b mov v21.16b, v0.16b mov v22.16b, v0.16b mov v23.16b, v0.16b mov v24.16b, v0.16b mov v25.16b, v0.16b mov v26.16b, v0.16b mov v27.16b, v0.16b mov v28.16b, v0.16b mov v29.16b, v0.16b mov v30.16b, v0.16b mov v31.16b, v0.16b loop_bench: sub x0, x0, #1 .rept 100 fmla v0.4s, v8.4s, v16.4s fmla v1.4s, v9.4s, v17.4s fmla v2.4s, v10.4s, v18.4s fmla v3.4s, v11.4s, v19.4s fmla v4.4s, v12.4s, v20.4s fmla v5.4s, v13.4s, v21.4s fmla v6.4s, v14.4s, v22.4s fmla v7.4s, v15.4s, v23.4s fmla v8.4s, v16.4s, v24.4s fmla v9.4s, v17.4s, v25.4s fmla v10.4s, v18.4s, v26.4s fmla v11.4s, v19.4s, v27.4s fmla v12.4s, v20.4s, v28.4s fmla v13.4s, v21.4s, v29.4s fmla v14.4s, v22.4s, v30.4s fmla v15.4s, v23.4s, v31.4s fmla v16.4s, v24.4s, v0.4s fmla v17.4s, v25.4s, v1.4s fmla v18.4s, v26.4s, v2.4s fmla v19.4s, v27.4s, v3.4s fmla v20.4s, v28.4s, v4.4s fmla v21.4s, v29.4s, v5.4s fmla v22.4s, v30.4s, v6.4s fmla v23.4s, v31.4s, v7.4s fmla v24.4s, v0.4s, v8.4s fmla v25.4s, v1.4s, v9.4s fmla v26.4s, v2.4s, v10.4s fmla v27.4s, v3.4s, v11.4s fmla v28.4s, v4.4s, v12.4s fmla v29.4s, v5.4s, v13.4s fmla v30.4s, v6.4s, v14.4s fmla v31.4s, v7.4s, v15.4s .endr cbnz x0, loop_bench // restore callee-saved registers ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 ldp x27, x28, [sp], #16 ldp x25, x26, [sp], #16 ldp x23, x24, [sp], #16 ldp x21, x22, [sp], #16 ldp x19, x20, [sp], #16 // restore frame pointer and link register ldp fp, lr, [sp], #16 // write number of flops to return register mov x0, 100*32*8 ret .size thr_neon_fmla_4s, (. - thr_neon_fmla_4s)