.text .type gemm_neon_16_6_1, %function .global gemm_neon_16_6_1 /* * Performs the matrix-multiplication C+=A*B * with the shapes (16x6) = (16x1) * (1x6). * The input-data is of type float. * * @param X0 Pointer to column-major A. * @param X1 Pointer to column-major B. * @param X2 Pointer to column-major C. * @param X3 Leading dimension of A (unused). * @param X4 Leading dimension of B. * @param X5 Leading dimension of C. * @param X6 Batch-reduce stride between A matrices (unused). * @param X7 Batch-reduce stride between B matrices (unused). */ gemm_neon_16_6_1: /* * Prologue: PCS */ // save frame pointer and link register stp fp, lr, [sp, #-16]! // update frame pointer to current stack pointer mov fp, sp // save callee-saved registers stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! stp x25, x26, [sp, #-16]! stp x27, x28, [sp, #-16]! stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! // hold addresses to A, B, C in work registers mov x7, x0 // A mov x8, x1 // B mov x9, x2 // C // convert strides to bytes lsl x3, x3, #2 // stride of A (unused) lsl x4, x4, #2 // stride of B lsl x5, x5, #2 // stride of C /* * Part 1: * Load 16*6 accumulator. */ ld1 { v0.4s, v1.4s, v2.4s, v3.4s}, [x9] add x9, x9, x5 ld1 { v4.4s, v5.4s, v6.4s, v7.4s}, [x9] add x9, x9, x5 ld1 { v8.4s, v9.4s, v10.4s, v11.4s}, [x9] add x9, x9, x5 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] add x9, x9, x5 ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x9] add x9, x9, x5 ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x9] mov x9, x2 /* * Part 2: * Stream A and B. * Execute fused-multiply-adds (FMAs). */ // load 16 values of A ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x7] // load first value of B // each value is mutliplied by 16 values of A ldr s28, [x8] add x8, x8, x4 // perform the fmas fmla v0.4s, v24.4s, v28.s[0] fmla v1.4s, v25.4s, v28.s[0] fmla v2.4s, v26.4s, v28.s[0] fmla v3.4s, v27.4s, v28.s[0] // load second value of B ldr s29, [x8] add x8, x8, x4 // perform the fmas fmla v4.4s, v24.4s, v29.s[0] fmla v5.4s, v25.4s, v29.s[0] fmla v6.4s, v26.4s, v29.s[0] fmla v7.4s, v27.4s, v29.s[0] // load third value of B ldr s30, [x8] add x8, x8, x4 // perform the fmas fmla v8.4s, v24.4s, v30.s[0] fmla v9.4s, v25.4s, v30.s[0] fmla v10.4s, v26.4s, v30.s[0] fmla v11.4s, v27.4s, v30.s[0] // load fourth value of B ldr s31, [x8] add x8, x8, x4 // perform the fmas fmla v12.4s, v24.4s, v31.s[0] fmla v13.4s, v25.4s, v31.s[0] fmla v14.4s, v26.4s, v31.s[0] fmla v15.4s, v27.4s, v31.s[0] // load fifth value of B ldr s28, [x8] add x8, x8, x4 // perform the fmas fmla v16.4s, v24.4s, v28.s[0] fmla v17.4s, v25.4s, v28.s[0] fmla v18.4s, v26.4s, v28.s[0] fmla v19.4s, v27.4s, v28.s[0] // load sixth value of B ldr s29, [x8] add x8, x8, x4 // perform the fmas fmla v20.4s, v24.4s, v29.s[0] fmla v21.4s, v25.4s, v29.s[0] fmla v22.4s, v26.4s, v29.s[0] fmla v23.4s, v27.4s, v29.s[0] /* * Part 3: * Store 16*6 accumulator. */ st1 { v0.4s, v1.4s, v2.4s, v3.4s}, [x9] add x9, x9, x5 st1 { v4.4s, v5.4s, v6.4s, v7.4s}, [x9] add x9, x9, x5 st1 { v8.4s, v9.4s, v10.4s, v11.4s}, [x9] add x9, x9, x5 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9] add x9, x9, x5 st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x9] add x9, x9, x5 st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x9] /* * Epilogue: PCS */ // restore callee-saved registers ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 ldp x27, x28, [sp], #16 ldp x25, x26, [sp], #16 ldp x23, x24, [sp], #16 ldp x21, x22, [sp], #16 ldp x19, x20, [sp], #16 // restore frame pointer and link register ldp fp, lr, [sp], #16 ret .size gemm_neon_16_6_1, (. - gemm_neon_16_6_1)