.text
    .type gemm_neon_16_6_1, %function
    .global gemm_neon_16_6_1
    /*
     * Performs the matrix-multiplication C+=A*B
     * with the shapes (16x6) = (16x1) * (1x6).
     * The input-data is of type float.
     *
     * @param X0 Pointer to column-major A.
     * @param X1 Pointer to column-major B.
     * @param X2 Pointer to column-major C.
     * @param X3 Leading dimension of A (unused).
     * @param X4 Leading dimension of B.
     * @param X5 Leading dimension of C.
     * @param X6 Batch-reduce stride between A matrices (unused).
     * @param X7 Batch-reduce stride between B matrices (unused).
     */ 
gemm_neon_16_6_1:
    /*
     * Prologue: PCS
     */
    // save frame pointer and link register
    stp fp, lr, [sp, #-16]!
    // update frame pointer to current stack pointer
    mov fp, sp

    // save callee-saved registers
    stp x19, x20, [sp, #-16]!
    stp x21, x22, [sp, #-16]!
    stp x23, x24, [sp, #-16]!
    stp x25, x26, [sp, #-16]!
    stp x27, x28, [sp, #-16]!

    stp  d8,  d9, [sp, #-16]!
    stp d10, d11, [sp, #-16]!
    stp d12, d13, [sp, #-16]!
    stp d14, d15, [sp, #-16]!

    // hold addresses to A, B, C in work registers
    mov x7, x0 // A
    mov x8, x1 // B
    mov x9, x2 // C

    // convert strides to bytes
    lsl x3, x3, #2 // stride of A (unused)
    lsl x4, x4, #2 // stride of B
    lsl x5, x5, #2 // stride of C

    /*
     * Part 1:
     * Load 16*6 accumulator.
     */
    ld1 { v0.4s,  v1.4s,  v2.4s,  v3.4s}, [x9]
    add x9, x9, x5
    ld1 { v4.4s,  v5.4s,  v6.4s,  v7.4s}, [x9]
    add x9, x9, x5
    ld1 { v8.4s,  v9.4s, v10.4s, v11.4s}, [x9]
    add x9, x9, x5
    ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
    add x9, x9, x5
    ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x9]
    add x9, x9, x5
    ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x9]
    mov x9, x2

    /*
     * Part 2:
     * Stream A and B.
     * Execute fused-multiply-adds (FMAs).
     */
    // load 16 values of A
    ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x7]

    // load first value of B
    // each value is mutliplied by 16 values of A
    ldr s28, [x8]
    add x8, x8, x4

    // perform the fmas
    fmla  v0.4s, v24.4s, v28.s[0]
    fmla  v1.4s, v25.4s, v28.s[0]
    fmla  v2.4s, v26.4s, v28.s[0]
    fmla  v3.4s, v27.4s, v28.s[0]

    // load second value of B
    ldr s29, [x8]
    add x8, x8, x4

    // perform the fmas
    fmla  v4.4s, v24.4s, v29.s[0]
    fmla  v5.4s, v25.4s, v29.s[0]
    fmla  v6.4s, v26.4s, v29.s[0]
    fmla  v7.4s, v27.4s, v29.s[0]

    // load third value of B
    ldr s30, [x8]
    add x8, x8, x4

    // perform the fmas
    fmla  v8.4s, v24.4s, v30.s[0]
    fmla  v9.4s, v25.4s, v30.s[0]
    fmla v10.4s, v26.4s, v30.s[0]
    fmla v11.4s, v27.4s, v30.s[0]

    // load fourth value of B
    ldr s31, [x8]
    add x8, x8, x4

    // perform the fmas
    fmla v12.4s, v24.4s, v31.s[0]
    fmla v13.4s, v25.4s, v31.s[0]
    fmla v14.4s, v26.4s, v31.s[0]
    fmla v15.4s, v27.4s, v31.s[0]

    // load fifth value of B
    ldr s28, [x8]
    add x8, x8, x4

    // perform the fmas
    fmla v16.4s, v24.4s, v28.s[0]
    fmla v17.4s, v25.4s, v28.s[0]
    fmla v18.4s, v26.4s, v28.s[0]
    fmla v19.4s, v27.4s, v28.s[0]

    // load sixth value of B
    ldr s29, [x8]
    add x8, x8, x4

    // perform the fmas
    fmla v20.4s, v24.4s, v29.s[0]
    fmla v21.4s, v25.4s, v29.s[0]
    fmla v22.4s, v26.4s, v29.s[0]
    fmla v23.4s, v27.4s, v29.s[0]

    /*
     * Part 3:
     * Store 16*6 accumulator.
     */
    st1 { v0.4s,  v1.4s,  v2.4s,  v3.4s}, [x9]
    add x9, x9, x5
    st1 { v4.4s,  v5.4s,  v6.4s,  v7.4s}, [x9]
    add x9, x9, x5
    st1 { v8.4s,  v9.4s, v10.4s, v11.4s}, [x9]
    add x9, x9, x5
    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x9]
    add x9, x9, x5
    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x9]
    add x9, x9, x5
    st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x9]

    /*
     * Epilogue: PCS
     */
    // restore callee-saved registers
    ldp d14, d15, [sp], #16
    ldp d12, d13, [sp], #16
    ldp d10, d11, [sp], #16
    ldp  d8,  d9, [sp], #16

    ldp x27, x28, [sp], #16
    ldp x25, x26, [sp], #16
    ldp x23, x24, [sp], #16
    ldp x21, x22, [sp], #16
    ldp x19, x20, [sp], #16

    // restore frame pointer and link register
    ldp fp, lr, [sp], #16

    ret
    .size gemm_neon_16_6_1, (. - gemm_neon_16_6_1)