16,12,4
.text
.align 4
.type gemm_asm_asimd_16_12_4, %function
.global gemm_asm_asimd_16_12_4
gemm_asm_asimd_16_12_4:
// store
// wir verwenden x19-x30 nicht, also müssen wir es nicht retten
// außerdem habe ich die Register v8-v15 auf v24-v31 gemappt, sodass auch die Vektorregister nicht gerettet werden müssen
// lade C komplett
ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
// verwende ein neues Register, damit wir x2 nicht zurücksetzen müssen (spart eine Anweisung)
add x5,x2,#16*4
ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x5]
add x5,x5,#16*4
ld1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x5]
add x5,x5,#16*4
ld1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x5]
// 3 Iterationen
mov x3, #3
schleife:
// lade B
ld1 { v16.4s }, [x1]
add x1, x1, #12*4
ld1 { v17.4s }, [x1]
add x1, x1, #12*4
ld1 { v18.4s }, [x1]
add x1, x1, #12*4
ld1 { v19.4s }, [x1]
// zurücksetzen, und dann einen Block weiter
sub x1, x1, #((3*12-4)*4)
// lade A, erste Spalte für erste 4 Spalten von Berechnungen
ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
add x0,x0,#64
// rechne
fmla v0.4s, v20.4s, v16.s[0]
fmla v1.4s, v21.4s, v16.s[0]
fmla v2.4s, v22.4s, v16.s[0]
fmla v3.4s, v23.4s, v16.s[0]
fmla v4.4s, v20.4s, v17.s[0]
fmla v5.4s, v21.4s, v17.s[0]
fmla v6.4s, v22.4s, v17.s[0]
fmla v7.4s, v23.4s, v17.s[0]
fmla v24.4s, v20.4s, v18.s[0]
fmla v25.4s, v21.4s, v18.s[0]
fmla v26.4s, v22.4s, v18.s[0]
fmla v27.4s, v23.4s, v18.s[0]
fmla v28.4s, v20.4s, v19.s[0]
fmla v29.4s, v21.4s, v19.s[0]
fmla v30.4s, v22.4s, v19.s[0]
fmla v31.4s, v23.4s, v19.s[0]
ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
add x0,x0,#64
fmla v0.4s, v20.4s, v16.s[1]
fmla v1.4s, v21.4s, v16.s[1]
fmla v2.4s, v22.4s, v16.s[1]
fmla v3.4s, v23.4s, v16.s[1]
fmla v4.4s, v20.4s, v17.s[1]
fmla v5.4s, v21.4s, v17.s[1]
fmla v6.4s, v22.4s, v17.s[1]
fmla v7.4s, v23.4s, v17.s[1]
fmla v24.4s, v20.4s, v18.s[1]
fmla v25.4s, v21.4s, v18.s[1]
fmla v26.4s, v22.4s, v18.s[1]
fmla v27.4s, v23.4s, v18.s[1]
fmla v28.4s, v20.4s, v19.s[1]
fmla v29.4s, v21.4s, v19.s[1]
fmla v30.4s, v22.4s, v19.s[1]
fmla v31.4s, v23.4s, v19.s[1]
ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
add x0,x0,#64
fmla v0.4s, v20.4s, v16.s[2]
fmla v1.4s, v21.4s, v16.s[2]
fmla v2.4s, v22.4s, v16.s[2]
fmla v3.4s, v23.4s, v16.s[2]
fmla v4.4s, v20.4s, v17.s[2]
fmla v5.4s, v21.4s, v17.s[2]
fmla v6.4s, v22.4s, v17.s[2]
fmla v7.4s, v23.4s, v17.s[2]
fmla v24.4s, v20.4s, v18.s[2]
fmla v25.4s, v21.4s, v18.s[2]
fmla v26.4s, v22.4s, v18.s[2]
fmla v27.4s, v23.4s, v18.s[2]
fmla v28.4s, v20.4s, v19.s[2]
fmla v29.4s, v21.4s, v19.s[2]
fmla v30.4s, v22.4s, v19.s[2]
fmla v31.4s, v23.4s, v19.s[2]
ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
add x0,x0,#64
fmla v0.4s, v20.4s, v16.s[3]
fmla v1.4s, v21.4s, v16.s[3]
fmla v2.4s, v22.4s, v16.s[3]
fmla v3.4s, v23.4s, v16.s[3]
fmla v4.4s, v20.4s, v17.s[3]
fmla v5.4s, v21.4s, v17.s[3]
fmla v6.4s, v22.4s, v17.s[3]
fmla v7.4s, v23.4s, v17.s[3]
fmla v24.4s, v20.4s, v18.s[3]
fmla v25.4s, v21.4s, v18.s[3]
fmla v26.4s, v22.4s, v18.s[3]
fmla v27.4s, v23.4s, v18.s[3]
fmla v28.4s, v20.4s, v19.s[3]
fmla v29.4s, v21.4s, v19.s[3]
fmla v30.4s, v22.4s, v19.s[3]
fmla v31.4s, v23.4s, v19.s[3]
// x3--
sub x3,x3,#1
// repeat, if x3 != 0
cbnz x3, schleife
// store C
st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
add x2,x2,#64
st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x2]
add x2,x2,#64
st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x2]
add x2,x2,#64
st1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x2]
// restore: wir müssen nichts retten, also müssen wir auch nichts restoren
ret
.size gemm_asm_asimd_16_12_4, (. - gemm_asm_asimd_16_12_4)
AARCH64, 16 x 12 x 4 Matrix-Multiplication
Running Code:
Source Code: