120
19,4,4
.text
.align 4
.type gemm_asm_asimd_19_4_4, %function
.global gemm_asm_asimd_19_4_4
gemm_asm_asimd_19_4_4:
// store
// wir verwenden x19-x30 nicht, also müssen wir es nicht retten
// außerdem habe ich die Register v8-v15 auf v24-v31 gemappt, sodass auch die Vektorregister nicht gerettet werden müssen
// lade C komplett
ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
// verwende ein neues Register, damit wir x2 nicht zurücksetzen müssen (spart eine Anweisung)
add x5,x2,#19*4
ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x5]
add x5,x5,#19*4
ld1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x5]
add x5,x5,#19*4
ld1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x5]
// lade B
ld1 { v16.4s }, [x1]
add x1, x1, #4*4
ld1 { v17.4s }, [x1]
add x1, x1, #4*4
ld1 { v18.4s }, [x1]
add x1, x1, #4*4
ld1 { v19.4s }, [x1]
// lade A, erste Spalte für erste 4 Spalten von Berechnungen
ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
add x0,x0,#19*4
// rechne
fmla v0.4s, v20.4s, v16.s[0]
fmla v1.4s, v21.4s, v16.s[0]
fmla v2.4s, v22.4s, v16.s[0]
fmla v3.4s, v23.4s, v16.s[0]
fmla v4.4s, v20.4s, v17.s[0]
fmla v5.4s, v21.4s, v17.s[0]
fmla v6.4s, v22.4s, v17.s[0]
fmla v7.4s, v23.4s, v17.s[0]
fmla v24.4s, v20.4s, v18.s[0]
fmla v25.4s, v21.4s, v18.s[0]
fmla v26.4s, v22.4s, v18.s[0]
fmla v27.4s, v23.4s, v18.s[0]
fmla v28.4s, v20.4s, v19.s[0]
fmla v29.4s, v21.4s, v19.s[0]
fmla v30.4s, v22.4s, v19.s[0]
fmla v31.4s, v23.4s, v19.s[0]
ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
add x0,x0,#19*4
fmla v0.4s, v20.4s, v16.s[1]
fmla v1.4s, v21.4s, v16.s[1]
fmla v2.4s, v22.4s, v16.s[1]
fmla v3.4s, v23.4s, v16.s[1]
fmla v4.4s, v20.4s, v17.s[1]
fmla v5.4s, v21.4s, v17.s[1]
fmla v6.4s, v22.4s, v17.s[1]
fmla v7.4s, v23.4s, v17.s[1]
fmla v24.4s, v20.4s, v18.s[1]
fmla v25.4s, v21.4s, v18.s[1]
fmla v26.4s, v22.4s, v18.s[1]
fmla v27.4s, v23.4s, v18.s[1]
fmla v28.4s, v20.4s, v19.s[1]
fmla v29.4s, v21.4s, v19.s[1]
fmla v30.4s, v22.4s, v19.s[1]
fmla v31.4s, v23.4s, v19.s[1]
ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
add x0,x0,#19*4
fmla v0.4s, v20.4s, v16.s[2]
fmla v1.4s, v21.4s, v16.s[2]
fmla v2.4s, v22.4s, v16.s[2]
fmla v3.4s, v23.4s, v16.s[2]
fmla v4.4s, v20.4s, v17.s[2]
fmla v5.4s, v21.4s, v17.s[2]
fmla v6.4s, v22.4s, v17.s[2]
fmla v7.4s, v23.4s, v17.s[2]
fmla v24.4s, v20.4s, v18.s[2]
fmla v25.4s, v21.4s, v18.s[2]
fmla v26.4s, v22.4s, v18.s[2]
fmla v27.4s, v23.4s, v18.s[2]
fmla v28.4s, v20.4s, v19.s[2]
fmla v29.4s, v21.4s, v19.s[2]
fmla v30.4s, v22.4s, v19.s[2]
fmla v31.4s, v23.4s, v19.s[2]
ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
add x0,x0,#19*4
fmla v0.4s, v20.4s, v16.s[3]
fmla v1.4s, v21.4s, v16.s[3]
fmla v2.4s, v22.4s, v16.s[3]
fmla v3.4s, v23.4s, v16.s[3]
fmla v4.4s, v20.4s, v17.s[3]
fmla v5.4s, v21.4s, v17.s[3]
fmla v6.4s, v22.4s, v17.s[3]
fmla v7.4s, v23.4s, v17.s[3]
fmla v24.4s, v20.4s, v18.s[3]
fmla v25.4s, v21.4s, v18.s[3]
fmla v26.4s, v22.4s, v18.s[3]
fmla v27.4s, v23.4s, v18.s[3]
fmla v28.4s, v20.4s, v19.s[3]
fmla v29.4s, v21.4s, v19.s[3]
fmla v30.4s, v22.4s, v19.s[3]
fmla v31.4s, v23.4s, v19.s[3]
// store C
st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
add x2,x2,#19*4
st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x2]
add x2,x2,#19*4
st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x2]
add x2,x2,#19*4
st1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x2]
// berechne die letzten Werte
// lade den Rest von C
sub x2,x2,#(19*3-16)*4
ldr d0, [x2], #2*4 // d0 = v0.2s
ldr s1, [x2], #17*4
ldr d2, [x2], #2*4
ldr s3, [x2], #17*4
ldr d4, [x2], #2*4
ldr s5, [x2], #17*4
ldr d6, [x2], #2*4
ldr s7, [x2], #17*4
// lade den Rest von A
sub x0,x0,#(19*4-16)*4
ldr d20, [x0], #2*4
ldr s21, [x0], #17*4
ldr d22, [x0], #2*4
ldr s23, [x0], #17*4
ldr d24, [x0], #2*4
ldr s25, [x0], #17*4
ldr d26, [x0], #2*4
ldr s27, [x0], #17*4
// berechne den Rest
// Teil 1
fmla v0.2s, v20.2s, v16.s[0]
fmla s1, s21, v16.s[0]
fmla v2.2s, v20.2s, v17.s[0]
fmla s3, s21, v17.s[0]
fmla v4.2s, v20.2s, v18.s[0]
fmla s5, s21, v18.s[0]
fmla v6.2s, v20.2s, v19.s[0]
fmla s7, s21, v19.s[0]
// Teil 2
fmla v0.2s, v22.2s, v16.s[1]
fmla s1, s23, v16.s[1]
fmla v2.2s, v22.2s, v17.s[1]
fmla s3, s23, v17.s[1]
fmla v4.2s, v22.2s, v18.s[1]
fmla s5, s23, v18.s[1]
fmla v6.2s, v22.2s, v19.s[1]
fmla s7, s23, v19.s[1]
// Teil 3
fmla v0.2s, v24.2s, v16.s[2]
fmla s1, s25, v16.s[2]
fmla v2.2s, v24.2s, v17.s[2]
fmla s3, s25, v17.s[2]
fmla v4.2s, v24.2s, v18.s[2]
fmla s5, s25, v18.s[2]
fmla v6.2s, v24.2s, v19.s[2]
fmla s7, s25, v19.s[2]
// Teil 4
fmla v0.2s, v26.2s, v16.s[3]
fmla s1, s27, v16.s[3]
fmla v2.2s, v26.2s, v17.s[3]
fmla s3, s27, v17.s[3]
fmla v4.2s, v26.2s, v18.s[3]
fmla s5, s27, v18.s[3]
fmla v6.2s, v26.2s, v19.s[3]
fmla s7, s27, v19.s[3]
// speichere den Rest von C
sub x2,x2,#(19*4)*4 // könnte sich gespart werden, indem man pre-inc statt post-inc verwendet (aber man muss aufpassen, denn irgendwann ist der Offset zu groß)
str d0, [x2], #2*4
str s1, [x2], #17*4
str d2, [x2], #2*4
str s3, [x2], #17*4
str d4, [x2], #2*4
str s5, [x2], #17*4
str d6, [x2], #2*4
str s7, [x2]
// restore: wir müssen nichts retten, also müssen wir auch nichts restoren
ret
.size gemm_asm_asimd_19_4_4, (. - gemm_asm_asimd_19_4_4)
AARCH64, 16 x 12 x 4 Matrix-Multiplication
Running Code:
Source Code: