1
32,32,32
.text
.align 4
.type gemm_asm_asimd_32_32_32, %function
.global gemm_asm_asimd_32_32_32
gemm_asm_asimd_32_32_32:
// store
// wir verwenden x19-x30 nicht, also müssen wir es nicht retten
// außerdem habe ich die Register v8-v15 auf v24-v31 gemappt, sodass auch die Vektorregister nicht gerettet werden müssen
// (n = 32)/(4 pro Schleifendurchlauf)
mov x5, #8
nSchleife:
mov x4, #2
mSchleife:
// lade C komplett
ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
// verwende ein neues Register, damit wir x2 nicht zurücksetzen müssen (spart eine Anweisung)
add x6,x2,#32*4
ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x6]
add x6,x6,#32*4
ld1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x6]
add x6,x6,#32*4
ld1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x6]
// k-Schleife: k/4
mov x3, #8
kSchleife:
// lade B
ld1 { v16.4s }, [x1]
add x1, x1, #32*4
ld1 { v17.4s }, [x1]
add x1, x1, #32*4
ld1 { v18.4s }, [x1]
add x1, x1, #32*4
ld1 { v19.4s }, [x1]
// zurücksetzen, und dann einen Block weiter
sub x1, x1, #((3*32-4)*4)
// lade A, erste Spalte für erste 4 Spalten von Berechnungen
ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
add x0,x0,#32*4
// rechne
fmla v0.4s, v20.4s, v16.s[0]
fmla v1.4s, v21.4s, v16.s[0]
fmla v2.4s, v22.4s, v16.s[0]
fmla v3.4s, v23.4s, v16.s[0]
fmla v4.4s, v20.4s, v17.s[0]
fmla v5.4s, v21.4s, v17.s[0]
fmla v6.4s, v22.4s, v17.s[0]
fmla v7.4s, v23.4s, v17.s[0]
fmla v24.4s, v20.4s, v18.s[0]
fmla v25.4s, v21.4s, v18.s[0]
fmla v26.4s, v22.4s, v18.s[0]
fmla v27.4s, v23.4s, v18.s[0]
fmla v28.4s, v20.4s, v19.s[0]
fmla v29.4s, v21.4s, v19.s[0]
fmla v30.4s, v22.4s, v19.s[0]
fmla v31.4s, v23.4s, v19.s[0]
ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
add x0,x0,#32*4
fmla v0.4s, v20.4s, v16.s[1]
fmla v1.4s, v21.4s, v16.s[1]
fmla v2.4s, v22.4s, v16.s[1]
fmla v3.4s, v23.4s, v16.s[1]
fmla v4.4s, v20.4s, v17.s[1]
fmla v5.4s, v21.4s, v17.s[1]
fmla v6.4s, v22.4s, v17.s[1]
fmla v7.4s, v23.4s, v17.s[1]
fmla v24.4s, v20.4s, v18.s[1]
fmla v25.4s, v21.4s, v18.s[1]
fmla v26.4s, v22.4s, v18.s[1]
fmla v27.4s, v23.4s, v18.s[1]
fmla v28.4s, v20.4s, v19.s[1]
fmla v29.4s, v21.4s, v19.s[1]
fmla v30.4s, v22.4s, v19.s[1]
fmla v31.4s, v23.4s, v19.s[1]
ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
add x0,x0,#32*4
fmla v0.4s, v20.4s, v16.s[2]
fmla v1.4s, v21.4s, v16.s[2]
fmla v2.4s, v22.4s, v16.s[2]
fmla v3.4s, v23.4s, v16.s[2]
fmla v4.4s, v20.4s, v17.s[2]
fmla v5.4s, v21.4s, v17.s[2]
fmla v6.4s, v22.4s, v17.s[2]
fmla v7.4s, v23.4s, v17.s[2]
fmla v24.4s, v20.4s, v18.s[2]
fmla v25.4s, v21.4s, v18.s[2]
fmla v26.4s, v22.4s, v18.s[2]
fmla v27.4s, v23.4s, v18.s[2]
fmla v28.4s, v20.4s, v19.s[2]
fmla v29.4s, v21.4s, v19.s[2]
fmla v30.4s, v22.4s, v19.s[2]
fmla v31.4s, v23.4s, v19.s[2]
ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
add x0,x0,#32*4
fmla v0.4s, v20.4s, v16.s[3]
fmla v1.4s, v21.4s, v16.s[3]
fmla v2.4s, v22.4s, v16.s[3]
fmla v3.4s, v23.4s, v16.s[3]
fmla v4.4s, v20.4s, v17.s[3]
fmla v5.4s, v21.4s, v17.s[3]
fmla v6.4s, v22.4s, v17.s[3]
fmla v7.4s, v23.4s, v17.s[3]
fmla v24.4s, v20.4s, v18.s[3]
fmla v25.4s, v21.4s, v18.s[3]
fmla v26.4s, v22.4s, v18.s[3]
fmla v27.4s, v23.4s, v18.s[3]
fmla v28.4s, v20.4s, v19.s[3]
fmla v29.4s, v21.4s, v19.s[3]
fmla v30.4s, v22.4s, v19.s[3]
fmla v31.4s, v23.4s, v19.s[3]
sub x3,x3,#1
// repeat, if x3 != 0
cbnz x3, kSchleife
// store C
st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
add x2,x2,#32*4
st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x2]
add x2,x2,#32*4
st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x2]
add x2,x2,#32*4
st1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x2]
// reset C
sub x2,x2,#(32*3-16)*4
// reset A
sub x0,x0,#(32*32-16)*4
// reset B
sub x1,x1,#32*4
sub x4,x4,#1
cbnz x4, mSchleife
// reset A, down 32 values
sub x0,x0,#32*4
// move B further by 4 columns
add x1,x1,#(32*4)*4
// move C further by 3 cols (1 was done automatically before)
add x2,x2,#(32*3)*4
sub x5,x5,#1
cbnz x5,nSchleife
// restore: wir müssen nichts retten, also müssen wir auch nichts restoren
ret
.size gemm_asm_asimd_32_32_32, (. - gemm_asm_asimd_32_32_32)
AARCH64, 16 x 12 x 4 Matrix-Multiplication
Running Code:
Source Code: