1

32,32,32

.text .align 4 .type gemm_asm_asimd_32_32_32, %function .global gemm_asm_asimd_32_32_32 gemm_asm_asimd_32_32_32: // store // wir verwenden x19-x30 nicht, also müssen wir es nicht retten // außerdem habe ich die Register v8-v15 auf v24-v31 gemappt, sodass auch die Vektorregister nicht gerettet werden müssen // (n = 32)/(4 pro Schleifendurchlauf) mov x5, #8 nSchleife: mov x4, #2 mSchleife: // lade C komplett ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] // verwende ein neues Register, damit wir x2 nicht zurücksetzen müssen (spart eine Anweisung) add x6,x2,#32*4 ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x6] add x6,x6,#32*4 ld1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x6] add x6,x6,#32*4 ld1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x6] // k-Schleife: k/4 mov x3, #8 kSchleife: // lade B ld1 { v16.4s }, [x1] add x1, x1, #32*4 ld1 { v17.4s }, [x1] add x1, x1, #32*4 ld1 { v18.4s }, [x1] add x1, x1, #32*4 ld1 { v19.4s }, [x1] // zurücksetzen, und dann einen Block weiter sub x1, x1, #((3*32-4)*4) // lade A, erste Spalte für erste 4 Spalten von Berechnungen ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0] add x0,x0,#32*4 // rechne fmla v0.4s, v20.4s, v16.s[0] fmla v1.4s, v21.4s, v16.s[0] fmla v2.4s, v22.4s, v16.s[0] fmla v3.4s, v23.4s, v16.s[0] fmla v4.4s, v20.4s, v17.s[0] fmla v5.4s, v21.4s, v17.s[0] fmla v6.4s, v22.4s, v17.s[0] fmla v7.4s, v23.4s, v17.s[0] fmla v24.4s, v20.4s, v18.s[0] fmla v25.4s, v21.4s, v18.s[0] fmla v26.4s, v22.4s, v18.s[0] fmla v27.4s, v23.4s, v18.s[0] fmla v28.4s, v20.4s, v19.s[0] fmla v29.4s, v21.4s, v19.s[0] fmla v30.4s, v22.4s, v19.s[0] fmla v31.4s, v23.4s, v19.s[0] ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0] add x0,x0,#32*4 fmla v0.4s, v20.4s, v16.s[1] fmla v1.4s, v21.4s, v16.s[1] fmla v2.4s, v22.4s, v16.s[1] fmla v3.4s, v23.4s, v16.s[1] fmla v4.4s, v20.4s, v17.s[1] fmla v5.4s, v21.4s, v17.s[1] fmla v6.4s, v22.4s, v17.s[1] fmla v7.4s, v23.4s, v17.s[1] fmla v24.4s, v20.4s, v18.s[1] fmla v25.4s, v21.4s, v18.s[1] fmla v26.4s, v22.4s, v18.s[1] fmla v27.4s, v23.4s, v18.s[1] fmla v28.4s, v20.4s, v19.s[1] fmla v29.4s, v21.4s, v19.s[1] fmla v30.4s, v22.4s, v19.s[1] fmla v31.4s, v23.4s, v19.s[1] ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0] add x0,x0,#32*4 fmla v0.4s, v20.4s, v16.s[2] fmla v1.4s, v21.4s, v16.s[2] fmla v2.4s, v22.4s, v16.s[2] fmla v3.4s, v23.4s, v16.s[2] fmla v4.4s, v20.4s, v17.s[2] fmla v5.4s, v21.4s, v17.s[2] fmla v6.4s, v22.4s, v17.s[2] fmla v7.4s, v23.4s, v17.s[2] fmla v24.4s, v20.4s, v18.s[2] fmla v25.4s, v21.4s, v18.s[2] fmla v26.4s, v22.4s, v18.s[2] fmla v27.4s, v23.4s, v18.s[2] fmla v28.4s, v20.4s, v19.s[2] fmla v29.4s, v21.4s, v19.s[2] fmla v30.4s, v22.4s, v19.s[2] fmla v31.4s, v23.4s, v19.s[2] ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0] add x0,x0,#32*4 fmla v0.4s, v20.4s, v16.s[3] fmla v1.4s, v21.4s, v16.s[3] fmla v2.4s, v22.4s, v16.s[3] fmla v3.4s, v23.4s, v16.s[3] fmla v4.4s, v20.4s, v17.s[3] fmla v5.4s, v21.4s, v17.s[3] fmla v6.4s, v22.4s, v17.s[3] fmla v7.4s, v23.4s, v17.s[3] fmla v24.4s, v20.4s, v18.s[3] fmla v25.4s, v21.4s, v18.s[3] fmla v26.4s, v22.4s, v18.s[3] fmla v27.4s, v23.4s, v18.s[3] fmla v28.4s, v20.4s, v19.s[3] fmla v29.4s, v21.4s, v19.s[3] fmla v30.4s, v22.4s, v19.s[3] fmla v31.4s, v23.4s, v19.s[3] sub x3,x3,#1 // repeat, if x3 != 0 cbnz x3, kSchleife // store C st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] add x2,x2,#32*4 st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x2] add x2,x2,#32*4 st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x2] add x2,x2,#32*4 st1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x2] // reset C sub x2,x2,#(32*3-16)*4 // reset A sub x0,x0,#(32*32-16)*4 // reset B sub x1,x1,#32*4 sub x4,x4,#1 cbnz x4, mSchleife // reset A, down 32 values sub x0,x0,#32*4 // move B further by 4 columns add x1,x1,#(32*4)*4 // move C further by 3 cols (1 was done automatically before) add x2,x2,#(32*3)*4 sub x5,x5,#1 cbnz x5,nSchleife // restore: wir müssen nichts retten, also müssen wir auch nichts restoren ret .size gemm_asm_asimd_32_32_32, (. - gemm_asm_asimd_32_32_32)

AARCH64, 32 x 32 x 32 Matrix-Multiplication

Running Code:

Source Code: