AARCH64, 16 x 12 x 4 Matrix-Multiplication

Running Code:


Source Code:


	
	.text
	.align 4
	.type   gemm_asm_asimd_16_12_4, %function
	.global gemm_asm_asimd_16_12_4
gemm_asm_asimd_16_12_4:

	// store
	// wir verwenden x19-x30 nicht, also müssen wir es nicht retten
	// außerdem habe ich die Register v8-v15 auf v24-v31 gemappt, sodass auch die Vektorregister nicht gerettet werden müssen


	// lade C komplett
	ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
	// verwende ein neues Register, damit wir x2 nicht zurücksetzen müssen (spart eine Anweisung)
	add x5,x2,#16*4
	ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x5]
	add x5,x5,#16*4
	ld1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x5]
	add x5,x5,#16*4
	ld1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x5]


	// 3 Iterationen
	mov x3, #3

schleife:

	// lade B
	ld1 { v16.4s }, [x1]
	add x1, x1, #12*4
	ld1 { v17.4s }, [x1]
	add x1, x1, #12*4
	ld1 { v18.4s }, [x1]
	add x1, x1, #12*4
	ld1 { v19.4s }, [x1]
	// zurücksetzen, und dann einen Block weiter
	sub x1, x1, #((3*12-4)*4)

	// lade A, erste Spalte für erste 4 Spalten von Berechnungen
	ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
	add x0,x0,#64

	// rechne
	fmla  v0.4s, v20.4s, v16.s[0]
	fmla  v1.4s, v21.4s, v16.s[0]
	fmla  v2.4s, v22.4s, v16.s[0]
	fmla  v3.4s, v23.4s, v16.s[0]

	fmla  v4.4s, v20.4s, v17.s[0]
	fmla  v5.4s, v21.4s, v17.s[0]
	fmla  v6.4s, v22.4s, v17.s[0]
	fmla  v7.4s, v23.4s, v17.s[0]

	fmla v24.4s, v20.4s, v18.s[0]
	fmla v25.4s, v21.4s, v18.s[0]
	fmla v26.4s, v22.4s, v18.s[0]
	fmla v27.4s, v23.4s, v18.s[0]

	fmla v28.4s, v20.4s, v19.s[0]
	fmla v29.4s, v21.4s, v19.s[0]
	fmla v30.4s, v22.4s, v19.s[0]
	fmla v31.4s, v23.4s, v19.s[0]


	ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
	add x0,x0,#64

	fmla  v0.4s, v20.4s, v16.s[1]
	fmla  v1.4s, v21.4s, v16.s[1]
	fmla  v2.4s, v22.4s, v16.s[1]
	fmla  v3.4s, v23.4s, v16.s[1]

	fmla  v4.4s, v20.4s, v17.s[1]
	fmla  v5.4s, v21.4s, v17.s[1]
	fmla  v6.4s, v22.4s, v17.s[1]
	fmla  v7.4s, v23.4s, v17.s[1]

	fmla v24.4s, v20.4s, v18.s[1]
	fmla v25.4s, v21.4s, v18.s[1]
	fmla v26.4s, v22.4s, v18.s[1]
	fmla v27.4s, v23.4s, v18.s[1]

	fmla v28.4s, v20.4s, v19.s[1]
	fmla v29.4s, v21.4s, v19.s[1]
	fmla v30.4s, v22.4s, v19.s[1]
	fmla v31.4s, v23.4s, v19.s[1]


	ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
	add x0,x0,#64

	fmla  v0.4s, v20.4s, v16.s[2]
	fmla  v1.4s, v21.4s, v16.s[2]
	fmla  v2.4s, v22.4s, v16.s[2]
	fmla  v3.4s, v23.4s, v16.s[2]

	fmla  v4.4s, v20.4s, v17.s[2]
	fmla  v5.4s, v21.4s, v17.s[2]
	fmla  v6.4s, v22.4s, v17.s[2]
	fmla  v7.4s, v23.4s, v17.s[2]

	fmla v24.4s, v20.4s, v18.s[2]
	fmla v25.4s, v21.4s, v18.s[2]
	fmla v26.4s, v22.4s, v18.s[2]
	fmla v27.4s, v23.4s, v18.s[2]

	fmla v28.4s, v20.4s, v19.s[2]
	fmla v29.4s, v21.4s, v19.s[2]
	fmla v30.4s, v22.4s, v19.s[2]
	fmla v31.4s, v23.4s, v19.s[2]


	ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
	add x0,x0,#64

	fmla  v0.4s, v20.4s, v16.s[3]
	fmla  v1.4s, v21.4s, v16.s[3]
	fmla  v2.4s, v22.4s, v16.s[3]
	fmla  v3.4s, v23.4s, v16.s[3]

	fmla  v4.4s, v20.4s, v17.s[3]
	fmla  v5.4s, v21.4s, v17.s[3]
	fmla  v6.4s, v22.4s, v17.s[3]
	fmla  v7.4s, v23.4s, v17.s[3]

	fmla v24.4s, v20.4s, v18.s[3]
	fmla v25.4s, v21.4s, v18.s[3]
	fmla v26.4s, v22.4s, v18.s[3]
	fmla v27.4s, v23.4s, v18.s[3]

	fmla v28.4s, v20.4s, v19.s[3]
	fmla v29.4s, v21.4s, v19.s[3]
	fmla v30.4s, v22.4s, v19.s[3]
	fmla v31.4s, v23.4s, v19.s[3]


	
	// x3--
	sub x3,x3,#1
	// repeat, if x3 != 0
	cbnz x3, schleife
	
	// store C
	st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
	add x2,x2,#64
	st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x2]
	add x2,x2,#64
	st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x2]
	add x2,x2,#64
	st1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x2]
	
	// restore: wir müssen nichts retten, also müssen wir auch nichts restoren
	ret
	
.size gemm_asm_asimd_16_12_4, (. - gemm_asm_asimd_16_12_4)