AARCH64, 19 x 4 x 4 Matrix-Multiplication

Running Code:


Source Code:


	
	.text
	.align 4
	.type   gemm_asm_asimd_19_4_4, %function
	.global gemm_asm_asimd_19_4_4
gemm_asm_asimd_19_4_4:

	// store
	// wir verwenden x19-x30 nicht, also müssen wir es nicht retten
	// außerdem habe ich die Register v8-v15 auf v24-v31 gemappt, sodass auch die Vektorregister nicht gerettet werden müssen


	// lade C komplett
	ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
	// verwende ein neues Register, damit wir x2 nicht zurücksetzen müssen (spart eine Anweisung)
	add x5,x2,#19*4
	ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x5]
	add x5,x5,#19*4
	ld1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x5]
	add x5,x5,#19*4
	ld1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x5]

	// lade B
	ld1 { v16.4s }, [x1]
	add x1, x1, #4*4
	ld1 { v17.4s }, [x1]
	add x1, x1, #4*4
	ld1 { v18.4s }, [x1]
	add x1, x1, #4*4
	ld1 { v19.4s }, [x1]

	// lade A, erste Spalte für erste 4 Spalten von Berechnungen
	ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
	add x0,x0,#19*4

	// rechne
	fmla  v0.4s, v20.4s, v16.s[0]
	fmla  v1.4s, v21.4s, v16.s[0]
	fmla  v2.4s, v22.4s, v16.s[0]
	fmla  v3.4s, v23.4s, v16.s[0]

	fmla  v4.4s, v20.4s, v17.s[0]
	fmla  v5.4s, v21.4s, v17.s[0]
	fmla  v6.4s, v22.4s, v17.s[0]
	fmla  v7.4s, v23.4s, v17.s[0]

	fmla v24.4s, v20.4s, v18.s[0]
	fmla v25.4s, v21.4s, v18.s[0]
	fmla v26.4s, v22.4s, v18.s[0]
	fmla v27.4s, v23.4s, v18.s[0]

	fmla v28.4s, v20.4s, v19.s[0]
	fmla v29.4s, v21.4s, v19.s[0]
	fmla v30.4s, v22.4s, v19.s[0]
	fmla v31.4s, v23.4s, v19.s[0]


	ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
	add x0,x0,#19*4

	fmla  v0.4s, v20.4s, v16.s[1]
	fmla  v1.4s, v21.4s, v16.s[1]
	fmla  v2.4s, v22.4s, v16.s[1]
	fmla  v3.4s, v23.4s, v16.s[1]

	fmla  v4.4s, v20.4s, v17.s[1]
	fmla  v5.4s, v21.4s, v17.s[1]
	fmla  v6.4s, v22.4s, v17.s[1]
	fmla  v7.4s, v23.4s, v17.s[1]

	fmla v24.4s, v20.4s, v18.s[1]
	fmla v25.4s, v21.4s, v18.s[1]
	fmla v26.4s, v22.4s, v18.s[1]
	fmla v27.4s, v23.4s, v18.s[1]

	fmla v28.4s, v20.4s, v19.s[1]
	fmla v29.4s, v21.4s, v19.s[1]
	fmla v30.4s, v22.4s, v19.s[1]
	fmla v31.4s, v23.4s, v19.s[1]


	ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
	add x0,x0,#19*4

	fmla  v0.4s, v20.4s, v16.s[2]
	fmla  v1.4s, v21.4s, v16.s[2]
	fmla  v2.4s, v22.4s, v16.s[2]
	fmla  v3.4s, v23.4s, v16.s[2]

	fmla  v4.4s, v20.4s, v17.s[2]
	fmla  v5.4s, v21.4s, v17.s[2]
	fmla  v6.4s, v22.4s, v17.s[2]
	fmla  v7.4s, v23.4s, v17.s[2]

	fmla v24.4s, v20.4s, v18.s[2]
	fmla v25.4s, v21.4s, v18.s[2]
	fmla v26.4s, v22.4s, v18.s[2]
	fmla v27.4s, v23.4s, v18.s[2]

	fmla v28.4s, v20.4s, v19.s[2]
	fmla v29.4s, v21.4s, v19.s[2]
	fmla v30.4s, v22.4s, v19.s[2]
	fmla v31.4s, v23.4s, v19.s[2]


	ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
	add x0,x0,#19*4

	fmla  v0.4s, v20.4s, v16.s[3]
	fmla  v1.4s, v21.4s, v16.s[3]
	fmla  v2.4s, v22.4s, v16.s[3]
	fmla  v3.4s, v23.4s, v16.s[3]

	fmla  v4.4s, v20.4s, v17.s[3]
	fmla  v5.4s, v21.4s, v17.s[3]
	fmla  v6.4s, v22.4s, v17.s[3]
	fmla  v7.4s, v23.4s, v17.s[3]

	fmla v24.4s, v20.4s, v18.s[3]
	fmla v25.4s, v21.4s, v18.s[3]
	fmla v26.4s, v22.4s, v18.s[3]
	fmla v27.4s, v23.4s, v18.s[3]

	fmla v28.4s, v20.4s, v19.s[3]
	fmla v29.4s, v21.4s, v19.s[3]
	fmla v30.4s, v22.4s, v19.s[3]
	fmla v31.4s, v23.4s, v19.s[3]
	
	// store C
	st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
	add x2,x2,#19*4
	st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x2]
	add x2,x2,#19*4
	st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x2]
	add x2,x2,#19*4
	st1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x2]
	
	
	// berechne die letzten Werte
	// lade den Rest von C
	sub x2,x2,#(19*3-16)*4
	ldr d0, [x2], #2*4 // d0 = v0.2s
	ldr s1, [x2], #17*4
	ldr d2, [x2], #2*4
	ldr s3, [x2], #17*4
	ldr d4, [x2], #2*4
	ldr s5, [x2], #17*4
	ldr d6, [x2], #2*4
	ldr s7, [x2], #17*4
		
	// lade den Rest von A
	sub x0,x0,#(19*4-16)*4
	ldr d20, [x0], #2*4
	ldr s21, [x0], #17*4
	ldr d22, [x0], #2*4
	ldr s23, [x0], #17*4
	ldr d24, [x0], #2*4
	ldr s25, [x0], #17*4
	ldr d26, [x0], #2*4
	ldr s27, [x0], #17*4
	
	// berechne den Rest
	// Teil 1
	fmla v0.2s, v20.2s, v16.s[0]
	fmla s1,    s21,    v16.s[0]
	
	fmla v2.2s, v20.2s, v17.s[0]
	fmla s3,    s21,    v17.s[0]
	
	fmla v4.2s, v20.2s, v18.s[0]
	fmla s5,    s21,    v18.s[0]
	
	fmla v6.2s, v20.2s, v19.s[0]
	fmla s7,    s21,    v19.s[0]
	
	
	// Teil 2
	fmla v0.2s, v22.2s, v16.s[1]
	fmla s1,    s23,    v16.s[1]
	
	fmla v2.2s, v22.2s, v17.s[1]
	fmla s3,    s23,    v17.s[1]
	
	fmla v4.2s, v22.2s, v18.s[1]
	fmla s5,    s23,    v18.s[1]
	
	fmla v6.2s, v22.2s, v19.s[1]
	fmla s7,    s23,    v19.s[1]
	
	
	// Teil 3
	fmla v0.2s, v24.2s, v16.s[2]
	fmla s1,    s25,    v16.s[2]
	
	fmla v2.2s, v24.2s, v17.s[2]
	fmla s3,    s25,    v17.s[2]
	
	fmla v4.2s, v24.2s, v18.s[2]
	fmla s5,    s25,    v18.s[2]
	
	fmla v6.2s, v24.2s, v19.s[2]
	fmla s7,    s25,    v19.s[2]
	
	
	// Teil 4
	fmla v0.2s, v26.2s, v16.s[3]
	fmla s1,    s27,    v16.s[3]
	
	fmla v2.2s, v26.2s, v17.s[3]
	fmla s3,    s27,    v17.s[3]
	
	fmla v4.2s, v26.2s, v18.s[3]
	fmla s5,    s27,    v18.s[3]
	
	fmla v6.2s, v26.2s, v19.s[3]
	fmla s7,    s27,    v19.s[3]
	
	
	// speichere den Rest von C
	sub x2,x2,#(19*4)*4 // könnte sich gespart werden, indem man pre-inc statt post-inc verwendet (aber man muss aufpassen, denn irgendwann ist der Offset zu groß)
	str d0, [x2], #2*4
	str s1, [x2], #17*4
	str d2, [x2], #2*4
	str s3, [x2], #17*4
	str d4, [x2], #2*4
	str s5, [x2], #17*4
	str d6, [x2], #2*4
	str s7, [x2]
	
	// restore: wir müssen nichts retten, also müssen wir auch nichts restoren

ret
.size gemm_asm_asimd_19_4_4, (. - gemm_asm_asimd_19_4_4)