AARCH64, 32 x 32 x 32 Matrix-Multiplication

Running Code:


Source Code:


	
	.text
	.align 4
	.type   gemm_asm_asimd_32_32_32, %function
	.global gemm_asm_asimd_32_32_32
gemm_asm_asimd_32_32_32:

	// store
	// wir verwenden x19-x30 nicht, also müssen wir es nicht retten
	// außerdem habe ich die Register v8-v15 auf v24-v31 gemappt, sodass auch die Vektorregister nicht gerettet werden müssen

	// (n = 32)/(4 pro Schleifendurchlauf)
	mov x5, #8
nSchleife:

		mov x4, #2
	mSchleife:

			// lade C komplett
			ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
			// verwende ein neues Register, damit wir x2 nicht zurücksetzen müssen (spart eine Anweisung)
			add x6,x2,#32*4
			ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x6]
			add x6,x6,#32*4
			ld1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x6]
			add x6,x6,#32*4
			ld1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x6]

			// k-Schleife: k/4
			mov x3, #8
		kSchleife:

				// lade B
				ld1 { v16.4s }, [x1]
				add x1, x1, #32*4
				ld1 { v17.4s }, [x1]
				add x1, x1, #32*4
				ld1 { v18.4s }, [x1]
				add x1, x1, #32*4
				ld1 { v19.4s }, [x1]
				// zurücksetzen, und dann einen Block weiter
				sub x1, x1, #((3*32-4)*4)

				// lade A, erste Spalte für erste 4 Spalten von Berechnungen
				ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
				add x0,x0,#32*4

				// rechne
				fmla  v0.4s, v20.4s, v16.s[0]
				fmla  v1.4s, v21.4s, v16.s[0]
				fmla  v2.4s, v22.4s, v16.s[0]
				fmla  v3.4s, v23.4s, v16.s[0]

				fmla  v4.4s, v20.4s, v17.s[0]
				fmla  v5.4s, v21.4s, v17.s[0]
				fmla  v6.4s, v22.4s, v17.s[0]
				fmla  v7.4s, v23.4s, v17.s[0]

				fmla v24.4s, v20.4s, v18.s[0]
				fmla v25.4s, v21.4s, v18.s[0]
				fmla v26.4s, v22.4s, v18.s[0]
				fmla v27.4s, v23.4s, v18.s[0]

				fmla v28.4s, v20.4s, v19.s[0]
				fmla v29.4s, v21.4s, v19.s[0]
				fmla v30.4s, v22.4s, v19.s[0]
				fmla v31.4s, v23.4s, v19.s[0]


				ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
				add x0,x0,#32*4

				fmla  v0.4s, v20.4s, v16.s[1]
				fmla  v1.4s, v21.4s, v16.s[1]
				fmla  v2.4s, v22.4s, v16.s[1]
				fmla  v3.4s, v23.4s, v16.s[1]

				fmla  v4.4s, v20.4s, v17.s[1]
				fmla  v5.4s, v21.4s, v17.s[1]
				fmla  v6.4s, v22.4s, v17.s[1]
				fmla  v7.4s, v23.4s, v17.s[1]

				fmla v24.4s, v20.4s, v18.s[1]
				fmla v25.4s, v21.4s, v18.s[1]
				fmla v26.4s, v22.4s, v18.s[1]
				fmla v27.4s, v23.4s, v18.s[1]

				fmla v28.4s, v20.4s, v19.s[1]
				fmla v29.4s, v21.4s, v19.s[1]
				fmla v30.4s, v22.4s, v19.s[1]
				fmla v31.4s, v23.4s, v19.s[1]


				ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
				add x0,x0,#32*4

				fmla  v0.4s, v20.4s, v16.s[2]
				fmla  v1.4s, v21.4s, v16.s[2]
				fmla  v2.4s, v22.4s, v16.s[2]
				fmla  v3.4s, v23.4s, v16.s[2]

				fmla  v4.4s, v20.4s, v17.s[2]
				fmla  v5.4s, v21.4s, v17.s[2]
				fmla  v6.4s, v22.4s, v17.s[2]
				fmla  v7.4s, v23.4s, v17.s[2]

				fmla v24.4s, v20.4s, v18.s[2]
				fmla v25.4s, v21.4s, v18.s[2]
				fmla v26.4s, v22.4s, v18.s[2]
				fmla v27.4s, v23.4s, v18.s[2]

				fmla v28.4s, v20.4s, v19.s[2]
				fmla v29.4s, v21.4s, v19.s[2]
				fmla v30.4s, v22.4s, v19.s[2]
				fmla v31.4s, v23.4s, v19.s[2]


				ld1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x0]
				add x0,x0,#32*4

				fmla  v0.4s, v20.4s, v16.s[3]
				fmla  v1.4s, v21.4s, v16.s[3]
				fmla  v2.4s, v22.4s, v16.s[3]
				fmla  v3.4s, v23.4s, v16.s[3]

				fmla  v4.4s, v20.4s, v17.s[3]
				fmla  v5.4s, v21.4s, v17.s[3]
				fmla  v6.4s, v22.4s, v17.s[3]
				fmla  v7.4s, v23.4s, v17.s[3]

				fmla v24.4s, v20.4s, v18.s[3]
				fmla v25.4s, v21.4s, v18.s[3]
				fmla v26.4s, v22.4s, v18.s[3]
				fmla v27.4s, v23.4s, v18.s[3]

				fmla v28.4s, v20.4s, v19.s[3]
				fmla v29.4s, v21.4s, v19.s[3]
				fmla v30.4s, v22.4s, v19.s[3]
				fmla v31.4s, v23.4s, v19.s[3]

			sub x3,x3,#1
			// repeat, if x3 != 0
			cbnz x3, kSchleife
			
			// store C
			st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
			add x2,x2,#32*4
			st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x2]
			add x2,x2,#32*4
			st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x2]
			add x2,x2,#32*4
			st1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x2]
			
			// reset C
			sub x2,x2,#(32*3-16)*4
			
			// reset A
			sub x0,x0,#(32*32-16)*4
			
			// reset B
			sub x1,x1,#32*4
			
		sub x4,x4,#1
		cbnz x4, mSchleife
		
		// reset A, down 32 values
		sub x0,x0,#32*4
		
		// move B further by 4 columns
		add x1,x1,#(32*4)*4
		
		// move C further by 3 cols (1 was done automatically before)
		add x2,x2,#(32*3)*4
		
	sub x5,x5,#1
	cbnz x5,nSchleife
	
	// restore: wir müssen nichts retten, also müssen wir auch nichts restoren
	ret
	
.size gemm_asm_asimd_32_32_32, (. - gemm_asm_asimd_32_32_32)