Blame thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/mips64/trsm_kernel_RN_loongson3a.S

kusano 2b45e8
#define REALNAME ASMNAME
kusano 2b45e8
kusano 2b45e8
#define ASSEMBLER
kusano 2b45e8
#include "common.h"
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#define M	$4
kusano 2b45e8
#define	N	$5
kusano 2b45e8
#define	K	$6
kusano 2b45e8
#define A	$8
kusano 2b45e8
#define B	$9
kusano 2b45e8
#define C	$10
kusano 2b45e8
#define LDC	$11
kusano 2b45e8
kusano 2b45e8
#define AO	$12
kusano 2b45e8
#define BO	$13
kusano 2b45e8
kusano 2b45e8
#define I	$2
kusano 2b45e8
#define J	$3
kusano 2b45e8
#define L	$7
kusano 2b45e8
kusano 2b45e8
#define CO1	$14
kusano 2b45e8
#define CO2	$15
kusano 2b45e8
#define CO3	$16
kusano 2b45e8
#define CO4	$17
kusano 2b45e8
kusano 2b45e8
#define OFFSET	$22
kusano 2b45e8
#define KK	$23
kusano 2b45e8
#define TEMP	$24
kusano 2b45e8
#define AORIG	$25
kusano 2b45e8
kusano 2b45e8
#define a1	$f0
kusano 2b45e8
#define a2	$f1
kusano 2b45e8
#define a3	$f26
kusano 2b45e8
#define a4	$f27
kusano 2b45e8
kusano 2b45e8
#define a5	$f28
kusano 2b45e8
#define a6	$f29
kusano 2b45e8
#define	a7	$f30
kusano 2b45e8
#define	a8	$f31
kusano 2b45e8
kusano 2b45e8
#define b1	$f2
kusano 2b45e8
#define b2	$f3
kusano 2b45e8
#define b3	$f4
kusano 2b45e8
#define b4	$f5
kusano 2b45e8
kusano 2b45e8
#define b5	$f6
kusano 2b45e8
#define b6	$f7
kusano 2b45e8
#define b7	$f8
kusano 2b45e8
#define b8	$f9
kusano 2b45e8
kusano 2b45e8
#define t11	$f10
kusano 2b45e8
#define t21	$f11
kusano 2b45e8
#define t31	$f12
kusano 2b45e8
#define	t41	$f13
kusano 2b45e8
kusano 2b45e8
#define t12	$f14
kusano 2b45e8
#define	t22	$f15
kusano 2b45e8
#define t32	$f16
kusano 2b45e8
#define	t42	$f17
kusano 2b45e8
kusano 2b45e8
#define	t13	$f18
kusano 2b45e8
#define	t23	$f19
kusano 2b45e8
#define	t33	$f20
kusano 2b45e8
#define	t43	$f21
kusano 2b45e8
kusano 2b45e8
#define	t14	$f22
kusano 2b45e8
#define	t24	$f23
kusano 2b45e8
#define	t34	$f24
kusano 2b45e8
#define t44	$f25
kusano 2b45e8
kusano 2b45e8
	PROLOGUE
kusano 2b45e8
	
kusano 2b45e8
	daddiu	$sp, $sp, -144
kusano 2b45e8
kusano 2b45e8
	SDARG	$16,   0($sp)
kusano 2b45e8
	SDARG	$17,   8($sp)
kusano 2b45e8
	SDARG	$18,  16($sp)
kusano 2b45e8
	SDARG	$19,  24($sp)
kusano 2b45e8
	SDARG	$20,  32($sp)
kusano 2b45e8
	SDARG	$21,  40($sp)
kusano 2b45e8
	sdc1	$f24, 48($sp)
kusano 2b45e8
	sdc1	$f25, 56($sp)
kusano 2b45e8
	sdc1	$f26, 64($sp)
kusano 2b45e8
	sdc1	$f27, 72($sp)
kusano 2b45e8
	sdc1	$f28, 80($sp)
kusano 2b45e8
kusano 2b45e8
	SDARG	$22,  88($sp)
kusano 2b45e8
	SDARG	$23,  96($sp)
kusano 2b45e8
	SDARG	$24, 104($sp)
kusano 2b45e8
	SDARG	$25, 112($sp)
kusano 2b45e8
kusano 2b45e8
#ifndef __64BIT__
kusano 2b45e8
	sdc1	$f20,112($sp)
kusano 2b45e8
	sdc1	$f21,120($sp)
kusano 2b45e8
	sdc1	$f22,128($sp)
kusano 2b45e8
	sdc1	$f23,136($sp)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
											#	RN compute from top to bottom left to right
kusano 2b45e8
	.align	3
kusano 2b45e8
	LDARG	OFFSET, 144($sp)				#	get the last parameter
kusano 2b45e8
	dsll	LDC, LDC, BASE_SHIFT			#	LDC * data_Byte
kusano 2b45e8
kusano 2b45e8
	neg	KK, OFFSET							#	for RN OFFSET always 0
kusano 2b45e8
kusano 2b45e8
	dsra	J,  N, 2						#	J = NC/4 
kusano 2b45e8
	blez	J, .L30
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
.L10:
kusano 2b45e8
	daddiu	J, J, -1
kusano 2b45e8
	
kusano 2b45e8
	move	CO1, C
kusano 2b45e8
	daddu	CO2, C,   LDC
kusano 2b45e8
	daddu	CO3, CO2, LDC
kusano 2b45e8
	daddu	CO4, CO3, LDC
kusano 2b45e8
	
kusano 2b45e8
	move	AO, A							#	A is the retangular matrix and B is the trigular matrix	
kusano 2b45e8
	daddu	C,  CO4, LDC					#	Fixed pointer C
kusano 2b45e8
kusano 2b45e8
	dsra	I,  M, 2						#	I=MC/4
kusano 2b45e8
	blez	I, .L20
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L11:
kusano 2b45e8
	MTC	$0,  t11							#	clear results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t32, t11
kusano 2b45e8
	MOV	t42, t11
kusano 2b45e8
kusano 2b45e8
	MOV	t13, t11
kusano 2b45e8
	MOV	t23, t11
kusano 2b45e8
	MOV	t33, t11
kusano 2b45e8
	MOV	t43, t11
kusano 2b45e8
	
kusano 2b45e8
	MOV	t14, t11
kusano 2b45e8
	MOV	t24, t11
kusano 2b45e8
	MOV	t34, t11
kusano 2b45e8
	MOV	t44, t11
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	AO point to the beginning address of sa
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)					#	get 4 a	
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(B)					#	B point to the beginning address of every panel Bj
kusano 2b45e8
	LD	b2,  1 * SIZE(B)					#	get 4 b
kusano 2b45e8
	LD	b3,  2 * SIZE(B)
kusano 2b45e8
	LD	b4,  3 * SIZE(B)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  KK, 2						#	L=KK/4, KK is the length of the retangular data part of Bj
kusano 2b45e8
	blez	L, .L15
kusano 2b45e8
	move	BO,  B							#	reset B
kusano 2b45e8
kusano 2b45e8
.L12:
kusano 2b45e8
	LD	a5,  4 * SIZE(AO)
kusano 2b45e8
	LD	a6,  5 * SIZE(AO)
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  4 * SIZE(BO)
kusano 2b45e8
	LD	b6,  5 * SIZE(BO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
	MADD	t33, t33, a3, b3
kusano 2b45e8
	MADD	t43, t43, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
	MADD	t34, t34, a3, b4
kusano 2b45e8
	MADD	t44, t44, a4, b4			#	fisrt 	
kusano 2b45e8
kusano 2b45e8
	LD	a1,  8 * SIZE(AO)
kusano 2b45e8
	LD	a2,  9 * SIZE(AO)
kusano 2b45e8
	LD	a3,  10 * SIZE(AO)
kusano 2b45e8
	LD	a4,  11 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  8 * SIZE(BO)
kusano 2b45e8
	LD	b2,  9 * SIZE(BO)
kusano 2b45e8
	LD	b3,  10 * SIZE(BO)
kusano 2b45e8
	LD	b4,  11 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t31, t31, a7, b5
kusano 2b45e8
	MADD	t41, t41, a8, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
	MADD	t32, t32, a7, b6
kusano 2b45e8
	MADD	t42, t42, a8, b6
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a5, b7
kusano 2b45e8
	MADD	t23, t23, a6, b7
kusano 2b45e8
	MADD	t33, t33, a7, b7
kusano 2b45e8
	MADD	t43, t43, a8, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a5, b8
kusano 2b45e8
	MADD	t24, t24, a6, b8
kusano 2b45e8
	MADD	t34, t34, a7, b8
kusano 2b45e8
	MADD	t44, t44, a8, b8			#	second
kusano 2b45e8
kusano 2b45e8
	LD	a5,  12 * SIZE(AO)
kusano 2b45e8
	LD	a6,  13 * SIZE(AO)
kusano 2b45e8
	LD	a7,  14 * SIZE(AO)
kusano 2b45e8
	LD	a8,  15 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  12 * SIZE(BO)
kusano 2b45e8
	LD	b6,  13 * SIZE(BO)
kusano 2b45e8
	LD	b7,  14 * SIZE(BO)
kusano 2b45e8
	LD	b8,  15 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
	MADD	t33, t33, a3, b3
kusano 2b45e8
	MADD	t43, t43, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
	MADD	t34, t34, a3, b4
kusano 2b45e8
	MADD	t44, t44, a4, b4			#	third
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 16 * SIZE			#	AO += 4mr*4kr	
kusano 2b45e8
	daddiu	BO, BO, 16 * SIZE			#	BP += 4nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t31, t31, a7, b5
kusano 2b45e8
	MADD	t41, t41, a8, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
	MADD	t32, t32, a7, b6
kusano 2b45e8
	MADD	t42, t42, a8, b6
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a5, b7
kusano 2b45e8
	MADD	t23, t23, a6, b7
kusano 2b45e8
	MADD	t33, t33, a7, b7
kusano 2b45e8
	MADD	t43, t43, a8, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a5, b8
kusano 2b45e8
	MADD	t24, t24, a6, b8
kusano 2b45e8
	MADD	t34, t34, a7, b8
kusano 2b45e8
	MADD	t44, t44, a8, b8			#	fouth
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L12
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L15:
kusano 2b45e8
	andi	L, KK, 3					#	deal with kc remainder part
kusano 2b45e8
	blez	L, .L18
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L16:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
	MADD	t33, t33, a3, b3
kusano 2b45e8
	MADD	t43, t43, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
	MADD	t34, t34, a3, b4
kusano 2b45e8
	MADD	t44, t44, a4, b4		
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 4 * SIZE			#	AO += 4mr	
kusano 2b45e8
	daddiu	BO, BO, 4 * SIZE			#	BP += 4nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L16
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L18:									#	.L18 always deal with the trigular data part
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	for RN & RT A is the result matrix
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)				#	Fixed results
kusano 2b45e8
	LD	b3,  2 * SIZE(AO)				
kusano 2b45e8
	LD	b4,  3 * SIZE(AO)				#	sa stored as col major
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
	SUB	t31, b3, t31
kusano 2b45e8
	SUB	t41, b4, t41
kusano 2b45e8
kusano 2b45e8
 	LD	b5,  4 * SIZE(AO)
kusano 2b45e8
	LD	b6,  5 * SIZE(AO)
kusano 2b45e8
	LD	b7,  6 * SIZE(AO)
kusano 2b45e8
	LD	b8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t12, b5, t12
kusano 2b45e8
	SUB	t22, b6, t22
kusano 2b45e8
	SUB	t32, b7, t32
kusano 2b45e8
	SUB	t42, b8, t42
kusano 2b45e8
kusano 2b45e8
 	LD	b1,  8 * SIZE(AO)
kusano 2b45e8
	LD	b2,  9 * SIZE(AO)
kusano 2b45e8
	LD	b3, 10 * SIZE(AO)
kusano 2b45e8
	LD	b4, 11 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t13, b1, t13
kusano 2b45e8
	SUB	t23, b2, t23
kusano 2b45e8
	SUB	t33, b3, t33
kusano 2b45e8
	SUB	t43, b4, t43
kusano 2b45e8
kusano 2b45e8
 	LD	b5, 12 * SIZE(AO)
kusano 2b45e8
	LD	b6, 13 * SIZE(AO)
kusano 2b45e8
	LD	b7, 14 * SIZE(AO)
kusano 2b45e8
	LD	b8, 15 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t14, b5, t14
kusano 2b45e8
	SUB	t24, b6, t24
kusano 2b45e8
	SUB	t34, b7, t34
kusano 2b45e8
	SUB	t44, b8, t44
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)				#	BO point to the beginning of the trigular data part of Bj
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b1, t11
kusano 2b45e8
	MUL	t21, b1, t21
kusano 2b45e8
	MUL	t31, b1, t31
kusano 2b45e8
	MUL	t41, b1, t41
kusano 2b45e8
	NMSUB	t12, t12, b2, t11
kusano 2b45e8
	NMSUB	t22, t22, b2, t21
kusano 2b45e8
	NMSUB	t32, t32, b2, t31
kusano 2b45e8
	NMSUB	t42, t42, b2, t41
kusano 2b45e8
	NMSUB	t13, t13, b3, t11
kusano 2b45e8
	NMSUB	t23, t23, b3, t21
kusano 2b45e8
	NMSUB	t33, t33, b3, t31
kusano 2b45e8
	NMSUB	t43, t43, b3, t41
kusano 2b45e8
	NMSUB	t14, t14, b4, t11
kusano 2b45e8
	NMSUB	t24, t24, b4, t21
kusano 2b45e8
	NMSUB	t34, t34, b4, t31
kusano 2b45e8
	NMSUB	t44, t44, b4, t41
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
	LD	b5,  5 * SIZE(BO)
kusano 2b45e8
	LD	b6,  6 * SIZE(BO)
kusano 2b45e8
	LD	b7,  7 * SIZE(BO)
kusano 2b45e8
	MUL	t12, b5, t12
kusano 2b45e8
	MUL	t22, b5, t22
kusano 2b45e8
	MUL	t32, b5, t32
kusano 2b45e8
	MUL	t42, b5, t42
kusano 2b45e8
	NMSUB	t13, t13, b6, t12
kusano 2b45e8
	NMSUB	t23, t23, b6, t22
kusano 2b45e8
	NMSUB	t33, t33, b6, t32
kusano 2b45e8
	NMSUB	t43, t43, b6, t42
kusano 2b45e8
	NMSUB	t14, t14, b7, t12
kusano 2b45e8
	NMSUB	t24, t24, b7, t22
kusano 2b45e8
	NMSUB	t34, t34, b7, t32
kusano 2b45e8
	NMSUB	t44, t44, b7, t42
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b8,  10 * SIZE(BO)
kusano 2b45e8
	LD	b1,  11 * SIZE(BO)
kusano 2b45e8
	MUL	t13, b8, t13
kusano 2b45e8
	MUL	t23, b8, t23
kusano 2b45e8
	MUL	t33, b8, t33
kusano 2b45e8
	MUL	t43, b8, t43
kusano 2b45e8
	NMSUB	t14, t14, b1, t13
kusano 2b45e8
	NMSUB	t24, t24, b1, t23
kusano 2b45e8
	NMSUB	t34, t34, b1, t33
kusano 2b45e8
	NMSUB	t44, t44, b1, t43
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,  15 * SIZE(BO)
kusano 2b45e8
	MUL	t14, b2, t14
kusano 2b45e8
	MUL	t24, b2, t24
kusano 2b45e8
	MUL	t34, b2, t34
kusano 2b45e8
	MUL	t44, b2, t44
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				#	update packed blockA for follow-up compute
kusano 2b45e8
	ST	t21,  1 * SIZE(AO)
kusano 2b45e8
	ST	t31,  2 * SIZE(AO)
kusano 2b45e8
	ST	t41,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t12,  4 * SIZE(AO)
kusano 2b45e8
	ST	t22,  5 * SIZE(AO)
kusano 2b45e8
	ST	t32,  6 * SIZE(AO)
kusano 2b45e8
	ST	t42,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t13,  8 * SIZE(AO)
kusano 2b45e8
	ST	t23,  9 * SIZE(AO)
kusano 2b45e8
	ST	t33, 10 * SIZE(AO)
kusano 2b45e8
	ST	t43, 11 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t14, 12 * SIZE(AO)
kusano 2b45e8
	ST	t24, 13 * SIZE(AO)
kusano 2b45e8
	ST	t34, 14 * SIZE(AO)
kusano 2b45e8
	ST	t44, 15 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back results
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	ST	t31,  2 * SIZE(CO1)
kusano 2b45e8
	ST	t41,  3 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t22,  1 * SIZE(CO2)
kusano 2b45e8
	ST	t32,  2 * SIZE(CO2)
kusano 2b45e8
	ST	t42,  3 * SIZE(CO2)
kusano 2b45e8
	
kusano 2b45e8
	ST	t13,  0 * SIZE(CO3)
kusano 2b45e8
	ST	t23,  1 * SIZE(CO3)
kusano 2b45e8
	ST	t33,  2 * SIZE(CO3)
kusano 2b45e8
	ST	t43,  3 * SIZE(CO3)
kusano 2b45e8
	
kusano 2b45e8
	ST	t14,  0 * SIZE(CO4)
kusano 2b45e8
	ST	t24,  1 * SIZE(CO4)
kusano 2b45e8
	ST	t34,  2 * SIZE(CO4)
kusano 2b45e8
	ST	t44,  3 * SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 4 * SIZE			#	fixed address
kusano 2b45e8
	daddiu	CO2, CO2, 4 * SIZE
kusano 2b45e8
	daddiu	CO3, CO3, 4 * SIZE
kusano 2b45e8
	daddiu	CO4, CO4, 4 * SIZE
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK					#	temp = kc - retangular data length of every panel
kusano 2b45e8
	dsll	L,    TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	daddu	AO, AO, L					#	move AO to the end of this panel. also the beginning of next panel
kusano 2b45e8
	daddu	BO, BO, TEMP				#	move BO to the end of this panel
kusano 2b45e8
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
	bgtz	I, .L11
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L20:									
kusano 2b45e8
	andi	I,  M, 2					#	mr=2
kusano 2b45e8
	blez	I, .L50
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t32, t11
kusano 2b45e8
	MOV	t42, t11
kusano 2b45e8
kusano 2b45e8
	MOV	t13, t11
kusano 2b45e8
	MOV	t23, t11
kusano 2b45e8
	MOV	t33, t11
kusano 2b45e8
	MOV	t43, t11
kusano 2b45e8
	
kusano 2b45e8
	MOV	t14, t11
kusano 2b45e8
	MOV	t24, t11
kusano 2b45e8
	MOV	t34, t11
kusano 2b45e8
	MOV	t44, t11
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	AO point to the beginning address of sa
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)					#	get 4 a	
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(B)					#	B point to the beginning address of every panel Bj
kusano 2b45e8
	LD	b2,  1 * SIZE(B)					#	get 4 b
kusano 2b45e8
	LD	b3,  2 * SIZE(B)
kusano 2b45e8
	LD	b4,  3 * SIZE(B)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  KK, 2						#	L=KK/4, KK is the length of the retangular data part of Bj
kusano 2b45e8
	blez	L, .L25
kusano 2b45e8
	move	BO,  B							#	reset B
kusano 2b45e8
kusano 2b45e8
.L22:
kusano 2b45e8
	LD	a5,  2 * SIZE(AO)
kusano 2b45e8
	LD	a6,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  4 * SIZE(BO)
kusano 2b45e8
	LD	b6,  5 * SIZE(BO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
kusano 2b45e8
	LD	a3,  4 * SIZE(AO)
kusano 2b45e8
	LD	a4,  5 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  8 * SIZE(BO)
kusano 2b45e8
	LD	b2,  9 * SIZE(BO)
kusano 2b45e8
	LD	b3,  10 * SIZE(BO)
kusano 2b45e8
	LD	b4,  11 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a5, b7
kusano 2b45e8
	MADD	t23, t23, a6, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a5, b8
kusano 2b45e8
	MADD	t24, t24, a6, b8
kusano 2b45e8
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  12 * SIZE(BO)
kusano 2b45e8
	LD	b6,  13 * SIZE(BO)
kusano 2b45e8
	LD	b7,  14 * SIZE(BO)
kusano 2b45e8
	LD	b8,  15 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b1
kusano 2b45e8
	MADD	t21, t21, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a3, b2
kusano 2b45e8
	MADD	t22, t22, a4, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a3, b3
kusano 2b45e8
	MADD	t23, t23, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a3, b4
kusano 2b45e8
	MADD	t24, t24, a4, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 8 * SIZE			#	AO += 2mr*4kr	
kusano 2b45e8
	daddiu	BO, BO, 16 * SIZE			#	BP += 4nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b5
kusano 2b45e8
	MADD	t21, t21, a8, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a7, b6
kusano 2b45e8
	MADD	t22, t22, a8, b6
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a7, b7
kusano 2b45e8
	MADD	t23, t23, a8, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a7, b8
kusano 2b45e8
	MADD	t24, t24, a8, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L22
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L25:
kusano 2b45e8
	andi	L, KK, 3					#	deal with kc remainder part
kusano 2b45e8
	blez	L, .L28
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L26:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 2 * SIZE			#	AO += 2mr	
kusano 2b45e8
	daddiu	BO, BO, 4 * SIZE			#	BP += 4nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L26
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L28:									#	.L18 always deal with the trigular data part
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	for RN & RT A is the result matrix
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)				#	Fixed results
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
kusano 2b45e8
 	LD	b5,  2 * SIZE(AO)
kusano 2b45e8
	LD	b6,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t12, b5, t12
kusano 2b45e8
	SUB	t22, b6, t22
kusano 2b45e8
kusano 2b45e8
	LD	b3,  4 * SIZE(AO)
kusano 2b45e8
	LD	b4,  5 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t13, b3, t13
kusano 2b45e8
	SUB	t23, b4, t23
kusano 2b45e8
kusano 2b45e8
	LD	b7,  6 * SIZE(AO)
kusano 2b45e8
	LD	b8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t14, b7, t14
kusano 2b45e8
	SUB	t24, b8, t24
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)				#	BO point to the beginning of the trigular data part of Bj
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b1, t11
kusano 2b45e8
	MUL	t21, b1, t21
kusano 2b45e8
	NMSUB	t12, t12, b2, t11
kusano 2b45e8
	NMSUB	t22, t22, b2, t21
kusano 2b45e8
	NMSUB	t13, t13, b3, t11
kusano 2b45e8
	NMSUB	t23, t23, b3, t21
kusano 2b45e8
	NMSUB	t14, t14, b4, t11
kusano 2b45e8
	NMSUB	t24, t24, b4, t21
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
	LD	b5,  5 * SIZE(BO)
kusano 2b45e8
	LD	b6,  6 * SIZE(BO)
kusano 2b45e8
	LD	b7,  7 * SIZE(BO)
kusano 2b45e8
	MUL	t12, b5, t12
kusano 2b45e8
	MUL	t22, b5, t22
kusano 2b45e8
	NMSUB	t13, t13, b6, t12
kusano 2b45e8
	NMSUB	t23, t23, b6, t22
kusano 2b45e8
	NMSUB	t14, t14, b7, t12
kusano 2b45e8
	NMSUB	t24, t24, b7, t22
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b8,  10 * SIZE(BO)
kusano 2b45e8
	LD	b1,  11 * SIZE(BO)
kusano 2b45e8
	MUL	t13, b8, t13
kusano 2b45e8
	MUL	t23, b8, t23
kusano 2b45e8
	NMSUB	t14, t14, b1, t13
kusano 2b45e8
	NMSUB	t24, t24, b1, t23
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,  15 * SIZE(BO)
kusano 2b45e8
	MUL	t14, b2, t14
kusano 2b45e8
	MUL	t24, b2, t24
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				#	update packed blockA for follow-up compute
kusano 2b45e8
	ST	t21,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t12,  2 * SIZE(AO)
kusano 2b45e8
	ST	t22,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t13,  4 * SIZE(AO)
kusano 2b45e8
	ST	t23,  5 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t14,  6 * SIZE(AO)
kusano 2b45e8
	ST	t24,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back results
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t22,  1 * SIZE(CO2)
kusano 2b45e8
	
kusano 2b45e8
	ST	t13,  0 * SIZE(CO3)
kusano 2b45e8
	ST	t23,  1 * SIZE(CO3)
kusano 2b45e8
	
kusano 2b45e8
	ST	t14,  0 * SIZE(CO4)
kusano 2b45e8
	ST	t24,  1 * SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 2 * SIZE			#	fixed address
kusano 2b45e8
	daddiu	CO2, CO2, 2 * SIZE			#	mr=2
kusano 2b45e8
	daddiu	CO3, CO3, 2 * SIZE
kusano 2b45e8
	daddiu	CO4, CO4, 2 * SIZE
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK					#	temp = kc - retangular data length of every panel
kusano 2b45e8
	dsll	L,    TEMP, 1 + BASE_SHIFT	#	mr=2
kusano 2b45e8
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	daddu	AO, AO, L					#	move AO to the end of this panel. also the beginning of next panel
kusano 2b45e8
	daddu	BO, BO, TEMP				#	move BO to the end of this panel
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L50:									
kusano 2b45e8
	andi	I,  M, 1					#	mr=1
kusano 2b45e8
	blez	I, .L29
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t32, t11
kusano 2b45e8
	MOV	t42, t11
kusano 2b45e8
kusano 2b45e8
	MOV	t13, t11
kusano 2b45e8
	MOV	t23, t11
kusano 2b45e8
	MOV	t33, t11
kusano 2b45e8
	MOV	t43, t11
kusano 2b45e8
	
kusano 2b45e8
	MOV	t14, t11
kusano 2b45e8
	MOV	t24, t11
kusano 2b45e8
	MOV	t34, t11
kusano 2b45e8
	MOV	t44, t11
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	AO point to the beginning address of sa
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(B)					#	B point to the beginning address of every panel Bj
kusano 2b45e8
	LD	b2,  1 * SIZE(B)					#	get 4 b
kusano 2b45e8
	LD	b3,  2 * SIZE(B)
kusano 2b45e8
	LD	b4,  3 * SIZE(B)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  KK, 2						#	L=KK/4, KK is the length of the retangular data part of Bj
kusano 2b45e8
	blez	L, .L55
kusano 2b45e8
	move	BO,  B							#	reset B
kusano 2b45e8
kusano 2b45e8
.L52:
kusano 2b45e8
	LD	a5,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  4 * SIZE(BO)
kusano 2b45e8
	LD	b6,  5 * SIZE(BO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  8 * SIZE(BO)
kusano 2b45e8
	LD	b2,  9 * SIZE(BO)
kusano 2b45e8
	LD	b3,  10 * SIZE(BO)
kusano 2b45e8
	LD	b4,  11 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t13, t13, a5, b7
kusano 2b45e8
	MADD	t14, t14, a5, b8
kusano 2b45e8
kusano 2b45e8
	LD	a7,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  12 * SIZE(BO)
kusano 2b45e8
	LD	b6,  13 * SIZE(BO)
kusano 2b45e8
	LD	b7,  14 * SIZE(BO)
kusano 2b45e8
	LD	b8,  15 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b1
kusano 2b45e8
	MADD	t12, t12, a3, b2
kusano 2b45e8
	MADD	t13, t13, a3, b3
kusano 2b45e8
	MADD	t14, t14, a3, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 4 * SIZE			#	AO += 1mr*4kr	
kusano 2b45e8
	daddiu	BO, BO, 16 * SIZE			#	BP += 4nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b5
kusano 2b45e8
	MADD	t12, t12, a7, b6
kusano 2b45e8
	MADD	t13, t13, a7, b7
kusano 2b45e8
	MADD	t14, t14, a7, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L52
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L55:
kusano 2b45e8
	andi	L, KK, 3					#	deal with kc remainder part
kusano 2b45e8
	blez	L, .L58
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L56:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 1 * SIZE			#	AO += 1mr	
kusano 2b45e8
	daddiu	BO, BO, 4 * SIZE			#	BP += 4nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L56
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L58:									#	.L18 always deal with the trigular data part
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	for RN & RT A is the result matrix
kusano 2b45e8
 	LD	b5,  1 * SIZE(AO)
kusano 2b45e8
	LD	b3,  2 * SIZE(AO)
kusano 2b45e8
	LD	b7,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t12, b5, t12
kusano 2b45e8
	SUB	t13, b3, t13
kusano 2b45e8
	SUB	t14, b7, t14
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)				#	BO point to the beginning of the trigular data part of Bj
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b1, t11
kusano 2b45e8
	NMSUB	t12, t12, b2, t11
kusano 2b45e8
	NMSUB	t13, t13, b3, t11
kusano 2b45e8
	NMSUB	t14, t14, b4, t11
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
	LD	b5,  5 * SIZE(BO)
kusano 2b45e8
	LD	b6,  6 * SIZE(BO)
kusano 2b45e8
	LD	b7,  7 * SIZE(BO)
kusano 2b45e8
	MUL	t12, b5, t12
kusano 2b45e8
	NMSUB	t13, t13, b6, t12
kusano 2b45e8
	NMSUB	t14, t14, b7, t12
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b8,  10 * SIZE(BO)
kusano 2b45e8
	LD	b1,  11 * SIZE(BO)
kusano 2b45e8
	MUL	t13, b8, t13
kusano 2b45e8
	NMSUB	t14, t14, b1, t13
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,  15 * SIZE(BO)
kusano 2b45e8
	MUL	t14, b2, t14
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				#	update packed blockA for follow-up compute
kusano 2b45e8
	ST	t12,  1 * SIZE(AO)
kusano 2b45e8
	ST	t13,  2 * SIZE(AO)
kusano 2b45e8
	ST	t14,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back results
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t13,  0 * SIZE(CO3)
kusano 2b45e8
	ST	t14,  0 * SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 1 * SIZE			#	fixed address
kusano 2b45e8
	daddiu	CO2, CO2, 1 * SIZE			#	mr=2
kusano 2b45e8
	daddiu	CO3, CO3, 1 * SIZE
kusano 2b45e8
	daddiu	CO4, CO4, 1 * SIZE
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK					#	temp = kc - retangular data length of every panel
kusano 2b45e8
	dsll	L,    TEMP, BASE_SHIFT	#	mr=2
kusano 2b45e8
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	daddu	AO, AO, L					#	move AO to the end of this panel. also the beginning of next panel
kusano 2b45e8
	daddu	BO, BO, TEMP				#	move BO to the end of this panel
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L29:
kusano 2b45e8
	move	B,  BO						#	change to next panel of Bj
kusano 2b45e8
	daddiu	KK, KK,  4					#	rectangular data length increase by 4	
kusano 2b45e8
	bgtz	J, .L10
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
	
kusano 2b45e8
.L30:
kusano 2b45e8
	andi	J,  N, 2
kusano 2b45e8
	blez	J, .L70
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	move	CO1, C
kusano 2b45e8
	daddu	CO2, C,   LDC
kusano 2b45e8
	
kusano 2b45e8
	move	AO, A							#	A is the retangular matrix and B is the trigular matrix	
kusano 2b45e8
	daddu	C,  CO2, LDC					#	Fixed pointer C
kusano 2b45e8
kusano 2b45e8
	dsra	I,  M, 2						#	I=MC/4
kusano 2b45e8
	blez	I, .L40
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L31:
kusano 2b45e8
	MTC	$0,  t11							#	clear results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t32, t11
kusano 2b45e8
	MOV	t42, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	AO point to the beginning address of sa
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)					#	get 4 a	
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(B)					#	B point to the beginning address of every panel Bj
kusano 2b45e8
	LD	b2,  1 * SIZE(B)					#	get 4 b
kusano 2b45e8
kusano 2b45e8
	dsra	L,  KK, 2						#	L=KK/4, KK is the length of the retangular data part of Bj
kusano 2b45e8
	blez	L, .L35
kusano 2b45e8
	move	BO,  B							#	reset B
kusano 2b45e8
kusano 2b45e8
.L32:
kusano 2b45e8
	LD	a5,  4 * SIZE(AO)
kusano 2b45e8
	LD	a6,  5 * SIZE(AO)
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  2 * SIZE(BO)
kusano 2b45e8
	LD	b6,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	LD	a1,  8 * SIZE(AO)
kusano 2b45e8
	LD	a2,  9 * SIZE(AO)
kusano 2b45e8
	LD	a3,  10 * SIZE(AO)
kusano 2b45e8
	LD	a4,  11 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b3,  4 * SIZE(BO)
kusano 2b45e8
	LD	b4,  5 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t31, t31, a7, b5
kusano 2b45e8
	MADD	t41, t41, a8, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
	MADD	t32, t32, a7, b6
kusano 2b45e8
	MADD	t42, t42, a8, b6
kusano 2b45e8
kusano 2b45e8
	LD	a5,  12 * SIZE(AO)
kusano 2b45e8
	LD	a6,  13 * SIZE(AO)
kusano 2b45e8
	LD	a7,  14 * SIZE(AO)
kusano 2b45e8
	LD	a8,  15 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b3
kusano 2b45e8
	MADD	t21, t21, a2, b3
kusano 2b45e8
	MADD	t31, t31, a3, b3
kusano 2b45e8
	MADD	t41, t41, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b4
kusano 2b45e8
	MADD	t22, t22, a2, b4
kusano 2b45e8
	MADD	t32, t32, a3, b4
kusano 2b45e8
	MADD	t42, t42, a4, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 16 * SIZE			#	AO += 4mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  8 * SIZE			#	BP += 2nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b7
kusano 2b45e8
	MADD	t21, t21, a6, b7
kusano 2b45e8
	MADD	t31, t31, a7, b7
kusano 2b45e8
	MADD	t41, t41, a8, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b8
kusano 2b45e8
	MADD	t22, t22, a6, b8
kusano 2b45e8
	MADD	t32, t32, a7, b8
kusano 2b45e8
	MADD	t42, t42, a8, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L32
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L35:
kusano 2b45e8
	andi	L, KK, 3					#	deal with kc remainder part
kusano 2b45e8
	blez	L, .L38
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L36:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 4 * SIZE			#	AO += 4mr	
kusano 2b45e8
	daddiu	BO, BO, 2 * SIZE			#	BP += 2nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L36
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L38:									#	.L38 always deal with the trigular data part
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	for RN & RT A is the result matrix
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)				#	Fixed results
kusano 2b45e8
	LD	b3,  2 * SIZE(AO)				
kusano 2b45e8
	LD	b4,  3 * SIZE(AO)				#	sa stored as col major
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
	SUB	t31, b3, t31
kusano 2b45e8
	SUB	t41, b4, t41
kusano 2b45e8
kusano 2b45e8
 	LD	b5,  4 * SIZE(AO)
kusano 2b45e8
	LD	b6,  5 * SIZE(AO)
kusano 2b45e8
	LD	b7,  6 * SIZE(AO)
kusano 2b45e8
	LD	b8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t12, b5, t12
kusano 2b45e8
	SUB	t22, b6, t22
kusano 2b45e8
	SUB	t32, b7, t32
kusano 2b45e8
	SUB	t42, b8, t42
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)				#	BO point to the beginning of the trigular data part of Bj
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b1, t11
kusano 2b45e8
	MUL	t21, b1, t21
kusano 2b45e8
	MUL	t31, b1, t31
kusano 2b45e8
	MUL	t41, b1, t41
kusano 2b45e8
	NMSUB	t12, t12, b2, t11
kusano 2b45e8
	NMSUB	t22, t22, b2, t21
kusano 2b45e8
	NMSUB	t32, t32, b2, t31
kusano 2b45e8
	NMSUB	t42, t42, b2, t41
kusano 2b45e8
	
kusano 2b45e8
	LD	b5,  3 * SIZE(BO)
kusano 2b45e8
	MUL	t12, b5, t12
kusano 2b45e8
	MUL	t22, b5, t22
kusano 2b45e8
	MUL	t32, b5, t32
kusano 2b45e8
	MUL	t42, b5, t42
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				#	update packed blockA for follow-up compute
kusano 2b45e8
	ST	t21,  1 * SIZE(AO)
kusano 2b45e8
	ST	t31,  2 * SIZE(AO)
kusano 2b45e8
	ST	t41,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t12,  4 * SIZE(AO)
kusano 2b45e8
	ST	t22,  5 * SIZE(AO)
kusano 2b45e8
	ST	t32,  6 * SIZE(AO)
kusano 2b45e8
	ST	t42,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back results
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	ST	t31,  2 * SIZE(CO1)
kusano 2b45e8
	ST	t41,  3 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t22,  1 * SIZE(CO2)
kusano 2b45e8
	ST	t32,  2 * SIZE(CO2)
kusano 2b45e8
	ST	t42,  3 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 4 * SIZE			#	fixed address
kusano 2b45e8
	daddiu	CO2, CO2, 4 * SIZE
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK					#	temp = kc - retangular data length of every panel
kusano 2b45e8
	dsll	L,    TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + BASE_SHIFT	#	nr=2
kusano 2b45e8
	daddu	AO, AO, L					#	move AO to the end of this panel. also the beginning of next panel
kusano 2b45e8
	daddu	BO, BO, TEMP				#	move BO to the end of this panel
kusano 2b45e8
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
	bgtz	I, .L31
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L40:
kusano 2b45e8
	andi	I, M,2
kusano 2b45e8
	blez	I,.L60
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	AO point to the beginning address of sa
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)					#	get 4 a	
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(B)					#	B point to the beginning address of every panel Bj
kusano 2b45e8
	LD	b2,  1 * SIZE(B)					#	get 4 b
kusano 2b45e8
kusano 2b45e8
	dsra	L,  KK, 2						#	L=KK/4, KK is the length of the retangular data part of Bj
kusano 2b45e8
	blez	L, .L45
kusano 2b45e8
	move	BO,  B							#	reset B
kusano 2b45e8
kusano 2b45e8
.L42:
kusano 2b45e8
	LD	a5,  2 * SIZE(AO)
kusano 2b45e8
	LD	a6,  3 * SIZE(AO)
kusano 2b45e8
	LD	b5,  2 * SIZE(BO)
kusano 2b45e8
	LD	b6,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
kusano 2b45e8
	LD	a3,  4 * SIZE(AO)
kusano 2b45e8
	LD	a4,  5 * SIZE(AO)
kusano 2b45e8
	LD	b3,  4 * SIZE(BO)
kusano 2b45e8
	LD	b4,  5 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b3
kusano 2b45e8
	MADD	t21, t21, a4, b3
kusano 2b45e8
	MADD	t12, t12, a3, b4
kusano 2b45e8
	MADD	t22, t22, a4, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  8 * SIZE			#	AO += 2mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  8 * SIZE			#	BP += 2nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b7
kusano 2b45e8
	MADD	t21, t21, a8, b7
kusano 2b45e8
	MADD	t12, t12, a7, b8
kusano 2b45e8
	MADD	t22, t22, a8, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L42
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L45:
kusano 2b45e8
	andi	L, KK, 3					#	deal with kc remainder part
kusano 2b45e8
	blez	L, .L48
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L46:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 2 * SIZE			#	AO += 2mr	
kusano 2b45e8
	daddiu	BO, BO, 2 * SIZE			#	BP += 2nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L46
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L48:									#	.L48 always deal with the trigular data part
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	for RN & RT A is the result matrix
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)				#	Fixed results
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
kusano 2b45e8
 	LD	b5,  2 * SIZE(AO)
kusano 2b45e8
	LD	b6,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t12, b5, t12
kusano 2b45e8
	SUB	t22, b6, t22
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)				#	BO point to the beginning of the trigular data part of Bj
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b1, t11
kusano 2b45e8
	MUL	t21, b1, t21
kusano 2b45e8
	NMSUB	t12, t12, b2, t11
kusano 2b45e8
	NMSUB	t22, t22, b2, t21
kusano 2b45e8
	
kusano 2b45e8
	LD	b5,  3 * SIZE(BO)
kusano 2b45e8
	MUL	t12, b5, t12
kusano 2b45e8
	MUL	t22, b5, t22
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				#	update packed blockA for follow-up compute
kusano 2b45e8
	ST	t21,  1 * SIZE(AO)
kusano 2b45e8
	ST	t12,  2 * SIZE(AO)
kusano 2b45e8
	ST	t22,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back results
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t22,  1 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 2 * SIZE			#	fixed address
kusano 2b45e8
	daddiu	CO2, CO2, 2 * SIZE
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK					#	temp = kc - retangular data length of every panel
kusano 2b45e8
	dsll	L,    TEMP, 1 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + BASE_SHIFT	#	nr=2
kusano 2b45e8
	daddu	AO, AO, L					#	move AO to the end of this panel. also the beginning of next panel
kusano 2b45e8
	daddu	BO, BO, TEMP				#	move BO to the end of this panel
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L60:
kusano 2b45e8
	andi	I,M,1						#	nr=2 mr=1
kusano 2b45e8
	blez	I,.L39
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear results registers
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	AO point to the beginning address of sa
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(B)					#	B point to the beginning address of every panel Bj
kusano 2b45e8
	LD	b2,  1 * SIZE(B)					#	get 4 b
kusano 2b45e8
kusano 2b45e8
	dsra	L,  KK, 2						#	L=KK/4, KK is the length of the retangular data part of Bj
kusano 2b45e8
	blez	L, .L65
kusano 2b45e8
	move	BO,  B							#	reset B
kusano 2b45e8
kusano 2b45e8
.L62:
kusano 2b45e8
	LD	a5,  1 * SIZE(AO)
kusano 2b45e8
	LD	b5,  2 * SIZE(BO)
kusano 2b45e8
	LD	b6,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	b3,  4 * SIZE(BO)
kusano 2b45e8
	LD	b4,  5 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
kusano 2b45e8
	LD	a7,  3 * SIZE(AO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b3
kusano 2b45e8
	MADD	t12, t12, a3, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  4 * SIZE			#	AO += 1mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  8 * SIZE			#	BP += 2nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b7
kusano 2b45e8
	MADD	t12, t12, a7, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L62
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L65:
kusano 2b45e8
	andi	L, KK, 3					#	deal with kc remainder part
kusano 2b45e8
	blez	L, .L68
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L66:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 1 * SIZE			#	AO += mr	
kusano 2b45e8
	daddiu	BO, BO, 2 * SIZE			#	BP += 2nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L66
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L68:									#	.L48 always deal with the trigular data part
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	for RN & RT A is the result matrix
kusano 2b45e8
	LD	b5,  1 * SIZE(AO)				#	Fixed results
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t12, b5, t12
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)				#	BO point to the beginning of the trigular data part of Bj
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b1, t11
kusano 2b45e8
	NMSUB	t12, t12, b2, t11
kusano 2b45e8
	
kusano 2b45e8
	LD	b5,  3 * SIZE(BO)
kusano 2b45e8
	MUL	t12, b5, t12
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				#	update packed blockA for follow-up compute
kusano 2b45e8
	ST	t12,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back results
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 1 * SIZE			#	fixed address
kusano 2b45e8
	daddiu	CO2, CO2, 1 * SIZE
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK					#	temp = kc - retangular data length of every panel
kusano 2b45e8
	dsll	L,    TEMP, BASE_SHIFT		#	mr=1
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + BASE_SHIFT	#	nr=2
kusano 2b45e8
	daddu	AO, AO, L					#	move AO to the end of this panel. also the beginning of next panel
kusano 2b45e8
	daddu	BO, BO, TEMP				#	move BO to the end of this panel
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L39:
kusano 2b45e8
	move	B,  BO						#	change to next panel of Bj
kusano 2b45e8
	daddiu	KK, KK,  2					#	rectangular data length increase by 4	
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L70:
kusano 2b45e8
	andi	J,  N, 1					#	nr=1
kusano 2b45e8
	blez	J, .L999
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	move	CO1, C
kusano 2b45e8
	move	AO, A
kusano 2b45e8
kusano 2b45e8
	daddu	C,  CO1, LDC
kusano 2b45e8
kusano 2b45e8
	dsra	I,  M, 2					#	I=MC/4
kusano 2b45e8
	blez	I, .L80
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L71:
kusano 2b45e8
	MTC	$0,  t11							#	clear results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	AO point to the beginning address of sa
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)					#	get 4 a	
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(B)					#	B point to the beginning address of every panel Bj
kusano 2b45e8
kusano 2b45e8
	dsra	L,  KK, 2						#	L=KK/4, KK is the length of the retangular data part of Bj
kusano 2b45e8
	blez	L, .L75
kusano 2b45e8
	move	BO,  B							#	reset B
kusano 2b45e8
kusano 2b45e8
.L72:
kusano 2b45e8
	LD	a5,  4 * SIZE(AO)
kusano 2b45e8
	LD	a6,  5 * SIZE(AO)
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	LD	a1,  8 * SIZE(AO)
kusano 2b45e8
	LD	a2,  9 * SIZE(AO)
kusano 2b45e8
	LD	a3,  10 * SIZE(AO)
kusano 2b45e8
	LD	a4,  11 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t31, t31, a7, b5
kusano 2b45e8
	MADD	t41, t41, a8, b5
kusano 2b45e8
kusano 2b45e8
	LD	a5,  12 * SIZE(AO)
kusano 2b45e8
	LD	a6,  13 * SIZE(AO)
kusano 2b45e8
	LD	a7,  14 * SIZE(AO)
kusano 2b45e8
	LD	a8,  15 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b7,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b3
kusano 2b45e8
	MADD	t21, t21, a2, b3
kusano 2b45e8
	MADD	t31, t31, a3, b3
kusano 2b45e8
	MADD	t41, t41, a4, b3
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 16 * SIZE			#	AO += 4mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  4 * SIZE			#	BP += 1nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b7
kusano 2b45e8
	MADD	t21, t21, a6, b7
kusano 2b45e8
	MADD	t31, t31, a7, b7
kusano 2b45e8
	MADD	t41, t41, a8, b7
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L72
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L75:
kusano 2b45e8
	andi	L, KK, 3					#	deal with kc remainder part
kusano 2b45e8
	blez	L, .L78
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L76:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 4 * SIZE			#	AO += 4mr	
kusano 2b45e8
	daddiu	BO, BO, 1 * SIZE			#	BP += 1nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L76
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L78:									#	.L78 always deal with the trigular data part
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	for RN & RT A is the result matrix
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)				#	Fixed results
kusano 2b45e8
	LD	b3,  2 * SIZE(AO)				
kusano 2b45e8
	LD	b4,  3 * SIZE(AO)				#	sa stored as col major
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
	SUB	t31, b3, t31
kusano 2b45e8
	SUB	t41, b4, t41
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)				#	BO point to the beginning of the trigular data part of Bj
kusano 2b45e8
	MUL	t11, b1, t11
kusano 2b45e8
	MUL	t21, b1, t21
kusano 2b45e8
	MUL	t31, b1, t31
kusano 2b45e8
	MUL	t41, b1, t41
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				#	update packed blockA for follow-up compute
kusano 2b45e8
	ST	t21,  1 * SIZE(AO)
kusano 2b45e8
	ST	t31,  2 * SIZE(AO)
kusano 2b45e8
	ST	t41,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back results
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	ST	t31,  2 * SIZE(CO1)
kusano 2b45e8
	ST	t41,  3 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 4 * SIZE			#	fixed address
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK					#	temp = kc - retangular data length of every panel
kusano 2b45e8
	dsll	L,    TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, BASE_SHIFT		#	nr=1
kusano 2b45e8
	daddu	AO, AO, L					#	move AO to the end of this panel. also the beginning of next panel
kusano 2b45e8
	daddu	BO, BO, TEMP				#	move BO to the end of this panel
kusano 2b45e8
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
	bgtz	I, .L71
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L80:
kusano 2b45e8
	andi	I, M, 2						#	mr=2
kusano 2b45e8
	blez	I, .L90
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	AO point to the beginning address of sa
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)					#	get 4 a	
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(B)					#	B point to the beginning address of every panel Bj
kusano 2b45e8
kusano 2b45e8
	dsra	L,  KK, 2						#	L=KK/4, KK is the length of the retangular data part of Bj
kusano 2b45e8
	blez	L, .L85
kusano 2b45e8
	move	BO,  B							#	reset B
kusano 2b45e8
kusano 2b45e8
.L82:
kusano 2b45e8
	LD	a5,  2 * SIZE(AO)
kusano 2b45e8
	LD	a6,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
kusano 2b45e8
	LD	a3,  4 * SIZE(AO)
kusano 2b45e8
	LD	a4,  5 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b7,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b3
kusano 2b45e8
	MADD	t21, t21, a4, b3
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  8 * SIZE			#	AO += 2mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  4 * SIZE			#	BP += 1nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b7
kusano 2b45e8
	MADD	t21, t21, a8, b7
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L82
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L85:
kusano 2b45e8
	andi	L, KK, 3					#	deal with kc remainder part
kusano 2b45e8
	blez	L, .L88
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L86:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 2 * SIZE			#	AO += 2mr	
kusano 2b45e8
	daddiu	BO, BO, 1 * SIZE			#	BP += 1nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L86
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L88:									#	.L88 always deal with the trigular data part
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	for RN & RT A is the result matrix
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)				#	Fixed results
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)				#	BO point to the beginning of the trigular data part of Bj
kusano 2b45e8
	MUL	t11, b1, t11
kusano 2b45e8
	MUL	t21, b1, t21
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				#	update packed blockA for follow-up compute
kusano 2b45e8
	ST	t21,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back results
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 2 * SIZE			#	fixed address
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK					#	temp = kc - retangular data length of every panel
kusano 2b45e8
	dsll	L,    TEMP, 1 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, BASE_SHIFT		#	nr=1
kusano 2b45e8
	daddu	AO, AO, L					#	move AO to the end of this panel. also the beginning of next panel
kusano 2b45e8
	daddu	BO, BO, TEMP				#	move BO to the end of this panel
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L90:
kusano 2b45e8
	andi	I, M, 1						#	mr=1
kusano 2b45e8
	blez	I, .L79
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear results registers
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	AO point to the beginning address of sa
kusano 2b45e8
	LD	b1,  0 * SIZE(B)					#	B point to the beginning address of every panel Bj
kusano 2b45e8
kusano 2b45e8
	dsra	L,  KK, 2						#	L=KK/4, KK is the length of the retangular data part of Bj
kusano 2b45e8
	blez	L, .L95
kusano 2b45e8
	move	BO,  B							#	reset B
kusano 2b45e8
kusano 2b45e8
.L92:
kusano 2b45e8
	LD	a5,  1 * SIZE(AO)
kusano 2b45e8
	LD	b5,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
kusano 2b45e8
	LD	a7,  3 * SIZE(AO)
kusano 2b45e8
	LD	b7,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b3
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  4 * SIZE			#	AO += 1mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  4 * SIZE			#	BP += 1nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b7
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L92
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L95:
kusano 2b45e8
	andi	L, KK, 3					#	deal with kc remainder part
kusano 2b45e8
	blez	L, .L98
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L96:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 1 * SIZE			#	AO += 2mr	
kusano 2b45e8
	daddiu	BO, BO, 1 * SIZE			#	BP += 1nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L96
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L98:									#	.L98 always deal with the trigular data part
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	for RN & RT A is the result matrix
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)				#	BO point to the beginning of the trigular data part of Bj
kusano 2b45e8
	MUL	t11, b1, t11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				#	update packed blockA for follow-up compute
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back results
kusano 2b45e8
	
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 1 * SIZE			#	fixed address
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK					#	temp = kc - retangular data length of every panel
kusano 2b45e8
	dsll	L,    TEMP, BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, BASE_SHIFT		#	nr=1
kusano 2b45e8
	daddu	AO, AO, L					#	move AO to the end of this panel. also the beginning of next panel
kusano 2b45e8
	daddu	BO, BO, TEMP				#	move BO to the end of this panel
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L79:
kusano 2b45e8
	move	B,  BO
kusano 2b45e8
	daddiu	KK, KK, 1
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L999:
kusano 2b45e8
	LDARG	$16,   0($sp)
kusano 2b45e8
	LDARG	$17,   8($sp)
kusano 2b45e8
	LDARG	$18,  16($sp)
kusano 2b45e8
	LDARG	$19,  24($sp)
kusano 2b45e8
	LDARG	$20,  32($sp)
kusano 2b45e8
	LDARG	$21,  40($sp)
kusano 2b45e8
	ldc1	$f24, 48($sp)
kusano 2b45e8
	ldc1	$f25, 56($sp)
kusano 2b45e8
	ldc1	$f26, 64($sp)
kusano 2b45e8
	ldc1	$f27, 72($sp)
kusano 2b45e8
	ldc1	$f28, 80($sp)
kusano 2b45e8
kusano 2b45e8
	LDARG	$22,  88($sp)
kusano 2b45e8
	LDARG	$23,  96($sp)
kusano 2b45e8
	LDARG	$24, 104($sp)
kusano 2b45e8
	LDARG	$25, 112($sp)
kusano 2b45e8
kusano 2b45e8
#ifndef __64BIT__
kusano 2b45e8
	ldc1	$f20,112($sp)
kusano 2b45e8
	ldc1	$f21,120($sp)
kusano 2b45e8
	ldc1	$f22,128($sp)
kusano 2b45e8
	ldc1	$f23,136($sp)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	j	$31
kusano 2b45e8
	daddiu	$sp, $sp, 144
kusano 2b45e8
kusano 2b45e8
	EPILOGUE