Blame thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/mips64/trsm_kernel_LN_loongson3a.S

kusano 2b45e8
#define REALNAME ASMNAME
kusano 2b45e8
kusano 2b45e8
#define ASSEMBLER
kusano 2b45e8
#include "common.h"
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#define M	$4
kusano 2b45e8
#define	N	$5
kusano 2b45e8
#define	K	$6
kusano 2b45e8
#define A	$8
kusano 2b45e8
#define B	$9
kusano 2b45e8
#define C	$10
kusano 2b45e8
#define LDC	$11
kusano 2b45e8
kusano 2b45e8
#define AO	$12
kusano 2b45e8
#define BO	$13
kusano 2b45e8
kusano 2b45e8
#define I	$2
kusano 2b45e8
#define J	$3
kusano 2b45e8
#define L	$7
kusano 2b45e8
kusano 2b45e8
#define CO1	$14
kusano 2b45e8
#define CO2	$15
kusano 2b45e8
#define CO3	$16
kusano 2b45e8
#define CO4	$17
kusano 2b45e8
kusano 2b45e8
#define OFFSET	$22
kusano 2b45e8
#define KK	$23
kusano 2b45e8
#define TEMP	$24
kusano 2b45e8
#define AORIG	$25
kusano 2b45e8
kusano 2b45e8
#define a1	$f0
kusano 2b45e8
#define a2	$f1
kusano 2b45e8
#define a3	$f2
kusano 2b45e8
#define a4	$f3
kusano 2b45e8
#define a5	$f4
kusano 2b45e8
#define a6	$f5
kusano 2b45e8
#define a7	$f6
kusano 2b45e8
#define	a8	$f7
kusano 2b45e8
kusano 2b45e8
#define b1	$f8
kusano 2b45e8
#define b2	$f9
kusano 2b45e8
#define b3	$f10
kusano 2b45e8
#define b4	$f11
kusano 2b45e8
#define b5	$f12
kusano 2b45e8
#define b6	$f13
kusano 2b45e8
#define b7	$f14
kusano 2b45e8
#define b8	$f15
kusano 2b45e8
kusano 2b45e8
#define t11	$f16
kusano 2b45e8
#define t21	$f17
kusano 2b45e8
#define t31	$f18
kusano 2b45e8
#define t41	$f19
kusano 2b45e8
kusano 2b45e8
#define t12	$f20
kusano 2b45e8
#define t22	$f21
kusano 2b45e8
#define t32	$f22
kusano 2b45e8
#define t42	$f23
kusano 2b45e8
kusano 2b45e8
#define t13	$f24
kusano 2b45e8
#define t23	$f25
kusano 2b45e8
#define t33	$f26
kusano 2b45e8
#define t43	$f27
kusano 2b45e8
kusano 2b45e8
#define t14	$f28
kusano 2b45e8
#define t24	$f29
kusano 2b45e8
#define t34	$f30
kusano 2b45e8
#define t44	$f31
kusano 2b45e8
kusano 2b45e8
#define ALPHA	$f15
kusano 2b45e8
kusano 2b45e8
	PROLOGUE
kusano 2b45e8
	
kusano 2b45e8
	daddiu	$sp, $sp, -144
kusano 2b45e8
kusano 2b45e8
	SDARG	$16,   0($sp)
kusano 2b45e8
	SDARG	$17,   8($sp)
kusano 2b45e8
	SDARG	$18,  16($sp)
kusano 2b45e8
	SDARG	$19,  24($sp)
kusano 2b45e8
	SDARG	$20,  32($sp)
kusano 2b45e8
	SDARG	$21,  40($sp)
kusano 2b45e8
	sdc1	$f24, 48($sp)
kusano 2b45e8
	sdc1	$f25, 56($sp)
kusano 2b45e8
	sdc1	$f26, 64($sp)
kusano 2b45e8
	sdc1	$f27, 72($sp)
kusano 2b45e8
	sdc1	$f28, 80($sp)
kusano 2b45e8
kusano 2b45e8
	SDARG	$22,  88($sp)
kusano 2b45e8
	SDARG	$23,  96($sp)
kusano 2b45e8
	SDARG	$24, 104($sp)
kusano 2b45e8
	SDARG	$25, 112($sp)
kusano 2b45e8
kusano 2b45e8
#ifndef __64BIT__
kusano 2b45e8
	sdc1	$f20,112($sp)
kusano 2b45e8
	sdc1	$f21,120($sp)
kusano 2b45e8
	sdc1	$f22,128($sp)
kusano 2b45e8
	sdc1	$f23,136($sp)
kusano 2b45e8
#endif
kusano 2b45e8
											#	LN compute from bottom to top
kusano 2b45e8
	LDARG	OFFSET, 144($sp)					
kusano 2b45e8
	dsll	LDC, LDC, BASE_SHIFT			#	ldc
kusano 2b45e8
kusano 2b45e8
	mult	M, K
kusano 2b45e8
	mflo	TEMP							#	TEMP=MC*KC
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, TEMP, BASE_SHIFT			
kusano 2b45e8
	daddu	A, A, TEMP						#	A move to the end of sa
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, M, BASE_SHIFT
kusano 2b45e8
	daddu	C, C, TEMP						#	C+=MC
kusano 2b45e8
kusano 2b45e8
	dsra	J,  N, 2						#	j = nc/4
kusano 2b45e8
	blez	J, .L30
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L10:										#	nr=4
kusano 2b45e8
	daddiu	J, J, -1
kusano 2b45e8
	move	CO1, C
kusano 2b45e8
	daddu	CO2, C,   LDC
kusano 2b45e8
	daddu	CO3, CO2, LDC
kusano 2b45e8
	daddu	CO4, CO3, LDC
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear result registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t32, t11
kusano 2b45e8
	MOV	t42, t11
kusano 2b45e8
kusano 2b45e8
	daddu	KK, M, OFFSET					#	kc - kk is the length of the rectangular data part of panel Ai 
kusano 2b45e8
	move	AORIG, A						#	reset A
kusano 2b45e8
kusano 2b45e8
	daddu	C,  CO4, LDC					#	fixed pointer C, the write back address
kusano 2b45e8
	
kusano 2b45e8
	andi	I,  M, 1						#	mr=2,nr=4	
kusano 2b45e8
	blez	I, .L50
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP,   K, BASE_SHIFT			#	mr=1
kusano 2b45e8
	dsubu	AORIG, AORIG, TEMP				#	AORIG point to the beginning address of Ai
kusano 2b45e8
kusano 2b45e8
	dsll	L,    KK, BASE_SHIFT			#	mr=1	
kusano 2b45e8
	dsll	TEMP, KK, 2 + BASE_SHIFT		#	nr=4
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AORIG, L					#	AO point to the rectangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
kusano 2b45e8
	MOV	t13, t11							#	mr=2
kusano 2b45e8
	MOV	t23, t11
kusano 2b45e8
	MOV	t33, t11
kusano 2b45e8
	MOV	t43, t11
kusano 2b45e8
	MOV	t14, t11
kusano 2b45e8
	MOV	t24, t11
kusano 2b45e8
	MOV	t34, t11
kusano 2b45e8
	MOV	t44, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)					#	get 4b
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)				
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2
kusano 2b45e8
	blez	L, .L55
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L52:
kusano 2b45e8
	LD	a5,  1 * SIZE(AO)					
kusano 2b45e8
kusano 2b45e8
	LD	b5,  4 * SIZE(BO)
kusano 2b45e8
	LD	b6,  5 * SIZE(BO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	1st compute
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
kusano 2b45e8
	LD	a3,   2 * SIZE(AO)
kusano 2b45e8
	LD	b1,   8 * SIZE(BO)
kusano 2b45e8
	LD	b2,   9 * SIZE(BO)
kusano 2b45e8
	LD	b3,  10 * SIZE(BO)
kusano 2b45e8
	LD	b4,  11 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5				#	2ed compute
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t13, t13, a5, b7
kusano 2b45e8
	MADD	t14, t14, a5, b8
kusano 2b45e8
kusano 2b45e8
	LD	a7,   3 * SIZE(AO)
kusano 2b45e8
	LD	b5,  12 * SIZE(BO)
kusano 2b45e8
	LD	b6,  13 * SIZE(BO)
kusano 2b45e8
	LD	b7,  14 * SIZE(BO)
kusano 2b45e8
	LD	b8,  15 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b1				#	3rd compute
kusano 2b45e8
	MADD	t12, t12, a3, b2
kusano 2b45e8
	MADD	t13, t13, a3, b3
kusano 2b45e8
	MADD	t14, t14, a3, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  4 * SIZE				#	AO += 1mr*4kr	
kusano 2b45e8
	daddiu	BO, BO, 16 * SIZE				#	BO += 4nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b5				#	4th compute
kusano 2b45e8
	MADD	t12, t12, a7, b6
kusano 2b45e8
	MADD	t13, t13, a7, b7
kusano 2b45e8
	MADD	t14, t14, a7, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L52
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L55:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L58
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
.L56:
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	3rd compute
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  1 * SIZE				#	AO += 1mr	
kusano 2b45e8
	daddiu	BO, BO,  4 * SIZE				#	BO += 4nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L56
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L58:										#	deal with the triangular part
kusano 2b45e8
	daddiu	TEMP, KK, -1
kusano 2b45e8
	dsll	L,    TEMP, BASE_SHIFT			#	mr=1
kusano 2b45e8
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	daddu	AO, AORIG, L					#	Ao point to the triangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t12, b2, t12
kusano 2b45e8
	SUB	t13, b3, t13
kusano 2b45e8
	SUB	t14, b4, t14
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b3,  0 * SIZE(AO)
kusano 2b45e8
	MUL	t11, b3, t11
kusano 2b45e8
	MUL	t12, b3, t12
kusano 2b45e8
	MUL	t13, b3, t13
kusano 2b45e8
	MUL	t14, b3, t14
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, -1 * SIZE
kusano 2b45e8
	daddiu	CO2, CO2, -1 * SIZE
kusano 2b45e8
	daddiu	CO3, CO3, -1 * SIZE
kusano 2b45e8
	daddiu	CO4, CO4, -1 * SIZE
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(BO)
kusano 2b45e8
	ST	t12,  1 * SIZE(BO)
kusano 2b45e8
	ST	t13,  2 * SIZE(BO)
kusano 2b45e8
	ST	t14,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t13,  0 * SIZE(CO3)
kusano 2b45e8
	ST	t14,  0 * SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	daddiu	KK, KK, -1							#	the length of rectangular data part increases by 1
kusano 2b45e8
	MTC	$0,  t11							#	clear result registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t32, t11
kusano 2b45e8
	MOV	t42, t11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
.L50:	
kusano 2b45e8
	andi	I,  M, 2						#	mr=2,nr=4	
kusano 2b45e8
	blez	I, .L20
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP,   K, 1 + BASE_SHIFT
kusano 2b45e8
	dsubu	AORIG, AORIG, TEMP				#	AORIG point to the beginning address of Ai
kusano 2b45e8
kusano 2b45e8
	dsll	L,    KK, 1 + BASE_SHIFT		
kusano 2b45e8
	dsll	TEMP, KK, 2 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AORIG, L					#	AO point to the rectangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
kusano 2b45e8
	MOV	t13, t11							#	mr=2
kusano 2b45e8
	MOV	t23, t11
kusano 2b45e8
	MOV	t33, t11
kusano 2b45e8
	MOV	t43, t11
kusano 2b45e8
	MOV	t14, t11
kusano 2b45e8
	MOV	t24, t11
kusano 2b45e8
	MOV	t34, t11
kusano 2b45e8
	MOV	t44, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)					#	mr*KK with nr*KK
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)					#	get 4b
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)				
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2
kusano 2b45e8
	blez	L, .L25
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L22:
kusano 2b45e8
	LD	a5,  2 * SIZE(AO)					
kusano 2b45e8
	LD	a6,  3 * SIZE(AO)
kusano 2b45e8
	LD	b5,  4 * SIZE(BO)
kusano 2b45e8
	LD	b6,  5 * SIZE(BO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	1st compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
kusano 2b45e8
	LD	a3,   4 * SIZE(AO)
kusano 2b45e8
	LD	a4,   5 * SIZE(AO)
kusano 2b45e8
	LD	b1,   8 * SIZE(BO)
kusano 2b45e8
	LD	b2,   9 * SIZE(BO)
kusano 2b45e8
	LD	b3,  10 * SIZE(BO)
kusano 2b45e8
	LD	b4,  11 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5				#	2ed compute
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
	MADD	t13, t13, a5, b7
kusano 2b45e8
	MADD	t23, t23, a6, b7
kusano 2b45e8
	MADD	t14, t14, a5, b8
kusano 2b45e8
	MADD	t24, t24, a6, b8
kusano 2b45e8
kusano 2b45e8
	LD	a7,   6 * SIZE(AO)
kusano 2b45e8
	LD	a8,   7 * SIZE(AO)
kusano 2b45e8
	LD	b5,  12 * SIZE(BO)
kusano 2b45e8
	LD	b6,  13 * SIZE(BO)
kusano 2b45e8
	LD	b7,  14 * SIZE(BO)
kusano 2b45e8
	LD	b8,  15 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b1				#	3rd compute
kusano 2b45e8
	MADD	t21, t21, a4, b1
kusano 2b45e8
	MADD	t12, t12, a3, b2
kusano 2b45e8
	MADD	t22, t22, a4, b2
kusano 2b45e8
	MADD	t13, t13, a3, b3
kusano 2b45e8
	MADD	t23, t23, a4, b3
kusano 2b45e8
	MADD	t14, t14, a3, b4
kusano 2b45e8
	MADD	t24, t24, a4, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  8 * SIZE				#	AO += 2mr*4kr	
kusano 2b45e8
	daddiu	BO, BO, 16 * SIZE				#	BO += 4nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b5				#	4th compute
kusano 2b45e8
	MADD	t21, t21, a8, b5
kusano 2b45e8
	MADD	t12, t12, a7, b6
kusano 2b45e8
	MADD	t22, t22, a8, b6
kusano 2b45e8
	MADD	t13, t13, a7, b7
kusano 2b45e8
	MADD	t23, t23, a8, b7
kusano 2b45e8
	MADD	t14, t14, a7, b8
kusano 2b45e8
	MADD	t24, t24, a8, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L22
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L25:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L28
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
.L26:
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	3rd compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  2 * SIZE				#	AO += 2mr	
kusano 2b45e8
	daddiu	BO, BO,  4 * SIZE				#	BO += 4nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L26
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L28:										#	deal with the triangular part
kusano 2b45e8
	daddiu	TEMP, KK, -2
kusano 2b45e8
	dsll	L,    TEMP, 1 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	daddu	AO, AORIG, L					#	Ao point to the triangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
	LD	b5,  4 * SIZE(BO)
kusano 2b45e8
	LD	b6,  5 * SIZE(BO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t12, b2, t12
kusano 2b45e8
	SUB	t13, b3, t13
kusano 2b45e8
	SUB	t14, b4, t14
kusano 2b45e8
	SUB	t21, b5, t21
kusano 2b45e8
	SUB	t22, b6, t22
kusano 2b45e8
	SUB	t23, b7, t23
kusano 2b45e8
	SUB	t24, b8, t24
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1,  3 * SIZE(AO)				#	computes the triangular_part		
kusano 2b45e8
	LD	b2,  2 * SIZE(AO)
kusano 2b45e8
	MUL	t21, b1, t21
kusano 2b45e8
	MUL	t22, b1, t22
kusano 2b45e8
	MUL	t23, b1, t23
kusano 2b45e8
	MUL	t24, b1, t24
kusano 2b45e8
	NMSUB	t11, t11, b2, t21
kusano 2b45e8
	NMSUB	t12, t12, b2, t22
kusano 2b45e8
	NMSUB	t13, t13, b2, t23
kusano 2b45e8
	NMSUB	t14, t14, b2, t24
kusano 2b45e8
	
kusano 2b45e8
	LD	b3,  0 * SIZE(AO)
kusano 2b45e8
	MUL	t11, b3, t11
kusano 2b45e8
	MUL	t12, b3, t12
kusano 2b45e8
	MUL	t13, b3, t13
kusano 2b45e8
	MUL	t14, b3, t14
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, -2 * SIZE
kusano 2b45e8
	daddiu	CO2, CO2, -2 * SIZE
kusano 2b45e8
	daddiu	CO3, CO3, -2 * SIZE
kusano 2b45e8
	daddiu	CO4, CO4, -2 * SIZE
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(BO)
kusano 2b45e8
	ST	t12,  1 * SIZE(BO)
kusano 2b45e8
	ST	t13,  2 * SIZE(BO)
kusano 2b45e8
	ST	t14,  3 * SIZE(BO)
kusano 2b45e8
	ST	t21,  4 * SIZE(BO)
kusano 2b45e8
	ST	t22,  5 * SIZE(BO)
kusano 2b45e8
	ST	t23,  6 * SIZE(BO)
kusano 2b45e8
	ST	t24,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t22,  1 * SIZE(CO2)
kusano 2b45e8
	ST	t13,  0 * SIZE(CO3)
kusano 2b45e8
	ST	t23,  1 * SIZE(CO3)
kusano 2b45e8
	ST	t14,  0 * SIZE(CO4)
kusano 2b45e8
	ST	t24,  1 * SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	daddiu	KK, KK, -2							#	the length of rectangular data part increases by 2
kusano 2b45e8
	MTC	$0,  t11							#	clear result registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t32, t11
kusano 2b45e8
	MOV	t42, t11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L20:
kusano 2b45e8
	dsra	I,  M, 2						#	I=MC/4
kusano 2b45e8
	blez	I, .L29
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L11:										#	mr=4
kusano 2b45e8
	dsll	TEMP,   K,  2 + BASE_SHIFT		#	TEMP=KC*MR*data_Byte
kusano 2b45e8
	dsubu	AORIG, AORIG, TEMP				#	AORIG point to the beginning address of panel Ai
kusano 2b45e8
	dsll	L,    KK, 2 + BASE_SHIFT		# 	KC-KK is the length of the rectangular data part of Ai	
kusano 2b45e8
	dsll	TEMP, KK, 2 + BASE_SHIFT		#	KK*NR*data_Byte
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AORIG, L					#	AO point to the rectangular data part
kusano 2b45e8
	daddu	BO, B, TEMP
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK	
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)					#	mr*KK with nr*KK
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)					#	get 4a
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)					#	get 4b
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)				
kusano 2b45e8
kusano 2b45e8
	MOV	t13, t11							#	clear result registers
kusano 2b45e8
	MOV	t23, t11
kusano 2b45e8
	MOV	t33, t11
kusano 2b45e8
	MOV	t43, t11
kusano 2b45e8
	MOV	t14, t11
kusano 2b45e8
	MOV	t24, t11
kusano 2b45e8
	MOV	t34, t11
kusano 2b45e8
	MOV	t44, t11
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2						#	L=(KC-offset)/4
kusano 2b45e8
	blez	L, .L15
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L12:
kusano 2b45e8
	LD	a5,  4 * SIZE(AO)					
kusano 2b45e8
	LD	a6,  5 * SIZE(AO)
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  4 * SIZE(BO)
kusano 2b45e8
	LD	b6,  5 * SIZE(BO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	1st compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
	MADD	t33, t33, a3, b3
kusano 2b45e8
	MADD	t43, t43, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
	MADD	t34, t34, a3, b4
kusano 2b45e8
	MADD	t44, t44, a4, b4		
kusano 2b45e8
kusano 2b45e8
	LD	a1,   8 * SIZE(AO)
kusano 2b45e8
	LD	a2,   9 * SIZE(AO)
kusano 2b45e8
	LD	a3,  10 * SIZE(AO)
kusano 2b45e8
	LD	a4,  11 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,   8 * SIZE(BO)
kusano 2b45e8
	LD	b2,   9 * SIZE(BO)
kusano 2b45e8
	LD	b3,  10 * SIZE(BO)
kusano 2b45e8
	LD	b4,  11 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5				#	2ed compute
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t31, t31, a7, b5
kusano 2b45e8
	MADD	t41, t41, a8, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
	MADD	t32, t32, a7, b6
kusano 2b45e8
	MADD	t42, t42, a8, b6
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a5, b7
kusano 2b45e8
	MADD	t23, t23, a6, b7
kusano 2b45e8
	MADD	t33, t33, a7, b7
kusano 2b45e8
	MADD	t43, t43, a8, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a5, b8
kusano 2b45e8
	MADD	t24, t24, a6, b8
kusano 2b45e8
	MADD	t34, t34, a7, b8
kusano 2b45e8
	MADD	t44, t44, a8, b8			
kusano 2b45e8
kusano 2b45e8
	LD	a5,  12 * SIZE(AO)
kusano 2b45e8
	LD	a6,  13 * SIZE(AO)
kusano 2b45e8
	LD	a7,  14 * SIZE(AO)
kusano 2b45e8
	LD	a8,  15 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  12 * SIZE(BO)
kusano 2b45e8
	LD	b6,  13 * SIZE(BO)
kusano 2b45e8
	LD	b7,  14 * SIZE(BO)
kusano 2b45e8
	LD	b8,  15 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	3rd compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
	MADD	t33, t33, a3, b3
kusano 2b45e8
	MADD	t43, t43, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
	MADD	t34, t34, a3, b4
kusano 2b45e8
	MADD	t44, t44, a4, b4		
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 16 * SIZE				#	AO += 4mr*4kr	
kusano 2b45e8
	daddiu	BO, BO, 16 * SIZE				#	BO += 4nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5				#	4th compute
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t31, t31, a7, b5
kusano 2b45e8
	MADD	t41, t41, a8, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
	MADD	t32, t32, a7, b6
kusano 2b45e8
	MADD	t42, t42, a8, b6
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a5, b7
kusano 2b45e8
	MADD	t23, t23, a6, b7
kusano 2b45e8
	MADD	t33, t33, a7, b7
kusano 2b45e8
	MADD	t43, t43, a8, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a5, b8
kusano 2b45e8
	MADD	t24, t24, a6, b8
kusano 2b45e8
	MADD	t34, t34, a7, b8
kusano 2b45e8
	MADD	t44, t44, a8, b8			
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L12
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L15:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L18
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L16:
kusano 2b45e8
	MADD	t11, t11, a1, b1				
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
	MADD	t33, t33, a3, b3
kusano 2b45e8
	MADD	t43, t43, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
	MADD	t34, t34, a3, b4
kusano 2b45e8
	MADD	t44, t44, a4, b4		
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  4 * SIZE				#	AO += 4mr	
kusano 2b45e8
	daddiu	BO, BO,  4 * SIZE				#	BO += 4nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L16
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L18:										#	deal with the triangular data part of panel Ai
kusano 2b45e8
	daddiu	TEMP, KK, -4					#	
kusano 2b45e8
kusano 2b45e8
	dsll	L,    TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	daddu	AO, AORIG, L					#	AO point to the triangular data part
kusano 2b45e8
	daddu	BO, B, TEMP
kusano 2b45e8
	
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)					#	triangular_part*X + rectangular_part = B
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)					#	triangular_part*X = B - rectangular_part
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t12, b2, t12
kusano 2b45e8
	SUB	t13, b3, t13
kusano 2b45e8
	SUB	t14, b4, t14
kusano 2b45e8
kusano 2b45e8
	LD	b5,  4 * SIZE(BO)					#	sb store in row major
kusano 2b45e8
	LD	b6,  5 * SIZE(BO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
	
kusano 2b45e8
	SUB	t21, b5, t21
kusano 2b45e8
	SUB	t22, b6, t22
kusano 2b45e8
	SUB	t23, b7, t23
kusano 2b45e8
	SUB	t24, b8, t24
kusano 2b45e8
kusano 2b45e8
	LD	b1,  8 * SIZE(BO)
kusano 2b45e8
	LD	b2,  9 * SIZE(BO)
kusano 2b45e8
	LD	b3, 10 * SIZE(BO)
kusano 2b45e8
	LD	b4, 11 * SIZE(BO)
kusano 2b45e8
	
kusano 2b45e8
	SUB	t31, b1, t31
kusano 2b45e8
	SUB	t32, b2, t32
kusano 2b45e8
	SUB	t33, b3, t33
kusano 2b45e8
	SUB	t34, b4, t34
kusano 2b45e8
	
kusano 2b45e8
	LD	b5, 12 * SIZE(BO)
kusano 2b45e8
	LD	b6, 13 * SIZE(BO)
kusano 2b45e8
	LD	b7, 14 * SIZE(BO)
kusano 2b45e8
	LD	b8, 15 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	SUB	t41, b5, t41
kusano 2b45e8
	SUB	t42, b6, t42
kusano 2b45e8
	SUB	t43, b7, t43
kusano 2b45e8
	SUB	t44, b8, t44
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1,  15 * SIZE(AO)
kusano 2b45e8
	LD	b2,  14 * SIZE(AO)	
kusano 2b45e8
	LD	b4,  13 * SIZE(AO)
kusano 2b45e8
	LD	b7,  12 * SIZE(AO)
kusano 2b45e8
	
kusano 2b45e8
	MUL	t41, b1, t41
kusano 2b45e8
	MUL	t42, b1, t42
kusano 2b45e8
	MUL	t43, b1, t43
kusano 2b45e8
	MUL	t44, b1, t44
kusano 2b45e8
	NMSUB	t31, t31, b2, t41
kusano 2b45e8
	NMSUB	t32, t32, b2, t42
kusano 2b45e8
	NMSUB	t33, t33, b2, t43
kusano 2b45e8
	NMSUB	t34, t34, b2, t44
kusano 2b45e8
	NMSUB	t21, t21, b4, t41
kusano 2b45e8
	NMSUB	t22, t22, b4, t42
kusano 2b45e8
	NMSUB	t23, t23, b4, t43
kusano 2b45e8
	NMSUB	t24, t24, b4, t44
kusano 2b45e8
	NMSUB	t11, t11, b7, t41
kusano 2b45e8
	NMSUB	t12, t12, b7, t42
kusano 2b45e8
	NMSUB	t13, t13, b7, t43
kusano 2b45e8
	NMSUB	t14, t14, b7, t44
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b3,  10 * SIZE(AO)	
kusano 2b45e8
	LD	b5,   9 * SIZE(AO)
kusano 2b45e8
	LD	b8,   8 * SIZE(AO)
kusano 2b45e8
	MUL	t31, b3, t31
kusano 2b45e8
	MUL	t32, b3, t32
kusano 2b45e8
	MUL	t33, b3, t33
kusano 2b45e8
	MUL	t34, b3, t34
kusano 2b45e8
	NMSUB	t21, t21, b5, t31
kusano 2b45e8
	NMSUB	t22, t22, b5, t32
kusano 2b45e8
	NMSUB	t23, t23, b5, t33
kusano 2b45e8
	NMSUB	t24, t24, b5, t34
kusano 2b45e8
	NMSUB	t11, t11, b8, t31
kusano 2b45e8
	NMSUB	t12, t12, b8, t32
kusano 2b45e8
	NMSUB	t13, t13, b8, t33
kusano 2b45e8
	NMSUB	t14, t14, b8, t34
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b6,   5 * SIZE(AO)
kusano 2b45e8
	LD	b1,   4 * SIZE(AO)
kusano 2b45e8
	MUL	t21, b6, t21
kusano 2b45e8
	MUL	t22, b6, t22
kusano 2b45e8
	MUL	t23, b6, t23
kusano 2b45e8
	MUL	t24, b6, t24
kusano 2b45e8
	NMSUB	t11, t11, b1, t21
kusano 2b45e8
	NMSUB	t12, t12, b1, t22
kusano 2b45e8
	NMSUB	t13, t13, b1, t23
kusano 2b45e8
	NMSUB	t14, t14, b1, t24
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,   0 * SIZE(AO)
kusano 2b45e8
	MUL	t11, b2, t11
kusano 2b45e8
	MUL	t12, b2, t12
kusano 2b45e8
	MUL	t13, b2, t13
kusano 2b45e8
	MUL	t14, b2, t14
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, -4 * SIZE				#	modify 
kusano 2b45e8
	daddiu	CO2, CO2, -4 * SIZE
kusano 2b45e8
	daddiu	CO3, CO3, -4 * SIZE
kusano 2b45e8
	daddiu	CO4, CO4, -4 * SIZE
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(BO)					#	update packed B
kusano 2b45e8
	ST	t12,  1 * SIZE(BO)
kusano 2b45e8
	ST	t13,  2 * SIZE(BO)
kusano 2b45e8
	ST	t14,  3 * SIZE(BO)
kusano 2b45e8
	ST	t21,  4 * SIZE(BO)
kusano 2b45e8
	ST	t22,  5 * SIZE(BO)
kusano 2b45e8
	ST	t23,  6 * SIZE(BO)
kusano 2b45e8
	ST	t24,  7 * SIZE(BO)
kusano 2b45e8
	ST	t31,  8 * SIZE(BO)
kusano 2b45e8
	ST	t32,  9 * SIZE(BO)
kusano 2b45e8
	ST	t33, 10 * SIZE(BO)
kusano 2b45e8
	ST	t34, 11 * SIZE(BO)
kusano 2b45e8
	ST	t41, 12 * SIZE(BO)
kusano 2b45e8
	ST	t42, 13 * SIZE(BO)
kusano 2b45e8
	ST	t43, 14 * SIZE(BO)
kusano 2b45e8
	ST	t44, 15 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)					#	write back 
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	ST	t31,  2 * SIZE(CO1)
kusano 2b45e8
	ST	t41,  3 * SIZE(CO1)
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t22,  1 * SIZE(CO2)
kusano 2b45e8
	ST	t32,  2 * SIZE(CO2)
kusano 2b45e8
	ST	t42,  3 * SIZE(CO2)
kusano 2b45e8
	ST	t13,  0 * SIZE(CO3)
kusano 2b45e8
	ST	t23,  1 * SIZE(CO3)
kusano 2b45e8
	ST	t33,  2 * SIZE(CO3)
kusano 2b45e8
	ST	t43,  3 * SIZE(CO3)
kusano 2b45e8
	ST	t14,  0 * SIZE(CO4)
kusano 2b45e8
	ST	t24,  1 * SIZE(CO4)
kusano 2b45e8
	ST	t34,  2 * SIZE(CO4)
kusano 2b45e8
	ST	t44,  3 * SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	daddiu	KK, KK, -4						#	KC-KK is the length of the rectangular data part, LN compute from bottom to top so KK-=4
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  a1
kusano 2b45e8
	MOV	t11, a1
kusano 2b45e8
	MOV	t21, a1
kusano 2b45e8
	MOV	t31, a1
kusano 2b45e8
	MOV	t41, a1
kusano 2b45e8
	MOV	t12, a1
kusano 2b45e8
	MOV	t22, a1
kusano 2b45e8
	MOV	t32, a1
kusano 2b45e8
	MOV	t42, a1
kusano 2b45e8
	bgtz	I, .L11
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L29:
kusano 2b45e8
	dsll	TEMP, K, 2 + BASE_SHIFT
kusano 2b45e8
	daddu	B, B, TEMP							# B point to next Bj
kusano 2b45e8
kusano 2b45e8
	bgtz	J, .L10
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	
kusano 2b45e8
	.align 3
kusano 2b45e8
.L30:
kusano 2b45e8
	andi	J,  N, 2							#	nr=2
kusano 2b45e8
	blez	J, .L70
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	move	CO1, C
kusano 2b45e8
	daddu	CO2, C,   LDC
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11								#	clear result regusters
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
kusano 2b45e8
	daddu	KK, M, OFFSET
kusano 2b45e8
	move	AORIG, A							#	reset A
kusano 2b45e8
	
kusano 2b45e8
	daddu	C,  CO2, LDC						#	fixed 
kusano 2b45e8
kusano 2b45e8
	andi	I,  M, 1							#	mr=1
kusano 2b45e8
	blez	I, .L60
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP,   K, BASE_SHIFT
kusano 2b45e8
	dsubu	AORIG, AORIG, TEMP				#	AORIG point to the beginning address of everypanel of Ai
kusano 2b45e8
kusano 2b45e8
	dsll	L,    KK, BASE_SHIFT			#	mr=1
kusano 2b45e8
	dsll	TEMP, KK, 1 + BASE_SHIFT		#	nr=2
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AORIG, L					#	AO point to rectangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
kusano 2b45e8
	MOV	t12, t11							#	clear result registers
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t32, t11
kusano 2b45e8
	MOV	t42, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2
kusano 2b45e8
	blez	L, .L65
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L62:
kusano 2b45e8
	LD	a5,  1 * SIZE(AO)					
kusano 2b45e8
	LD	b5,  2 * SIZE(BO)
kusano 2b45e8
	LD	b6,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	1st compute
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	b3,  4 * SIZE(BO)
kusano 2b45e8
	LD	b4,  5 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5				#	2ed compute
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
kusano 2b45e8
	LD	a7,  3 * SIZE(AO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b3				#	3rd compute
kusano 2b45e8
	MADD	t12, t12, a3, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  4 * SIZE				#	AO += 1mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  8 * SIZE				#	BO += 2nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b7				#	4th compute
kusano 2b45e8
	MADD	t12, t12, a7, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L62
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L65:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L68
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
.L66:
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	3rd compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  1 * SIZE				#	AO += mr	
kusano 2b45e8
	daddiu	BO, BO,  2 * SIZE				#	BO += 2nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L66
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L68:
kusano 2b45e8
	daddiu	TEMP, KK, -1					#	mr=1
kusano 2b45e8
kusano 2b45e8
	dsll	L,    TEMP, BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + BASE_SHIFT
kusano 2b45e8
	daddu	AO, AORIG, L					#	Ao point to the triangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
	
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t12, b2, t12
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
	LD	b3,  0 * SIZE(AO)
kusano 2b45e8
	MUL	t11, b3, t11
kusano 2b45e8
	MUL	t12, b3, t12
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, -1 * SIZE
kusano 2b45e8
	daddiu	CO2, CO2, -1 * SIZE
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(BO)
kusano 2b45e8
	ST	t12,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	daddiu	KK, KK, -1
kusano 2b45e8
	MTC	$0,  t11								#	clear result regusters
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L60:
kusano 2b45e8
	andi	I,  M, 2
kusano 2b45e8
	blez	I, .L40
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP,   K, 1 + BASE_SHIFT
kusano 2b45e8
	dsubu	AORIG, AORIG, TEMP				#	AORIG point to the beginning address of everypanel of Ai
kusano 2b45e8
kusano 2b45e8
	dsll	L,    KK, 1 + BASE_SHIFT		#	mr=2
kusano 2b45e8
	dsll	TEMP, KK, 1 + BASE_SHIFT		#	nr=2
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AORIG, L					#	AO point to rectangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	MOV	t12, t11							#	clear result registers
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t32, t11
kusano 2b45e8
	MOV	t42, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2
kusano 2b45e8
	blez	L, .L45
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L42:
kusano 2b45e8
	LD	a5,  2 * SIZE(AO)					
kusano 2b45e8
	LD	a6,  3 * SIZE(AO)
kusano 2b45e8
	LD	b5,  2 * SIZE(BO)
kusano 2b45e8
	LD	b6,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	1st compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
kusano 2b45e8
	LD	a3,  4 * SIZE(AO)
kusano 2b45e8
	LD	a4,  5 * SIZE(AO)
kusano 2b45e8
	LD	b3,  4 * SIZE(BO)
kusano 2b45e8
	LD	b4,  5 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5				#	2ed compute
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b3				#	3rd compute
kusano 2b45e8
	MADD	t21, t21, a4, b3
kusano 2b45e8
	MADD	t12, t12, a3, b4
kusano 2b45e8
	MADD	t22, t22, a4, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  8 * SIZE				#	AO += 2mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  8 * SIZE				#	BO += 2nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b7				#	4th compute
kusano 2b45e8
	MADD	t21, t21, a8, b7
kusano 2b45e8
	MADD	t12, t12, a7, b8
kusano 2b45e8
	MADD	t22, t22, a8, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L42
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L45:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L48
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
.L46:
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	3rd compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  2 * SIZE				#	AO += 2mr	
kusano 2b45e8
	daddiu	BO, BO,  2 * SIZE				#	BO += 2nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L46
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L48:
kusano 2b45e8
	daddiu	TEMP, KK, -2
kusano 2b45e8
kusano 2b45e8
	dsll	L,    TEMP, 1 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + BASE_SHIFT
kusano 2b45e8
	daddu	AO, AORIG, L					#	Ao point to the triangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
	
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t12, b2, t12
kusano 2b45e8
	SUB	t21, b3, t21
kusano 2b45e8
	SUB	t22, b4, t22
kusano 2b45e8
kusano 2b45e8
	LD	b1,  3 * SIZE(AO)				#	computes the triangular_part		
kusano 2b45e8
	LD	b2,  2 * SIZE(AO)
kusano 2b45e8
	MUL	t21, b1, t21
kusano 2b45e8
	MUL	t22, b1, t22
kusano 2b45e8
	NMSUB	t11, t11, b2, t21
kusano 2b45e8
	NMSUB	t12, t12, b2, t22
kusano 2b45e8
	
kusano 2b45e8
	LD	b3,  0 * SIZE(AO)
kusano 2b45e8
	MUL	t11, b3, t11
kusano 2b45e8
	MUL	t12, b3, t12
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, -2 * SIZE
kusano 2b45e8
	daddiu	CO2, CO2, -2 * SIZE
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(BO)
kusano 2b45e8
	ST	t12,  1 * SIZE(BO)
kusano 2b45e8
	ST	t21,  2 * SIZE(BO)
kusano 2b45e8
	ST	t22,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t22,  1 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	daddiu	KK, KK, -2
kusano 2b45e8
	MTC	$0,  t11								#	clear result regusters
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L40:
kusano 2b45e8
	dsra	I,  M, 2							#	I = mc/4
kusano 2b45e8
	blez	I, .L49
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L31:
kusano 2b45e8
	dsll	TEMP,   K,  2 + BASE_SHIFT
kusano 2b45e8
	dsubu	AORIG, AORIG, TEMP					#	AORIG point to the beginning address of panel Ai
kusano 2b45e8
	dsll	L,    KK, 2 + BASE_SHIFT			#	mr=4
kusano 2b45e8
	dsll	TEMP, KK, 1 + BASE_SHIFT			#	nr=2
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AORIG, L						#	AO point to the rectangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t32, t11
kusano 2b45e8
	MOV	t42, t11
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)					#	mr*KK with nr*KK
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)					#	get 4a
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)					#	get 4b
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2
kusano 2b45e8
	blez	L, .L35
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L32:
kusano 2b45e8
	LD	a5,  4 * SIZE(AO)					
kusano 2b45e8
	LD	a6,  5 * SIZE(AO)
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
	LD	b5,  2 * SIZE(BO)
kusano 2b45e8
	LD	b6,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	1st compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	LD	a1,   8 * SIZE(AO)
kusano 2b45e8
	LD	a2,   9 * SIZE(AO)
kusano 2b45e8
	LD	a3,  10 * SIZE(AO)
kusano 2b45e8
	LD	a4,  11 * SIZE(AO)
kusano 2b45e8
	LD	b3,   4 * SIZE(BO)
kusano 2b45e8
	LD	b4,   5 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5				#	2ed compute
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t31, t31, a7, b5
kusano 2b45e8
	MADD	t41, t41, a8, b5
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
	MADD	t32, t32, a7, b6
kusano 2b45e8
	MADD	t42, t42, a8, b6
kusano 2b45e8
kusano 2b45e8
	LD	a5,  12 * SIZE(AO)
kusano 2b45e8
	LD	a6,  13 * SIZE(AO)
kusano 2b45e8
	LD	a7,  14 * SIZE(AO)
kusano 2b45e8
	LD	a8,  15 * SIZE(AO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b3				#	3rd compute
kusano 2b45e8
	MADD	t21, t21, a2, b3
kusano 2b45e8
	MADD	t31, t31, a3, b3
kusano 2b45e8
	MADD	t41, t41, a4, b3
kusano 2b45e8
	MADD	t12, t12, a1, b4
kusano 2b45e8
	MADD	t22, t22, a2, b4
kusano 2b45e8
	MADD	t32, t32, a3, b4
kusano 2b45e8
	MADD	t42, t42, a4, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 16 * SIZE				#	AO += 4mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  8 * SIZE				#	BO += 2nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b7				#	4th compute
kusano 2b45e8
	MADD	t21, t21, a6, b7
kusano 2b45e8
	MADD	t31, t31, a7, b7
kusano 2b45e8
	MADD	t41, t41, a8, b7
kusano 2b45e8
	MADD	t12, t12, a5, b8
kusano 2b45e8
	MADD	t22, t22, a6, b8
kusano 2b45e8
	MADD	t32, t32, a7, b8
kusano 2b45e8
	MADD	t42, t42, a8, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L32
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L35:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L38
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
.L36:
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	3rd compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
	
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  4 * SIZE				#	AO += 4mr	
kusano 2b45e8
	daddiu	BO, BO,  2 * SIZE				#	BO += 2nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L36
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L38:										#
kusano 2b45e8
	daddiu	TEMP, KK, -4
kusano 2b45e8
	dsll	L,    TEMP, 2 + BASE_SHIFT		#	mr=4
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + BASE_SHIFT		#	nr=2
kusano 2b45e8
	daddu	AO, AORIG, L					#	AO point to the triangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
	LD	b5,  4 * SIZE(BO)
kusano 2b45e8
	LD	b6,  5 * SIZE(BO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t12, b2, t12
kusano 2b45e8
	SUB	t21, b3, t21
kusano 2b45e8
	SUB	t22, b4, t22
kusano 2b45e8
	SUB	t31, b5, t31
kusano 2b45e8
	SUB	t32, b6, t32
kusano 2b45e8
	SUB	t41, b7, t41
kusano 2b45e8
	SUB	t42, b8, t42
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1,  15 * SIZE(AO)
kusano 2b45e8
	LD	b2,  14 * SIZE(AO)	
kusano 2b45e8
	LD	b4,  13 * SIZE(AO)
kusano 2b45e8
	LD	b7,  12 * SIZE(AO)
kusano 2b45e8
	
kusano 2b45e8
	MUL	t41, b1, t41
kusano 2b45e8
	MUL	t42, b1, t42
kusano 2b45e8
	NMSUB	t31, t31, b2, t41
kusano 2b45e8
	NMSUB	t32, t32, b2, t42
kusano 2b45e8
	NMSUB	t21, t21, b4, t41
kusano 2b45e8
	NMSUB	t22, t22, b4, t42
kusano 2b45e8
	NMSUB	t11, t11, b7, t41
kusano 2b45e8
	NMSUB	t12, t12, b7, t42
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b3,  10 * SIZE(AO)	
kusano 2b45e8
	LD	b5,   9 * SIZE(AO)
kusano 2b45e8
	LD	b8,   8 * SIZE(AO)
kusano 2b45e8
	MUL	t31, b3, t31
kusano 2b45e8
	MUL	t32, b3, t32
kusano 2b45e8
	NMSUB	t21, t21, b5, t31
kusano 2b45e8
	NMSUB	t22, t22, b5, t32
kusano 2b45e8
	NMSUB	t11, t11, b8, t31
kusano 2b45e8
	NMSUB	t12, t12, b8, t32
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b6,   5 * SIZE(AO)
kusano 2b45e8
	LD	b1,   4 * SIZE(AO)
kusano 2b45e8
	MUL	t21, b6, t21
kusano 2b45e8
	MUL	t22, b6, t22
kusano 2b45e8
	NMSUB	t11, t11, b1, t21
kusano 2b45e8
	NMSUB	t12, t12, b1, t22
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,   0 * SIZE(AO)
kusano 2b45e8
	MUL	t11, b2, t11
kusano 2b45e8
	MUL	t12, b2, t12
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, -4 * SIZE
kusano 2b45e8
	daddiu	CO2, CO2, -4 * SIZE
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(BO)
kusano 2b45e8
	ST	t12,  1 * SIZE(BO)
kusano 2b45e8
	ST	t21,  2 * SIZE(BO)
kusano 2b45e8
	ST	t22,  3 * SIZE(BO)
kusano 2b45e8
	ST	t31,  4 * SIZE(BO)
kusano 2b45e8
	ST	t32,  5 * SIZE(BO)
kusano 2b45e8
	ST	t41,  6 * SIZE(BO)
kusano 2b45e8
	ST	t42,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	ST	t31,  2 * SIZE(CO1)
kusano 2b45e8
	ST	t41,  3 * SIZE(CO1)
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t22,  1 * SIZE(CO2)
kusano 2b45e8
	ST	t32,  2 * SIZE(CO2)
kusano 2b45e8
	ST	t42,  3 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	daddiu	KK, KK, -4
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
	bgtz	I, .L31
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L49:
kusano 2b45e8
	dsll	TEMP, K, 1 + BASE_SHIFT		# 	nr=2
kusano 2b45e8
	daddu	B, B, TEMP
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L70:
kusano 2b45e8
	andi	J,  N, 1					#	nr=1
kusano 2b45e8
	blez	J, .L999					#	END
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	move	CO1, C
kusano 2b45e8
kusano 2b45e8
	daddu	KK, M, OFFSET
kusano 2b45e8
	move	AORIG, A					#	reset A
kusano 2b45e8
kusano 2b45e8
	andi	I,  M, 1					#	mr=1
kusano 2b45e8
	blez	I, .L90
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP,   K, BASE_SHIFT			#	mr=1
kusano 2b45e8
	dsubu	AORIG, AORIG, TEMP
kusano 2b45e8
	
kusano 2b45e8
	dsll	L,    KK, BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AORIG, L					#	AO point to the rectangular data part
kusano 2b45e8
	daddu	BO, B,    L 
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2
kusano 2b45e8
	blez	L, .L95
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
.L92:
kusano 2b45e8
	LD	a5,  1 * SIZE(AO)					
kusano 2b45e8
	LD	b5,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	1st compute
kusano 2b45e8
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5				#	2ed compute
kusano 2b45e8
kusano 2b45e8
	LD	a7,  3 * SIZE(AO)
kusano 2b45e8
	LD	b7,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b3				#	3rd compute
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  4 * SIZE				#	AO += 1mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  4 * SIZE				#	BO += 1nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b7				#	4th compute
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L92
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L95:
kusano 2b45e8
	andi	L, TEMP,  3
kusano 2b45e8
	blez	L, .L98
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L96:
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	3rd compute
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  1 * SIZE				#	AO += 1mr	
kusano 2b45e8
	daddiu	BO, BO,  1 * SIZE				#	BO += 1nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L96
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L98:
kusano 2b45e8
	daddiu	TEMP, KK, -1					# 	mr=2
kusano 2b45e8
	dsll	TEMP, TEMP,  BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AORIG, TEMP					#	AO point to the triangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b3,  0 * SIZE(AO)
kusano 2b45e8
	MUL	t11, b3, t11
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, -1 * SIZE
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	daddiu	KK, KK, -1
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L90:	
kusano 2b45e8
	andi	I,  M, 2
kusano 2b45e8
	blez	I, .L80
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11
kusano 2b45e8
	MOV	t21, t11							#	clear result registers
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP,   K, 1+BASE_SHIFT			#	mr=2
kusano 2b45e8
	dsubu	AORIG, AORIG, TEMP
kusano 2b45e8
	
kusano 2b45e8
	dsll	L,    KK, 1 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, KK, 0 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AORIG, L					#	AO point to the rectangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2
kusano 2b45e8
	blez	L, .L85
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
.L82:
kusano 2b45e8
	LD	a5,  2 * SIZE(AO)					
kusano 2b45e8
	LD	a6,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	1st compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
kusano 2b45e8
	LD	a3,  4 * SIZE(AO)
kusano 2b45e8
	LD	a4,  5 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5				#	2ed compute
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b7,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b3				#	3rd compute
kusano 2b45e8
	MADD	t21, t21, a4, b3
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  8 * SIZE				#	AO += 2mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  4 * SIZE				#	BO += 1nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b7				#	4th compute
kusano 2b45e8
	MADD	t21, t21, a8, b7
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L82
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L85:
kusano 2b45e8
	andi	L, TEMP,  3
kusano 2b45e8
	blez	L, .L88
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L86:
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	3rd compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  2 * SIZE				#	AO += 2mr	
kusano 2b45e8
	daddiu	BO, BO,  1 * SIZE				#	BO += 1nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L86
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L88:
kusano 2b45e8
	daddiu	TEMP, KK, -2					# 	mr=2
kusano 2b45e8
	dsll	L,    TEMP, 1 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 0 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AORIG, L					#	AO point to the triangular data part
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
kusano 2b45e8
	LD	b1,  3 * SIZE(AO)				#	computes the triangular_part		
kusano 2b45e8
	LD	b2,  2 * SIZE(AO)
kusano 2b45e8
	MUL	t21, b1, t21
kusano 2b45e8
	NMSUB	t11, t11, b2, t21
kusano 2b45e8
	
kusano 2b45e8
	LD	b3,  0 * SIZE(AO)
kusano 2b45e8
	MUL	t11, b3, t11
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, -2 * SIZE
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(BO)
kusano 2b45e8
	ST	t21,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	daddiu	KK, KK, -2
kusano 2b45e8
	
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
.L80:
kusano 2b45e8
	dsra	I,  M, 2
kusano 2b45e8
	blez	I, .L89
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L71:
kusano 2b45e8
	dsll	TEMP,   K,  2 + BASE_SHIFT		#	mr=4
kusano 2b45e8
	dsubu	AORIG, AORIG, TEMP
kusano 2b45e8
kusano 2b45e8
	dsll	L,    KK, 2 + BASE_SHIFT		#	mr=4
kusano 2b45e8
	dsll	TEMP, KK, 0 + BASE_SHIFT		#	nr=1
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AORIG, L					#	AO point to the rectangular
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11								#	clear result regusters
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)					#	mr*KK with nr*KK
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)					#	get 4a
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)					#	get 4b
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2
kusano 2b45e8
	blez	L, .L75
kusano 2b45e8
	nop										#	reset B
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L72:
kusano 2b45e8
	LD	a5,  4 * SIZE(AO)					
kusano 2b45e8
	LD	a6,  5 * SIZE(AO)
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	1st compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	LD	a1,   8 * SIZE(AO)
kusano 2b45e8
	LD	a2,   9 * SIZE(AO)
kusano 2b45e8
	LD	a3,  10 * SIZE(AO)
kusano 2b45e8
	LD	a4,  11 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5				#	2ed compute
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t31, t31, a7, b5
kusano 2b45e8
	MADD	t41, t41, a8, b5
kusano 2b45e8
kusano 2b45e8
	LD	a5,  12 * SIZE(AO)
kusano 2b45e8
	LD	a6,  13 * SIZE(AO)
kusano 2b45e8
	LD	a7,  14 * SIZE(AO)
kusano 2b45e8
	LD	a8,  15 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b7,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b3				#	3rd compute
kusano 2b45e8
	MADD	t21, t21, a2, b3
kusano 2b45e8
	MADD	t31, t31, a3, b3
kusano 2b45e8
	MADD	t41, t41, a4, b3
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 16 * SIZE				#	AO += 4mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  4 * SIZE				#	BO += 1nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b7				#	4th compute
kusano 2b45e8
	MADD	t21, t21, a6, b7
kusano 2b45e8
	MADD	t31, t31, a7, b7
kusano 2b45e8
	MADD	t41, t41, a8, b7
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L72
kusano 2b45e8
	nop
kusano 2b45e8
	
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L75:
kusano 2b45e8
	andi	L, TEMP,  3
kusano 2b45e8
	blez	L, .L78
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L76:
kusano 2b45e8
	MADD	t11, t11, a1, b1				#	3rd compute
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  4 * SIZE				#	AO += 4mr	
kusano 2b45e8
	daddiu	BO, BO,  1 * SIZE				#	BO += 1nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)					#	next 
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L76
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L78:
kusano 2b45e8
	daddiu	TEMP, KK, -4				#	mr=4
kusano 2b45e8
kusano 2b45e8
	dsll	L,    TEMP, 2 + BASE_SHIFT	#	mr=4
kusano 2b45e8
	dsll	TEMP, TEMP, 0 + BASE_SHIFT	#	nr=1
kusano 2b45e8
	daddu	AO, AORIG, L				#	AO point to the triangular
kusano 2b45e8
	daddu	BO, B,     TEMP
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
	SUB	t31, b3, t31
kusano 2b45e8
	SUB	t41, b4, t41
kusano 2b45e8
kusano 2b45e8
	LD	b1,  15 * SIZE(AO)
kusano 2b45e8
	LD	b2,  14 * SIZE(AO)	
kusano 2b45e8
	LD	b4,  13 * SIZE(AO)
kusano 2b45e8
	LD	b7,  12 * SIZE(AO)
kusano 2b45e8
	MUL	t41, b1, t41
kusano 2b45e8
	NMSUB	t31, t31, b2, t41
kusano 2b45e8
	NMSUB	t21, t21, b4, t41
kusano 2b45e8
	NMSUB	t11, t11, b7, t41
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b3,  10 * SIZE(AO)	
kusano 2b45e8
	LD	b5,   9 * SIZE(AO)
kusano 2b45e8
	LD	b8,   8 * SIZE(AO)
kusano 2b45e8
	MUL	t31, b3, t31
kusano 2b45e8
	NMSUB	t21, t21, b5, t31
kusano 2b45e8
	NMSUB	t11, t11, b8, t31
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b6,   5 * SIZE(AO)
kusano 2b45e8
	LD	b1,   4 * SIZE(AO)
kusano 2b45e8
	MUL	t21, b6, t21
kusano 2b45e8
	NMSUB	t11, t11, b1, t21
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,   0 * SIZE(AO)
kusano 2b45e8
	MUL	t11, b2, t11
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, -4 * SIZE
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(BO)
kusano 2b45e8
	ST	t21,  1 * SIZE(BO)
kusano 2b45e8
	ST	t31,  2 * SIZE(BO)
kusano 2b45e8
	ST	t41,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	ST	t31,  2 * SIZE(CO1)
kusano 2b45e8
	ST	t41,  3 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	daddiu	KK, KK, -4
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
	bgtz	I, .L71
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L89:
kusano 2b45e8
	dsll	TEMP, K, BASE_SHIFT			#	nr=1
kusano 2b45e8
	daddu	B, B, TEMP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L999:
kusano 2b45e8
	LDARG	$16,   0($sp)
kusano 2b45e8
	LDARG	$17,   8($sp)
kusano 2b45e8
	LDARG	$18,  16($sp)
kusano 2b45e8
	LDARG	$19,  24($sp)
kusano 2b45e8
	LDARG	$20,  32($sp)
kusano 2b45e8
	LDARG	$21,  40($sp)
kusano 2b45e8
	ldc1	$f24, 48($sp)
kusano 2b45e8
	ldc1	$f25, 56($sp)
kusano 2b45e8
	ldc1	$f26, 64($sp)
kusano 2b45e8
	ldc1	$f27, 72($sp)
kusano 2b45e8
	ldc1	$f28, 80($sp)
kusano 2b45e8
kusano 2b45e8
	LDARG	$22,  88($sp)
kusano 2b45e8
	LDARG	$23,  96($sp)
kusano 2b45e8
	LDARG	$24, 104($sp)
kusano 2b45e8
	LDARG	$25, 112($sp)
kusano 2b45e8
kusano 2b45e8
#ifndef __64BIT__
kusano 2b45e8
	ldc1	$f20,112($sp)
kusano 2b45e8
	ldc1	$f21,120($sp)
kusano 2b45e8
	ldc1	$f22,128($sp)
kusano 2b45e8
	ldc1	$f23,136($sp)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	j	$31
kusano 2b45e8
	daddiu	$sp, $sp, 144
kusano 2b45e8
kusano 2b45e8
	EPILOGUE