Blame thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/mips64/trsm_kernel_RT_loongson3a.S

kusano 2b45e8
#define REALNAME ASMNAME
kusano 2b45e8
kusano 2b45e8
#define ASSEMBLER
kusano 2b45e8
#include "common.h"
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#define M	$4
kusano 2b45e8
#define	N	$5
kusano 2b45e8
#define	K	$6
kusano 2b45e8
#define A	$8
kusano 2b45e8
#define B	$9
kusano 2b45e8
#define C	$10
kusano 2b45e8
#define LDC	$11
kusano 2b45e8
kusano 2b45e8
#define AO	$12
kusano 2b45e8
#define BO	$13
kusano 2b45e8
kusano 2b45e8
#define I	$2
kusano 2b45e8
#define J	$3
kusano 2b45e8
#define L	$7
kusano 2b45e8
kusano 2b45e8
#define CO1	$14
kusano 2b45e8
#define CO2	$15
kusano 2b45e8
#define CO3	$16
kusano 2b45e8
#define CO4	$17
kusano 2b45e8
kusano 2b45e8
#define OFFSET	$22
kusano 2b45e8
#define KK	$23
kusano 2b45e8
#define TEMP	$24
kusano 2b45e8
#define AORIG	$25
kusano 2b45e8
kusano 2b45e8
#define a1	$f0
kusano 2b45e8
#define a2	$f1
kusano 2b45e8
#define a3	$f26
kusano 2b45e8
#define a4	$f27
kusano 2b45e8
kusano 2b45e8
#define a5	$f28
kusano 2b45e8
#define a6	$f29
kusano 2b45e8
#define	a7	$f30
kusano 2b45e8
#define	a8	$f31
kusano 2b45e8
kusano 2b45e8
#define b1	$f2
kusano 2b45e8
#define b2	$f3
kusano 2b45e8
#define b3	$f4
kusano 2b45e8
#define b4	$f5
kusano 2b45e8
kusano 2b45e8
#define b5	$f6
kusano 2b45e8
#define b6	$f7
kusano 2b45e8
#define b7	$f8
kusano 2b45e8
#define b8	$f9
kusano 2b45e8
kusano 2b45e8
#define t11	$f10
kusano 2b45e8
#define t21	$f11
kusano 2b45e8
#define t31	$f12
kusano 2b45e8
#define	t41	$f13
kusano 2b45e8
kusano 2b45e8
#define t12	$f14
kusano 2b45e8
#define	t22	$f15
kusano 2b45e8
#define t32	$f16
kusano 2b45e8
#define	t42	$f17
kusano 2b45e8
kusano 2b45e8
#define	t13	$f18
kusano 2b45e8
#define	t23	$f19
kusano 2b45e8
#define	t33	$f20
kusano 2b45e8
#define	t43	$f21
kusano 2b45e8
kusano 2b45e8
#define	t14	$f22
kusano 2b45e8
#define	t24	$f23
kusano 2b45e8
#define	t34	$f24
kusano 2b45e8
#define t44	$f25
kusano 2b45e8
kusano 2b45e8
	PROLOGUE
kusano 2b45e8
	
kusano 2b45e8
	daddiu	$sp, $sp, -144
kusano 2b45e8
kusano 2b45e8
	SDARG	$16,   0($sp)
kusano 2b45e8
	SDARG	$17,   8($sp)
kusano 2b45e8
	SDARG	$18,  16($sp)
kusano 2b45e8
	SDARG	$19,  24($sp)
kusano 2b45e8
	SDARG	$20,  32($sp)
kusano 2b45e8
	SDARG	$21,  40($sp)
kusano 2b45e8
	sdc1	$f24, 48($sp)
kusano 2b45e8
	sdc1	$f25, 56($sp)
kusano 2b45e8
	sdc1	$f26, 64($sp)
kusano 2b45e8
	sdc1	$f27, 72($sp)
kusano 2b45e8
	sdc1	$f28, 80($sp)
kusano 2b45e8
kusano 2b45e8
	SDARG	$22,  88($sp)
kusano 2b45e8
	SDARG	$23,  96($sp)
kusano 2b45e8
	SDARG	$24, 104($sp)
kusano 2b45e8
	SDARG	$25, 112($sp)
kusano 2b45e8
kusano 2b45e8
#ifndef __64BIT__
kusano 2b45e8
	sdc1	$f20,112($sp)
kusano 2b45e8
	sdc1	$f21,120($sp)
kusano 2b45e8
	sdc1	$f22,128($sp)
kusano 2b45e8
	sdc1	$f23,136($sp)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align	3								#	RT compute from right to left
kusano 2b45e8
	LDARG	OFFSET, 144($sp)				#	get the last parameter
kusano 2b45e8
	dsll	LDC, LDC, BASE_SHIFT			#	LDC * data_Byte
kusano 2b45e8
kusano 2b45e8
	mult	N, K
kusano 2b45e8
	mflo	TEMP
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, TEMP, BASE_SHIFT			#	B Representative triangle matrix!!!
kusano 2b45e8
	daddu	B, B, TEMP						#	B point to the end of sb
kusano 2b45e8
											#	Be carefull B has no effeck of mc!!
kusano 2b45e8
	mult	N, LDC
kusano 2b45e8
	mflo	TEMP
kusano 2b45e8
	daddu	C, C, TEMP						#	C point to the last colum of blockB 
kusano 2b45e8
kusano 2b45e8
	dsubu	KK, K, OFFSET					#	KC-KK is the length of rectangular data part of Bj
kusano 2b45e8
kusano 2b45e8
	andi	J,  N, 1
kusano 2b45e8
	blez	J, .L30
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, K, BASE_SHIFT
kusano 2b45e8
	dsubu	B, B, TEMP						#	move B to the beginning address of Bj
kusano 2b45e8
kusano 2b45e8
	dsubu	C, C, LDC
kusano 2b45e8
kusano 2b45e8
	move	CO1, C
kusano 2b45e8
kusano 2b45e8
	move	AORIG, A
kusano 2b45e8
	
kusano 2b45e8
	dsra	I,  M, 2
kusano 2b45e8
	blez	I, .L80
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
.L31:										#	mr=4,nr=1
kusano 2b45e8
	dsll	L,    KK, 2 + BASE_SHIFT		#	mr=4	
kusano 2b45e8
	dsll	TEMP, KK, BASE_SHIFT			#	nr=1
kusano 2b45e8
	daddu	AO, AORIG, L					
kusano 2b45e8
	daddu	BO, B, TEMP						#	BO point to the retangular data part,also reset BO
kusano 2b45e8
	dsubu	TEMP, K, KK						#	temp = the length of rectangular data part 
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear 4 results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2						#	L=(KC-offset)/4
kusano 2b45e8
	blez	L, .L35
kusano 2b45e8
	NOP
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L32:
kusano 2b45e8
	LD	a5,  4 * SIZE(AO)
kusano 2b45e8
	LD	a6,  5 * SIZE(AO)
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	LD	a1,  8 * SIZE(AO)
kusano 2b45e8
	LD	a2,  9 * SIZE(AO)
kusano 2b45e8
	LD	a3,  10 * SIZE(AO)
kusano 2b45e8
	LD	a4,  11 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t31, t31, a7, b5
kusano 2b45e8
	MADD	t41, t41, a8, b5
kusano 2b45e8
kusano 2b45e8
	LD	a5,  12 * SIZE(AO)
kusano 2b45e8
	LD	a6,  13 * SIZE(AO)
kusano 2b45e8
	LD	a7,  14 * SIZE(AO)
kusano 2b45e8
	LD	a8,  15 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b7,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b3
kusano 2b45e8
	MADD	t21, t21, a2, b3
kusano 2b45e8
	MADD	t31, t31, a3, b3
kusano 2b45e8
	MADD	t41, t41, a4, b3
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 16 * SIZE			#	AO += 4mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  4 * SIZE			#	BO += 1nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b7
kusano 2b45e8
	MADD	t21, t21, a6, b7
kusano 2b45e8
	MADD	t31, t31, a7, b7
kusano 2b45e8
	MADD	t41, t41, a8, b7
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L32
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L35:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L38
kusano 2b45e8
	NOP
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L36:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 4 * SIZE			#	AO += 4mr	
kusano 2b45e8
	daddiu	BO, BO, 1 * SIZE			#	BO += 2nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L36
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align
kusano 2b45e8
.L38:
kusano 2b45e8
	daddiu	TEMP, KK,  -1				#	deal with the triangular data part
kusano 2b45e8
	dsll	L,    TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, BASE_SHIFT		#	nr=1
kusano 2b45e8
	daddu	AO, AORIG, L
kusano 2b45e8
	daddu	BO, B, TEMP					#	BO point to the trigular data part
kusano 2b45e8
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	fixed results
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)
kusano 2b45e8
	LD	b3,  2 * SIZE(AO)
kusano 2b45e8
	LD	b4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
	SUB	t31, b3, t31
kusano 2b45e8
	SUB	t41, b4, t41
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,   0 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b2, t11
kusano 2b45e8
	MUL	t21, b2, t21
kusano 2b45e8
	MUL	t31, b2, t31
kusano 2b45e8
	MUL	t41, b2, t41
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				# updata packed A
kusano 2b45e8
	ST	t21,  1 * SIZE(AO)
kusano 2b45e8
	ST	t31,  2 * SIZE(AO)
kusano 2b45e8
	ST	t41,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	ST	t31,  2 * SIZE(CO1)
kusano 2b45e8
	ST	t41,  3 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 4 * SIZE			#	fixed pointer
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, K, 2 + BASE_SHIFT		
kusano 2b45e8
	daddu	AORIG, AORIG, TEMP			# move to next panel Ai
kusano 2b45e8
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
	bgtz	I, .L31
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L80:
kusano 2b45e8
	andi	I, M, 2
kusano 2b45e8
	blez	I, .L90
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	dsll	L,    KK, 1 + BASE_SHIFT		#	mr=2	
kusano 2b45e8
	dsll	TEMP, KK, BASE_SHIFT			#	nr=1
kusano 2b45e8
	daddu	AO, AORIG, L					
kusano 2b45e8
	daddu	BO, B, TEMP						#	BO point to the retangular data part,also reset BO
kusano 2b45e8
	dsubu	TEMP, K, KK						#	temp = the length of rectangular data part 
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear 4 results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2						#	L=(KC-offset)/4
kusano 2b45e8
	blez	L, .L85
kusano 2b45e8
	NOP
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L82:
kusano 2b45e8
	LD	a5,  2 * SIZE(AO)
kusano 2b45e8
	LD	a6,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
kusano 2b45e8
	LD	a3,  4 * SIZE(AO)
kusano 2b45e8
	LD	a4,  5 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b7,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b3
kusano 2b45e8
	MADD	t21, t21, a4, b3
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 16 * SIZE			#	AO += 4mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  4 * SIZE			#	BO += 1nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b7
kusano 2b45e8
	MADD	t21, t21, a8, b7
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L82
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L85:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L88
kusano 2b45e8
	NOP
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L86:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 2 * SIZE			#	AO += 2mr	
kusano 2b45e8
	daddiu	BO, BO, 1 * SIZE			#	BO += 1nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L86
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align
kusano 2b45e8
.L88:
kusano 2b45e8
	daddiu	TEMP, KK,  -1				#	deal with the triangular data part
kusano 2b45e8
	dsll	L,    TEMP, 1 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, BASE_SHIFT		#	nr=1
kusano 2b45e8
	daddu	AO, AORIG, L
kusano 2b45e8
	daddu	BO, B, TEMP					#	BO point to the trigular data part
kusano 2b45e8
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	fixed results
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,   0 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b2, t11
kusano 2b45e8
	MUL	t21, b2, t21
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				# updata packed A
kusano 2b45e8
	ST	t21,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 2 * SIZE			#	fixed pointer
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, K, 1 + BASE_SHIFT		
kusano 2b45e8
	daddu	AORIG, AORIG, TEMP			# move to next panel Ai
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L90:
kusano 2b45e8
	andi	I, M, 1
kusano 2b45e8
	blez	I, .L39
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	dsll	L,    KK, BASE_SHIFT			#	mr=1	
kusano 2b45e8
	dsll	TEMP, KK, BASE_SHIFT			#	nr=1
kusano 2b45e8
	daddu	AO, AORIG, L					
kusano 2b45e8
	daddu	BO, B, TEMP						#	BO point to the retangular data part,also reset BO
kusano 2b45e8
	dsubu	TEMP, K, KK						#	temp = the length of rectangular data part 
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear 4 results registers
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2						#	L=(KC-offset)/4
kusano 2b45e8
	blez	L, .L95
kusano 2b45e8
	NOP
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L92:
kusano 2b45e8
	LD	a5,  1 * SIZE(AO)
kusano 2b45e8
	LD	b5,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
kusano 2b45e8
	LD	a7,  3 * SIZE(AO)
kusano 2b45e8
	LD	b7,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b3
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  4 * SIZE			#	AO += 1mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  4 * SIZE			#	BO += 1nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b7
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L92
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L95:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L98
kusano 2b45e8
	NOP
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L96:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 1 * SIZE			#	AO += 2mr	
kusano 2b45e8
	daddiu	BO, BO, 1 * SIZE			#	BO += 1nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L96
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align
kusano 2b45e8
.L98:
kusano 2b45e8
	daddiu	TEMP, KK,  -1				#	deal with the triangular data part
kusano 2b45e8
	dsll	L,    TEMP, BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, BASE_SHIFT		#	nr=1
kusano 2b45e8
	daddu	AO, AORIG, L
kusano 2b45e8
	daddu	BO, B, TEMP					#	BO point to the trigular data part
kusano 2b45e8
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	fixed results
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,   0 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b2, t11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				# updata packed A
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back
kusano 2b45e8
	
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 1 * SIZE			#	fixed pointer
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, K, BASE_SHIFT		
kusano 2b45e8
	daddu	AORIG, AORIG, TEMP			# move to next panel Ai
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L39:
kusano 2b45e8
	daddiu	KK, KK, -1						#	rectangular data length increased by 1
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L30:										#	nr=2
kusano 2b45e8
	andi	J,  N, 2
kusano 2b45e8
	blez	J, .L50
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, K, 1 + BASE_SHIFT			#	Kc*2nr	move B to the beginning address of Bj
kusano 2b45e8
	dsubu	B, B, TEMP
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, LDC, 1					#	C 
kusano 2b45e8
	dsubu	C, C, TEMP
kusano 2b45e8
kusano 2b45e8
	move	CO1, C
kusano 2b45e8
	daddu	CO2, C,   LDC
kusano 2b45e8
kusano 2b45e8
	move	AORIG, A
kusano 2b45e8
kusano 2b45e8
	dsra	I,  M, 2
kusano 2b45e8
	blez	I, .L60
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
.L51:										#	mr=4,nr=2
kusano 2b45e8
	dsll	L,    KK, 2 + BASE_SHIFT		#	mr=4	
kusano 2b45e8
	dsll	TEMP, KK, 1 + BASE_SHIFT		#	nr=2
kusano 2b45e8
	daddu	AO, AORIG, L					
kusano 2b45e8
	daddu	BO, B, TEMP						#	BO point to the retangular data part,also reset BO
kusano 2b45e8
	dsubu	TEMP, K, KK						#	temp = the length of rectangular data part 
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear 8 results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t32, t11
kusano 2b45e8
	MOV	t42, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2						#	L=(KC-offset)/4
kusano 2b45e8
	blez	L, .L55
kusano 2b45e8
	NOP
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L52:
kusano 2b45e8
	LD	a5,  4 * SIZE(AO)
kusano 2b45e8
	LD	a6,  5 * SIZE(AO)
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  2 * SIZE(BO)
kusano 2b45e8
	LD	b6,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	LD	a1,  8 * SIZE(AO)
kusano 2b45e8
	LD	a2,  9 * SIZE(AO)
kusano 2b45e8
	LD	a3,  10 * SIZE(AO)
kusano 2b45e8
	LD	a4,  11 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b3,  4 * SIZE(BO)
kusano 2b45e8
	LD	b4,  5 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t31, t31, a7, b5
kusano 2b45e8
	MADD	t41, t41, a8, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
	MADD	t32, t32, a7, b6
kusano 2b45e8
	MADD	t42, t42, a8, b6
kusano 2b45e8
kusano 2b45e8
	LD	a5,  12 * SIZE(AO)
kusano 2b45e8
	LD	a6,  13 * SIZE(AO)
kusano 2b45e8
	LD	a7,  14 * SIZE(AO)
kusano 2b45e8
	LD	a8,  15 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b3
kusano 2b45e8
	MADD	t21, t21, a2, b3
kusano 2b45e8
	MADD	t31, t31, a3, b3
kusano 2b45e8
	MADD	t41, t41, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b4
kusano 2b45e8
	MADD	t22, t22, a2, b4
kusano 2b45e8
	MADD	t32, t32, a3, b4
kusano 2b45e8
	MADD	t42, t42, a4, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 16 * SIZE			#	AO += 4mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  8 * SIZE			#	BO += 2nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b7
kusano 2b45e8
	MADD	t21, t21, a6, b7
kusano 2b45e8
	MADD	t31, t31, a7, b7
kusano 2b45e8
	MADD	t41, t41, a8, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b8
kusano 2b45e8
	MADD	t22, t22, a6, b8
kusano 2b45e8
	MADD	t32, t32, a7, b8
kusano 2b45e8
	MADD	t42, t42, a8, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L52
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L55:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L58
kusano 2b45e8
	NOP
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L56:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 4 * SIZE			#	AO += 4mr	
kusano 2b45e8
	daddiu	BO, BO, 2 * SIZE			#	BO += 2nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L56
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align
kusano 2b45e8
.L58:
kusano 2b45e8
	daddiu	TEMP, KK,  -2				#	deal with the triangular data part
kusano 2b45e8
	dsll	L,    TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + BASE_SHIFT	#	nr=2
kusano 2b45e8
	daddu	AO, AORIG, L
kusano 2b45e8
	daddu	BO, B, TEMP					#	BO point to the trigular data part
kusano 2b45e8
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	fixed results
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)
kusano 2b45e8
	LD	b3,  2 * SIZE(AO)
kusano 2b45e8
	LD	b4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
	SUB	t31, b3, t31
kusano 2b45e8
	SUB	t41, b4, t41
kusano 2b45e8
kusano 2b45e8
 	LD	b5,  4 * SIZE(AO)
kusano 2b45e8
	LD	b6,  5 * SIZE(AO)
kusano 2b45e8
	LD	b7,  6 * SIZE(AO)
kusano 2b45e8
	LD	b8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t12, b5, t12
kusano 2b45e8
	SUB	t22, b6, t22
kusano 2b45e8
	SUB	t32, b7, t32
kusano 2b45e8
	SUB	t42, b8, t42
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b8,   3 * SIZE(BO)
kusano 2b45e8
	LD	b1,   2 * SIZE(BO)
kusano 2b45e8
	MUL	t12, b8, t12
kusano 2b45e8
	MUL	t22, b8, t22
kusano 2b45e8
	MUL	t32, b8, t32
kusano 2b45e8
	MUL	t42, b8, t42
kusano 2b45e8
	NMSUB	t11, t11, b1, t12
kusano 2b45e8
	NMSUB	t21, t21, b1, t22
kusano 2b45e8
	NMSUB	t31, t31, b1, t32
kusano 2b45e8
	NMSUB	t41, t41, b1, t42
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,   0 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b2, t11
kusano 2b45e8
	MUL	t21, b2, t21
kusano 2b45e8
	MUL	t31, b2, t31
kusano 2b45e8
	MUL	t41, b2, t41
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				# updata packed A
kusano 2b45e8
	ST	t21,  1 * SIZE(AO)
kusano 2b45e8
	ST	t31,  2 * SIZE(AO)
kusano 2b45e8
	ST	t41,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t12,  4 * SIZE(AO)
kusano 2b45e8
	ST	t22,  5 * SIZE(AO)
kusano 2b45e8
	ST	t32,  6 * SIZE(AO)
kusano 2b45e8
	ST	t42,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	ST	t31,  2 * SIZE(CO1)
kusano 2b45e8
	ST	t41,  3 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t22,  1 * SIZE(CO2)
kusano 2b45e8
	ST	t32,  2 * SIZE(CO2)
kusano 2b45e8
	ST	t42,  3 * SIZE(CO2)
kusano 2b45e8
	
kusano 2b45e8
	daddiu	CO1, CO1, 4 * SIZE			#	fixed pointer
kusano 2b45e8
	daddiu	CO2, CO2, 4 * SIZE
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, K, 2 + BASE_SHIFT		
kusano 2b45e8
	daddu	AORIG, AORIG, TEMP			# move to next panel Ai
kusano 2b45e8
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
	bgtz	I, .L51
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L60:
kusano 2b45e8
	andi	I,  M, 2						#	mr=2
kusano 2b45e8
	blez	I, .L70
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	dsll	L,    KK, 1 + BASE_SHIFT		#	mr=2	
kusano 2b45e8
	dsll	TEMP, KK, 1 + BASE_SHIFT		#	nr=2
kusano 2b45e8
	daddu	AO, AORIG, L					
kusano 2b45e8
	daddu	BO, B, TEMP						#	BO point to the retangular data part,also reset BO
kusano 2b45e8
	dsubu	TEMP, K, KK						#	temp = the length of rectangular data part 
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear 8 results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2						#	L=(KC-offset)/4
kusano 2b45e8
	blez	L, .L65
kusano 2b45e8
	NOP
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L62:
kusano 2b45e8
	LD	a5,  2 * SIZE(AO)
kusano 2b45e8
	LD	a6,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  2 * SIZE(BO)
kusano 2b45e8
	LD	b6,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
kusano 2b45e8
	LD	a3,  4 * SIZE(AO)
kusano 2b45e8
	LD	a4,  5 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b3,  4 * SIZE(BO)
kusano 2b45e8
	LD	b4,  5 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b3
kusano 2b45e8
	MADD	t21, t21, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a3, b4
kusano 2b45e8
	MADD	t22, t22, a4, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  8 * SIZE			#	AO += 2mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  8 * SIZE			#	BO += 2nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b7
kusano 2b45e8
	MADD	t21, t21, a8, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a7, b8
kusano 2b45e8
	MADD	t22, t22, a8, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L62
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L65:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L68
kusano 2b45e8
	NOP
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L66:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 2 * SIZE			#	AO += 2mr	
kusano 2b45e8
	daddiu	BO, BO, 2 * SIZE			#	BO += 2nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L66
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align
kusano 2b45e8
.L68:
kusano 2b45e8
	daddiu	TEMP, KK,  -2				#	deal with the triangular data part
kusano 2b45e8
	dsll	L,    TEMP, 1 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + BASE_SHIFT	#	nr=2
kusano 2b45e8
	daddu	AO, AORIG, L
kusano 2b45e8
	daddu	BO, B, TEMP					#	BO point to the trigular data part
kusano 2b45e8
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	fixed results
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)
kusano 2b45e8
	LD	b3,  2 * SIZE(AO)
kusano 2b45e8
	LD	b4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
	SUB	t12, b3, t12
kusano 2b45e8
	SUB	t22, b4, t22
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b8,   3 * SIZE(BO)
kusano 2b45e8
	LD	b7,   2 * SIZE(BO)
kusano 2b45e8
	MUL	t12, b8, t12
kusano 2b45e8
	MUL	t22, b8, t22
kusano 2b45e8
	NMSUB	t11, t11, b7, t12
kusano 2b45e8
	NMSUB	t21, t21, b7, t22
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b6,   0 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b6, t11
kusano 2b45e8
	MUL	t21, b6, t21
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				# updata packed A
kusano 2b45e8
	ST	t21,  1 * SIZE(AO)
kusano 2b45e8
	ST	t12,  2 * SIZE(AO)
kusano 2b45e8
	ST	t22,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t22,  1 * SIZE(CO2)
kusano 2b45e8
	
kusano 2b45e8
	daddiu	CO1, CO1, 2 * SIZE			#	fixed pointer
kusano 2b45e8
	daddiu	CO2, CO2, 2 * SIZE
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, K, 1 + BASE_SHIFT		#	mr=2
kusano 2b45e8
	daddu	AORIG, AORIG, TEMP			# 	move to next panel Ai
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L70:
kusano 2b45e8
	andi	I, M, 1							#	mr=1
kusano 2b45e8
	blez	I, .L59
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	dsll	L,    KK, BASE_SHIFT			#	mr=1	
kusano 2b45e8
	dsll	TEMP, KK, 1 + BASE_SHIFT		#	nr=2
kusano 2b45e8
	daddu	AO, AORIG, L					
kusano 2b45e8
	daddu	BO, B, TEMP						#	BO point to the retangular data part,also reset BO
kusano 2b45e8
	dsubu	TEMP, K, KK						#	temp = the length of rectangular data part 
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear 8 results registers
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2						#	L=(KC-offset)/4
kusano 2b45e8
	blez	L, .L75
kusano 2b45e8
	NOP
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L72:
kusano 2b45e8
	LD	a5,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  2 * SIZE(BO)
kusano 2b45e8
	LD	b6,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b3,  4 * SIZE(BO)
kusano 2b45e8
	LD	b4,  5 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
kusano 2b45e8
	LD	a7,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b3
kusano 2b45e8
	MADD	t12, t12, a3, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO,  4 * SIZE			#	AO += 1mr*4kr	
kusano 2b45e8
	daddiu	BO, BO,  8 * SIZE			#	BO += 2nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b7
kusano 2b45e8
	MADD	t12, t12, a7, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L72
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L75:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L78
kusano 2b45e8
	NOP
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L76:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 1 * SIZE			#	AO += 1mr	
kusano 2b45e8
	daddiu	BO, BO, 2 * SIZE			#	BO += 2nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L76
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align
kusano 2b45e8
.L78:
kusano 2b45e8
	daddiu	TEMP, KK,  -2				#	deal with the triangular data part
kusano 2b45e8
	dsll	L,    TEMP, BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + BASE_SHIFT	#	nr=2
kusano 2b45e8
	daddu	AO, AORIG, L
kusano 2b45e8
	daddu	BO, B, TEMP					#	BO point to the trigular data part
kusano 2b45e8
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	fixed results
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t12, b2, t12
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b8,   3 * SIZE(BO)
kusano 2b45e8
	LD	b7,   2 * SIZE(BO)
kusano 2b45e8
	MUL	t12, b8, t12
kusano 2b45e8
	NMSUB	t11, t11, b7, t12
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b6,   0 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b6, t11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				# updata packed A
kusano 2b45e8
	ST	t12,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	
kusano 2b45e8
	daddiu	CO1, CO1, 1 * SIZE			#	fixed pointer
kusano 2b45e8
	daddiu	CO2, CO2, 1 * SIZE
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, K, BASE_SHIFT		#	mr=2
kusano 2b45e8
	daddu	AORIG, AORIG, TEMP			# 	move to next panel Ai
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L59:
kusano 2b45e8
	daddiu	KK, KK, -2						#	rectangular data length increased by 2
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L50:
kusano 2b45e8
	dsra	J,  N, 2						#	J = NC/4 
kusano 2b45e8
	blez	J, .L999
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
.L10:
kusano 2b45e8
	dsll	TEMP, K, 2 + BASE_SHIFT
kusano 2b45e8
	dsubu	B, B, TEMP						#	move B to the beginning address of Bj	
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, LDC, 2
kusano 2b45e8
	dsubu	C, C, TEMP						#	move C to the beginning address of Cj
kusano 2b45e8
kusano 2b45e8
	daddiu	J, J, -1
kusano 2b45e8
kusano 2b45e8
	move	CO1, C
kusano 2b45e8
	daddu	CO2, C,   LDC
kusano 2b45e8
	daddu	CO3, CO2, LDC
kusano 2b45e8
	daddu	CO4, CO3, LDC
kusano 2b45e8
	
kusano 2b45e8
	move	AORIG, A						#	reset A
kusano 2b45e8
kusano 2b45e8
	dsra	I,  M, 2						#	I=MC/4
kusano 2b45e8
	blez	I, .L20
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L11:
kusano 2b45e8
	dsll	L,    KK, 2 + BASE_SHIFT		#	mr=4	
kusano 2b45e8
	dsll	TEMP, KK, 2 + BASE_SHIFT		#	nr=4
kusano 2b45e8
	daddu	AO, AORIG, L					
kusano 2b45e8
	daddu	BO, B, TEMP						#	BO point to the retangular data part,also reset BO
kusano 2b45e8
	dsubu	TEMP, K, KK						#	temp = the length of rectangular data part 
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear 16 results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t31, t11
kusano 2b45e8
	MOV	t41, t11
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t32, t11
kusano 2b45e8
	MOV	t42, t11
kusano 2b45e8
	MOV	t13, t11
kusano 2b45e8
	MOV	t23, t11
kusano 2b45e8
	MOV	t33, t11
kusano 2b45e8
	MOV	t43, t11
kusano 2b45e8
	MOV	t14, t11
kusano 2b45e8
	MOV	t24, t11
kusano 2b45e8
	MOV	t34, t11
kusano 2b45e8
	MOV	t44, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2						#	L=(KC-offset)/4
kusano 2b45e8
	blez	L, .L15
kusano 2b45e8
	NOP
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L12:
kusano 2b45e8
	LD	a5,  4 * SIZE(AO)
kusano 2b45e8
	LD	a6,  5 * SIZE(AO)
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  4 * SIZE(BO)
kusano 2b45e8
	LD	b6,  5 * SIZE(BO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
	MADD	t33, t33, a3, b3
kusano 2b45e8
	MADD	t43, t43, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
	MADD	t34, t34, a3, b4
kusano 2b45e8
	MADD	t44, t44, a4, b4			#	fisrt 	
kusano 2b45e8
kusano 2b45e8
	LD	a1,  8 * SIZE(AO)
kusano 2b45e8
	LD	a2,  9 * SIZE(AO)
kusano 2b45e8
	LD	a3,  10 * SIZE(AO)
kusano 2b45e8
	LD	a4,  11 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  8 * SIZE(BO)
kusano 2b45e8
	LD	b2,  9 * SIZE(BO)
kusano 2b45e8
	LD	b3,  10 * SIZE(BO)
kusano 2b45e8
	LD	b4,  11 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t31, t31, a7, b5
kusano 2b45e8
	MADD	t41, t41, a8, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
	MADD	t32, t32, a7, b6
kusano 2b45e8
	MADD	t42, t42, a8, b6
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a5, b7
kusano 2b45e8
	MADD	t23, t23, a6, b7
kusano 2b45e8
	MADD	t33, t33, a7, b7
kusano 2b45e8
	MADD	t43, t43, a8, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a5, b8
kusano 2b45e8
	MADD	t24, t24, a6, b8
kusano 2b45e8
	MADD	t34, t34, a7, b8
kusano 2b45e8
	MADD	t44, t44, a8, b8			#	second
kusano 2b45e8
kusano 2b45e8
	LD	a5,  12 * SIZE(AO)
kusano 2b45e8
	LD	a6,  13 * SIZE(AO)
kusano 2b45e8
	LD	a7,  14 * SIZE(AO)
kusano 2b45e8
	LD	a8,  15 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  12 * SIZE(BO)
kusano 2b45e8
	LD	b6,  13 * SIZE(BO)
kusano 2b45e8
	LD	b7,  14 * SIZE(BO)
kusano 2b45e8
	LD	b8,  15 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
	MADD	t33, t33, a3, b3
kusano 2b45e8
	MADD	t43, t43, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
	MADD	t34, t34, a3, b4
kusano 2b45e8
	MADD	t44, t44, a4, b4			#	third
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 16 * SIZE			#	AO += 4mr*4kr	
kusano 2b45e8
	daddiu	BO, BO, 16 * SIZE			#	BO += 4nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
	MADD	t31, t31, a7, b5
kusano 2b45e8
	MADD	t41, t41, a8, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
	MADD	t32, t32, a7, b6
kusano 2b45e8
	MADD	t42, t42, a8, b6
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a5, b7
kusano 2b45e8
	MADD	t23, t23, a6, b7
kusano 2b45e8
	MADD	t33, t33, a7, b7
kusano 2b45e8
	MADD	t43, t43, a8, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a5, b8
kusano 2b45e8
	MADD	t24, t24, a6, b8
kusano 2b45e8
	MADD	t34, t34, a7, b8
kusano 2b45e8
	MADD	t44, t44, a8, b8			#	fouth
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L12
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L15:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L18
kusano 2b45e8
	NOP
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L16:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
	MADD	t31, t31, a3, b1
kusano 2b45e8
	MADD	t41, t41, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
	MADD	t32, t32, a3, b2
kusano 2b45e8
	MADD	t42, t42, a4, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
	MADD	t33, t33, a3, b3
kusano 2b45e8
	MADD	t43, t43, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
	MADD	t34, t34, a3, b4
kusano 2b45e8
	MADD	t44, t44, a4, b4			#	third
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 4 * SIZE			#	AO += 4mr	
kusano 2b45e8
	daddiu	BO, BO, 4 * SIZE			#	BO += 4nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
	LD	a4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L16
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align
kusano 2b45e8
.L18:
kusano 2b45e8
	daddiu	TEMP, KK, -4				#	deal with the triangular data part
kusano 2b45e8
	dsll	L,    TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	daddu	AO, AORIG, L
kusano 2b45e8
	daddu	BO, B, TEMP					#	BO point to the trigular data part
kusano 2b45e8
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	fixed results
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)
kusano 2b45e8
	LD	b3,  2 * SIZE(AO)
kusano 2b45e8
	LD	b4,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
	SUB	t31, b3, t31
kusano 2b45e8
	SUB	t41, b4, t41
kusano 2b45e8
kusano 2b45e8
 	LD	b5,  4 * SIZE(AO)
kusano 2b45e8
	LD	b6,  5 * SIZE(AO)
kusano 2b45e8
	LD	b7,  6 * SIZE(AO)
kusano 2b45e8
	LD	b8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t12, b5, t12
kusano 2b45e8
	SUB	t22, b6, t22
kusano 2b45e8
	SUB	t32, b7, t32
kusano 2b45e8
	SUB	t42, b8, t42
kusano 2b45e8
kusano 2b45e8
 	LD	b1,  8 * SIZE(AO)
kusano 2b45e8
	LD	b2,  9 * SIZE(AO)
kusano 2b45e8
	LD	b3, 10 * SIZE(AO)
kusano 2b45e8
	LD	b4, 11 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t13, b1, t13
kusano 2b45e8
	SUB	t23, b2, t23
kusano 2b45e8
	SUB	t33, b3, t33
kusano 2b45e8
	SUB	t43, b4, t43
kusano 2b45e8
kusano 2b45e8
 	LD	b5, 12 * SIZE(AO)
kusano 2b45e8
	LD	b6, 13 * SIZE(AO)
kusano 2b45e8
	LD	b7, 14 * SIZE(AO)
kusano 2b45e8
	LD	b8, 15 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t14, b5, t14
kusano 2b45e8
	SUB	t24, b6, t24
kusano 2b45e8
	SUB	t34, b7, t34
kusano 2b45e8
	SUB	t44, b8, t44
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1, 15 * SIZE(BO)
kusano 2b45e8
	LD	b2, 14 * SIZE(BO)
kusano 2b45e8
	LD	b3, 13 * SIZE(BO)
kusano 2b45e8
	LD	b4, 12 * SIZE(BO)
kusano 2b45e8
	MUL	t14, b1, t14
kusano 2b45e8
	MUL	t24, b1, t24
kusano 2b45e8
	MUL	t34, b1, t34
kusano 2b45e8
	MUL	t44, b1, t44
kusano 2b45e8
	NMSUB	t13, t13, b2, t14
kusano 2b45e8
	NMSUB	t23, t23, b2, t24
kusano 2b45e8
	NMSUB	t33, t33, b2, t34
kusano 2b45e8
	NMSUB	t43, t43, b2, t44
kusano 2b45e8
	NMSUB	t12, t12, b3, t14
kusano 2b45e8
	NMSUB	t22, t22, b3, t24
kusano 2b45e8
	NMSUB	t32, t32, b3, t34
kusano 2b45e8
	NMSUB	t42, t42, b3, t44
kusano 2b45e8
	NMSUB	t11, t11, b4, t14
kusano 2b45e8
	NMSUB	t21, t21, b4, t24
kusano 2b45e8
	NMSUB	t31, t31, b4, t34
kusano 2b45e8
	NMSUB	t41, t41, b4, t44
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
	LD	b5,  10 * SIZE(BO)
kusano 2b45e8
	LD	b6,   9 * SIZE(BO)
kusano 2b45e8
	LD	b7,   8 * SIZE(BO)
kusano 2b45e8
	MUL	t13, b5, t13
kusano 2b45e8
	MUL	t23, b5, t23
kusano 2b45e8
	MUL	t33, b5, t33
kusano 2b45e8
	MUL	t43, b5, t43
kusano 2b45e8
	NMSUB	t12, t12, b6, t13
kusano 2b45e8
	NMSUB	t22, t22, b6, t23
kusano 2b45e8
	NMSUB	t32, t32, b6, t33
kusano 2b45e8
	NMSUB	t42, t42, b6, t43
kusano 2b45e8
	NMSUB	t11, t11, b7, t13
kusano 2b45e8
	NMSUB	t21, t21, b7, t23
kusano 2b45e8
	NMSUB	t31, t31, b7, t33
kusano 2b45e8
	NMSUB	t41, t41, b7, t43
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b8,   5 * SIZE(BO)
kusano 2b45e8
	LD	b1,   4 * SIZE(BO)
kusano 2b45e8
	MUL	t12, b8, t12
kusano 2b45e8
	MUL	t22, b8, t22
kusano 2b45e8
	MUL	t32, b8, t32
kusano 2b45e8
	MUL	t42, b8, t42
kusano 2b45e8
	NMSUB	t11, t11, b1, t12
kusano 2b45e8
	NMSUB	t21, t21, b1, t22
kusano 2b45e8
	NMSUB	t31, t31, b1, t32
kusano 2b45e8
	NMSUB	t41, t41, b1, t42
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,   0 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b2, t11
kusano 2b45e8
	MUL	t21, b2, t21
kusano 2b45e8
	MUL	t31, b2, t31
kusano 2b45e8
	MUL	t41, b2, t41
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				# updata packed A
kusano 2b45e8
	ST	t21,  1 * SIZE(AO)
kusano 2b45e8
	ST	t31,  2 * SIZE(AO)
kusano 2b45e8
	ST	t41,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t12,  4 * SIZE(AO)
kusano 2b45e8
	ST	t22,  5 * SIZE(AO)
kusano 2b45e8
	ST	t32,  6 * SIZE(AO)
kusano 2b45e8
	ST	t42,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t13,  8 * SIZE(AO)
kusano 2b45e8
	ST	t23,  9 * SIZE(AO)
kusano 2b45e8
	ST	t33, 10 * SIZE(AO)
kusano 2b45e8
	ST	t43, 11 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t14, 12 * SIZE(AO)
kusano 2b45e8
	ST	t24, 13 * SIZE(AO)
kusano 2b45e8
	ST	t34, 14 * SIZE(AO)
kusano 2b45e8
	ST	t44, 15 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	ST	t31,  2 * SIZE(CO1)
kusano 2b45e8
	ST	t41,  3 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t22,  1 * SIZE(CO2)
kusano 2b45e8
	ST	t32,  2 * SIZE(CO2)
kusano 2b45e8
	ST	t42,  3 * SIZE(CO2)
kusano 2b45e8
	
kusano 2b45e8
	ST	t13,  0 * SIZE(CO3)
kusano 2b45e8
	ST	t23,  1 * SIZE(CO3)
kusano 2b45e8
	ST	t33,  2 * SIZE(CO3)
kusano 2b45e8
	ST	t43,  3 * SIZE(CO3)
kusano 2b45e8
	
kusano 2b45e8
	ST	t14,  0 * SIZE(CO4)
kusano 2b45e8
	ST	t24,  1 * SIZE(CO4)
kusano 2b45e8
	ST	t34,  2 * SIZE(CO4)
kusano 2b45e8
	ST	t44,  3 * SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 4 * SIZE			#	fixed pointer
kusano 2b45e8
	daddiu	CO2, CO2, 4 * SIZE
kusano 2b45e8
	daddiu	CO3, CO3, 4 * SIZE
kusano 2b45e8
	daddiu	CO4, CO4, 4 * SIZE
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, K, 2 + BASE_SHIFT		
kusano 2b45e8
	daddu	AORIG, AORIG, TEMP			# move to next panel Ai
kusano 2b45e8
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
	bgtz	I, .L11
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L20:
kusano 2b45e8
	andi	I,  M, 2						#	mr=2
kusano 2b45e8
	blez	I, .L40
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	dsll	L,    KK, 1 + BASE_SHIFT		#	mr=2	
kusano 2b45e8
	dsll	TEMP, KK, 2 + BASE_SHIFT		#	nr=4
kusano 2b45e8
	daddu	AO, AORIG, L					
kusano 2b45e8
	daddu	BO, B, TEMP						#	BO point to the retangular data part,also reset BO
kusano 2b45e8
	dsubu	TEMP, K, KK						#	temp = the length of rectangular data part 
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear 8 results registers
kusano 2b45e8
	MOV	t21, t11
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t22, t11
kusano 2b45e8
	MOV	t13, t11
kusano 2b45e8
	MOV	t23, t11
kusano 2b45e8
	MOV	t14, t11
kusano 2b45e8
	MOV	t24, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2						#	L=(KC-offset)/4
kusano 2b45e8
	blez	L, .L25
kusano 2b45e8
	NOP
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L22:
kusano 2b45e8
	LD	a5,  2 * SIZE(AO)
kusano 2b45e8
	LD	a6,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  4 * SIZE(BO)
kusano 2b45e8
	LD	b6,  5 * SIZE(BO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
kusano 2b45e8
	LD	a3,  4 * SIZE(AO)
kusano 2b45e8
	LD	a4,  5 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  8 * SIZE(BO)
kusano 2b45e8
	LD	b2,  9 * SIZE(BO)
kusano 2b45e8
	LD	b3,  10 * SIZE(BO)
kusano 2b45e8
	LD	b4,  11 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t21, t21, a6, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t22, t22, a6, b6
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a5, b7
kusano 2b45e8
	MADD	t23, t23, a6, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a5, b8
kusano 2b45e8
	MADD	t24, t24, a6, b8
kusano 2b45e8
kusano 2b45e8
	LD	a7,  6 * SIZE(AO)
kusano 2b45e8
	LD	a8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  12 * SIZE(BO)
kusano 2b45e8
	LD	b6,  13 * SIZE(BO)
kusano 2b45e8
	LD	b7,  14 * SIZE(BO)
kusano 2b45e8
	LD	b8,  15 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b1
kusano 2b45e8
	MADD	t21, t21, a4, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a3, b2
kusano 2b45e8
	MADD	t22, t22, a4, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a3, b3
kusano 2b45e8
	MADD	t23, t23, a4, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a3, b4
kusano 2b45e8
	MADD	t24, t24, a4, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 8 * SIZE			#	AO += 2mr*4kr	
kusano 2b45e8
	daddiu	BO, BO, 16 * SIZE			#	BO += 4nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b5
kusano 2b45e8
	MADD	t21, t21, a8, b5
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a7, b6
kusano 2b45e8
	MADD	t22, t22, a8, b6
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a7, b7
kusano 2b45e8
	MADD	t23, t23, a8, b7
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a7, b8
kusano 2b45e8
	MADD	t24, t24, a8, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L22
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L25:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L28
kusano 2b45e8
	NOP
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L26:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t21, t21, a2, b1
kusano 2b45e8
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t22, t22, a2, b2
kusano 2b45e8
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t23, t23, a2, b3
kusano 2b45e8
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
	MADD	t24, t24, a2, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 2 * SIZE			#	AO += 2mr	
kusano 2b45e8
	daddiu	BO, BO, 4 * SIZE			#	BO += 4nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
	LD	a2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L26
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align
kusano 2b45e8
.L28:
kusano 2b45e8
	daddiu	TEMP, KK, -4				#	deal with the triangular data part
kusano 2b45e8
	dsll	L,    TEMP, 1 + BASE_SHIFT	#	mr=2
kusano 2b45e8
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	daddu	AO, AORIG, L
kusano 2b45e8
	daddu	BO, B, TEMP					#	BO point to the trigular data part
kusano 2b45e8
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	fixed results
kusano 2b45e8
	LD	b2,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t21, b2, t21
kusano 2b45e8
kusano 2b45e8
 	LD	b5,  2 * SIZE(AO)
kusano 2b45e8
	LD	b6,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t12, b5, t12
kusano 2b45e8
	SUB	t22, b6, t22
kusano 2b45e8
kusano 2b45e8
	LD	b3,  4 * SIZE(AO)
kusano 2b45e8
	LD	b4,  5 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t13, b3, t13
kusano 2b45e8
	SUB	t23, b4, t23
kusano 2b45e8
kusano 2b45e8
	LD	b7,  6 * SIZE(AO)
kusano 2b45e8
	LD	b8,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t14, b7, t14
kusano 2b45e8
	SUB	t24, b8, t24
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1, 15 * SIZE(BO)
kusano 2b45e8
	LD	b2, 14 * SIZE(BO)
kusano 2b45e8
	LD	b3, 13 * SIZE(BO)
kusano 2b45e8
	LD	b4, 12 * SIZE(BO)
kusano 2b45e8
	MUL	t14, b1, t14
kusano 2b45e8
	MUL	t24, b1, t24
kusano 2b45e8
	NMSUB	t13, t13, b2, t14
kusano 2b45e8
	NMSUB	t23, t23, b2, t24
kusano 2b45e8
	NMSUB	t12, t12, b3, t14
kusano 2b45e8
	NMSUB	t22, t22, b3, t24
kusano 2b45e8
	NMSUB	t11, t11, b4, t14
kusano 2b45e8
	NMSUB	t21, t21, b4, t24
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
	LD	b5,  10 * SIZE(BO)
kusano 2b45e8
	LD	b6,   9 * SIZE(BO)
kusano 2b45e8
	LD	b7,   8 * SIZE(BO)
kusano 2b45e8
	MUL	t13, b5, t13
kusano 2b45e8
	MUL	t23, b5, t23
kusano 2b45e8
	NMSUB	t12, t12, b6, t13
kusano 2b45e8
	NMSUB	t22, t22, b6, t23
kusano 2b45e8
	NMSUB	t11, t11, b7, t13
kusano 2b45e8
	NMSUB	t21, t21, b7, t23
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b8,   5 * SIZE(BO)
kusano 2b45e8
	LD	b1,   4 * SIZE(BO)
kusano 2b45e8
	MUL	t12, b8, t12
kusano 2b45e8
	MUL	t22, b8, t22
kusano 2b45e8
	NMSUB	t11, t11, b1, t12
kusano 2b45e8
	NMSUB	t21, t21, b1, t22
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,   0 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b2, t11
kusano 2b45e8
	MUL	t21, b2, t21
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				# updata packed A
kusano 2b45e8
	ST	t21,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t12,  2 * SIZE(AO)
kusano 2b45e8
	ST	t22,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t13,  4 * SIZE(AO)
kusano 2b45e8
	ST	t23,  5 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t14,  6 * SIZE(AO)
kusano 2b45e8
	ST	t24,  7 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back
kusano 2b45e8
	ST	t21,  1 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t22,  1 * SIZE(CO2)
kusano 2b45e8
	
kusano 2b45e8
	ST	t13,  0 * SIZE(CO3)
kusano 2b45e8
	ST	t23,  1 * SIZE(CO3)
kusano 2b45e8
	
kusano 2b45e8
	ST	t14,  0 * SIZE(CO4)
kusano 2b45e8
	ST	t24,  1 * SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 2 * SIZE			#	fixed pointer
kusano 2b45e8
	daddiu	CO2, CO2, 2 * SIZE
kusano 2b45e8
	daddiu	CO3, CO3, 2 * SIZE
kusano 2b45e8
	daddiu	CO4, CO4, 2 * SIZE
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, K, 1 + BASE_SHIFT		#	mr=2
kusano 2b45e8
	daddu	AORIG, AORIG, TEMP			# 	move to next panel Ai
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L40:
kusano 2b45e8
	andi	I,  M, 1
kusano 2b45e8
	blez	I, .L29
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	dsll	L,    KK, BASE_SHIFT			#	mr=1	
kusano 2b45e8
	dsll	TEMP, KK, 2 + BASE_SHIFT		#	nr=4
kusano 2b45e8
	daddu	AO, AORIG, L					
kusano 2b45e8
	daddu	BO, B, TEMP						#	BO point to the retangular data part,also reset BO
kusano 2b45e8
	dsubu	TEMP, K, KK						#	temp = the length of rectangular data part 
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  t11							#	clear 4 results registers
kusano 2b45e8
	MOV	t12, t11
kusano 2b45e8
	MOV	t13, t11
kusano 2b45e8
	MOV	t14, t11
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	dsra	L,  TEMP, 2						#	L=(KC-offset)/4
kusano 2b45e8
	blez	L, .L45
kusano 2b45e8
	NOP
kusano 2b45e8
	
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L42:
kusano 2b45e8
	LD	a5,  1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  4 * SIZE(BO)
kusano 2b45e8
	LD	b6,  5 * SIZE(BO)
kusano 2b45e8
	LD	b7,  6 * SIZE(BO)
kusano 2b45e8
	LD	b8,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
kusano 2b45e8
	LD	a3,  2 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  8 * SIZE(BO)
kusano 2b45e8
	LD	b2,  9 * SIZE(BO)
kusano 2b45e8
	LD	b3,  10 * SIZE(BO)
kusano 2b45e8
	LD	b4,  11 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a5, b5
kusano 2b45e8
	MADD	t12, t12, a5, b6
kusano 2b45e8
	MADD	t13, t13, a5, b7
kusano 2b45e8
	MADD	t14, t14, a5, b8
kusano 2b45e8
kusano 2b45e8
	LD	a7,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b5,  12 * SIZE(BO)
kusano 2b45e8
	LD	b6,  13 * SIZE(BO)
kusano 2b45e8
	LD	b7,  14 * SIZE(BO)
kusano 2b45e8
	LD	b8,  15 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a3, b1
kusano 2b45e8
	MADD	t12, t12, a3, b2
kusano 2b45e8
	MADD	t13, t13, a3, b3
kusano 2b45e8
	MADD	t14, t14, a3, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 4 * SIZE			#	AO += 1mr*4kr	
kusano 2b45e8
	daddiu	BO, BO, 16 * SIZE			#	BO += 4nr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MADD	t11, t11, a7, b5
kusano 2b45e8
	MADD	t12, t12, a7, b6
kusano 2b45e8
	MADD	t13, t13, a7, b7
kusano 2b45e8
	MADD	t14, t14, a7, b8
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L42
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
.L45:
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	blez	L, .L48
kusano 2b45e8
	NOP
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L46:
kusano 2b45e8
	MADD	t11, t11, a1, b1
kusano 2b45e8
	MADD	t12, t12, a1, b2
kusano 2b45e8
	MADD	t13, t13, a1, b3
kusano 2b45e8
	MADD	t14, t14, a1, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 1 * SIZE			#	AO += 2mr	
kusano 2b45e8
	daddiu	BO, BO, 4 * SIZE			#	BO += 4nr
kusano 2b45e8
	
kusano 2b45e8
	LD	a1,  0 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1,  0 * SIZE(BO)
kusano 2b45e8
	LD	b2,  1 * SIZE(BO)
kusano 2b45e8
	LD	b3,  2 * SIZE(BO)
kusano 2b45e8
	LD	b4,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	bgtz	L, .L46
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align
kusano 2b45e8
.L48:
kusano 2b45e8
	daddiu	TEMP, KK, -4				#	deal with the triangular data part
kusano 2b45e8
	dsll	L,    TEMP, BASE_SHIFT		#	mr=1
kusano 2b45e8
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	daddu	AO, AORIG, L
kusano 2b45e8
	daddu	BO, B, TEMP					#	BO point to the trigular data part
kusano 2b45e8
kusano 2b45e8
 	LD	b1,  0 * SIZE(AO)				#	fixed results
kusano 2b45e8
 	LD	b5,  1 * SIZE(AO)
kusano 2b45e8
	LD	b3,  2 * SIZE(AO)
kusano 2b45e8
	LD	b7,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	SUB	t11, b1, t11
kusano 2b45e8
	SUB	t12, b5, t12
kusano 2b45e8
	SUB	t13, b3, t13
kusano 2b45e8
	SUB	t14, b7, t14
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b1, 15 * SIZE(BO)
kusano 2b45e8
	LD	b2, 14 * SIZE(BO)
kusano 2b45e8
	LD	b3, 13 * SIZE(BO)
kusano 2b45e8
	LD	b4, 12 * SIZE(BO)
kusano 2b45e8
	MUL	t14, b1, t14
kusano 2b45e8
	NMSUB	t13, t13, b2, t14
kusano 2b45e8
	NMSUB	t12, t12, b3, t14
kusano 2b45e8
	NMSUB	t11, t11, b4, t14
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
	LD	b5,  10 * SIZE(BO)
kusano 2b45e8
	LD	b6,   9 * SIZE(BO)
kusano 2b45e8
	LD	b7,   8 * SIZE(BO)
kusano 2b45e8
	MUL	t13, b5, t13
kusano 2b45e8
	NMSUB	t12, t12, b6, t13
kusano 2b45e8
	NMSUB	t11, t11, b7, t13
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b8,   5 * SIZE(BO)
kusano 2b45e8
	LD	b1,   4 * SIZE(BO)
kusano 2b45e8
	MUL	t12, b8, t12
kusano 2b45e8
	NMSUB	t11, t11, b1, t12
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	LD	b2,   0 * SIZE(BO)
kusano 2b45e8
	MUL	t11, b2, t11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(AO)				# updata packed A
kusano 2b45e8
	ST	t12,  1 * SIZE(AO)
kusano 2b45e8
	ST	t13,  2 * SIZE(AO)
kusano 2b45e8
	ST	t14,  3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)				#	write back
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t13,  0 * SIZE(CO3)
kusano 2b45e8
	ST	t14,  0 * SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1, CO1, 1 * SIZE			#	fixed pointer
kusano 2b45e8
	daddiu	CO2, CO2, 1 * SIZE
kusano 2b45e8
	daddiu	CO3, CO3, 1 * SIZE
kusano 2b45e8
	daddiu	CO4, CO4, 1 * SIZE
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, K, BASE_SHIFT			#	mr=2
kusano 2b45e8
	daddu	AORIG, AORIG, TEMP			# 	move to next panel Ai
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L29:
kusano 2b45e8
	daddiu	KK, KK, -4					#	rectangular data part increased by 4
kusano 2b45e8
	bgtz	J, .L10
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L999:
kusano 2b45e8
	LDARG	$16,   0($sp)
kusano 2b45e8
	LDARG	$17,   8($sp)
kusano 2b45e8
	LDARG	$18,  16($sp)
kusano 2b45e8
	LDARG	$19,  24($sp)
kusano 2b45e8
	LDARG	$20,  32($sp)
kusano 2b45e8
	LDARG	$21,  40($sp)
kusano 2b45e8
	ldc1	$f24, 48($sp)
kusano 2b45e8
	ldc1	$f25, 56($sp)
kusano 2b45e8
	ldc1	$f26, 64($sp)
kusano 2b45e8
	ldc1	$f27, 72($sp)
kusano 2b45e8
	ldc1	$f28, 80($sp)
kusano 2b45e8
kusano 2b45e8
	LDARG	$22,  88($sp)
kusano 2b45e8
	LDARG	$23,  96($sp)
kusano 2b45e8
	LDARG	$24, 104($sp)
kusano 2b45e8
	LDARG	$25, 112($sp)
kusano 2b45e8
kusano 2b45e8
#ifndef __64BIT__
kusano 2b45e8
	ldc1	$f20,112($sp)
kusano 2b45e8
	ldc1	$f21,120($sp)
kusano 2b45e8
	ldc1	$f22,128($sp)
kusano 2b45e8
	ldc1	$f23,136($sp)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	j	$31
kusano 2b45e8
	daddiu	$sp, $sp, 144
kusano 2b45e8
kusano 2b45e8
	EPILOGUE