Blob Blame Raw
#define REALNAME ASMNAME

#define ASSEMBLER
#include "common.h"


#define M	$4
#define	N	$5
#define	K	$6
#define A	$8
#define B	$9
#define C	$10
#define LDC	$11

#define AO	$12
#define BO	$13

#define I	$2
#define J	$3
#define L	$7

#define CO1	$14
#define CO2	$15
#define CO3	$16
#define CO4	$17

#define OFFSET	$22
#define KK	$23
#define TEMP	$24
#define AORIG	$25

#define a1	$f0
#define a2	$f1
#define a3	$f2
#define a4	$f3
#define a5	$f4
#define a6	$f5
#define a7	$f6
#define	a8	$f7

#define b1	$f8
#define b2	$f9
#define b3	$f10
#define b4	$f11
#define b5	$f12
#define b6	$f13
#define b7	$f14
#define b8	$f15

#define t11	$f16
#define t21	$f17
#define t31	$f18
#define t41	$f19

#define t12	$f20
#define t22	$f21
#define t32	$f22
#define t42	$f23

#define t13	$f24
#define t23	$f25
#define t33	$f26
#define t43	$f27

#define t14	$f28
#define t24	$f29
#define t34	$f30
#define t44	$f31

#define ALPHA	$f15

	PROLOGUE
	
	daddiu	$sp, $sp, -144

	SDARG	$16,   0($sp)
	SDARG	$17,   8($sp)
	SDARG	$18,  16($sp)
	SDARG	$19,  24($sp)
	SDARG	$20,  32($sp)
	SDARG	$21,  40($sp)
	sdc1	$f24, 48($sp)
	sdc1	$f25, 56($sp)
	sdc1	$f26, 64($sp)
	sdc1	$f27, 72($sp)
	sdc1	$f28, 80($sp)

	SDARG	$22,  88($sp)
	SDARG	$23,  96($sp)
	SDARG	$24, 104($sp)
	SDARG	$25, 112($sp)

#ifndef __64BIT__
	sdc1	$f20,112($sp)
	sdc1	$f21,120($sp)
	sdc1	$f22,128($sp)
	sdc1	$f23,136($sp)
#endif
											#	LN compute from bottom to top
	LDARG	OFFSET, 144($sp)					
	dsll	LDC, LDC, BASE_SHIFT			#	ldc

	mult	M, K
	mflo	TEMP							#	TEMP=MC*KC

	dsll	TEMP, TEMP, BASE_SHIFT			
	daddu	A, A, TEMP						#	A move to the end of sa

	dsll	TEMP, M, BASE_SHIFT
	daddu	C, C, TEMP						#	C+=MC

	dsra	J,  N, 2						#	j = nc/4
	blez	J, .L30
	nop

.L10:										#	nr=4
	daddiu	J, J, -1
	move	CO1, C
	daddu	CO2, C,   LDC
	daddu	CO3, CO2, LDC
	daddu	CO4, CO3, LDC

	MTC	$0,  t11							#	clear result registers
	MOV	t21, t11
	MOV	t31, t11
	MOV	t41, t11
	MOV	t12, t11
	MOV	t22, t11
	MOV	t32, t11
	MOV	t42, t11

	daddu	KK, M, OFFSET					#	kc - kk is the length of the rectangular data part of panel Ai 
	move	AORIG, A						#	reset A

	daddu	C,  CO4, LDC					#	fixed pointer C, the write back address
	
	andi	I,  M, 1						#	mr=2,nr=4	
	blez	I, .L50
	nop

	dsll	TEMP,   K, BASE_SHIFT			#	mr=1
	dsubu	AORIG, AORIG, TEMP				#	AORIG point to the beginning address of Ai

	dsll	L,    KK, BASE_SHIFT			#	mr=1	
	dsll	TEMP, KK, 2 + BASE_SHIFT		#	nr=4

	daddu	AO, AORIG, L					#	AO point to the rectangular data part
	daddu	BO, B,     TEMP

	dsubu	TEMP, K, KK

	MOV	t13, t11							#	mr=2
	MOV	t23, t11
	MOV	t33, t11
	MOV	t43, t11
	MOV	t14, t11
	MOV	t24, t11
	MOV	t34, t11
	MOV	t44, t11

	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai

	LD	b1,  0 * SIZE(BO)					#	get 4b
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)				

	dsra	L,  TEMP, 2
	blez	L, .L55
	nop


	.align	3
.L52:
	LD	a5,  1 * SIZE(AO)					

	LD	b5,  4 * SIZE(BO)
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t12, t12, a1, b2
	MADD	t13, t13, a1, b3
	MADD	t14, t14, a1, b4

	LD	a3,   2 * SIZE(AO)
	LD	b1,   8 * SIZE(BO)
	LD	b2,   9 * SIZE(BO)
	LD	b3,  10 * SIZE(BO)
	LD	b4,  11 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t12, t12, a5, b6
	MADD	t13, t13, a5, b7
	MADD	t14, t14, a5, b8

	LD	a7,   3 * SIZE(AO)
	LD	b5,  12 * SIZE(BO)
	LD	b6,  13 * SIZE(BO)
	LD	b7,  14 * SIZE(BO)
	LD	b8,  15 * SIZE(BO)

	MADD	t11, t11, a3, b1				#	3rd compute
	MADD	t12, t12, a3, b2
	MADD	t13, t13, a3, b3
	MADD	t14, t14, a3, b4

	daddiu	AO, AO,  4 * SIZE				#	AO += 1mr*4kr	
	daddiu	BO, BO, 16 * SIZE				#	BO += 4nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	MADD	t11, t11, a7, b5				#	4th compute
	MADD	t12, t12, a7, b6
	MADD	t13, t13, a7, b7
	MADD	t14, t14, a7, b8

	daddiu	L, L, -1
	bgtz	L, .L52
	nop


	.align 3
.L55:
	andi	L, TEMP, 3
	blez	L, .L58
	nop
	
	.align	3
.L56:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t12, t12, a1, b2
	MADD	t13, t13, a1, b3
	MADD	t14, t14, a1, b4

	daddiu	AO, AO,  1 * SIZE				#	AO += 1mr	
	daddiu	BO, BO,  4 * SIZE				#	BO += 4nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L56
	nop


.L58:										#	deal with the triangular part
	daddiu	TEMP, KK, -1
	dsll	L,    TEMP, BASE_SHIFT			#	mr=1
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
	daddu	AO, AORIG, L					#	Ao point to the triangular data part
	daddu	BO, B,     TEMP

	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t12, b2, t12
	SUB	t13, b3, t13
	SUB	t14, b4, t14


	LD	b3,  0 * SIZE(AO)
	MUL	t11, b3, t11
	MUL	t12, b3, t12
	MUL	t13, b3, t13
	MUL	t14, b3, t14

	daddiu	CO1, CO1, -1 * SIZE
	daddiu	CO2, CO2, -1 * SIZE
	daddiu	CO3, CO3, -1 * SIZE
	daddiu	CO4, CO4, -1 * SIZE

	ST	t11,  0 * SIZE(BO)
	ST	t12,  1 * SIZE(BO)
	ST	t13,  2 * SIZE(BO)
	ST	t14,  3 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)
	ST	t13,  0 * SIZE(CO3)
	ST	t14,  0 * SIZE(CO4)


	daddiu	KK, KK, -1							#	the length of rectangular data part increases by 1
	MTC	$0,  t11							#	clear result registers
	MOV	t21, t11
	MOV	t31, t11
	MOV	t41, t11
	MOV	t12, t11
	MOV	t22, t11
	MOV	t32, t11
	MOV	t42, t11


	
.L50:	
	andi	I,  M, 2						#	mr=2,nr=4	
	blez	I, .L20
	nop

	dsll	TEMP,   K, 1 + BASE_SHIFT
	dsubu	AORIG, AORIG, TEMP				#	AORIG point to the beginning address of Ai

	dsll	L,    KK, 1 + BASE_SHIFT		
	dsll	TEMP, KK, 2 + BASE_SHIFT

	daddu	AO, AORIG, L					#	AO point to the rectangular data part
	daddu	BO, B,     TEMP

	dsubu	TEMP, K, KK

	MOV	t13, t11							#	mr=2
	MOV	t23, t11
	MOV	t33, t11
	MOV	t43, t11
	MOV	t14, t11
	MOV	t24, t11
	MOV	t34, t11
	MOV	t44, t11

	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
	LD	a2,  1 * SIZE(AO)					#	mr*KK with nr*KK

	LD	b1,  0 * SIZE(BO)					#	get 4b
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)				

	dsra	L,  TEMP, 2
	blez	L, .L25
	nop


	.align	3
.L22:
	LD	a5,  2 * SIZE(AO)					
	LD	a6,  3 * SIZE(AO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t21, t21, a2, b1
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t13, t13, a1, b3
	MADD	t23, t23, a2, b3
	MADD	t14, t14, a1, b4
	MADD	t24, t24, a2, b4

	LD	a3,   4 * SIZE(AO)
	LD	a4,   5 * SIZE(AO)
	LD	b1,   8 * SIZE(BO)
	LD	b2,   9 * SIZE(BO)
	LD	b3,  10 * SIZE(BO)
	LD	b4,  11 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t21, t21, a6, b5
	MADD	t12, t12, a5, b6
	MADD	t22, t22, a6, b6
	MADD	t13, t13, a5, b7
	MADD	t23, t23, a6, b7
	MADD	t14, t14, a5, b8
	MADD	t24, t24, a6, b8

	LD	a7,   6 * SIZE(AO)
	LD	a8,   7 * SIZE(AO)
	LD	b5,  12 * SIZE(BO)
	LD	b6,  13 * SIZE(BO)
	LD	b7,  14 * SIZE(BO)
	LD	b8,  15 * SIZE(BO)

	MADD	t11, t11, a3, b1				#	3rd compute
	MADD	t21, t21, a4, b1
	MADD	t12, t12, a3, b2
	MADD	t22, t22, a4, b2
	MADD	t13, t13, a3, b3
	MADD	t23, t23, a4, b3
	MADD	t14, t14, a3, b4
	MADD	t24, t24, a4, b4

	daddiu	AO, AO,  8 * SIZE				#	AO += 2mr*4kr	
	daddiu	BO, BO, 16 * SIZE				#	BO += 4nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	MADD	t11, t11, a7, b5				#	4th compute
	MADD	t21, t21, a8, b5
	MADD	t12, t12, a7, b6
	MADD	t22, t22, a8, b6
	MADD	t13, t13, a7, b7
	MADD	t23, t23, a8, b7
	MADD	t14, t14, a7, b8
	MADD	t24, t24, a8, b8

	daddiu	L, L, -1
	bgtz	L, .L22
	nop


	.align 3
.L25:
	andi	L, TEMP, 3
	blez	L, .L28
	nop
	
	.align	3
.L26:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t13, t13, a1, b3
	MADD	t23, t23, a2, b3
	MADD	t14, t14, a1, b4
	MADD	t24, t24, a2, b4

	daddiu	AO, AO,  2 * SIZE				#	AO += 2mr	
	daddiu	BO, BO,  4 * SIZE				#	BO += 4nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L26
	nop


.L28:										#	deal with the triangular part
	daddiu	TEMP, KK, -2
	dsll	L,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
	daddu	AO, AORIG, L					#	Ao point to the triangular data part
	daddu	BO, B,     TEMP

	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t12, b2, t12
	SUB	t13, b3, t13
	SUB	t14, b4, t14
	SUB	t21, b5, t21
	SUB	t22, b6, t22
	SUB	t23, b7, t23
	SUB	t24, b8, t24


	LD	b1,  3 * SIZE(AO)				#	computes the triangular_part		
	LD	b2,  2 * SIZE(AO)
	MUL	t21, b1, t21
	MUL	t22, b1, t22
	MUL	t23, b1, t23
	MUL	t24, b1, t24
	NMSUB	t11, t11, b2, t21
	NMSUB	t12, t12, b2, t22
	NMSUB	t13, t13, b2, t23
	NMSUB	t14, t14, b2, t24
	
	LD	b3,  0 * SIZE(AO)
	MUL	t11, b3, t11
	MUL	t12, b3, t12
	MUL	t13, b3, t13
	MUL	t14, b3, t14

	daddiu	CO1, CO1, -2 * SIZE
	daddiu	CO2, CO2, -2 * SIZE
	daddiu	CO3, CO3, -2 * SIZE
	daddiu	CO4, CO4, -2 * SIZE

	ST	t11,  0 * SIZE(BO)
	ST	t12,  1 * SIZE(BO)
	ST	t13,  2 * SIZE(BO)
	ST	t14,  3 * SIZE(BO)
	ST	t21,  4 * SIZE(BO)
	ST	t22,  5 * SIZE(BO)
	ST	t23,  6 * SIZE(BO)
	ST	t24,  7 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t21,  1 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)
	ST	t22,  1 * SIZE(CO2)
	ST	t13,  0 * SIZE(CO3)
	ST	t23,  1 * SIZE(CO3)
	ST	t14,  0 * SIZE(CO4)
	ST	t24,  1 * SIZE(CO4)



	daddiu	KK, KK, -2							#	the length of rectangular data part increases by 2
	MTC	$0,  t11							#	clear result registers
	MOV	t21, t11
	MOV	t31, t11
	MOV	t41, t11
	MOV	t12, t11
	MOV	t22, t11
	MOV	t32, t11
	MOV	t42, t11


.L20:
	dsra	I,  M, 2						#	I=MC/4
	blez	I, .L29
	nop

.L11:										#	mr=4
	dsll	TEMP,   K,  2 + BASE_SHIFT		#	TEMP=KC*MR*data_Byte
	dsubu	AORIG, AORIG, TEMP				#	AORIG point to the beginning address of panel Ai
	dsll	L,    KK, 2 + BASE_SHIFT		# 	KC-KK is the length of the rectangular data part of Ai	
	dsll	TEMP, KK, 2 + BASE_SHIFT		#	KK*NR*data_Byte

	daddu	AO, AORIG, L					#	AO point to the rectangular data part
	daddu	BO, B, TEMP

	dsubu	TEMP, K, KK	

	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
	LD	a2,  1 * SIZE(AO)					#	mr*KK with nr*KK
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)					#	get 4a

	LD	b1,  0 * SIZE(BO)					#	get 4b
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)				

	MOV	t13, t11							#	clear result registers
	MOV	t23, t11
	MOV	t33, t11
	MOV	t43, t11
	MOV	t14, t11
	MOV	t24, t11
	MOV	t34, t11
	MOV	t44, t11

	dsra	L,  TEMP, 2						#	L=(KC-offset)/4
	blez	L, .L15
	nop

	.align	3
.L12:
	LD	a5,  4 * SIZE(AO)					
	LD	a6,  5 * SIZE(AO)
	LD	a7,  6 * SIZE(AO)
	LD	a8,  7 * SIZE(AO)

	LD	b5,  4 * SIZE(BO)
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1

	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t32, t32, a3, b2
	MADD	t42, t42, a4, b2

	MADD	t13, t13, a1, b3
	MADD	t23, t23, a2, b3
	MADD	t33, t33, a3, b3
	MADD	t43, t43, a4, b3

	MADD	t14, t14, a1, b4
	MADD	t24, t24, a2, b4
	MADD	t34, t34, a3, b4
	MADD	t44, t44, a4, b4		

	LD	a1,   8 * SIZE(AO)
	LD	a2,   9 * SIZE(AO)
	LD	a3,  10 * SIZE(AO)
	LD	a4,  11 * SIZE(AO)

	LD	b1,   8 * SIZE(BO)
	LD	b2,   9 * SIZE(BO)
	LD	b3,  10 * SIZE(BO)
	LD	b4,  11 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t21, t21, a6, b5
	MADD	t31, t31, a7, b5
	MADD	t41, t41, a8, b5

	MADD	t12, t12, a5, b6
	MADD	t22, t22, a6, b6
	MADD	t32, t32, a7, b6
	MADD	t42, t42, a8, b6

	MADD	t13, t13, a5, b7
	MADD	t23, t23, a6, b7
	MADD	t33, t33, a7, b7
	MADD	t43, t43, a8, b7

	MADD	t14, t14, a5, b8
	MADD	t24, t24, a6, b8
	MADD	t34, t34, a7, b8
	MADD	t44, t44, a8, b8			

	LD	a5,  12 * SIZE(AO)
	LD	a6,  13 * SIZE(AO)
	LD	a7,  14 * SIZE(AO)
	LD	a8,  15 * SIZE(AO)

	LD	b5,  12 * SIZE(BO)
	LD	b6,  13 * SIZE(BO)
	LD	b7,  14 * SIZE(BO)
	LD	b8,  15 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1

	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t32, t32, a3, b2
	MADD	t42, t42, a4, b2

	MADD	t13, t13, a1, b3
	MADD	t23, t23, a2, b3
	MADD	t33, t33, a3, b3
	MADD	t43, t43, a4, b3

	MADD	t14, t14, a1, b4
	MADD	t24, t24, a2, b4
	MADD	t34, t34, a3, b4
	MADD	t44, t44, a4, b4		

	daddiu	AO, AO, 16 * SIZE				#	AO += 4mr*4kr	
	daddiu	BO, BO, 16 * SIZE				#	BO += 4nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	4th compute
	MADD	t21, t21, a6, b5
	MADD	t31, t31, a7, b5
	MADD	t41, t41, a8, b5

	MADD	t12, t12, a5, b6
	MADD	t22, t22, a6, b6
	MADD	t32, t32, a7, b6
	MADD	t42, t42, a8, b6

	MADD	t13, t13, a5, b7
	MADD	t23, t23, a6, b7
	MADD	t33, t33, a7, b7
	MADD	t43, t43, a8, b7

	MADD	t14, t14, a5, b8
	MADD	t24, t24, a6, b8
	MADD	t34, t34, a7, b8
	MADD	t44, t44, a8, b8			

	daddiu	L, L, -1
	bgtz	L, .L12
	nop
	

	.align 3
.L15:
	andi	L, TEMP, 3
	blez	L, .L18
	nop

	.align	3
.L16:
	MADD	t11, t11, a1, b1				
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1

	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t32, t32, a3, b2
	MADD	t42, t42, a4, b2

	MADD	t13, t13, a1, b3
	MADD	t23, t23, a2, b3
	MADD	t33, t33, a3, b3
	MADD	t43, t43, a4, b3

	MADD	t14, t14, a1, b4
	MADD	t24, t24, a2, b4
	MADD	t34, t34, a3, b4
	MADD	t44, t44, a4, b4		

	daddiu	AO, AO,  4 * SIZE				#	AO += 4mr	
	daddiu	BO, BO,  4 * SIZE				#	BO += 4nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L16
	nop


.L18:										#	deal with the triangular data part of panel Ai
	daddiu	TEMP, KK, -4					#	

	dsll	L,    TEMP, 2 + BASE_SHIFT
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
	daddu	AO, AORIG, L					#	AO point to the triangular data part
	daddu	BO, B, TEMP
	
	LD	b1,  0 * SIZE(BO)					#	triangular_part*X + rectangular_part = B
	LD	b2,  1 * SIZE(BO)					#	triangular_part*X = B - rectangular_part
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t12, b2, t12
	SUB	t13, b3, t13
	SUB	t14, b4, t14

	LD	b5,  4 * SIZE(BO)					#	sb store in row major
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)
	
	SUB	t21, b5, t21
	SUB	t22, b6, t22
	SUB	t23, b7, t23
	SUB	t24, b8, t24

	LD	b1,  8 * SIZE(BO)
	LD	b2,  9 * SIZE(BO)
	LD	b3, 10 * SIZE(BO)
	LD	b4, 11 * SIZE(BO)
	
	SUB	t31, b1, t31
	SUB	t32, b2, t32
	SUB	t33, b3, t33
	SUB	t34, b4, t34
	
	LD	b5, 12 * SIZE(BO)
	LD	b6, 13 * SIZE(BO)
	LD	b7, 14 * SIZE(BO)
	LD	b8, 15 * SIZE(BO)

	SUB	t41, b5, t41
	SUB	t42, b6, t42
	SUB	t43, b7, t43
	SUB	t44, b8, t44


	LD	b1,  15 * SIZE(AO)
	LD	b2,  14 * SIZE(AO)	
	LD	b4,  13 * SIZE(AO)
	LD	b7,  12 * SIZE(AO)
	
	MUL	t41, b1, t41
	MUL	t42, b1, t42
	MUL	t43, b1, t43
	MUL	t44, b1, t44
	NMSUB	t31, t31, b2, t41
	NMSUB	t32, t32, b2, t42
	NMSUB	t33, t33, b2, t43
	NMSUB	t34, t34, b2, t44
	NMSUB	t21, t21, b4, t41
	NMSUB	t22, t22, b4, t42
	NMSUB	t23, t23, b4, t43
	NMSUB	t24, t24, b4, t44
	NMSUB	t11, t11, b7, t41
	NMSUB	t12, t12, b7, t42
	NMSUB	t13, t13, b7, t43
	NMSUB	t14, t14, b7, t44



	LD	b3,  10 * SIZE(AO)	
	LD	b5,   9 * SIZE(AO)
	LD	b8,   8 * SIZE(AO)
	MUL	t31, b3, t31
	MUL	t32, b3, t32
	MUL	t33, b3, t33
	MUL	t34, b3, t34
	NMSUB	t21, t21, b5, t31
	NMSUB	t22, t22, b5, t32
	NMSUB	t23, t23, b5, t33
	NMSUB	t24, t24, b5, t34
	NMSUB	t11, t11, b8, t31
	NMSUB	t12, t12, b8, t32
	NMSUB	t13, t13, b8, t33
	NMSUB	t14, t14, b8, t34



	LD	b6,   5 * SIZE(AO)
	LD	b1,   4 * SIZE(AO)
	MUL	t21, b6, t21
	MUL	t22, b6, t22
	MUL	t23, b6, t23
	MUL	t24, b6, t24
	NMSUB	t11, t11, b1, t21
	NMSUB	t12, t12, b1, t22
	NMSUB	t13, t13, b1, t23
	NMSUB	t14, t14, b1, t24



	LD	b2,   0 * SIZE(AO)
	MUL	t11, b2, t11
	MUL	t12, b2, t12
	MUL	t13, b2, t13
	MUL	t14, b2, t14

	daddiu	CO1, CO1, -4 * SIZE				#	modify 
	daddiu	CO2, CO2, -4 * SIZE
	daddiu	CO3, CO3, -4 * SIZE
	daddiu	CO4, CO4, -4 * SIZE


	ST	t11,  0 * SIZE(BO)					#	update packed B
	ST	t12,  1 * SIZE(BO)
	ST	t13,  2 * SIZE(BO)
	ST	t14,  3 * SIZE(BO)
	ST	t21,  4 * SIZE(BO)
	ST	t22,  5 * SIZE(BO)
	ST	t23,  6 * SIZE(BO)
	ST	t24,  7 * SIZE(BO)
	ST	t31,  8 * SIZE(BO)
	ST	t32,  9 * SIZE(BO)
	ST	t33, 10 * SIZE(BO)
	ST	t34, 11 * SIZE(BO)
	ST	t41, 12 * SIZE(BO)
	ST	t42, 13 * SIZE(BO)
	ST	t43, 14 * SIZE(BO)
	ST	t44, 15 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)					#	write back 
	ST	t21,  1 * SIZE(CO1)
	ST	t31,  2 * SIZE(CO1)
	ST	t41,  3 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)
	ST	t22,  1 * SIZE(CO2)
	ST	t32,  2 * SIZE(CO2)
	ST	t42,  3 * SIZE(CO2)
	ST	t13,  0 * SIZE(CO3)
	ST	t23,  1 * SIZE(CO3)
	ST	t33,  2 * SIZE(CO3)
	ST	t43,  3 * SIZE(CO3)
	ST	t14,  0 * SIZE(CO4)
	ST	t24,  1 * SIZE(CO4)
	ST	t34,  2 * SIZE(CO4)
	ST	t44,  3 * SIZE(CO4)


	daddiu	KK, KK, -4						#	KC-KK is the length of the rectangular data part, LN compute from bottom to top so KK-=4
	daddiu	I, I, -1

	MTC	$0,  a1
	MOV	t11, a1
	MOV	t21, a1
	MOV	t31, a1
	MOV	t41, a1
	MOV	t12, a1
	MOV	t22, a1
	MOV	t32, a1
	MOV	t42, a1
	bgtz	I, .L11
	nop

	.align 3

.L29:
	dsll	TEMP, K, 2 + BASE_SHIFT
	daddu	B, B, TEMP							# B point to next Bj

	bgtz	J, .L10
	nop
	
	
	.align 3
.L30:
	andi	J,  N, 2							#	nr=2
	blez	J, .L70
	nop

	move	CO1, C
	daddu	CO2, C,   LDC

	MTC	$0,  t11								#	clear result regusters
	MOV	t21, t11
	MOV	t31, t11
	MOV	t41, t11

	daddu	KK, M, OFFSET
	move	AORIG, A							#	reset A
	
	daddu	C,  CO2, LDC						#	fixed 

	andi	I,  M, 1							#	mr=1
	blez	I, .L60
	nop

	dsll	TEMP,   K, BASE_SHIFT
	dsubu	AORIG, AORIG, TEMP				#	AORIG point to the beginning address of everypanel of Ai

	dsll	L,    KK, BASE_SHIFT			#	mr=1
	dsll	TEMP, KK, 1 + BASE_SHIFT		#	nr=2

	daddu	AO, AORIG, L					#	AO point to rectangular data part
	daddu	BO, B,     TEMP

	dsubu	TEMP, K, KK

	MOV	t12, t11							#	clear result registers
	MOV	t22, t11
	MOV	t32, t11
	MOV	t42, t11

	LD	a1,  0 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	dsra	L,  TEMP, 2
	blez	L, .L65
	nop


	.align	3
.L62:
	LD	a5,  1 * SIZE(AO)					
	LD	b5,  2 * SIZE(BO)
	LD	b6,  3 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t12, t12, a1, b2

	LD	a3,  2 * SIZE(AO)
	LD	b3,  4 * SIZE(BO)
	LD	b4,  5 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t12, t12, a5, b6

	LD	a7,  3 * SIZE(AO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MADD	t11, t11, a3, b3				#	3rd compute
	MADD	t12, t12, a3, b4

	daddiu	AO, AO,  4 * SIZE				#	AO += 1mr*4kr	
	daddiu	BO, BO,  8 * SIZE				#	BO += 2nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	MADD	t11, t11, a7, b7				#	4th compute
	MADD	t12, t12, a7, b8

	daddiu	L, L, -1
	bgtz	L, .L62
	nop
	
	.align 3

.L65:
	andi	L, TEMP, 3
	blez	L, .L68
	nop
	
	.align	3
.L66:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2

	daddiu	AO, AO,  1 * SIZE				#	AO += mr	
	daddiu	BO, BO,  2 * SIZE				#	BO += 2nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L66
	nop

.L68:
	daddiu	TEMP, KK, -1					#	mr=1

	dsll	L,    TEMP, BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT
	daddu	AO, AORIG, L					#	Ao point to the triangular data part
	daddu	BO, B,     TEMP
	
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t12, b2, t12

	
	LD	b3,  0 * SIZE(AO)
	MUL	t11, b3, t11
	MUL	t12, b3, t12

	daddiu	CO1, CO1, -1 * SIZE
	daddiu	CO2, CO2, -1 * SIZE


	ST	t11,  0 * SIZE(BO)
	ST	t12,  1 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)


	daddiu	KK, KK, -1
	MTC	$0,  t11								#	clear result regusters
	MOV	t21, t11
	MOV	t31, t11
	MOV	t41, t11




.L60:
	andi	I,  M, 2
	blez	I, .L40
	nop

	dsll	TEMP,   K, 1 + BASE_SHIFT
	dsubu	AORIG, AORIG, TEMP				#	AORIG point to the beginning address of everypanel of Ai

	dsll	L,    KK, 1 + BASE_SHIFT		#	mr=2
	dsll	TEMP, KK, 1 + BASE_SHIFT		#	nr=2

	daddu	AO, AORIG, L					#	AO point to rectangular data part
	daddu	BO, B,     TEMP

	dsubu	TEMP, K, KK


	MOV	t12, t11							#	clear result registers
	MOV	t22, t11
	MOV	t32, t11
	MOV	t42, t11

	LD	a1,  0 * SIZE(AO)
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	dsra	L,  TEMP, 2
	blez	L, .L45
	nop


	.align	3
.L42:
	LD	a5,  2 * SIZE(AO)					
	LD	a6,  3 * SIZE(AO)
	LD	b5,  2 * SIZE(BO)
	LD	b6,  3 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t21, t21, a2, b1
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2

	LD	a3,  4 * SIZE(AO)
	LD	a4,  5 * SIZE(AO)
	LD	b3,  4 * SIZE(BO)
	LD	b4,  5 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t21, t21, a6, b5
	MADD	t12, t12, a5, b6
	MADD	t22, t22, a6, b6

	LD	a7,  6 * SIZE(AO)
	LD	a8,  7 * SIZE(AO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MADD	t11, t11, a3, b3				#	3rd compute
	MADD	t21, t21, a4, b3
	MADD	t12, t12, a3, b4
	MADD	t22, t22, a4, b4

	daddiu	AO, AO,  8 * SIZE				#	AO += 2mr*4kr	
	daddiu	BO, BO,  8 * SIZE				#	BO += 2nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	MADD	t11, t11, a7, b7				#	4th compute
	MADD	t21, t21, a8, b7
	MADD	t12, t12, a7, b8
	MADD	t22, t22, a8, b8

	daddiu	L, L, -1
	bgtz	L, .L42
	nop
	
	.align 3

.L45:
	andi	L, TEMP, 3
	blez	L, .L48
	nop
	
	.align	3
.L46:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2

	daddiu	AO, AO,  2 * SIZE				#	AO += 2mr	
	daddiu	BO, BO,  2 * SIZE				#	BO += 2nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L46
	nop

.L48:
	daddiu	TEMP, KK, -2

	dsll	L,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT
	daddu	AO, AORIG, L					#	Ao point to the triangular data part
	daddu	BO, B,     TEMP
	
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t12, b2, t12
	SUB	t21, b3, t21
	SUB	t22, b4, t22

	LD	b1,  3 * SIZE(AO)				#	computes the triangular_part		
	LD	b2,  2 * SIZE(AO)
	MUL	t21, b1, t21
	MUL	t22, b1, t22
	NMSUB	t11, t11, b2, t21
	NMSUB	t12, t12, b2, t22
	
	LD	b3,  0 * SIZE(AO)
	MUL	t11, b3, t11
	MUL	t12, b3, t12

	daddiu	CO1, CO1, -2 * SIZE
	daddiu	CO2, CO2, -2 * SIZE


	ST	t11,  0 * SIZE(BO)
	ST	t12,  1 * SIZE(BO)
	ST	t21,  2 * SIZE(BO)
	ST	t22,  3 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t21,  1 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)
	ST	t22,  1 * SIZE(CO2)


	daddiu	KK, KK, -2
	MTC	$0,  t11								#	clear result regusters
	MOV	t21, t11
	MOV	t31, t11
	MOV	t41, t11


.L40:
	dsra	I,  M, 2							#	I = mc/4
	blez	I, .L49
	nop

.L31:
	dsll	TEMP,   K,  2 + BASE_SHIFT
	dsubu	AORIG, AORIG, TEMP					#	AORIG point to the beginning address of panel Ai
	dsll	L,    KK, 2 + BASE_SHIFT			#	mr=4
	dsll	TEMP, KK, 1 + BASE_SHIFT			#	nr=2

	daddu	AO, AORIG, L						#	AO point to the rectangular data part
	daddu	BO, B,     TEMP

	dsubu	TEMP, K, KK

	MOV	t12, t11
	MOV	t22, t11
	MOV	t32, t11
	MOV	t42, t11
	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
	LD	a2,  1 * SIZE(AO)					#	mr*KK with nr*KK
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)					#	get 4a

	LD	b1,  0 * SIZE(BO)					#	get 4b
	LD	b2,  1 * SIZE(BO)

	dsra	L,  TEMP, 2
	blez	L, .L35
	nop


	.align	3
.L32:
	LD	a5,  4 * SIZE(AO)					
	LD	a6,  5 * SIZE(AO)
	LD	a7,  6 * SIZE(AO)
	LD	a8,  7 * SIZE(AO)
	LD	b5,  2 * SIZE(BO)
	LD	b6,  3 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t32, t32, a3, b2
	MADD	t42, t42, a4, b2

	LD	a1,   8 * SIZE(AO)
	LD	a2,   9 * SIZE(AO)
	LD	a3,  10 * SIZE(AO)
	LD	a4,  11 * SIZE(AO)
	LD	b3,   4 * SIZE(BO)
	LD	b4,   5 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t21, t21, a6, b5
	MADD	t31, t31, a7, b5
	MADD	t41, t41, a8, b5
	MADD	t12, t12, a5, b6
	MADD	t22, t22, a6, b6
	MADD	t32, t32, a7, b6
	MADD	t42, t42, a8, b6

	LD	a5,  12 * SIZE(AO)
	LD	a6,  13 * SIZE(AO)
	LD	a7,  14 * SIZE(AO)
	LD	a8,  15 * SIZE(AO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MADD	t11, t11, a1, b3				#	3rd compute
	MADD	t21, t21, a2, b3
	MADD	t31, t31, a3, b3
	MADD	t41, t41, a4, b3
	MADD	t12, t12, a1, b4
	MADD	t22, t22, a2, b4
	MADD	t32, t32, a3, b4
	MADD	t42, t42, a4, b4

	daddiu	AO, AO, 16 * SIZE				#	AO += 4mr*4kr	
	daddiu	BO, BO,  8 * SIZE				#	BO += 2nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	MADD	t11, t11, a5, b7				#	4th compute
	MADD	t21, t21, a6, b7
	MADD	t31, t31, a7, b7
	MADD	t41, t41, a8, b7
	MADD	t12, t12, a5, b8
	MADD	t22, t22, a6, b8
	MADD	t32, t32, a7, b8
	MADD	t42, t42, a8, b8

	daddiu	L, L, -1
	bgtz	L, .L32
	nop

	
	.align 3

.L35:
	andi	L, TEMP, 3
	blez	L, .L38
	nop
	
	.align	3
.L36:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1
	
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t32, t32, a3, b2
	MADD	t42, t42, a4, b2

	daddiu	AO, AO,  4 * SIZE				#	AO += 4mr	
	daddiu	BO, BO,  2 * SIZE				#	BO += 2nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L36
	nop


.L38:										#
	daddiu	TEMP, KK, -4
	dsll	L,    TEMP, 2 + BASE_SHIFT		#	mr=4
	dsll	TEMP, TEMP, 1 + BASE_SHIFT		#	nr=2
	daddu	AO, AORIG, L					#	AO point to the triangular data part
	daddu	BO, B,     TEMP

	
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t12, b2, t12
	SUB	t21, b3, t21
	SUB	t22, b4, t22
	SUB	t31, b5, t31
	SUB	t32, b6, t32
	SUB	t41, b7, t41
	SUB	t42, b8, t42


	LD	b1,  15 * SIZE(AO)
	LD	b2,  14 * SIZE(AO)	
	LD	b4,  13 * SIZE(AO)
	LD	b7,  12 * SIZE(AO)
	
	MUL	t41, b1, t41
	MUL	t42, b1, t42
	NMSUB	t31, t31, b2, t41
	NMSUB	t32, t32, b2, t42
	NMSUB	t21, t21, b4, t41
	NMSUB	t22, t22, b4, t42
	NMSUB	t11, t11, b7, t41
	NMSUB	t12, t12, b7, t42



	LD	b3,  10 * SIZE(AO)	
	LD	b5,   9 * SIZE(AO)
	LD	b8,   8 * SIZE(AO)
	MUL	t31, b3, t31
	MUL	t32, b3, t32
	NMSUB	t21, t21, b5, t31
	NMSUB	t22, t22, b5, t32
	NMSUB	t11, t11, b8, t31
	NMSUB	t12, t12, b8, t32



	LD	b6,   5 * SIZE(AO)
	LD	b1,   4 * SIZE(AO)
	MUL	t21, b6, t21
	MUL	t22, b6, t22
	NMSUB	t11, t11, b1, t21
	NMSUB	t12, t12, b1, t22


	LD	b2,   0 * SIZE(AO)
	MUL	t11, b2, t11
	MUL	t12, b2, t12

	daddiu	CO1, CO1, -4 * SIZE
	daddiu	CO2, CO2, -4 * SIZE

	ST	t11,  0 * SIZE(BO)
	ST	t12,  1 * SIZE(BO)
	ST	t21,  2 * SIZE(BO)
	ST	t22,  3 * SIZE(BO)
	ST	t31,  4 * SIZE(BO)
	ST	t32,  5 * SIZE(BO)
	ST	t41,  6 * SIZE(BO)
	ST	t42,  7 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t21,  1 * SIZE(CO1)
	ST	t31,  2 * SIZE(CO1)
	ST	t41,  3 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)
	ST	t22,  1 * SIZE(CO2)
	ST	t32,  2 * SIZE(CO2)
	ST	t42,  3 * SIZE(CO2)


	daddiu	KK, KK, -4

	MTC	$0,  t11
	MOV	t21, t11
	MOV	t31, t11
	MOV	t41, t11

	daddiu	I, I, -1
	bgtz	I, .L31
	nop



	.align 3
.L49:
	dsll	TEMP, K, 1 + BASE_SHIFT		# 	nr=2
	daddu	B, B, TEMP

	.align 3

.L70:
	andi	J,  N, 1					#	nr=1
	blez	J, .L999					#	END
	nop

	move	CO1, C

	daddu	KK, M, OFFSET
	move	AORIG, A					#	reset A

	andi	I,  M, 1					#	mr=1
	blez	I, .L90
	NOP

	MTC	$0,  t11

	dsll	TEMP,   K, BASE_SHIFT			#	mr=1
	dsubu	AORIG, AORIG, TEMP
	
	dsll	L,    KK, BASE_SHIFT

	daddu	AO, AORIG, L					#	AO point to the rectangular data part
	daddu	BO, B,    L 

	dsubu	TEMP, K, KK


	LD	a1,  0 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)

	dsra	L,  TEMP, 2
	blez	L, .L95
	nop
	
	.align	3
.L92:
	LD	a5,  1 * SIZE(AO)					
	LD	b5,  1 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute

	LD	a3,  2 * SIZE(AO)
	LD	b3,  2 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute

	LD	a7,  3 * SIZE(AO)
	LD	b7,  3 * SIZE(BO)

	MADD	t11, t11, a3, b3				#	3rd compute

	daddiu	AO, AO,  4 * SIZE				#	AO += 1mr*4kr	
	daddiu	BO, BO,  4 * SIZE				#	BO += 1nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	b1,  0 * SIZE(BO)

	MADD	t11, t11, a7, b7				#	4th compute

	daddiu	L, L, -1
	bgtz	L, .L92
	nop
	
	.align 3

.L95:
	andi	L, TEMP,  3
	blez	L, .L98
	nop

	.align	3
.L96:
	MADD	t11, t11, a1, b1				#	3rd compute

	daddiu	AO, AO,  1 * SIZE				#	AO += 1mr	
	daddiu	BO, BO,  1 * SIZE				#	BO += 1nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	b1,  0 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L96
	nop


.L98:
	daddiu	TEMP, KK, -1					# 	mr=2
	dsll	TEMP, TEMP,  BASE_SHIFT

	daddu	AO, AORIG, TEMP					#	AO point to the triangular data part
	daddu	BO, B,     TEMP

	LD	b1,  0 * SIZE(BO)

	SUB	t11, b1, t11


	LD	b3,  0 * SIZE(AO)
	MUL	t11, b3, t11

	daddiu	CO1, CO1, -1 * SIZE

	ST	t11,  0 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)

	daddiu	KK, KK, -1


.L90:	
	andi	I,  M, 2
	blez	I, .L80
	NOP

	MTC	$0,  t11
	MOV	t21, t11							#	clear result registers

	dsll	TEMP,   K, 1+BASE_SHIFT			#	mr=2
	dsubu	AORIG, AORIG, TEMP
	
	dsll	L,    KK, 1 + BASE_SHIFT
	dsll	TEMP, KK, 0 + BASE_SHIFT

	daddu	AO, AORIG, L					#	AO point to the rectangular data part
	daddu	BO, B,     TEMP

	dsubu	TEMP, K, KK


	LD	a1,  0 * SIZE(AO)
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)

	dsra	L,  TEMP, 2
	blez	L, .L85
	nop
	
	.align	3
.L82:
	LD	a5,  2 * SIZE(AO)					
	LD	a6,  3 * SIZE(AO)

	LD	b5,  1 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t21, t21, a2, b1

	LD	a3,  4 * SIZE(AO)
	LD	a4,  5 * SIZE(AO)

	LD	b3,  2 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t21, t21, a6, b5

	LD	a7,  6 * SIZE(AO)
	LD	a8,  7 * SIZE(AO)

	LD	b7,  3 * SIZE(BO)

	MADD	t11, t11, a3, b3				#	3rd compute
	MADD	t21, t21, a4, b3

	daddiu	AO, AO,  8 * SIZE				#	AO += 2mr*4kr	
	daddiu	BO, BO,  4 * SIZE				#	BO += 1nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)

	MADD	t11, t11, a7, b7				#	4th compute
	MADD	t21, t21, a8, b7

	daddiu	L, L, -1
	bgtz	L, .L82
	nop
	
	.align 3

.L85:
	andi	L, TEMP,  3
	blez	L, .L88
	nop

	.align	3
.L86:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1

	daddiu	AO, AO,  2 * SIZE				#	AO += 2mr	
	daddiu	BO, BO,  1 * SIZE				#	BO += 1nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L86
	nop


.L88:
	daddiu	TEMP, KK, -2					# 	mr=2
	dsll	L,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 0 + BASE_SHIFT

	daddu	AO, AORIG, L					#	AO point to the triangular data part
	daddu	BO, B,     TEMP

	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t21, b2, t21

	LD	b1,  3 * SIZE(AO)				#	computes the triangular_part		
	LD	b2,  2 * SIZE(AO)
	MUL	t21, b1, t21
	NMSUB	t11, t11, b2, t21
	
	LD	b3,  0 * SIZE(AO)
	MUL	t11, b3, t11

	daddiu	CO1, CO1, -2 * SIZE

	ST	t11,  0 * SIZE(BO)
	ST	t21,  1 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t21,  1 * SIZE(CO1)

	daddiu	KK, KK, -2
	
	
	.align	3
.L80:
	dsra	I,  M, 2
	blez	I, .L89
	nop

.L71:
	dsll	TEMP,   K,  2 + BASE_SHIFT		#	mr=4
	dsubu	AORIG, AORIG, TEMP

	dsll	L,    KK, 2 + BASE_SHIFT		#	mr=4
	dsll	TEMP, KK, 0 + BASE_SHIFT		#	nr=1

	daddu	AO, AORIG, L					#	AO point to the rectangular
	daddu	BO, B,     TEMP

	dsubu	TEMP, K, KK


	MTC	$0,  t11								#	clear result regusters
	MOV	t21, t11
	MOV	t31, t11
	MOV	t41, t11

	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
	LD	a2,  1 * SIZE(AO)					#	mr*KK with nr*KK
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)					#	get 4a

	LD	b1,  0 * SIZE(BO)					#	get 4b

	dsra	L,  TEMP, 2
	blez	L, .L75
	nop										#	reset B

	.align	3
.L72:
	LD	a5,  4 * SIZE(AO)					
	LD	a6,  5 * SIZE(AO)
	LD	a7,  6 * SIZE(AO)
	LD	a8,  7 * SIZE(AO)

	LD	b5,  1 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1

	LD	a1,   8 * SIZE(AO)
	LD	a2,   9 * SIZE(AO)
	LD	a3,  10 * SIZE(AO)
	LD	a4,  11 * SIZE(AO)

	LD	b3,  2 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t21, t21, a6, b5
	MADD	t31, t31, a7, b5
	MADD	t41, t41, a8, b5

	LD	a5,  12 * SIZE(AO)
	LD	a6,  13 * SIZE(AO)
	LD	a7,  14 * SIZE(AO)
	LD	a8,  15 * SIZE(AO)

	LD	b7,  3 * SIZE(BO)

	MADD	t11, t11, a1, b3				#	3rd compute
	MADD	t21, t21, a2, b3
	MADD	t31, t31, a3, b3
	MADD	t41, t41, a4, b3

	daddiu	AO, AO, 16 * SIZE				#	AO += 4mr*4kr	
	daddiu	BO, BO,  4 * SIZE				#	BO += 1nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)

	MADD	t11, t11, a5, b7				#	4th compute
	MADD	t21, t21, a6, b7
	MADD	t31, t31, a7, b7
	MADD	t41, t41, a8, b7

	daddiu	L, L, -1
	bgtz	L, .L72
	nop
	
	.align 3

.L75:
	andi	L, TEMP,  3
	blez	L, .L78
	nop

	.align	3
.L76:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1

	daddiu	AO, AO,  4 * SIZE				#	AO += 4mr	
	daddiu	BO, BO,  1 * SIZE				#	BO += 1nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L76
	nop

.L78:
	daddiu	TEMP, KK, -4				#	mr=4

	dsll	L,    TEMP, 2 + BASE_SHIFT	#	mr=4
	dsll	TEMP, TEMP, 0 + BASE_SHIFT	#	nr=1
	daddu	AO, AORIG, L				#	AO point to the triangular
	daddu	BO, B,     TEMP

	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t21, b2, t21
	SUB	t31, b3, t31
	SUB	t41, b4, t41

	LD	b1,  15 * SIZE(AO)
	LD	b2,  14 * SIZE(AO)	
	LD	b4,  13 * SIZE(AO)
	LD	b7,  12 * SIZE(AO)
	MUL	t41, b1, t41
	NMSUB	t31, t31, b2, t41
	NMSUB	t21, t21, b4, t41
	NMSUB	t11, t11, b7, t41



	LD	b3,  10 * SIZE(AO)	
	LD	b5,   9 * SIZE(AO)
	LD	b8,   8 * SIZE(AO)
	MUL	t31, b3, t31
	NMSUB	t21, t21, b5, t31
	NMSUB	t11, t11, b8, t31



	LD	b6,   5 * SIZE(AO)
	LD	b1,   4 * SIZE(AO)
	MUL	t21, b6, t21
	NMSUB	t11, t11, b1, t21



	LD	b2,   0 * SIZE(AO)
	MUL	t11, b2, t11

	daddiu	CO1, CO1, -4 * SIZE

	ST	t11,  0 * SIZE(BO)
	ST	t21,  1 * SIZE(BO)
	ST	t31,  2 * SIZE(BO)
	ST	t41,  3 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t21,  1 * SIZE(CO1)
	ST	t31,  2 * SIZE(CO1)
	ST	t41,  3 * SIZE(CO1)


	daddiu	KK, KK, -4
	daddiu	I, I, -1
	bgtz	I, .L71
	nop


	.align 3
.L89:
	dsll	TEMP, K, BASE_SHIFT			#	nr=1
	daddu	B, B, TEMP



	.align 3

.L999:
	LDARG	$16,   0($sp)
	LDARG	$17,   8($sp)
	LDARG	$18,  16($sp)
	LDARG	$19,  24($sp)
	LDARG	$20,  32($sp)
	LDARG	$21,  40($sp)
	ldc1	$f24, 48($sp)
	ldc1	$f25, 56($sp)
	ldc1	$f26, 64($sp)
	ldc1	$f27, 72($sp)
	ldc1	$f28, 80($sp)

	LDARG	$22,  88($sp)
	LDARG	$23,  96($sp)
	LDARG	$24, 104($sp)
	LDARG	$25, 112($sp)

#ifndef __64BIT__
	ldc1	$f20,112($sp)
	ldc1	$f21,120($sp)
	ldc1	$f22,128($sp)
	ldc1	$f23,136($sp)
#endif

	j	$31
	daddiu	$sp, $sp, 144

	EPILOGUE