Blob Blame Raw
#define REALNAME ASMNAME

#define ASSEMBLER
#include "common.h"


#define M	$4
#define	N	$5
#define	K	$6
#define A	$8
#define B	$9
#define C	$10
#define LDC	$11

#define AO	$12
#define BO	$13

#define I	$2
#define J	$3
#define L	$7

#define CO1	$14
#define CO2	$15
#define CO3	$16
#define CO4	$17

#define OFFSET	$22
#define KK	$23
#define TEMP	$24
#define AORIG	$25

#define a1	$f0
#define a2	$f1
#define a3	$f2
#define a4	$f3
#define a5	$f4
#define a6	$f5
#define a7	$f6
#define	a8	$f7

#define b1	$f8
#define b2	$f9
#define b3	$f10
#define b4	$f11
#define b5	$f12
#define b6	$f13
#define b7	$f14
#define b8	$f15

#define t11	$f16
#define t21	$f17
#define t31	$f18
#define t41	$f19

#define t12	$f20
#define t22	$f21
#define t32	$f22
#define t42	$f23

#define t13	$f24
#define t23	$f25
#define t33	$f26
#define t43	$f27

#define t14	$f28
#define t24	$f29
#define t34	$f30
#define t44	$f31

#define ALPHA	$f15

	PROLOGUE
	
	daddiu	$sp, $sp, -144

	SDARG	$16,   0($sp)
	SDARG	$17,   8($sp)
	SDARG	$18,  16($sp)
	SDARG	$19,  24($sp)
	SDARG	$20,  32($sp)
	SDARG	$21,  40($sp)
	sdc1	$f24, 48($sp)
	sdc1	$f25, 56($sp)
	sdc1	$f26, 64($sp)
	sdc1	$f27, 72($sp)
	sdc1	$f28, 80($sp)

	SDARG	$22,  88($sp)
	SDARG	$23,  96($sp)
	SDARG	$24, 104($sp)
	SDARG	$25, 112($sp)

#ifndef __64BIT__
	sdc1	$f20,112($sp)
	sdc1	$f21,120($sp)
	sdc1	$f22,128($sp)
	sdc1	$f23,136($sp)
#endif
											#	LT compute from left to right, top to bottom
	LDARG	OFFSET, 144($sp)					
	dsll	LDC, LDC, BASE_SHIFT			#	ldc

	dsra	J,  N, 2						#	j = nc/4
	blez	J, .L30
	nop

.L10:										#	nr=4
	daddiu	J, J, -1
	move	CO1, C
	daddu	CO2, C,   LDC
	daddu	CO3, CO2, LDC
	daddu	CO4, CO3, LDC

	MTC	$0,  t11							#	clear result registers
	MOV	t21, t11
	MOV	t31, t11
	MOV	t41, t11
	MOV	t12, t11
	MOV	t22, t11
	MOV	t32, t11
	MOV	t42, t11

	dsra	I,  M, 2						#	i = mc/4
	move	KK, OFFSET						#	kk is the length of the rectangular data part of panel Ai
	move	AO, A							#	reset A	
	daddu	C,  CO4, LDC					#	fixed pointer C, the write back address
	blez	I, .L20
	nop


.L11:										#	mr=4
	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
	LD	a2,  1 * SIZE(AO)					#	mr*KK with nr*KK
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)					#	get 4a

	LD	b1,  0 * SIZE(B)					#	get 4b
	LD	b2,  1 * SIZE(B)
	LD	b3,  2 * SIZE(B)
	LD	b4,  3 * SIZE(B)				

	MOV	t13, t11							#	clear result registers
	MOV	t23, t11
	MOV	t33, t11
	MOV	t43, t11
	MOV	t14, t11
	MOV	t24, t11
	MOV	t34, t11
	MOV	t44, t11

	dsra	L,  KK, 2						#	L = kk/4
	blez	L, .L15
	move	BO,  B							#


	.align	3
.L12:
	LD	a5,  4 * SIZE(AO)					
	LD	a6,  5 * SIZE(AO)
	LD	a7,  6 * SIZE(AO)
	LD	a8,  7 * SIZE(AO)

	LD	b5,  4 * SIZE(BO)
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1

	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t32, t32, a3, b2
	MADD	t42, t42, a4, b2

	MADD	t13, t13, a1, b3
	MADD	t23, t23, a2, b3
	MADD	t33, t33, a3, b3
	MADD	t43, t43, a4, b3

	MADD	t14, t14, a1, b4
	MADD	t24, t24, a2, b4
	MADD	t34, t34, a3, b4
	MADD	t44, t44, a4, b4		

	LD	a1,   8 * SIZE(AO)
	LD	a2,   9 * SIZE(AO)
	LD	a3,  10 * SIZE(AO)
	LD	a4,  11 * SIZE(AO)

	LD	b1,   8 * SIZE(BO)
	LD	b2,   9 * SIZE(BO)
	LD	b3,  10 * SIZE(BO)
	LD	b4,  11 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t21, t21, a6, b5
	MADD	t31, t31, a7, b5
	MADD	t41, t41, a8, b5

	MADD	t12, t12, a5, b6
	MADD	t22, t22, a6, b6
	MADD	t32, t32, a7, b6
	MADD	t42, t42, a8, b6

	MADD	t13, t13, a5, b7
	MADD	t23, t23, a6, b7
	MADD	t33, t33, a7, b7
	MADD	t43, t43, a8, b7

	MADD	t14, t14, a5, b8
	MADD	t24, t24, a6, b8
	MADD	t34, t34, a7, b8
	MADD	t44, t44, a8, b8			

	LD	a5,  12 * SIZE(AO)
	LD	a6,  13 * SIZE(AO)
	LD	a7,  14 * SIZE(AO)
	LD	a8,  15 * SIZE(AO)

	LD	b5,  12 * SIZE(BO)
	LD	b6,  13 * SIZE(BO)
	LD	b7,  14 * SIZE(BO)
	LD	b8,  15 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1

	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t32, t32, a3, b2
	MADD	t42, t42, a4, b2

	MADD	t13, t13, a1, b3
	MADD	t23, t23, a2, b3
	MADD	t33, t33, a3, b3
	MADD	t43, t43, a4, b3

	MADD	t14, t14, a1, b4
	MADD	t24, t24, a2, b4
	MADD	t34, t34, a3, b4
	MADD	t44, t44, a4, b4		

	daddiu	AO, AO, 16 * SIZE				#	AO += 4mr*4kr	
	daddiu	BO, BO, 16 * SIZE				#	BO += 4nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	4th compute
	MADD	t21, t21, a6, b5
	MADD	t31, t31, a7, b5
	MADD	t41, t41, a8, b5

	MADD	t12, t12, a5, b6
	MADD	t22, t22, a6, b6
	MADD	t32, t32, a7, b6
	MADD	t42, t42, a8, b6

	MADD	t13, t13, a5, b7
	MADD	t23, t23, a6, b7
	MADD	t33, t33, a7, b7
	MADD	t43, t43, a8, b7

	MADD	t14, t14, a5, b8
	MADD	t24, t24, a6, b8
	MADD	t34, t34, a7, b8
	MADD	t44, t44, a8, b8			

	daddiu	L, L, -1
	bgtz	L, .L12
	nop
	

	.align 3
.L15:
	andi	L, KK,  3						#	the remainder part: KK-KK/4
	blez	L, .L18
	nop

	.align	3
.L16:
	MADD	t11, t11, a1, b1				
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1

	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t32, t32, a3, b2
	MADD	t42, t42, a4, b2

	MADD	t13, t13, a1, b3
	MADD	t23, t23, a2, b3
	MADD	t33, t33, a3, b3
	MADD	t43, t43, a4, b3

	MADD	t14, t14, a1, b4
	MADD	t24, t24, a2, b4
	MADD	t34, t34, a3, b4
	MADD	t44, t44, a4, b4		

	daddiu	AO, AO,  4 * SIZE				#	AO += 4mr	
	daddiu	BO, BO,  4 * SIZE				#	BO += 4nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L16
	nop


.L18:										#	deal with the triangular data part of panel Ai
	LD	b1,  0 * SIZE(BO)					#	triangular_part*X + rectangular_part = B
	LD	b2,  1 * SIZE(BO)					#	triangular_part*X = B - rectangular_part
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t12, b2, t12
	SUB	t13, b3, t13
	SUB	t14, b4, t14

	LD	b5,  4 * SIZE(BO)					#	sb store in row major
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)
	
	SUB	t21, b5, t21
	SUB	t22, b6, t22
	SUB	t23, b7, t23
	SUB	t24, b8, t24

	LD	b1,  8 * SIZE(BO)
	LD	b2,  9 * SIZE(BO)
	LD	b3, 10 * SIZE(BO)
	LD	b4, 11 * SIZE(BO)
	
	SUB	t31, b1, t31
	SUB	t32, b2, t32
	SUB	t33, b3, t33
	SUB	t34, b4, t34
	
	LD	b5, 12 * SIZE(BO)
	LD	b6, 13 * SIZE(BO)
	LD	b7, 14 * SIZE(BO)
	LD	b8, 15 * SIZE(BO)

	SUB	t41, b5, t41
	SUB	t42, b6, t42
	SUB	t43, b7, t43
	SUB	t44, b8, t44


	LD	a1,  0 * SIZE(AO)					#	sa stores in col major		
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)
	MUL	t11, a1, t11
	MUL	t12, a1, t12
	MUL	t13, a1, t13
	MUL	t14, a1, t14
	NMSUB	t21, t21, a2, t11
	NMSUB	t22, t22, a2, t12
	NMSUB	t23, t23, a2, t13
	NMSUB	t24, t24, a2, t14
	NMSUB	t31, t31, a3, t11
	NMSUB	t32, t32, a3, t12
	NMSUB	t33, t33, a3, t13
	NMSUB	t34, t34, a3, t14
	NMSUB	t41, t41, a4, t11
	NMSUB	t42, t42, a4, t12
	NMSUB	t43, t43, a4, t13
	NMSUB	t44, t44, a4, t14

	
	LD	a5,  5 * SIZE(AO)
	LD	a6,  6 * SIZE(AO)
	LD	a7,  7 * SIZE(AO)
	MUL	t21, a5, t21
	MUL	t22, a5, t22
	MUL	t23, a5, t23
	MUL	t24, a5, t24
	NMSUB	t31, t31, a6, t21
	NMSUB	t32, t32, a6, t22
	NMSUB	t33, t33, a6, t23
	NMSUB	t34, t34, a6, t24
	NMSUB	t41, t41, a7, t21
	NMSUB	t42, t42, a7, t22
	NMSUB	t43, t43, a7, t23
	NMSUB	t44, t44, a7, t24


	LD	a8, 10 * SIZE(AO)
	LD	a1, 11 * SIZE(AO)
	MUL	t31, a8, t31
	MUL	t32, a8, t32
	MUL	t33, a8, t33
	MUL	t34, a8, t34
	NMSUB	t41, t41, a1, t31
	NMSUB	t42, t42, a1, t32
	NMSUB	t43, t43, a1, t33
	NMSUB	t44, t44, a1, t34


	LD	a2, 15 * SIZE(AO)
	MUL	t41, a2, t41
	MUL	t42, a2, t42
	MUL	t43, a2, t43
	MUL	t44, a2, t44

	ST	t11,  0 * SIZE(BO)					#	update packed B
	ST	t12,  1 * SIZE(BO)
	ST	t13,  2 * SIZE(BO)
	ST	t14,  3 * SIZE(BO)
	ST	t21,  4 * SIZE(BO)
	ST	t22,  5 * SIZE(BO)
	ST	t23,  6 * SIZE(BO)
	ST	t24,  7 * SIZE(BO)
	ST	t31,  8 * SIZE(BO)
	ST	t32,  9 * SIZE(BO)
	ST	t33, 10 * SIZE(BO)
	ST	t34, 11 * SIZE(BO)
	ST	t41, 12 * SIZE(BO)
	ST	t42, 13 * SIZE(BO)
	ST	t43, 14 * SIZE(BO)
	ST	t44, 15 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)					#	write back 
	ST	t21,  1 * SIZE(CO1)
	ST	t31,  2 * SIZE(CO1)
	ST	t41,  3 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)
	ST	t22,  1 * SIZE(CO2)
	ST	t32,  2 * SIZE(CO2)
	ST	t42,  3 * SIZE(CO2)
	ST	t13,  0 * SIZE(CO3)
	ST	t23,  1 * SIZE(CO3)
	ST	t33,  2 * SIZE(CO3)
	ST	t43,  3 * SIZE(CO3)
	ST	t14,  0 * SIZE(CO4)
	ST	t24,  1 * SIZE(CO4)
	ST	t34,  2 * SIZE(CO4)
	ST	t44,  3 * SIZE(CO4)

	daddiu	CO1, CO1, 4 * SIZE				#	fixed pointers
	daddiu	CO2, CO2, 4 * SIZE
	daddiu	CO3, CO3, 4 * SIZE
	daddiu	CO4, CO4, 4 * SIZE

	dsubu	TEMP, K, KK
	dsll	L,    TEMP, 2 + BASE_SHIFT
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
	daddu	AO, AO, L						#	mov AO to the end of panel Ai
	daddu	BO, BO, TEMP					#	mov BO to the end of panel Bj

	daddiu	KK, KK, 4						#	the length of rectangular data part increases by 4
	daddiu	I, I, -1

	MTC	$0,  a1
	MOV	t11, a1
	MOV	t21, a1
	MOV	t31, a1
	MOV	t41, a1
	MOV	t12, a1
	MOV	t22, a1
	MOV	t32, a1
	MOV	t42, a1
	bgtz	I, .L11
	nop


	.align 3
.L20:
	andi	I,  M, 2						#	mr=2,nr=4	
	blez	I, .L50
	nop

	MOV	t13, t11
	MOV	t23, t11
	MOV	t33, t11
	MOV	t43, t11
	MOV	t14, t11
	MOV	t24, t11
	MOV	t34, t11
	MOV	t44, t11

	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
	LD	a2,  1 * SIZE(AO)					#	mr*KK with nr*KK

	LD	b1,  0 * SIZE(B)					#	get 4b
	LD	b2,  1 * SIZE(B)
	LD	b3,  2 * SIZE(B)
	LD	b4,  3 * SIZE(B)				

	dsra	L,  KK, 2
	blez	L, .L25
	move	BO,  B


	.align	3
.L22:
	LD	a5,  2 * SIZE(AO)					
	LD	a6,  3 * SIZE(AO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t21, t21, a2, b1
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t13, t13, a1, b3
	MADD	t23, t23, a2, b3
	MADD	t14, t14, a1, b4
	MADD	t24, t24, a2, b4

	LD	a3,   4 * SIZE(AO)
	LD	a4,   5 * SIZE(AO)
	LD	b1,   8 * SIZE(BO)
	LD	b2,   9 * SIZE(BO)
	LD	b3,  10 * SIZE(BO)
	LD	b4,  11 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t21, t21, a6, b5
	MADD	t12, t12, a5, b6
	MADD	t22, t22, a6, b6
	MADD	t13, t13, a5, b7
	MADD	t23, t23, a6, b7
	MADD	t14, t14, a5, b8
	MADD	t24, t24, a6, b8

	LD	a7,   6 * SIZE(AO)
	LD	a8,   7 * SIZE(AO)
	LD	b5,  12 * SIZE(BO)
	LD	b6,  13 * SIZE(BO)
	LD	b7,  14 * SIZE(BO)
	LD	b8,  15 * SIZE(BO)

	MADD	t11, t11, a3, b1				#	3rd compute
	MADD	t21, t21, a4, b1
	MADD	t12, t12, a3, b2
	MADD	t22, t22, a4, b2
	MADD	t13, t13, a3, b3
	MADD	t23, t23, a4, b3
	MADD	t14, t14, a3, b4
	MADD	t24, t24, a4, b4

	daddiu	AO, AO,  8 * SIZE				#	AO += 2mr*4kr	
	daddiu	BO, BO, 16 * SIZE				#	BO += 4nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	MADD	t11, t11, a7, b5				#	4th compute
	MADD	t21, t21, a8, b5
	MADD	t12, t12, a7, b6
	MADD	t22, t22, a8, b6
	MADD	t13, t13, a7, b7
	MADD	t23, t23, a8, b7
	MADD	t14, t14, a7, b8
	MADD	t24, t24, a8, b8

	daddiu	L, L, -1
	bgtz	L, .L22
	nop


	.align 3
.L25:
	andi	L, KK,  3
	blez	L, .L28
	nop
	
	.align	3
.L26:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t13, t13, a1, b3
	MADD	t23, t23, a2, b3
	MADD	t14, t14, a1, b4
	MADD	t24, t24, a2, b4

	daddiu	AO, AO,  2 * SIZE				#	AO += 2mr	
	daddiu	BO, BO,  4 * SIZE				#	BO += 4nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L26
	nop


.L28:										#	deal with the triangular part
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t12, b2, t12
	SUB	t13, b3, t13
	SUB	t14, b4, t14
	SUB	t21, b5, t21
	SUB	t22, b6, t22
	SUB	t23, b7, t23
	SUB	t24, b8, t24


	LD	b1,  0 * SIZE(AO)				#	computes the triangular_part		
	LD	b2,  1 * SIZE(AO)
	MUL	t11, b1, t11
	MUL	t12, b1, t12
	MUL	t13, b1, t13
	MUL	t14, b1, t14
	NMSUB	t21, t21, b2, t11
	NMSUB	t22, t22, b2, t12
	NMSUB	t23, t23, b2, t13
	NMSUB	t24, t24, b2, t14
	
	LD	b3,  3 * SIZE(AO)
	MUL	t21, b3, t21
	MUL	t22, b3, t22
	MUL	t23, b3, t23
	MUL	t24, b3, t24


	ST	t11,  0 * SIZE(BO)
	ST	t12,  1 * SIZE(BO)
	ST	t13,  2 * SIZE(BO)
	ST	t14,  3 * SIZE(BO)
	ST	t21,  4 * SIZE(BO)
	ST	t22,  5 * SIZE(BO)
	ST	t23,  6 * SIZE(BO)
	ST	t24,  7 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t21,  1 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)
	ST	t22,  1 * SIZE(CO2)
	ST	t13,  0 * SIZE(CO3)
	ST	t23,  1 * SIZE(CO3)
	ST	t14,  0 * SIZE(CO4)
	ST	t24,  1 * SIZE(CO4)

	daddiu	CO1, CO1, 2 * SIZE
	daddiu	CO2, CO2, 2 * SIZE
	daddiu	CO3, CO3, 2 * SIZE
	daddiu	CO4, CO4, 2 * SIZE


	dsubu	TEMP, K, KK
	dsll	L,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
	daddu	AO, AO, L							#	mov AO to the end of Ai
	daddu	BO, BO, TEMP						#	mov BO to the end of Bj

	daddiu	KK, KK, 2							#	the length of rectangular data part increases by 2
	MTC	$0,  a1
	MOV	t11, a1
	MOV	t21, a1
	MOV	t31, a1
	MOV	t41, a1
	MOV	t12, a1
	MOV	t22, a1
	MOV	t32, a1
	MOV	t42, a1


	.align 3
.L50:
	andi	I,  M, 1						#	mr=1,nr=4	
	blez	I, .L29
	nop

	MOV	t13, t11
	MOV	t23, t11
	MOV	t33, t11
	MOV	t43, t11
	MOV	t14, t11
	MOV	t24, t11
	MOV	t34, t11
	MOV	t44, t11

	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai

	LD	b1,  0 * SIZE(B)					#	get 4b
	LD	b2,  1 * SIZE(B)
	LD	b3,  2 * SIZE(B)
	LD	b4,  3 * SIZE(B)				

	dsra	L,  KK, 2
	blez	L, .L55
	move	BO,  B


	.align	3
.L52:
	LD	a5,  1 * SIZE(AO)					
	LD	b5,  4 * SIZE(BO)
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t12, t12, a1, b2
	MADD	t13, t13, a1, b3
	MADD	t14, t14, a1, b4

	LD	a3,   2 * SIZE(AO)
	LD	b1,   8 * SIZE(BO)
	LD	b2,   9 * SIZE(BO)
	LD	b3,  10 * SIZE(BO)
	LD	b4,  11 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t12, t12, a5, b6
	MADD	t13, t13, a5, b7
	MADD	t14, t14, a5, b8

	LD	a7,   3 * SIZE(AO)
	LD	b5,  12 * SIZE(BO)
	LD	b6,  13 * SIZE(BO)
	LD	b7,  14 * SIZE(BO)
	LD	b8,  15 * SIZE(BO)

	MADD	t11, t11, a3, b1				#	3rd compute
	MADD	t12, t12, a3, b2
	MADD	t13, t13, a3, b3
	MADD	t14, t14, a3, b4

	daddiu	AO, AO,  4 * SIZE				#	AO += mr*4kr	
	daddiu	BO, BO, 16 * SIZE				#	BO += 4nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	MADD	t11, t11, a7, b5				#	4th compute
	MADD	t12, t12, a7, b6
	MADD	t13, t13, a7, b7
	MADD	t14, t14, a7, b8

	daddiu	L, L, -1
	bgtz	L, .L52
	nop


	.align 3
.L55:
	andi	L, KK,  3
	blez	L, .L58
	nop
	
	.align	3
.L56:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t12, t12, a1, b2
	MADD	t13, t13, a1, b3
	MADD	t14, t14, a1, b4

	daddiu	AO, AO,  1 * SIZE				#	AO += 2mr	
	daddiu	BO, BO,  4 * SIZE				#	BO += 4nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L56
	nop


.L58:										#	deal with the triangular part
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t12, b2, t12
	SUB	t13, b3, t13
	SUB	t14, b4, t14


	LD	b1,  0 * SIZE(AO)				#	computes the triangular_part		
	MUL	t11, b1, t11
	MUL	t12, b1, t12
	MUL	t13, b1, t13
	MUL	t14, b1, t14

	ST	t11,  0 * SIZE(BO)
	ST	t12,  1 * SIZE(BO)
	ST	t13,  2 * SIZE(BO)
	ST	t14,  3 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)
	ST	t13,  0 * SIZE(CO3)
	ST	t14,  0 * SIZE(CO4)

	daddiu	CO1, CO1, 1 * SIZE
	daddiu	CO2, CO2, 1 * SIZE
	daddiu	CO3, CO3, 1 * SIZE
	daddiu	CO4, CO4, 1 * SIZE


	dsubu	TEMP, K, KK
	dsll	L,    TEMP, BASE_SHIFT				#	mr=1
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
	daddu	AO, AO, L							#	mov AO to the end of Ai
	daddu	BO, BO, TEMP						#	mov BO to the end of Bj

	daddiu	KK, KK, 1							#	the length of rectangular data part increases by 2

	.align 3
.L29:
	move	B,  BO								#	fixed panel Bj
	bgtz	J, .L10
	nop
	
	
	.align 3
.L30:
	andi	J,  N, 2							#	nr=2
	blez	J, .L70
	nop

	move	CO1, C
	daddu	CO2, C,   LDC

	MTC	$0,  t11								#	clear result regusters
	MOV	t21, t11
	MOV	t31, t11
	MOV	t41, t11

	move	KK, OFFSET							
	move	AO, A								#	reset A
	daddu	C,  CO2, LDC						#	fixed 

	dsra	I,  M, 2							#	I = mc/4
	blez	I, .L40
	nop

.L31:
	MOV	t12, t11
	MOV	t22, t11
	MOV	t32, t11
	MOV	t42, t11
	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
	LD	a2,  1 * SIZE(AO)					#	mr*KK with nr*KK
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)					#	get 4a

	LD	b1,  0 * SIZE(B)					#	get 4b
	LD	b2,  1 * SIZE(B)

	dsra	L,  KK, 2						#	L=kk/4
	blez	L, .L35
	move	BO,  B							#	reset B


	.align	3
.L32:
	LD	a5,  4 * SIZE(AO)					
	LD	a6,  5 * SIZE(AO)
	LD	a7,  6 * SIZE(AO)
	LD	a8,  7 * SIZE(AO)
	LD	b5,  2 * SIZE(BO)
	LD	b6,  3 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t32, t32, a3, b2
	MADD	t42, t42, a4, b2

	LD	a1,   8 * SIZE(AO)
	LD	a2,   9 * SIZE(AO)
	LD	a3,  10 * SIZE(AO)
	LD	a4,  11 * SIZE(AO)
	LD	b3,   4 * SIZE(BO)
	LD	b4,   5 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t21, t21, a6, b5
	MADD	t31, t31, a7, b5
	MADD	t41, t41, a8, b5
	MADD	t12, t12, a5, b6
	MADD	t22, t22, a6, b6
	MADD	t32, t32, a7, b6
	MADD	t42, t42, a8, b6

	LD	a5,  12 * SIZE(AO)
	LD	a6,  13 * SIZE(AO)
	LD	a7,  14 * SIZE(AO)
	LD	a8,  15 * SIZE(AO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MADD	t11, t11, a1, b3				#	3rd compute
	MADD	t21, t21, a2, b3
	MADD	t31, t31, a3, b3
	MADD	t41, t41, a4, b3
	MADD	t12, t12, a1, b4
	MADD	t22, t22, a2, b4
	MADD	t32, t32, a3, b4
	MADD	t42, t42, a4, b4

	daddiu	AO, AO, 16 * SIZE				#	AO += 4mr*4kr	
	daddiu	BO, BO,  8 * SIZE				#	BO += 2nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	MADD	t11, t11, a5, b7				#	4th compute
	MADD	t21, t21, a6, b7
	MADD	t31, t31, a7, b7
	MADD	t41, t41, a8, b7
	MADD	t12, t12, a5, b8
	MADD	t22, t22, a6, b8
	MADD	t32, t32, a7, b8
	MADD	t42, t42, a8, b8

	daddiu	L, L, -1
	bgtz	L, .L32
	nop

	
	.align 3

.L35:
	andi	L, KK,  3
	blez	L, .L38
	nop
	
	.align	3
.L36:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1
	
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2
	MADD	t32, t32, a3, b2
	MADD	t42, t42, a4, b2

	daddiu	AO, AO,  4 * SIZE				#	AO += 4mr	
	daddiu	BO, BO,  2 * SIZE				#	BO += 2nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L36
	nop


.L38:										#
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  5 * SIZE(BO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t12, b2, t12
	SUB	t21, b3, t21
	SUB	t22, b4, t22
	SUB	t31, b5, t31
	SUB	t32, b6, t32
	SUB	t41, b7, t41
	SUB	t42, b8, t42

	LD	a1,  0 * SIZE(AO)					#	sa stores in col major		
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)
	MUL	t11, a1, t11
	MUL	t12, a1, t12
	NMSUB	t21, t21, a2, t11
	NMSUB	t22, t22, a2, t12
	NMSUB	t31, t31, a3, t11
	NMSUB	t32, t32, a3, t12
	NMSUB	t41, t41, a4, t11
	NMSUB	t42, t42, a4, t12

	
	LD	a5,  5 * SIZE(AO)
	LD	a6,  6 * SIZE(AO)
	LD	a7,  7 * SIZE(AO)
	MUL	t21, a5, t21
	MUL	t22, a5, t22
	NMSUB	t31, t31, a6, t21
	NMSUB	t32, t32, a6, t22
	NMSUB	t41, t41, a7, t21
	NMSUB	t42, t42, a7, t22


	LD	a8, 10 * SIZE(AO)
	LD	a1, 11 * SIZE(AO)
	MUL	t31, a8, t31
	MUL	t32, a8, t32
	NMSUB	t41, t41, a1, t31
	NMSUB	t42, t42, a1, t32


	LD	a2, 15 * SIZE(AO)
	MUL	t41, a2, t41
	MUL	t42, a2, t42

	ST	t11,  0 * SIZE(BO)
	ST	t12,  1 * SIZE(BO)
	ST	t21,  2 * SIZE(BO)
	ST	t22,  3 * SIZE(BO)
	ST	t31,  4 * SIZE(BO)
	ST	t32,  5 * SIZE(BO)
	ST	t41,  6 * SIZE(BO)
	ST	t42,  7 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t21,  1 * SIZE(CO1)
	ST	t31,  2 * SIZE(CO1)
	ST	t41,  3 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)
	ST	t22,  1 * SIZE(CO2)
	ST	t32,  2 * SIZE(CO2)
	ST	t42,  3 * SIZE(CO2)

	daddiu	CO1, CO1, 4 * SIZE
	daddiu	CO2, CO2, 4 * SIZE

	dsubu	TEMP, K, KK
	dsll	L,    TEMP, 2 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT
	daddu	AO, AO, L						#	move AO to the end of Ai
	daddu	BO, BO, TEMP

	daddiu	KK, KK, 4						#	

	MTC	$0,  a1
	MOV	t11, a1
	MOV	t21, a1
	MOV	t31, a1
	MOV	t41, a1

	daddiu	I, I, -1
	bgtz	I, .L31
	nop


	.align 3
.L40:
	andi	I,  M, 2
	blez	I, .L60
	nop

	MOV	t12, t11							#	clear result registers
	MOV	t22, t21
	MOV	t32, t31
	MOV	t42, t41

	LD	a1,  0 * SIZE(AO)
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(B)
	LD	b2,  1 * SIZE(B)

	dsra	L,  KK, 2
	blez	L, .L45
	move	BO,  B							#	reset B	


	.align	3
.L42:
	LD	a5,  2 * SIZE(AO)					
	LD	a6,  3 * SIZE(AO)
	LD	b5,  2 * SIZE(BO)
	LD	b6,  3 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t21, t21, a2, b1
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2

	LD	a3,  4 * SIZE(AO)
	LD	a4,  5 * SIZE(AO)
	LD	b3,  4 * SIZE(BO)
	LD	b4,  5 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t21, t21, a6, b5
	MADD	t12, t12, a5, b6
	MADD	t22, t22, a6, b6

	LD	a7,  6 * SIZE(AO)
	LD	a8,  7 * SIZE(AO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MADD	t11, t11, a3, b3				#	3rd compute
	MADD	t21, t21, a4, b3
	MADD	t12, t12, a3, b4
	MADD	t22, t22, a4, b4

	daddiu	AO, AO,  8 * SIZE				#	AO += 2mr*4kr	
	daddiu	BO, BO,  8 * SIZE				#	BO += 2nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	MADD	t11, t11, a7, b7				#	4th compute
	MADD	t21, t21, a8, b7
	MADD	t12, t12, a7, b8
	MADD	t22, t22, a8, b8

	daddiu	L, L, -1
	bgtz	L, .L42
	nop
	
	.align 3

.L45:
	andi	L, KK,  3
	blez	L, .L48
	nop
	
	.align	3
.L46:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1
	MADD	t12, t12, a1, b2
	MADD	t22, t22, a2, b2

	daddiu	AO, AO,  2 * SIZE				#	AO += 2mr	
	daddiu	BO, BO,  2 * SIZE				#	BO += 2nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L46
	nop

.L48:
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t12, b2, t12
	SUB	t21, b3, t21
	SUB	t22, b4, t22

	LD	b1,  0 * SIZE(AO)				#	computes the triangular_part		
	LD	b2,  1 * SIZE(AO)
	MUL	t11, b1, t11
	MUL	t12, b1, t12
	NMSUB	t21, t21, b2, t11
	NMSUB	t22, t22, b2, t12
	
	LD	b3,  3 * SIZE(AO)
	MUL	t21, b3, t21
	MUL	t22, b3, t22

	ST	t11,  0 * SIZE(BO)
	ST	t12,  1 * SIZE(BO)
	ST	t21,  2 * SIZE(BO)
	ST	t22,  3 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t21,  1 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)
	ST	t22,  1 * SIZE(CO2)

	daddiu	CO1, CO1, 2 * SIZE
	daddiu	CO2, CO2, 2 * SIZE

	dsubu	TEMP, K, KK
	dsll	L,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, BO, TEMP

	daddiu	KK, KK, 2
	MTC	$0,  a1
	MOV	t11, a1
	MOV	t21, a1
	MOV	t31, a1
	MOV	t41, a1


	.align 3
.L60:
	andi	I,  M, 1						#	mr=1
	blez	I, .L49
	nop

	MOV	t12, t11							#	clear result registers
	MOV	t22, t21
	MOV	t32, t31
	MOV	t42, t41

	LD	a1,  0 * SIZE(AO)
	LD	b1,  0 * SIZE(B)
	LD	b2,  1 * SIZE(B)

	dsra	L,  KK, 2
	blez	L, .L65
	move	BO,  B							#	reset B	


	.align	3
.L62:
	LD	a5,  1 * SIZE(AO)					
	LD	b5,  2 * SIZE(BO)
	LD	b6,  3 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t12, t12, a1, b2

	LD	a3,  2 * SIZE(AO)
	LD	b3,  4 * SIZE(BO)
	LD	b4,  5 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t12, t12, a5, b6

	LD	a7,  3 * SIZE(AO)
	LD	b7,  6 * SIZE(BO)
	LD	b8,  7 * SIZE(BO)

	MADD	t11, t11, a3, b3				#	3rd compute
	MADD	t12, t12, a3, b4

	daddiu	AO, AO,  4 * SIZE				#	AO += mr*4kr	
	daddiu	BO, BO,  8 * SIZE				#	BO += 2nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	MADD	t11, t11, a7, b7				#	4th compute
	MADD	t12, t12, a7, b8

	daddiu	L, L, -1
	bgtz	L, .L62
	nop
	
	.align 3

.L65:
	andi	L, KK,  3
	blez	L, .L68
	nop
	
	.align	3
.L66:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t12, t12, a1, b2

	daddiu	AO, AO,  1 * SIZE				#	AO += 1mr	
	daddiu	BO, BO,  2 * SIZE				#	BO += 2nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L66
	nop

.L68:
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t12, b2, t12

	LD	b1,  0 * SIZE(AO)				#	computes the triangular_part		
	MUL	t11, b1, t11
	MUL	t12, b1, t12

	ST	t11,  0 * SIZE(BO)
	ST	t12,  1 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)

	daddiu	CO1, CO1, 1 * SIZE
	daddiu	CO2, CO2, 1 * SIZE

	dsubu	TEMP, K, KK
	dsll	L,    TEMP, BASE_SHIFT		#	mr=1
	dsll	TEMP, TEMP, 1 + BASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, BO, TEMP

	daddiu	KK, KK, 1

	.align 3
.L49:
	move	B,  BO
	
	.align 3

.L70:
	andi	J,  N, 1					#	nr=1
	blez	J, .L999					#	END
	nop

	move	CO1, C

	move	KK, OFFSET
	move	AO, A

	dsra	I,  M, 2
	blez	I, .L80
	nop

.L71:
	MTC	$0,  t11								#	clear result regusters
	MOV	t21, t11
	MOV	t31, t11
	MOV	t41, t11

	LD	a1,  0 * SIZE(AO)					#	this part compute the rectangular data part of Ai
	LD	a2,  1 * SIZE(AO)					#	mr*KK with nr*KK
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)					#	get 4a

	LD	b1,  0 * SIZE(B)					#	get 4b

	dsra	L,  KK, 2
	blez	L, .L75
	move	BO,  B							#	reset B

	.align	3
.L72:
	LD	a5,  4 * SIZE(AO)					
	LD	a6,  5 * SIZE(AO)
	LD	a7,  6 * SIZE(AO)
	LD	a8,  7 * SIZE(AO)

	LD	b5,  1 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1

	LD	a1,   8 * SIZE(AO)
	LD	a2,   9 * SIZE(AO)
	LD	a3,  10 * SIZE(AO)
	LD	a4,  11 * SIZE(AO)

	LD	b3,  2 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t21, t21, a6, b5
	MADD	t31, t31, a7, b5
	MADD	t41, t41, a8, b5

	LD	a5,  12 * SIZE(AO)
	LD	a6,  13 * SIZE(AO)
	LD	a7,  14 * SIZE(AO)
	LD	a8,  15 * SIZE(AO)

	LD	b7,  3 * SIZE(BO)

	MADD	t11, t11, a1, b3				#	3rd compute
	MADD	t21, t21, a2, b3
	MADD	t31, t31, a3, b3
	MADD	t41, t41, a4, b3

	daddiu	AO, AO, 16 * SIZE				#	AO += 4mr*4kr	
	daddiu	BO, BO,  4 * SIZE				#	BO += 1nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)

	MADD	t11, t11, a5, b7				#	4th compute
	MADD	t21, t21, a6, b7
	MADD	t31, t31, a7, b7
	MADD	t41, t41, a8, b7

	daddiu	L, L, -1
	bgtz	L, .L72
	nop
	
	.align 3

.L75:
	andi	L, KK,  3
	blez	L, .L78
	nop

	.align	3
.L76:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1
	MADD	t31, t31, a3, b1
	MADD	t41, t41, a4, b1

	daddiu	AO, AO,  4 * SIZE				#	AO += 4mr	
	daddiu	BO, BO,  1 * SIZE				#	BO += 1nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L76
	nop

.L78:
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t21, b2, t21
	SUB	t31, b3, t31
	SUB	t41, b4, t41

	LD	a1,  0 * SIZE(AO)					#	sa stores in col major		
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)
	MUL	t11, a1, t11
	NMSUB	t21, t21, a2, t11
	NMSUB	t31, t31, a3, t11
	NMSUB	t41, t41, a4, t11

	LD	a5,  5 * SIZE(AO)
	LD	a6,  6 * SIZE(AO)
	LD	a7,  7 * SIZE(AO)
	MUL	t21, a5, t21
	NMSUB	t31, t31, a6, t21
	NMSUB	t41, t41, a7, t21

	LD	a8, 10 * SIZE(AO)
	LD	a1, 11 * SIZE(AO)
	MUL	t31, a8, t31
	NMSUB	t41, t41, a1, t31

	LD	a2, 15 * SIZE(AO)
	MUL	t41, a2, t41


	ST	t11,  0 * SIZE(BO)
	ST	t21,  1 * SIZE(BO)
	ST	t31,  2 * SIZE(BO)
	ST	t41,  3 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t21,  1 * SIZE(CO1)
	ST	t31,  2 * SIZE(CO1)
	ST	t41,  3 * SIZE(CO1)

	daddiu	CO1, CO1, 4 * SIZE

	dsubu	TEMP, K, KK
	dsll	L,    TEMP, 2 + BASE_SHIFT
	dsll	TEMP, TEMP, 0 + BASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, BO, TEMP

	daddiu	KK, KK, 4
	daddiu	I, I, -1
	bgtz	I, .L71
	nop


	.align 3

.L80:
	andi	I,  M, 2
	blez	I, .L90
	NOP

	MTC	$0,  t11
	MOV	t21, t11							#	clear result registers

	LD	a1,  0 * SIZE(AO)
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(B)

	dsra	L,  KK, 2
	blez	L, .L85
	move	BO,  B
	
	.align	3
.L82:
	LD	a5,  2 * SIZE(AO)					
	LD	a6,  3 * SIZE(AO)

	LD	b5,  1 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute
	MADD	t21, t21, a2, b1

	LD	a3,  4 * SIZE(AO)
	LD	a4,  5 * SIZE(AO)

	LD	b3,  2 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute
	MADD	t21, t21, a6, b5

	LD	a7,  6 * SIZE(AO)
	LD	a8,  7 * SIZE(AO)

	LD	b7,  3 * SIZE(BO)

	MADD	t11, t11, a3, b3				#	3rd compute
	MADD	t21, t21, a4, b3

	daddiu	AO, AO,  8 * SIZE				#	AO += 2mr*4kr	
	daddiu	BO, BO,  4 * SIZE				#	BO += 1nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)

	MADD	t11, t11, a7, b7				#	4th compute
	MADD	t21, t21, a8, b7

	daddiu	L, L, -1
	bgtz	L, .L82
	nop
	
	.align 3

.L85:
	andi	L, KK,  3
	blez	L, .L88
	nop

	.align	3
.L86:
	MADD	t11, t11, a1, b1				#	3rd compute
	MADD	t21, t21, a2, b1

	daddiu	AO, AO,  2 * SIZE				#	AO += 2mr	
	daddiu	BO, BO,  1 * SIZE				#	BO += 1nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	a2,  1 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L86
	nop


.L88:
	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)

	SUB	t11, b1, t11
	SUB	t21, b2, t21

	LD	b1,  0 * SIZE(AO)				#	computes the triangular_part		
	LD	b2,  1 * SIZE(AO)
	MUL	t11, b1, t11
	NMSUB	t21, t21, b2, t11
	
	LD	b3,  3 * SIZE(AO)
	MUL	t21, b3, t21

	ST	t11,  0 * SIZE(BO)
	ST	t21,  1 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)
	ST	t21,  1 * SIZE(CO1)


	daddiu	CO1, CO1, 2 * SIZE
	
	dsubu	TEMP, K, KK
	dsll	L,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 0 + BASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, BO, TEMP

	daddiu	KK, KK, 2

	
	.align 3
.L90:
	andi	I,  M, 1					#	mr=1
	blez	I, .L89
	NOP

	MTC	$0,  t11

	LD	a1,  0 * SIZE(AO)
	LD	b1,  0 * SIZE(B)

	dsra	L,  KK, 2
	blez	L, .L95
	move	BO,  B
	
	.align	3
.L92:
	LD	a5,  1 * SIZE(AO)					
	LD	b5,  1 * SIZE(BO)

	MADD	t11, t11, a1, b1				#	1st compute

	LD	a3,  2 * SIZE(AO)
	LD	b3,  2 * SIZE(BO)

	MADD	t11, t11, a5, b5				#	2ed compute

	LD	a7,  3 * SIZE(AO)
	LD	b7,  3 * SIZE(BO)

	MADD	t11, t11, a3, b3				#	3rd compute

	daddiu	AO, AO,  4 * SIZE				#	AO += 2mr*4kr	
	daddiu	BO, BO,  4 * SIZE				#	BO += 1nr*4kr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	b1,  0 * SIZE(BO)

	MADD	t11, t11, a7, b7				#	4th compute

	daddiu	L, L, -1
	bgtz	L, .L92
	nop
	
	.align 3
.L95:
	andi	L, KK,  3
	blez	L, .L98
	nop

	.align	3
.L96:
	MADD	t11, t11, a1, b1				#	3rd compute

	daddiu	AO, AO,  1 * SIZE				#	AO += 2mr	
	daddiu	BO, BO,  1 * SIZE				#	BO += 1nr
	
	LD	a1,  0 * SIZE(AO)					#	next 
	LD	b1,  0 * SIZE(BO)

	daddiu	L, L, -1
	bgtz	L, .L96
	nop


.L98:
	LD	b1,  0 * SIZE(BO)

	SUB	t11, b1, t11

	LD	b1,  0 * SIZE(AO)				#	computes the triangular_part		
	MUL	t11, b1, t11
	
	ST	t11,  0 * SIZE(BO)

	ST	t11,  0 * SIZE(CO1)


	daddiu	CO1, CO1, 1 * SIZE
	
	dsubu	TEMP, K, KK
	dsll	L,    TEMP,  BASE_SHIFT
	dsll	TEMP, TEMP,  BASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, BO, TEMP

	daddiu	KK, KK, 1


	.align 3
.L89:
	move	B,  BO


	.align 3

.L999:
	LDARG	$16,   0($sp)
	LDARG	$17,   8($sp)
	LDARG	$18,  16($sp)
	LDARG	$19,  24($sp)
	LDARG	$20,  32($sp)
	LDARG	$21,  40($sp)
	ldc1	$f24, 48($sp)
	ldc1	$f25, 56($sp)
	ldc1	$f26, 64($sp)
	ldc1	$f27, 72($sp)
	ldc1	$f28, 80($sp)

	LDARG	$22,  88($sp)
	LDARG	$23,  96($sp)
	LDARG	$24, 104($sp)
	LDARG	$25, 112($sp)

#ifndef __64BIT__
	ldc1	$f20,112($sp)
	ldc1	$f21,120($sp)
	ldc1	$f22,128($sp)
	ldc1	$f23,136($sp)
#endif

	j	$31
	daddiu	$sp, $sp, 144

	EPILOGUE