#define REALNAME ASMNAME
#define ASSEMBLER
#include "common.h"
#define M $4
#define N $5
#define K $6
#define A $8
#define B $9
#define C $10
#define LDC $11
#define AO $12
#define BO $13
#define I $2
#define J $3
#define L $7
#define CO1 $14
#define CO2 $15
#define CO3 $16
#define CO4 $17
#define OFFSET $22
#define KK $23
#define TEMP $24
#define AORIG $25
#define a1 $f0
#define a2 $f1
#define a3 $f2
#define a4 $f3
#define a5 $f4
#define a6 $f5
#define a7 $f6
#define a8 $f7
#define b1 $f8
#define b2 $f9
#define b3 $f10
#define b4 $f11
#define b5 $f12
#define b6 $f13
#define b7 $f14
#define b8 $f15
#define t11 $f16
#define t21 $f17
#define t31 $f18
#define t41 $f19
#define t12 $f20
#define t22 $f21
#define t32 $f22
#define t42 $f23
#define t13 $f24
#define t23 $f25
#define t33 $f26
#define t43 $f27
#define t14 $f28
#define t24 $f29
#define t34 $f30
#define t44 $f31
#define ALPHA $f15
PROLOGUE
daddiu $sp, $sp, -144
SDARG $16, 0($sp)
SDARG $17, 8($sp)
SDARG $18, 16($sp)
SDARG $19, 24($sp)
SDARG $20, 32($sp)
SDARG $21, 40($sp)
sdc1 $f24, 48($sp)
sdc1 $f25, 56($sp)
sdc1 $f26, 64($sp)
sdc1 $f27, 72($sp)
sdc1 $f28, 80($sp)
SDARG $22, 88($sp)
SDARG $23, 96($sp)
SDARG $24, 104($sp)
SDARG $25, 112($sp)
#ifndef __64BIT__
sdc1 $f20,112($sp)
sdc1 $f21,120($sp)
sdc1 $f22,128($sp)
sdc1 $f23,136($sp)
#endif
# LT compute from left to right, top to bottom
LDARG OFFSET, 144($sp)
dsll LDC, LDC, BASE_SHIFT # ldc
dsra J, N, 2 # j = nc/4
blez J, .L30
nop
.L10: # nr=4
daddiu J, J, -1
move CO1, C
daddu CO2, C, LDC
daddu CO3, CO2, LDC
daddu CO4, CO3, LDC
MTC $0, t11 # clear result registers
MOV t21, t11
MOV t31, t11
MOV t41, t11
MOV t12, t11
MOV t22, t11
MOV t32, t11
MOV t42, t11
dsra I, M, 2 # i = mc/4
move KK, OFFSET # kk is the length of the rectangular data part of panel Ai
move AO, A # reset A
daddu C, CO4, LDC # fixed pointer C, the write back address
blez I, .L20
nop
.L11: # mr=4
LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO) # get 4a
LD b1, 0 * SIZE(B) # get 4b
LD b2, 1 * SIZE(B)
LD b3, 2 * SIZE(B)
LD b4, 3 * SIZE(B)
MOV t13, t11 # clear result registers
MOV t23, t11
MOV t33, t11
MOV t43, t11
MOV t14, t11
MOV t24, t11
MOV t34, t11
MOV t44, t11
dsra L, KK, 2 # L = kk/4
blez L, .L15
move BO, B #
.align 3
.L12:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t33, t33, a3, b3
MADD t43, t43, a4, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
MADD t44, t44, a4, b4
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
LD a3, 10 * SIZE(AO)
LD a4, 11 * SIZE(AO)
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t21, t21, a6, b5
MADD t31, t31, a7, b5
MADD t41, t41, a8, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
MADD t32, t32, a7, b6
MADD t42, t42, a8, b6
MADD t13, t13, a5, b7
MADD t23, t23, a6, b7
MADD t33, t33, a7, b7
MADD t43, t43, a8, b7
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
MADD t34, t34, a7, b8
MADD t44, t44, a8, b8
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
LD a7, 14 * SIZE(AO)
LD a8, 15 * SIZE(AO)
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
LD b8, 15 * SIZE(BO)
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t33, t33, a3, b3
MADD t43, t43, a4, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
MADD t44, t44, a4, b4
daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MADD t11, t11, a5, b5 # 4th compute
MADD t21, t21, a6, b5
MADD t31, t31, a7, b5
MADD t41, t41, a8, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
MADD t32, t32, a7, b6
MADD t42, t42, a8, b6
MADD t13, t13, a5, b7
MADD t23, t23, a6, b7
MADD t33, t33, a7, b7
MADD t43, t43, a8, b7
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
MADD t34, t34, a7, b8
MADD t44, t44, a8, b8
daddiu L, L, -1
bgtz L, .L12
nop
.align 3
.L15:
andi L, KK, 3 # the remainder part: KK-KK/4
blez L, .L18
nop
.align 3
.L16:
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t33, t33, a3, b3
MADD t43, t43, a4, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
MADD t44, t44, a4, b4
daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L16
nop
.L18: # deal with the triangular data part of panel Ai
LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B
LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
SUB t13, b3, t13
SUB t14, b4, t14
LD b5, 4 * SIZE(BO) # sb store in row major
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
SUB t21, b5, t21
SUB t22, b6, t22
SUB t23, b7, t23
SUB t24, b8, t24
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
SUB t31, b1, t31
SUB t32, b2, t32
SUB t33, b3, t33
SUB t34, b4, t34
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
LD b8, 15 * SIZE(BO)
SUB t41, b5, t41
SUB t42, b6, t42
SUB t43, b7, t43
SUB t44, b8, t44
LD a1, 0 * SIZE(AO) # sa stores in col major
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
MUL t11, a1, t11
MUL t12, a1, t12
MUL t13, a1, t13
MUL t14, a1, t14
NMSUB t21, t21, a2, t11
NMSUB t22, t22, a2, t12
NMSUB t23, t23, a2, t13
NMSUB t24, t24, a2, t14
NMSUB t31, t31, a3, t11
NMSUB t32, t32, a3, t12
NMSUB t33, t33, a3, t13
NMSUB t34, t34, a3, t14
NMSUB t41, t41, a4, t11
NMSUB t42, t42, a4, t12
NMSUB t43, t43, a4, t13
NMSUB t44, t44, a4, t14
LD a5, 5 * SIZE(AO)
LD a6, 6 * SIZE(AO)
LD a7, 7 * SIZE(AO)
MUL t21, a5, t21
MUL t22, a5, t22
MUL t23, a5, t23
MUL t24, a5, t24
NMSUB t31, t31, a6, t21
NMSUB t32, t32, a6, t22
NMSUB t33, t33, a6, t23
NMSUB t34, t34, a6, t24
NMSUB t41, t41, a7, t21
NMSUB t42, t42, a7, t22
NMSUB t43, t43, a7, t23
NMSUB t44, t44, a7, t24
LD a8, 10 * SIZE(AO)
LD a1, 11 * SIZE(AO)
MUL t31, a8, t31
MUL t32, a8, t32
MUL t33, a8, t33
MUL t34, a8, t34
NMSUB t41, t41, a1, t31
NMSUB t42, t42, a1, t32
NMSUB t43, t43, a1, t33
NMSUB t44, t44, a1, t34
LD a2, 15 * SIZE(AO)
MUL t41, a2, t41
MUL t42, a2, t42
MUL t43, a2, t43
MUL t44, a2, t44
ST t11, 0 * SIZE(BO) # update packed B
ST t12, 1 * SIZE(BO)
ST t13, 2 * SIZE(BO)
ST t14, 3 * SIZE(BO)
ST t21, 4 * SIZE(BO)
ST t22, 5 * SIZE(BO)
ST t23, 6 * SIZE(BO)
ST t24, 7 * SIZE(BO)
ST t31, 8 * SIZE(BO)
ST t32, 9 * SIZE(BO)
ST t33, 10 * SIZE(BO)
ST t34, 11 * SIZE(BO)
ST t41, 12 * SIZE(BO)
ST t42, 13 * SIZE(BO)
ST t43, 14 * SIZE(BO)
ST t44, 15 * SIZE(BO)
ST t11, 0 * SIZE(CO1) # write back
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
ST t13, 0 * SIZE(CO3)
ST t23, 1 * SIZE(CO3)
ST t33, 2 * SIZE(CO3)
ST t43, 3 * SIZE(CO3)
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
ST t34, 2 * SIZE(CO4)
ST t44, 3 * SIZE(CO4)
daddiu CO1, CO1, 4 * SIZE # fixed pointers
daddiu CO2, CO2, 4 * SIZE
daddiu CO3, CO3, 4 * SIZE
daddiu CO4, CO4, 4 * SIZE
dsubu TEMP, K, KK
dsll L, TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, 2 + BASE_SHIFT
daddu AO, AO, L # mov AO to the end of panel Ai
daddu BO, BO, TEMP # mov BO to the end of panel Bj
daddiu KK, KK, 4 # the length of rectangular data part increases by 4
daddiu I, I, -1
MTC $0, a1
MOV t11, a1
MOV t21, a1
MOV t31, a1
MOV t41, a1
MOV t12, a1
MOV t22, a1
MOV t32, a1
MOV t42, a1
bgtz I, .L11
nop
.align 3
.L20:
andi I, M, 2 # mr=2,nr=4
blez I, .L50
nop
MOV t13, t11
MOV t23, t11
MOV t33, t11
MOV t43, t11
MOV t14, t11
MOV t24, t11
MOV t34, t11
MOV t44, t11
LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
LD b1, 0 * SIZE(B) # get 4b
LD b2, 1 * SIZE(B)
LD b3, 2 * SIZE(B)
LD b4, 3 * SIZE(B)
dsra L, KK, 2
blez L, .L25
move BO, B
.align 3
.L22:
LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
LD a3, 4 * SIZE(AO)
LD a4, 5 * SIZE(AO)
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t21, t21, a6, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
MADD t13, t13, a5, b7
MADD t23, t23, a6, b7
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
LD b8, 15 * SIZE(BO)
MADD t11, t11, a3, b1 # 3rd compute
MADD t21, t21, a4, b1
MADD t12, t12, a3, b2
MADD t22, t22, a4, b2
MADD t13, t13, a3, b3
MADD t23, t23, a4, b3
MADD t14, t14, a3, b4
MADD t24, t24, a4, b4
daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MADD t11, t11, a7, b5 # 4th compute
MADD t21, t21, a8, b5
MADD t12, t12, a7, b6
MADD t22, t22, a8, b6
MADD t13, t13, a7, b7
MADD t23, t23, a8, b7
MADD t14, t14, a7, b8
MADD t24, t24, a8, b8
daddiu L, L, -1
bgtz L, .L22
nop
.align 3
.L25:
andi L, KK, 3
blez L, .L28
nop
.align 3
.L26:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L26
nop
.L28: # deal with the triangular part
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
SUB t13, b3, t13
SUB t14, b4, t14
SUB t21, b5, t21
SUB t22, b6, t22
SUB t23, b7, t23
SUB t24, b8, t24
LD b1, 0 * SIZE(AO) # computes the triangular_part
LD b2, 1 * SIZE(AO)
MUL t11, b1, t11
MUL t12, b1, t12
MUL t13, b1, t13
MUL t14, b1, t14
NMSUB t21, t21, b2, t11
NMSUB t22, t22, b2, t12
NMSUB t23, t23, b2, t13
NMSUB t24, t24, b2, t14
LD b3, 3 * SIZE(AO)
MUL t21, b3, t21
MUL t22, b3, t22
MUL t23, b3, t23
MUL t24, b3, t24
ST t11, 0 * SIZE(BO)
ST t12, 1 * SIZE(BO)
ST t13, 2 * SIZE(BO)
ST t14, 3 * SIZE(BO)
ST t21, 4 * SIZE(BO)
ST t22, 5 * SIZE(BO)
ST t23, 6 * SIZE(BO)
ST t24, 7 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t21, 1 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t13, 0 * SIZE(CO3)
ST t23, 1 * SIZE(CO3)
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
daddiu CO1, CO1, 2 * SIZE
daddiu CO2, CO2, 2 * SIZE
daddiu CO3, CO3, 2 * SIZE
daddiu CO4, CO4, 2 * SIZE
dsubu TEMP, K, KK
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 2 + BASE_SHIFT
daddu AO, AO, L # mov AO to the end of Ai
daddu BO, BO, TEMP # mov BO to the end of Bj
daddiu KK, KK, 2 # the length of rectangular data part increases by 2
MTC $0, a1
MOV t11, a1
MOV t21, a1
MOV t31, a1
MOV t41, a1
MOV t12, a1
MOV t22, a1
MOV t32, a1
MOV t42, a1
.align 3
.L50:
andi I, M, 1 # mr=1,nr=4
blez I, .L29
nop
MOV t13, t11
MOV t23, t11
MOV t33, t11
MOV t43, t11
MOV t14, t11
MOV t24, t11
MOV t34, t11
MOV t44, t11
LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
LD b1, 0 * SIZE(B) # get 4b
LD b2, 1 * SIZE(B)
LD b3, 2 * SIZE(B)
LD b4, 3 * SIZE(B)
dsra L, KK, 2
blez L, .L55
move BO, B
.align 3
.L52:
LD a5, 1 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t12, t12, a1, b2
MADD t13, t13, a1, b3
MADD t14, t14, a1, b4
LD a3, 2 * SIZE(AO)
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t12, t12, a5, b6
MADD t13, t13, a5, b7
MADD t14, t14, a5, b8
LD a7, 3 * SIZE(AO)
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
LD b8, 15 * SIZE(BO)
MADD t11, t11, a3, b1 # 3rd compute
MADD t12, t12, a3, b2
MADD t13, t13, a3, b3
MADD t14, t14, a3, b4
daddiu AO, AO, 4 * SIZE # AO += mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MADD t11, t11, a7, b5 # 4th compute
MADD t12, t12, a7, b6
MADD t13, t13, a7, b7
MADD t14, t14, a7, b8
daddiu L, L, -1
bgtz L, .L52
nop
.align 3
.L55:
andi L, KK, 3
blez L, .L58
nop
.align 3
.L56:
MADD t11, t11, a1, b1 # 3rd compute
MADD t12, t12, a1, b2
MADD t13, t13, a1, b3
MADD t14, t14, a1, b4
daddiu AO, AO, 1 * SIZE # AO += 2mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L56
nop
.L58: # deal with the triangular part
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
SUB t13, b3, t13
SUB t14, b4, t14
LD b1, 0 * SIZE(AO) # computes the triangular_part
MUL t11, b1, t11
MUL t12, b1, t12
MUL t13, b1, t13
MUL t14, b1, t14
ST t11, 0 * SIZE(BO)
ST t12, 1 * SIZE(BO)
ST t13, 2 * SIZE(BO)
ST t14, 3 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t13, 0 * SIZE(CO3)
ST t14, 0 * SIZE(CO4)
daddiu CO1, CO1, 1 * SIZE
daddiu CO2, CO2, 1 * SIZE
daddiu CO3, CO3, 1 * SIZE
daddiu CO4, CO4, 1 * SIZE
dsubu TEMP, K, KK
dsll L, TEMP, BASE_SHIFT # mr=1
dsll TEMP, TEMP, 2 + BASE_SHIFT
daddu AO, AO, L # mov AO to the end of Ai
daddu BO, BO, TEMP # mov BO to the end of Bj
daddiu KK, KK, 1 # the length of rectangular data part increases by 2
.align 3
.L29:
move B, BO # fixed panel Bj
bgtz J, .L10
nop
.align 3
.L30:
andi J, N, 2 # nr=2
blez J, .L70
nop
move CO1, C
daddu CO2, C, LDC
MTC $0, t11 # clear result regusters
MOV t21, t11
MOV t31, t11
MOV t41, t11
move KK, OFFSET
move AO, A # reset A
daddu C, CO2, LDC # fixed
dsra I, M, 2 # I = mc/4
blez I, .L40
nop
.L31:
MOV t12, t11
MOV t22, t11
MOV t32, t11
MOV t42, t11
LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO) # get 4a
LD b1, 0 * SIZE(B) # get 4b
LD b2, 1 * SIZE(B)
dsra L, KK, 2 # L=kk/4
blez L, .L35
move BO, B # reset B
.align 3
.L32:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
LD a3, 10 * SIZE(AO)
LD a4, 11 * SIZE(AO)
LD b3, 4 * SIZE(BO)
LD b4, 5 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t21, t21, a6, b5
MADD t31, t31, a7, b5
MADD t41, t41, a8, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
MADD t32, t32, a7, b6
MADD t42, t42, a8, b6
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
LD a7, 14 * SIZE(AO)
LD a8, 15 * SIZE(AO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a1, b3 # 3rd compute
MADD t21, t21, a2, b3
MADD t31, t31, a3, b3
MADD t41, t41, a4, b3
MADD t12, t12, a1, b4
MADD t22, t22, a2, b4
MADD t32, t32, a3, b4
MADD t42, t42, a4, b4
daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MADD t11, t11, a5, b7 # 4th compute
MADD t21, t21, a6, b7
MADD t31, t31, a7, b7
MADD t41, t41, a8, b7
MADD t12, t12, a5, b8
MADD t22, t22, a6, b8
MADD t32, t32, a7, b8
MADD t42, t42, a8, b8
daddiu L, L, -1
bgtz L, .L32
nop
.align 3
.L35:
andi L, KK, 3
blez L, .L38
nop
.align 3
.L36:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L36
nop
.L38: #
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
SUB t21, b3, t21
SUB t22, b4, t22
SUB t31, b5, t31
SUB t32, b6, t32
SUB t41, b7, t41
SUB t42, b8, t42
LD a1, 0 * SIZE(AO) # sa stores in col major
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
MUL t11, a1, t11
MUL t12, a1, t12
NMSUB t21, t21, a2, t11
NMSUB t22, t22, a2, t12
NMSUB t31, t31, a3, t11
NMSUB t32, t32, a3, t12
NMSUB t41, t41, a4, t11
NMSUB t42, t42, a4, t12
LD a5, 5 * SIZE(AO)
LD a6, 6 * SIZE(AO)
LD a7, 7 * SIZE(AO)
MUL t21, a5, t21
MUL t22, a5, t22
NMSUB t31, t31, a6, t21
NMSUB t32, t32, a6, t22
NMSUB t41, t41, a7, t21
NMSUB t42, t42, a7, t22
LD a8, 10 * SIZE(AO)
LD a1, 11 * SIZE(AO)
MUL t31, a8, t31
MUL t32, a8, t32
NMSUB t41, t41, a1, t31
NMSUB t42, t42, a1, t32
LD a2, 15 * SIZE(AO)
MUL t41, a2, t41
MUL t42, a2, t42
ST t11, 0 * SIZE(BO)
ST t12, 1 * SIZE(BO)
ST t21, 2 * SIZE(BO)
ST t22, 3 * SIZE(BO)
ST t31, 4 * SIZE(BO)
ST t32, 5 * SIZE(BO)
ST t41, 6 * SIZE(BO)
ST t42, 7 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
daddiu CO1, CO1, 4 * SIZE
daddiu CO2, CO2, 4 * SIZE
dsubu TEMP, K, KK
dsll L, TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu AO, AO, L # move AO to the end of Ai
daddu BO, BO, TEMP
daddiu KK, KK, 4 #
MTC $0, a1
MOV t11, a1
MOV t21, a1
MOV t31, a1
MOV t41, a1
daddiu I, I, -1
bgtz I, .L31
nop
.align 3
.L40:
andi I, M, 2
blez I, .L60
nop
MOV t12, t11 # clear result registers
MOV t22, t21
MOV t32, t31
MOV t42, t41
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(B)
LD b2, 1 * SIZE(B)
dsra L, KK, 2
blez L, .L45
move BO, B # reset B
.align 3
.L42:
LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
LD a3, 4 * SIZE(AO)
LD a4, 5 * SIZE(AO)
LD b3, 4 * SIZE(BO)
LD b4, 5 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t21, t21, a6, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a3, b3 # 3rd compute
MADD t21, t21, a4, b3
MADD t12, t12, a3, b4
MADD t22, t22, a4, b4
daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MADD t11, t11, a7, b7 # 4th compute
MADD t21, t21, a8, b7
MADD t12, t12, a7, b8
MADD t22, t22, a8, b8
daddiu L, L, -1
bgtz L, .L42
nop
.align 3
.L45:
andi L, KK, 3
blez L, .L48
nop
.align 3
.L46:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L46
nop
.L48:
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
SUB t21, b3, t21
SUB t22, b4, t22
LD b1, 0 * SIZE(AO) # computes the triangular_part
LD b2, 1 * SIZE(AO)
MUL t11, b1, t11
MUL t12, b1, t12
NMSUB t21, t21, b2, t11
NMSUB t22, t22, b2, t12
LD b3, 3 * SIZE(AO)
MUL t21, b3, t21
MUL t22, b3, t22
ST t11, 0 * SIZE(BO)
ST t12, 1 * SIZE(BO)
ST t21, 2 * SIZE(BO)
ST t22, 3 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t21, 1 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
daddiu CO1, CO1, 2 * SIZE
daddiu CO2, CO2, 2 * SIZE
dsubu TEMP, K, KK
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
daddiu KK, KK, 2
MTC $0, a1
MOV t11, a1
MOV t21, a1
MOV t31, a1
MOV t41, a1
.align 3
.L60:
andi I, M, 1 # mr=1
blez I, .L49
nop
MOV t12, t11 # clear result registers
MOV t22, t21
MOV t32, t31
MOV t42, t41
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(B)
LD b2, 1 * SIZE(B)
dsra L, KK, 2
blez L, .L65
move BO, B # reset B
.align 3
.L62:
LD a5, 1 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t12, t12, a1, b2
LD a3, 2 * SIZE(AO)
LD b3, 4 * SIZE(BO)
LD b4, 5 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t12, t12, a5, b6
LD a7, 3 * SIZE(AO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a3, b3 # 3rd compute
MADD t12, t12, a3, b4
daddiu AO, AO, 4 * SIZE # AO += mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MADD t11, t11, a7, b7 # 4th compute
MADD t12, t12, a7, b8
daddiu L, L, -1
bgtz L, .L62
nop
.align 3
.L65:
andi L, KK, 3
blez L, .L68
nop
.align 3
.L66:
MADD t11, t11, a1, b1 # 3rd compute
MADD t12, t12, a1, b2
daddiu AO, AO, 1 * SIZE # AO += 1mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L66
nop
.L68:
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
LD b1, 0 * SIZE(AO) # computes the triangular_part
MUL t11, b1, t11
MUL t12, b1, t12
ST t11, 0 * SIZE(BO)
ST t12, 1 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
daddiu CO1, CO1, 1 * SIZE
daddiu CO2, CO2, 1 * SIZE
dsubu TEMP, K, KK
dsll L, TEMP, BASE_SHIFT # mr=1
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
daddiu KK, KK, 1
.align 3
.L49:
move B, BO
.align 3
.L70:
andi J, N, 1 # nr=1
blez J, .L999 # END
nop
move CO1, C
move KK, OFFSET
move AO, A
dsra I, M, 2
blez I, .L80
nop
.L71:
MTC $0, t11 # clear result regusters
MOV t21, t11
MOV t31, t11
MOV t41, t11
LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO) # get 4a
LD b1, 0 * SIZE(B) # get 4b
dsra L, KK, 2
blez L, .L75
move BO, B # reset B
.align 3
.L72:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b5, 1 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
LD a3, 10 * SIZE(AO)
LD a4, 11 * SIZE(AO)
LD b3, 2 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t21, t21, a6, b5
MADD t31, t31, a7, b5
MADD t41, t41, a8, b5
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
LD a7, 14 * SIZE(AO)
LD a8, 15 * SIZE(AO)
LD b7, 3 * SIZE(BO)
MADD t11, t11, a1, b3 # 3rd compute
MADD t21, t21, a2, b3
MADD t31, t31, a3, b3
MADD t41, t41, a4, b3
daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MADD t11, t11, a5, b7 # 4th compute
MADD t21, t21, a6, b7
MADD t31, t31, a7, b7
MADD t41, t41, a8, b7
daddiu L, L, -1
bgtz L, .L72
nop
.align 3
.L75:
andi L, KK, 3
blez L, .L78
nop
.align 3
.L76:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L76
nop
.L78:
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
SUB t11, b1, t11
SUB t21, b2, t21
SUB t31, b3, t31
SUB t41, b4, t41
LD a1, 0 * SIZE(AO) # sa stores in col major
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
MUL t11, a1, t11
NMSUB t21, t21, a2, t11
NMSUB t31, t31, a3, t11
NMSUB t41, t41, a4, t11
LD a5, 5 * SIZE(AO)
LD a6, 6 * SIZE(AO)
LD a7, 7 * SIZE(AO)
MUL t21, a5, t21
NMSUB t31, t31, a6, t21
NMSUB t41, t41, a7, t21
LD a8, 10 * SIZE(AO)
LD a1, 11 * SIZE(AO)
MUL t31, a8, t31
NMSUB t41, t41, a1, t31
LD a2, 15 * SIZE(AO)
MUL t41, a2, t41
ST t11, 0 * SIZE(BO)
ST t21, 1 * SIZE(BO)
ST t31, 2 * SIZE(BO)
ST t41, 3 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
daddiu CO1, CO1, 4 * SIZE
dsubu TEMP, K, KK
dsll L, TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, 0 + BASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
daddiu KK, KK, 4
daddiu I, I, -1
bgtz I, .L71
nop
.align 3
.L80:
andi I, M, 2
blez I, .L90
NOP
MTC $0, t11
MOV t21, t11 # clear result registers
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(B)
dsra L, KK, 2
blez L, .L85
move BO, B
.align 3
.L82:
LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 1 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t21, t21, a2, b1
LD a3, 4 * SIZE(AO)
LD a4, 5 * SIZE(AO)
LD b3, 2 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t21, t21, a6, b5
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b7, 3 * SIZE(BO)
MADD t11, t11, a3, b3 # 3rd compute
MADD t21, t21, a4, b3
daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MADD t11, t11, a7, b7 # 4th compute
MADD t21, t21, a8, b7
daddiu L, L, -1
bgtz L, .L82
nop
.align 3
.L85:
andi L, KK, 3
blez L, .L88
nop
.align 3
.L86:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L86
nop
.L88:
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
SUB t11, b1, t11
SUB t21, b2, t21
LD b1, 0 * SIZE(AO) # computes the triangular_part
LD b2, 1 * SIZE(AO)
MUL t11, b1, t11
NMSUB t21, t21, b2, t11
LD b3, 3 * SIZE(AO)
MUL t21, b3, t21
ST t11, 0 * SIZE(BO)
ST t21, 1 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t21, 1 * SIZE(CO1)
daddiu CO1, CO1, 2 * SIZE
dsubu TEMP, K, KK
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 0 + BASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
daddiu KK, KK, 2
.align 3
.L90:
andi I, M, 1 # mr=1
blez I, .L89
NOP
MTC $0, t11
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(B)
dsra L, KK, 2
blez L, .L95
move BO, B
.align 3
.L92:
LD a5, 1 * SIZE(AO)
LD b5, 1 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
LD a3, 2 * SIZE(AO)
LD b3, 2 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
LD a7, 3 * SIZE(AO)
LD b7, 3 * SIZE(BO)
MADD t11, t11, a3, b3 # 3rd compute
daddiu AO, AO, 4 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
MADD t11, t11, a7, b7 # 4th compute
daddiu L, L, -1
bgtz L, .L92
nop
.align 3
.L95:
andi L, KK, 3
blez L, .L98
nop
.align 3
.L96:
MADD t11, t11, a1, b1 # 3rd compute
daddiu AO, AO, 1 * SIZE # AO += 2mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L96
nop
.L98:
LD b1, 0 * SIZE(BO)
SUB t11, b1, t11
LD b1, 0 * SIZE(AO) # computes the triangular_part
MUL t11, b1, t11
ST t11, 0 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
daddiu CO1, CO1, 1 * SIZE
dsubu TEMP, K, KK
dsll L, TEMP, BASE_SHIFT
dsll TEMP, TEMP, BASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
daddiu KK, KK, 1
.align 3
.L89:
move B, BO
.align 3
.L999:
LDARG $16, 0($sp)
LDARG $17, 8($sp)
LDARG $18, 16($sp)
LDARG $19, 24($sp)
LDARG $20, 32($sp)
LDARG $21, 40($sp)
ldc1 $f24, 48($sp)
ldc1 $f25, 56($sp)
ldc1 $f26, 64($sp)
ldc1 $f27, 72($sp)
ldc1 $f28, 80($sp)
LDARG $22, 88($sp)
LDARG $23, 96($sp)
LDARG $24, 104($sp)
LDARG $25, 112($sp)
#ifndef __64BIT__
ldc1 $f20,112($sp)
ldc1 $f21,120($sp)
ldc1 $f22,128($sp)
ldc1 $f23,136($sp)
#endif
j $31
daddiu $sp, $sp, 144
EPILOGUE