#define REALNAME ASMNAME
#define ASSEMBLER
#include "common.h"
#define M $4
#define N $5
#define K $6
#define A $8
#define B $9
#define C $10
#define LDC $11
#define AO $12
#define BO $13
#define I $2
#define J $3
#define L $7
#define CO1 $14
#define CO2 $15
#define CO3 $16
#define CO4 $17
#define OFFSET $22
#define KK $23
#define TEMP $24
#define AORIG $25
#define a1 $f0
#define a2 $f1
#define a3 $f2
#define a4 $f3
#define a5 $f4
#define a6 $f5
#define a7 $f6
#define a8 $f7
#define b1 $f8
#define b2 $f9
#define b3 $f10
#define b4 $f11
#define b5 $f12
#define b6 $f13
#define b7 $f14
#define b8 $f15
#define t11 $f16
#define t21 $f17
#define t31 $f18
#define t41 $f19
#define t12 $f20
#define t22 $f21
#define t32 $f22
#define t42 $f23
#define t13 $f24
#define t23 $f25
#define t33 $f26
#define t43 $f27
#define t14 $f28
#define t24 $f29
#define t34 $f30
#define t44 $f31
#define ALPHA $f15
PROLOGUE
daddiu $sp, $sp, -144
SDARG $16, 0($sp)
SDARG $17, 8($sp)
SDARG $18, 16($sp)
SDARG $19, 24($sp)
SDARG $20, 32($sp)
SDARG $21, 40($sp)
sdc1 $f24, 48($sp)
sdc1 $f25, 56($sp)
sdc1 $f26, 64($sp)
sdc1 $f27, 72($sp)
sdc1 $f28, 80($sp)
SDARG $22, 88($sp)
SDARG $23, 96($sp)
SDARG $24, 104($sp)
SDARG $25, 112($sp)
#ifndef __64BIT__
sdc1 $f20,112($sp)
sdc1 $f21,120($sp)
sdc1 $f22,128($sp)
sdc1 $f23,136($sp)
#endif
# LN compute from bottom to top
LDARG OFFSET, 144($sp)
dsll LDC, LDC, BASE_SHIFT # ldc
mult M, K
mflo TEMP # TEMP=MC*KC
dsll TEMP, TEMP, BASE_SHIFT
daddu A, A, TEMP # A move to the end of sa
dsll TEMP, M, BASE_SHIFT
daddu C, C, TEMP # C+=MC
dsra J, N, 2 # j = nc/4
blez J, .L30
nop
.L10: # nr=4
daddiu J, J, -1
move CO1, C
daddu CO2, C, LDC
daddu CO3, CO2, LDC
daddu CO4, CO3, LDC
MTC $0, t11 # clear result registers
MOV t21, t11
MOV t31, t11
MOV t41, t11
MOV t12, t11
MOV t22, t11
MOV t32, t11
MOV t42, t11
daddu KK, M, OFFSET # kc - kk is the length of the rectangular data part of panel Ai
move AORIG, A # reset A
daddu C, CO4, LDC # fixed pointer C, the write back address
andi I, M, 1 # mr=2,nr=4
blez I, .L50
nop
dsll TEMP, K, BASE_SHIFT # mr=1
dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai
dsll L, KK, BASE_SHIFT # mr=1
dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
daddu AO, AORIG, L # AO point to the rectangular data part
daddu BO, B, TEMP
dsubu TEMP, K, KK
MOV t13, t11 # mr=2
MOV t23, t11
MOV t33, t11
MOV t43, t11
MOV t14, t11
MOV t24, t11
MOV t34, t11
MOV t44, t11
LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
LD b1, 0 * SIZE(BO) # get 4b
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
dsra L, TEMP, 2
blez L, .L55
nop
.align 3
.L52:
LD a5, 1 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t12, t12, a1, b2
MADD t13, t13, a1, b3
MADD t14, t14, a1, b4
LD a3, 2 * SIZE(AO)
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t12, t12, a5, b6
MADD t13, t13, a5, b7
MADD t14, t14, a5, b8
LD a7, 3 * SIZE(AO)
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
LD b8, 15 * SIZE(BO)
MADD t11, t11, a3, b1 # 3rd compute
MADD t12, t12, a3, b2
MADD t13, t13, a3, b3
MADD t14, t14, a3, b4
daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MADD t11, t11, a7, b5 # 4th compute
MADD t12, t12, a7, b6
MADD t13, t13, a7, b7
MADD t14, t14, a7, b8
daddiu L, L, -1
bgtz L, .L52
nop
.align 3
.L55:
andi L, TEMP, 3
blez L, .L58
nop
.align 3
.L56:
MADD t11, t11, a1, b1 # 3rd compute
MADD t12, t12, a1, b2
MADD t13, t13, a1, b3
MADD t14, t14, a1, b4
daddiu AO, AO, 1 * SIZE # AO += 1mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L56
nop
.L58: # deal with the triangular part
daddiu TEMP, KK, -1
dsll L, TEMP, BASE_SHIFT # mr=1
dsll TEMP, TEMP, 2 + BASE_SHIFT
daddu AO, AORIG, L # Ao point to the triangular data part
daddu BO, B, TEMP
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
SUB t13, b3, t13
SUB t14, b4, t14
LD b3, 0 * SIZE(AO)
MUL t11, b3, t11
MUL t12, b3, t12
MUL t13, b3, t13
MUL t14, b3, t14
daddiu CO1, CO1, -1 * SIZE
daddiu CO2, CO2, -1 * SIZE
daddiu CO3, CO3, -1 * SIZE
daddiu CO4, CO4, -1 * SIZE
ST t11, 0 * SIZE(BO)
ST t12, 1 * SIZE(BO)
ST t13, 2 * SIZE(BO)
ST t14, 3 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t13, 0 * SIZE(CO3)
ST t14, 0 * SIZE(CO4)
daddiu KK, KK, -1 # the length of rectangular data part increases by 1
MTC $0, t11 # clear result registers
MOV t21, t11
MOV t31, t11
MOV t41, t11
MOV t12, t11
MOV t22, t11
MOV t32, t11
MOV t42, t11
.L50:
andi I, M, 2 # mr=2,nr=4
blez I, .L20
nop
dsll TEMP, K, 1 + BASE_SHIFT
dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai
dsll L, KK, 1 + BASE_SHIFT
dsll TEMP, KK, 2 + BASE_SHIFT
daddu AO, AORIG, L # AO point to the rectangular data part
daddu BO, B, TEMP
dsubu TEMP, K, KK
MOV t13, t11 # mr=2
MOV t23, t11
MOV t33, t11
MOV t43, t11
MOV t14, t11
MOV t24, t11
MOV t34, t11
MOV t44, t11
LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
LD b1, 0 * SIZE(BO) # get 4b
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
dsra L, TEMP, 2
blez L, .L25
nop
.align 3
.L22:
LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
LD a3, 4 * SIZE(AO)
LD a4, 5 * SIZE(AO)
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t21, t21, a6, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
MADD t13, t13, a5, b7
MADD t23, t23, a6, b7
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
LD b8, 15 * SIZE(BO)
MADD t11, t11, a3, b1 # 3rd compute
MADD t21, t21, a4, b1
MADD t12, t12, a3, b2
MADD t22, t22, a4, b2
MADD t13, t13, a3, b3
MADD t23, t23, a4, b3
MADD t14, t14, a3, b4
MADD t24, t24, a4, b4
daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MADD t11, t11, a7, b5 # 4th compute
MADD t21, t21, a8, b5
MADD t12, t12, a7, b6
MADD t22, t22, a8, b6
MADD t13, t13, a7, b7
MADD t23, t23, a8, b7
MADD t14, t14, a7, b8
MADD t24, t24, a8, b8
daddiu L, L, -1
bgtz L, .L22
nop
.align 3
.L25:
andi L, TEMP, 3
blez L, .L28
nop
.align 3
.L26:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L26
nop
.L28: # deal with the triangular part
daddiu TEMP, KK, -2
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 2 + BASE_SHIFT
daddu AO, AORIG, L # Ao point to the triangular data part
daddu BO, B, TEMP
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
SUB t13, b3, t13
SUB t14, b4, t14
SUB t21, b5, t21
SUB t22, b6, t22
SUB t23, b7, t23
SUB t24, b8, t24
LD b1, 3 * SIZE(AO) # computes the triangular_part
LD b2, 2 * SIZE(AO)
MUL t21, b1, t21
MUL t22, b1, t22
MUL t23, b1, t23
MUL t24, b1, t24
NMSUB t11, t11, b2, t21
NMSUB t12, t12, b2, t22
NMSUB t13, t13, b2, t23
NMSUB t14, t14, b2, t24
LD b3, 0 * SIZE(AO)
MUL t11, b3, t11
MUL t12, b3, t12
MUL t13, b3, t13
MUL t14, b3, t14
daddiu CO1, CO1, -2 * SIZE
daddiu CO2, CO2, -2 * SIZE
daddiu CO3, CO3, -2 * SIZE
daddiu CO4, CO4, -2 * SIZE
ST t11, 0 * SIZE(BO)
ST t12, 1 * SIZE(BO)
ST t13, 2 * SIZE(BO)
ST t14, 3 * SIZE(BO)
ST t21, 4 * SIZE(BO)
ST t22, 5 * SIZE(BO)
ST t23, 6 * SIZE(BO)
ST t24, 7 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t21, 1 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t13, 0 * SIZE(CO3)
ST t23, 1 * SIZE(CO3)
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
daddiu KK, KK, -2 # the length of rectangular data part increases by 2
MTC $0, t11 # clear result registers
MOV t21, t11
MOV t31, t11
MOV t41, t11
MOV t12, t11
MOV t22, t11
MOV t32, t11
MOV t42, t11
.L20:
dsra I, M, 2 # I=MC/4
blez I, .L29
nop
.L11: # mr=4
dsll TEMP, K, 2 + BASE_SHIFT # TEMP=KC*MR*data_Byte
dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai
dsll L, KK, 2 + BASE_SHIFT # KC-KK is the length of the rectangular data part of Ai
dsll TEMP, KK, 2 + BASE_SHIFT # KK*NR*data_Byte
daddu AO, AORIG, L # AO point to the rectangular data part
daddu BO, B, TEMP
dsubu TEMP, K, KK
LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO) # get 4a
LD b1, 0 * SIZE(BO) # get 4b
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MOV t13, t11 # clear result registers
MOV t23, t11
MOV t33, t11
MOV t43, t11
MOV t14, t11
MOV t24, t11
MOV t34, t11
MOV t44, t11
dsra L, TEMP, 2 # L=(KC-offset)/4
blez L, .L15
nop
.align 3
.L12:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t33, t33, a3, b3
MADD t43, t43, a4, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
MADD t44, t44, a4, b4
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
LD a3, 10 * SIZE(AO)
LD a4, 11 * SIZE(AO)
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t21, t21, a6, b5
MADD t31, t31, a7, b5
MADD t41, t41, a8, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
MADD t32, t32, a7, b6
MADD t42, t42, a8, b6
MADD t13, t13, a5, b7
MADD t23, t23, a6, b7
MADD t33, t33, a7, b7
MADD t43, t43, a8, b7
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
MADD t34, t34, a7, b8
MADD t44, t44, a8, b8
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
LD a7, 14 * SIZE(AO)
LD a8, 15 * SIZE(AO)
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
LD b8, 15 * SIZE(BO)
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t33, t33, a3, b3
MADD t43, t43, a4, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
MADD t44, t44, a4, b4
daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MADD t11, t11, a5, b5 # 4th compute
MADD t21, t21, a6, b5
MADD t31, t31, a7, b5
MADD t41, t41, a8, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
MADD t32, t32, a7, b6
MADD t42, t42, a8, b6
MADD t13, t13, a5, b7
MADD t23, t23, a6, b7
MADD t33, t33, a7, b7
MADD t43, t43, a8, b7
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
MADD t34, t34, a7, b8
MADD t44, t44, a8, b8
daddiu L, L, -1
bgtz L, .L12
nop
.align 3
.L15:
andi L, TEMP, 3
blez L, .L18
nop
.align 3
.L16:
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t33, t33, a3, b3
MADD t43, t43, a4, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
MADD t44, t44, a4, b4
daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 4 * SIZE # BO += 4nr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L16
nop
.L18: # deal with the triangular data part of panel Ai
daddiu TEMP, KK, -4 #
dsll L, TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, 2 + BASE_SHIFT
daddu AO, AORIG, L # AO point to the triangular data part
daddu BO, B, TEMP
LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B
LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
SUB t13, b3, t13
SUB t14, b4, t14
LD b5, 4 * SIZE(BO) # sb store in row major
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
SUB t21, b5, t21
SUB t22, b6, t22
SUB t23, b7, t23
SUB t24, b8, t24
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
SUB t31, b1, t31
SUB t32, b2, t32
SUB t33, b3, t33
SUB t34, b4, t34
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
LD b8, 15 * SIZE(BO)
SUB t41, b5, t41
SUB t42, b6, t42
SUB t43, b7, t43
SUB t44, b8, t44
LD b1, 15 * SIZE(AO)
LD b2, 14 * SIZE(AO)
LD b4, 13 * SIZE(AO)
LD b7, 12 * SIZE(AO)
MUL t41, b1, t41
MUL t42, b1, t42
MUL t43, b1, t43
MUL t44, b1, t44
NMSUB t31, t31, b2, t41
NMSUB t32, t32, b2, t42
NMSUB t33, t33, b2, t43
NMSUB t34, t34, b2, t44
NMSUB t21, t21, b4, t41
NMSUB t22, t22, b4, t42
NMSUB t23, t23, b4, t43
NMSUB t24, t24, b4, t44
NMSUB t11, t11, b7, t41
NMSUB t12, t12, b7, t42
NMSUB t13, t13, b7, t43
NMSUB t14, t14, b7, t44
LD b3, 10 * SIZE(AO)
LD b5, 9 * SIZE(AO)
LD b8, 8 * SIZE(AO)
MUL t31, b3, t31
MUL t32, b3, t32
MUL t33, b3, t33
MUL t34, b3, t34
NMSUB t21, t21, b5, t31
NMSUB t22, t22, b5, t32
NMSUB t23, t23, b5, t33
NMSUB t24, t24, b5, t34
NMSUB t11, t11, b8, t31
NMSUB t12, t12, b8, t32
NMSUB t13, t13, b8, t33
NMSUB t14, t14, b8, t34
LD b6, 5 * SIZE(AO)
LD b1, 4 * SIZE(AO)
MUL t21, b6, t21
MUL t22, b6, t22
MUL t23, b6, t23
MUL t24, b6, t24
NMSUB t11, t11, b1, t21
NMSUB t12, t12, b1, t22
NMSUB t13, t13, b1, t23
NMSUB t14, t14, b1, t24
LD b2, 0 * SIZE(AO)
MUL t11, b2, t11
MUL t12, b2, t12
MUL t13, b2, t13
MUL t14, b2, t14
daddiu CO1, CO1, -4 * SIZE # modify
daddiu CO2, CO2, -4 * SIZE
daddiu CO3, CO3, -4 * SIZE
daddiu CO4, CO4, -4 * SIZE
ST t11, 0 * SIZE(BO) # update packed B
ST t12, 1 * SIZE(BO)
ST t13, 2 * SIZE(BO)
ST t14, 3 * SIZE(BO)
ST t21, 4 * SIZE(BO)
ST t22, 5 * SIZE(BO)
ST t23, 6 * SIZE(BO)
ST t24, 7 * SIZE(BO)
ST t31, 8 * SIZE(BO)
ST t32, 9 * SIZE(BO)
ST t33, 10 * SIZE(BO)
ST t34, 11 * SIZE(BO)
ST t41, 12 * SIZE(BO)
ST t42, 13 * SIZE(BO)
ST t43, 14 * SIZE(BO)
ST t44, 15 * SIZE(BO)
ST t11, 0 * SIZE(CO1) # write back
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
ST t13, 0 * SIZE(CO3)
ST t23, 1 * SIZE(CO3)
ST t33, 2 * SIZE(CO3)
ST t43, 3 * SIZE(CO3)
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
ST t34, 2 * SIZE(CO4)
ST t44, 3 * SIZE(CO4)
daddiu KK, KK, -4 # KC-KK is the length of the rectangular data part, LN compute from bottom to top so KK-=4
daddiu I, I, -1
MTC $0, a1
MOV t11, a1
MOV t21, a1
MOV t31, a1
MOV t41, a1
MOV t12, a1
MOV t22, a1
MOV t32, a1
MOV t42, a1
bgtz I, .L11
nop
.align 3
.L29:
dsll TEMP, K, 2 + BASE_SHIFT
daddu B, B, TEMP # B point to next Bj
bgtz J, .L10
nop
.align 3
.L30:
andi J, N, 2 # nr=2
blez J, .L70
nop
move CO1, C
daddu CO2, C, LDC
MTC $0, t11 # clear result regusters
MOV t21, t11
MOV t31, t11
MOV t41, t11
daddu KK, M, OFFSET
move AORIG, A # reset A
daddu C, CO2, LDC # fixed
andi I, M, 1 # mr=1
blez I, .L60
nop
dsll TEMP, K, BASE_SHIFT
dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of everypanel of Ai
dsll L, KK, BASE_SHIFT # mr=1
dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
daddu AO, AORIG, L # AO point to rectangular data part
daddu BO, B, TEMP
dsubu TEMP, K, KK
MOV t12, t11 # clear result registers
MOV t22, t11
MOV t32, t11
MOV t42, t11
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
dsra L, TEMP, 2
blez L, .L65
nop
.align 3
.L62:
LD a5, 1 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t12, t12, a1, b2
LD a3, 2 * SIZE(AO)
LD b3, 4 * SIZE(BO)
LD b4, 5 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t12, t12, a5, b6
LD a7, 3 * SIZE(AO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a3, b3 # 3rd compute
MADD t12, t12, a3, b4
daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MADD t11, t11, a7, b7 # 4th compute
MADD t12, t12, a7, b8
daddiu L, L, -1
bgtz L, .L62
nop
.align 3
.L65:
andi L, TEMP, 3
blez L, .L68
nop
.align 3
.L66:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
daddiu AO, AO, 1 * SIZE # AO += mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L66
nop
.L68:
daddiu TEMP, KK, -1 # mr=1
dsll L, TEMP, BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu AO, AORIG, L # Ao point to the triangular data part
daddu BO, B, TEMP
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
LD b3, 0 * SIZE(AO)
MUL t11, b3, t11
MUL t12, b3, t12
daddiu CO1, CO1, -1 * SIZE
daddiu CO2, CO2, -1 * SIZE
ST t11, 0 * SIZE(BO)
ST t12, 1 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
daddiu KK, KK, -1
MTC $0, t11 # clear result regusters
MOV t21, t11
MOV t31, t11
MOV t41, t11
.L60:
andi I, M, 2
blez I, .L40
nop
dsll TEMP, K, 1 + BASE_SHIFT
dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of everypanel of Ai
dsll L, KK, 1 + BASE_SHIFT # mr=2
dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
daddu AO, AORIG, L # AO point to rectangular data part
daddu BO, B, TEMP
dsubu TEMP, K, KK
MOV t12, t11 # clear result registers
MOV t22, t11
MOV t32, t11
MOV t42, t11
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
dsra L, TEMP, 2
blez L, .L45
nop
.align 3
.L42:
LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
LD a3, 4 * SIZE(AO)
LD a4, 5 * SIZE(AO)
LD b3, 4 * SIZE(BO)
LD b4, 5 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t21, t21, a6, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a3, b3 # 3rd compute
MADD t21, t21, a4, b3
MADD t12, t12, a3, b4
MADD t22, t22, a4, b4
daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MADD t11, t11, a7, b7 # 4th compute
MADD t21, t21, a8, b7
MADD t12, t12, a7, b8
MADD t22, t22, a8, b8
daddiu L, L, -1
bgtz L, .L42
nop
.align 3
.L45:
andi L, TEMP, 3
blez L, .L48
nop
.align 3
.L46:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L46
nop
.L48:
daddiu TEMP, KK, -2
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu AO, AORIG, L # Ao point to the triangular data part
daddu BO, B, TEMP
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
SUB t21, b3, t21
SUB t22, b4, t22
LD b1, 3 * SIZE(AO) # computes the triangular_part
LD b2, 2 * SIZE(AO)
MUL t21, b1, t21
MUL t22, b1, t22
NMSUB t11, t11, b2, t21
NMSUB t12, t12, b2, t22
LD b3, 0 * SIZE(AO)
MUL t11, b3, t11
MUL t12, b3, t12
daddiu CO1, CO1, -2 * SIZE
daddiu CO2, CO2, -2 * SIZE
ST t11, 0 * SIZE(BO)
ST t12, 1 * SIZE(BO)
ST t21, 2 * SIZE(BO)
ST t22, 3 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t21, 1 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
daddiu KK, KK, -2
MTC $0, t11 # clear result regusters
MOV t21, t11
MOV t31, t11
MOV t41, t11
.L40:
dsra I, M, 2 # I = mc/4
blez I, .L49
nop
.L31:
dsll TEMP, K, 2 + BASE_SHIFT
dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai
dsll L, KK, 2 + BASE_SHIFT # mr=4
dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
daddu AO, AORIG, L # AO point to the rectangular data part
daddu BO, B, TEMP
dsubu TEMP, K, KK
MOV t12, t11
MOV t22, t11
MOV t32, t11
MOV t42, t11
LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO) # get 4a
LD b1, 0 * SIZE(BO) # get 4b
LD b2, 1 * SIZE(BO)
dsra L, TEMP, 2
blez L, .L35
nop
.align 3
.L32:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
LD a3, 10 * SIZE(AO)
LD a4, 11 * SIZE(AO)
LD b3, 4 * SIZE(BO)
LD b4, 5 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t21, t21, a6, b5
MADD t31, t31, a7, b5
MADD t41, t41, a8, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
MADD t32, t32, a7, b6
MADD t42, t42, a8, b6
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
LD a7, 14 * SIZE(AO)
LD a8, 15 * SIZE(AO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a1, b3 # 3rd compute
MADD t21, t21, a2, b3
MADD t31, t31, a3, b3
MADD t41, t41, a4, b3
MADD t12, t12, a1, b4
MADD t22, t22, a2, b4
MADD t32, t32, a3, b4
MADD t42, t42, a4, b4
daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MADD t11, t11, a5, b7 # 4th compute
MADD t21, t21, a6, b7
MADD t31, t31, a7, b7
MADD t41, t41, a8, b7
MADD t12, t12, a5, b8
MADD t22, t22, a6, b8
MADD t32, t32, a7, b8
MADD t42, t42, a8, b8
daddiu L, L, -1
bgtz L, .L32
nop
.align 3
.L35:
andi L, TEMP, 3
blez L, .L38
nop
.align 3
.L36:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 2 * SIZE # BO += 2nr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L36
nop
.L38: #
daddiu TEMP, KK, -4
dsll L, TEMP, 2 + BASE_SHIFT # mr=4
dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
daddu AO, AORIG, L # AO point to the triangular data part
daddu BO, B, TEMP
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
SUB t11, b1, t11
SUB t12, b2, t12
SUB t21, b3, t21
SUB t22, b4, t22
SUB t31, b5, t31
SUB t32, b6, t32
SUB t41, b7, t41
SUB t42, b8, t42
LD b1, 15 * SIZE(AO)
LD b2, 14 * SIZE(AO)
LD b4, 13 * SIZE(AO)
LD b7, 12 * SIZE(AO)
MUL t41, b1, t41
MUL t42, b1, t42
NMSUB t31, t31, b2, t41
NMSUB t32, t32, b2, t42
NMSUB t21, t21, b4, t41
NMSUB t22, t22, b4, t42
NMSUB t11, t11, b7, t41
NMSUB t12, t12, b7, t42
LD b3, 10 * SIZE(AO)
LD b5, 9 * SIZE(AO)
LD b8, 8 * SIZE(AO)
MUL t31, b3, t31
MUL t32, b3, t32
NMSUB t21, t21, b5, t31
NMSUB t22, t22, b5, t32
NMSUB t11, t11, b8, t31
NMSUB t12, t12, b8, t32
LD b6, 5 * SIZE(AO)
LD b1, 4 * SIZE(AO)
MUL t21, b6, t21
MUL t22, b6, t22
NMSUB t11, t11, b1, t21
NMSUB t12, t12, b1, t22
LD b2, 0 * SIZE(AO)
MUL t11, b2, t11
MUL t12, b2, t12
daddiu CO1, CO1, -4 * SIZE
daddiu CO2, CO2, -4 * SIZE
ST t11, 0 * SIZE(BO)
ST t12, 1 * SIZE(BO)
ST t21, 2 * SIZE(BO)
ST t22, 3 * SIZE(BO)
ST t31, 4 * SIZE(BO)
ST t32, 5 * SIZE(BO)
ST t41, 6 * SIZE(BO)
ST t42, 7 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
daddiu KK, KK, -4
MTC $0, t11
MOV t21, t11
MOV t31, t11
MOV t41, t11
daddiu I, I, -1
bgtz I, .L31
nop
.align 3
.L49:
dsll TEMP, K, 1 + BASE_SHIFT # nr=2
daddu B, B, TEMP
.align 3
.L70:
andi J, N, 1 # nr=1
blez J, .L999 # END
nop
move CO1, C
daddu KK, M, OFFSET
move AORIG, A # reset A
andi I, M, 1 # mr=1
blez I, .L90
NOP
MTC $0, t11
dsll TEMP, K, BASE_SHIFT # mr=1
dsubu AORIG, AORIG, TEMP
dsll L, KK, BASE_SHIFT
daddu AO, AORIG, L # AO point to the rectangular data part
daddu BO, B, L
dsubu TEMP, K, KK
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
dsra L, TEMP, 2
blez L, .L95
nop
.align 3
.L92:
LD a5, 1 * SIZE(AO)
LD b5, 1 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
LD a3, 2 * SIZE(AO)
LD b3, 2 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
LD a7, 3 * SIZE(AO)
LD b7, 3 * SIZE(BO)
MADD t11, t11, a3, b3 # 3rd compute
daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
MADD t11, t11, a7, b7 # 4th compute
daddiu L, L, -1
bgtz L, .L92
nop
.align 3
.L95:
andi L, TEMP, 3
blez L, .L98
nop
.align 3
.L96:
MADD t11, t11, a1, b1 # 3rd compute
daddiu AO, AO, 1 * SIZE # AO += 1mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
LD a1, 0 * SIZE(AO) # next
LD b1, 0 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L96
nop
.L98:
daddiu TEMP, KK, -1 # mr=2
dsll TEMP, TEMP, BASE_SHIFT
daddu AO, AORIG, TEMP # AO point to the triangular data part
daddu BO, B, TEMP
LD b1, 0 * SIZE(BO)
SUB t11, b1, t11
LD b3, 0 * SIZE(AO)
MUL t11, b3, t11
daddiu CO1, CO1, -1 * SIZE
ST t11, 0 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
daddiu KK, KK, -1
.L90:
andi I, M, 2
blez I, .L80
NOP
MTC $0, t11
MOV t21, t11 # clear result registers
dsll TEMP, K, 1+BASE_SHIFT # mr=2
dsubu AORIG, AORIG, TEMP
dsll L, KK, 1 + BASE_SHIFT
dsll TEMP, KK, 0 + BASE_SHIFT
daddu AO, AORIG, L # AO point to the rectangular data part
daddu BO, B, TEMP
dsubu TEMP, K, KK
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
dsra L, TEMP, 2
blez L, .L85
nop
.align 3
.L82:
LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 1 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t21, t21, a2, b1
LD a3, 4 * SIZE(AO)
LD a4, 5 * SIZE(AO)
LD b3, 2 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t21, t21, a6, b5
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b7, 3 * SIZE(BO)
MADD t11, t11, a3, b3 # 3rd compute
MADD t21, t21, a4, b3
daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MADD t11, t11, a7, b7 # 4th compute
MADD t21, t21, a8, b7
daddiu L, L, -1
bgtz L, .L82
nop
.align 3
.L85:
andi L, TEMP, 3
blez L, .L88
nop
.align 3
.L86:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L86
nop
.L88:
daddiu TEMP, KK, -2 # mr=2
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 0 + BASE_SHIFT
daddu AO, AORIG, L # AO point to the triangular data part
daddu BO, B, TEMP
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
SUB t11, b1, t11
SUB t21, b2, t21
LD b1, 3 * SIZE(AO) # computes the triangular_part
LD b2, 2 * SIZE(AO)
MUL t21, b1, t21
NMSUB t11, t11, b2, t21
LD b3, 0 * SIZE(AO)
MUL t11, b3, t11
daddiu CO1, CO1, -2 * SIZE
ST t11, 0 * SIZE(BO)
ST t21, 1 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t21, 1 * SIZE(CO1)
daddiu KK, KK, -2
.align 3
.L80:
dsra I, M, 2
blez I, .L89
nop
.L71:
dsll TEMP, K, 2 + BASE_SHIFT # mr=4
dsubu AORIG, AORIG, TEMP
dsll L, KK, 2 + BASE_SHIFT # mr=4
dsll TEMP, KK, 0 + BASE_SHIFT # nr=1
daddu AO, AORIG, L # AO point to the rectangular
daddu BO, B, TEMP
dsubu TEMP, K, KK
MTC $0, t11 # clear result regusters
MOV t21, t11
MOV t31, t11
MOV t41, t11
LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO) # get 4a
LD b1, 0 * SIZE(BO) # get 4b
dsra L, TEMP, 2
blez L, .L75
nop # reset B
.align 3
.L72:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b5, 1 * SIZE(BO)
MADD t11, t11, a1, b1 # 1st compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
LD a3, 10 * SIZE(AO)
LD a4, 11 * SIZE(AO)
LD b3, 2 * SIZE(BO)
MADD t11, t11, a5, b5 # 2ed compute
MADD t21, t21, a6, b5
MADD t31, t31, a7, b5
MADD t41, t41, a8, b5
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
LD a7, 14 * SIZE(AO)
LD a8, 15 * SIZE(AO)
LD b7, 3 * SIZE(BO)
MADD t11, t11, a1, b3 # 3rd compute
MADD t21, t21, a2, b3
MADD t31, t31, a3, b3
MADD t41, t41, a4, b3
daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MADD t11, t11, a5, b7 # 4th compute
MADD t21, t21, a6, b7
MADD t31, t31, a7, b7
MADD t41, t41, a8, b7
daddiu L, L, -1
bgtz L, .L72
nop
.align 3
.L75:
andi L, TEMP, 3
blez L, .L78
nop
.align 3
.L76:
MADD t11, t11, a1, b1 # 3rd compute
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 1 * SIZE # BO += 1nr
LD a1, 0 * SIZE(AO) # next
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L76
nop
.L78:
daddiu TEMP, KK, -4 # mr=4
dsll L, TEMP, 2 + BASE_SHIFT # mr=4
dsll TEMP, TEMP, 0 + BASE_SHIFT # nr=1
daddu AO, AORIG, L # AO point to the triangular
daddu BO, B, TEMP
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
SUB t11, b1, t11
SUB t21, b2, t21
SUB t31, b3, t31
SUB t41, b4, t41
LD b1, 15 * SIZE(AO)
LD b2, 14 * SIZE(AO)
LD b4, 13 * SIZE(AO)
LD b7, 12 * SIZE(AO)
MUL t41, b1, t41
NMSUB t31, t31, b2, t41
NMSUB t21, t21, b4, t41
NMSUB t11, t11, b7, t41
LD b3, 10 * SIZE(AO)
LD b5, 9 * SIZE(AO)
LD b8, 8 * SIZE(AO)
MUL t31, b3, t31
NMSUB t21, t21, b5, t31
NMSUB t11, t11, b8, t31
LD b6, 5 * SIZE(AO)
LD b1, 4 * SIZE(AO)
MUL t21, b6, t21
NMSUB t11, t11, b1, t21
LD b2, 0 * SIZE(AO)
MUL t11, b2, t11
daddiu CO1, CO1, -4 * SIZE
ST t11, 0 * SIZE(BO)
ST t21, 1 * SIZE(BO)
ST t31, 2 * SIZE(BO)
ST t41, 3 * SIZE(BO)
ST t11, 0 * SIZE(CO1)
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
daddiu KK, KK, -4
daddiu I, I, -1
bgtz I, .L71
nop
.align 3
.L89:
dsll TEMP, K, BASE_SHIFT # nr=1
daddu B, B, TEMP
.align 3
.L999:
LDARG $16, 0($sp)
LDARG $17, 8($sp)
LDARG $18, 16($sp)
LDARG $19, 24($sp)
LDARG $20, 32($sp)
LDARG $21, 40($sp)
ldc1 $f24, 48($sp)
ldc1 $f25, 56($sp)
ldc1 $f26, 64($sp)
ldc1 $f27, 72($sp)
ldc1 $f28, 80($sp)
LDARG $22, 88($sp)
LDARG $23, 96($sp)
LDARG $24, 104($sp)
LDARG $25, 112($sp)
#ifndef __64BIT__
ldc1 $f20,112($sp)
ldc1 $f21,120($sp)
ldc1 $f22,128($sp)
ldc1 $f23,136($sp)
#endif
j $31
daddiu $sp, $sp, 144
EPILOGUE