##define REALNAME gemm
#define ASSEMBLER
#include "common.h"
#define FETCH ld
#define STACKSIZE 192
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
##### Parameter registers ####
#define M $4
#define N $5
#define K $6
#define A $9
#define B $10
#define C $11
#define LDC $8
#### Pointer A, B, C ####
#define AO $12
#define BO $13
#define CO1 $14
#define CO2 $15
#define PREA $18
#define PREB $19
#### Used registers ####
#define A1 $f0
#define A2 $f1
#define A3 $f2
#define A4 $f3
#define A5 $f4
#define A6 $f5
#define A7 $f6
#define A8 $f7
#define B1 $f8
#define B2 $f9
#define B3 $f10
#define B4 $f11
#define B5 $f12
#define B6 $f13
#define B7 $f14
#define B8 $f15
#define C11 $f16
#define C12 $f17
#define C21 $f18
#define C22 $f19
#define C31 $f20
#define C32 $f21
#define C41 $f22
#define C42 $f23
#define C13 $f24
#define C14 $f25
#define C23 $f26
#define C24 $f27
#define C33 $f28
#define C34 $f29
#define C43 $f30
#define C44 $f31
#define I $2
#define J $3
#define L $7
#### Alpha register ####
#define ALPHA $f15
#define F31 31
#define F30 30
#define F29 29
#define F28 28
#define F27 27
#define F26 26
#define F25 25
#define F24 24
#define F23 23
#define F22 22
#define F21 21
#define F20 20
#define F19 19
#define F18 18
#define F17 17
#define F16 16
#define F15 15
#define F14 14
#define F13 13
#define F12 12
#define F11 11
#define F10 10
#define F9 9
#define F8 8
#define F7 7
#define F6 6
#define F5 5
#define F4 4
#define F3 3
#define F2 2
#define F1 1
#define F0 0
#define R12 12
#define R13 13
#define R14 14
#define R15 15
#define R16 16
#define R17 17
#if defined(TRMMKERNEL)
#define OFFSET $23
#define KK $24
#define TEMP $25
#endif
PROLOGUE
LDARG LDC, 0($sp)
daddiu $sp,$sp,-STACKSIZE
sd $16, 0($sp)
sd $17, 8($sp)
sd $18, 16($sp)
sd $19, 24($sp)
sd $20, 32($sp)
sd $21, 40($sp)
sd $22, 48($sp)
ST $f24, 56($sp)
ST $f25, 64($sp)
ST $f26, 72($sp)
ST $f27, 80($sp)
ST $f28, 88($sp)
#if defined(TRMMKERNEL)
sd $23, 96($sp)
sd $24, 104($sp)
sd $25, 112($sp)
LDARG OFFSET, STACKSIZE+8($sp)
#endif
#ifndef __64BIT__
ST $f20,120($sp)
ST $f21,128($sp)
ST $f22,136($sp)
ST $f23,144($sp)
#endif
.align 4
.L2:
dsra J, N, 1 # NR=2
ST $f15, 152($sp)
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, OFFSET
#endif
dsll LDC, LDC, ZBASE_SHIFT# LDC*SIZE
blez J, .L1
ST $f16, 160($sp)
.L24:
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
dsra I, M, 2 # MR=8
move AO, A # Reset A
dsll PREA, K, 1 + ZBASE_SHIFT
move CO1, C
daddu CO2, C, LDC
daddu PREA, AO, PREA
blez I, .L22
daddu C, CO2, LDC
.align 4
.L241:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 2 + ZBASE_SHIFT
dsll TEMP, KK, 1 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
MOV C12, C11
dsll PREB, K, ZBASE_SHIFT
MOV C21, C11
MOV C22, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
MOV C31, C11
MOV C32, C11
gsLQC1(R12, F1, F0, 0) # A1 A2
MOV C41, C11
MOV C42, C11
gsLQC1(R12, F3, F2, 1) # A3 A4
MOV C13, C11
MOV C14, C11
MOV C23, C11
MOV C24, C11
MOV C33, C11
MOV C34, C11
MOV C43, C11
MOV C44, C11
PLU B3, B1, B1
PLU B4, B2, B2
daddu PREB, BO, PREB
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 8 * SIZE(CO1)
FETCH $0, 0 * SIZE(CO2)
FETCH $0, 8 * SIZE(CO2)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 4
#else
daddiu TEMP, KK, 2
#endif
dsra L, TEMP, 2
blez L, .L242
NOP
#else
move BO, B # Reset B
dsra L, K, 2 # UnRoll K=64
MTC $0, C11 # CLEAR REAULTS REGISTERS
MOV C12, C11
dsll PREB, K, ZBASE_SHIFT
MOV C21, C11
MOV C22, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
MOV C31, C11
MOV C32, C11
gsLQC1(R12, F1, F0, 0) # A1 A2
MOV C41, C11
MOV C42, C11
gsLQC1(R12, F3, F2, 1) # A3 A4
MOV C13, C11
MOV C14, C11
FETCH $0, 0 * SIZE(CO1)
MOV C23, C11
MOV C24, C11
FETCH $0, 0 * SIZE(CO2)
MOV C33, C11
MOV C34, C11
MOV C43, C11
MOV C44, C11
daddu PREB, BO, PREB
PLU B3, B1, B1
PLU B4, B2, B2
FETCH $0, 8 * SIZE(CO1)
blez L, .L242
FETCH $0, 8 * SIZE(CO2)
#endif
.L2410:
daddiu L, L, -1
gsLQC1(R13, F13, F12, 1) # B3 B4
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
gsLQC1(R12, F5, F4, 2) # A5 A6
MADPS C12, C12, A1, B2
MADPS C22, C22, A2, B2
gsLQC1(R12, F7, F6, 3) # A7 A8
MADPS C31, C31, A3, B1
MADPS C41, C41, A4, B1
FETCH $0, 0 * SIZE(PREB)
MADPS C32, C32, A3, B2
MADPS C42, C42, A4, B2
FETCH $0, 0 * SIZE(PREA)
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
MADPS C33, C33, A3, B3
MADPS C43, C43, A4, B3
MADPS C14, C14, A1, B4
MADPS C24, C24, A2, B4
PLU B7, B5, B5
PLU B8, B6, B6
daddu PREB, PREB, 8 * SIZE
MADPS C34, C34, A3, B4
MADPS C44, C44, A4, B4
gsLQC1(R13, F9, F8, 2) # B1 B2
MADPS C11, C11, A5, B5
MADPS C21, C21, A6, B5
gsLQC1(R12, F1, F0, 4) # A1 A2
MADPS C12, C12, A5, B6
MADPS C22, C22, A6, B6
gsLQC1(R12, F3, F2, 5) # A3 A4
MADPS C31, C31, A7, B5
MADPS C41, C41, A8, B5
FETCH $0, 8 * SIZE(PREA)
MADPS C32, C32, A7, B6
MADPS C42, C42, A8, B6
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
MADPS C14, C14, A5, B8
MADPS C24, C24, A6, B8
PLU B3, B1, B1
PLU B4, B2, B2
MADPS C34, C34, A7, B8
MADPS C44, C44, A8, B8
gsLQC1(R13, F13, F12, 3) # B3 B4
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
gsLQC1(R12, F5, F4, 6) # A5 A6
MADPS C12, C12, A1, B2
MADPS C22, C22, A2, B2
gsLQC1(R12, F7, F6, 7) # A7 A8
MADPS C31, C31, A3, B1
MADPS C41, C41, A4, B1
daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR
FETCH $0, 16 * SIZE(PREA)
MADPS C32, C32, A3, B2
MADPS C42, C42, A4, B2
daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
MADPS C33, C33, A3, B3
MADPS C43, C43, A4, B3
MADPS C14, C14, A1, B4
MADPS C24, C24, A2, B4
PLU B7, B5, B5
PLU B8, B6, B6
MADPS C34, C34, A3, B4
MADPS C44, C44, A4, B4
gsLQC1(R13, F9, F8, 0) # B1 B2
MADPS C11, C11, A5, B5
MADPS C21, C21, A6, B5
gsLQC1(R12, F1, F0, 0) # A1 A2
MADPS C12, C12, A5, B6
MADPS C22, C22, A6, B6
gsLQC1(R12, F3, F2, 1) # A3 A4
MADPS C31, C31, A7, B5
MADPS C41, C41, A8, B5
FETCH $0, 24 * SIZE(PREA)
MADPS C32, C32, A7, B6
MADPS C42, C42, A8, B6
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
daddu PREA, PREA, 32 * SIZE
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
MADPS C14, C14, A5, B8
MADPS C24, C24, A6, B8
PLU B3, B1, B1
PLU B4, B2, B2
MADPS C34, C34, A7, B8
bgtz L, .L2410
MADPS C44, C44, A8, B8
.align 4
.L242:
#ifndef TRMMKERNEL
andi L, K, 2
#else
andi L, TEMP, 2
#endif
blez L, .L247
NOP
gsLQC1(R13, F13, F12, 1) # B3 B4
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
gsLQC1(R12, F5, F4, 2) # A5 A6
MADPS C12, C12, A1, B2
MADPS C22, C22, A2, B2
gsLQC1(R12, F7, F6, 3) # A7 A8
MADPS C31, C31, A3, B1
MADPS C41, C41, A4, B1
daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR
MADPS C32, C32, A3, B2
MADPS C42, C42, A4, B2
daddiu AO, AO, 4 * 4 * SIZE
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
MADPS C33, C33, A3, B3
MADPS C43, C43, A4, B3
MADPS C14, C14, A1, B4
MADPS C24, C24, A2, B4
PLU B7, B5, B5
PLU B8, B6, B6
MADPS C34, C34, A3, B4
MADPS C44, C44, A4, B4
gsLQC1(R13, F9, F8, 0) # B1 B2
MADPS C11, C11, A5, B5
MADPS C21, C21, A6, B5
gsLQC1(R12, F1, F0, 0) # A1 A2
MADPS C12, C12, A5, B6
MADPS C22, C22, A6, B6
gsLQC1(R12, F3, F2, 1) # A3 A4
MADPS C31, C31, A7, B5
MADPS C41, C41, A8, B5
MADPS C32, C32, A7, B6
MADPS C42, C42, A8, B6
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
MADPS C33, C33, A7, B7
MADPS C43, C43, A8, B7
MADPS C14, C14, A5, B8
MADPS C24, C24, A6, B8
PLU B3, B1, B1
PLU B4, B2, B2
MADPS C34, C34, A7, B8
MADPS C44, C44, A8, B8
.align 4
.L247:
#ifndef TRMMKERNEL
andi L, K, 1
#else
andi L, TEMP, 1
#endif
blez L, .L240
NOP
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
MADPS C12, C12, A1, B2
MADPS C22, C22, A2, B2
MADPS C31, C31, A3, B1
MADPS C41, C41, A4, B1
daddiu BO, BO, 1 * 4 * SIZE # 4KR*4NR
MADPS C32, C32, A3, B2
MADPS C42, C42, A4, B2
daddiu AO, AO, 2 * 4 * SIZE
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
MADPS C33, C33, A3, B3
MADPS C43, C43, A4, B3
MADPS C14, C14, A1, B4
MADPS C24, C24, A2, B4
MADPS C34, C34, A3, B4
MADPS C44, C44, A4, B4
.align 4
.L240: # Write Back
#ifndef TRMMKERNEL
daddiu I, I, -1
CVTU A1, C11
CVTU A2, C21
CVTU A3, C31
CVTU A4, C41
CVTU A5, C13
CVTU A6, C23
CVTU A7, C33
CVTU A8, C43
CVTU B1, C12
CVTU B2, C22
CVTU B3, C32
CVTU B4, C42
CVTU B5, C14
CVTU B6, C24
CVTU B7, C34
CVTU B8, C44
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
/* (a + bi) * (c + di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
# LD A1, 0 * SIZE(A) # load alpha_r
SUB C31, C31, A3
LD A1, 152($sp) # load alpha_r
SUB C41, C41, A4
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_i
ADD C13, A5, C13 # ad'+'cb
ADD C23, A6, C23
ADD C33, A7, C33
ADD C43, A8, C43
SUB C12, C12, B1
SUB C22, C22, B2
SUB C32, C32, B3
SUB C42, C42, B4
ADD C14, B5, C14
ADD C24, B6, C24
ADD C34, B7, C34
ADD C44, B8, C44
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B5, 4 * SIZE(CO1)
LD B7, 6 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
LD B6, 5 * SIZE(CO1)
LD B8, 7 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B5, B5, C31, A1
MADD B7, B7, C41, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
MADD B6, B6, C33, A1
MADD B8, B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
LD C13, 0 * SIZE(CO2)
LD C23, 2 * SIZE(CO2)
LD C33, 4 * SIZE(CO2)
LD C43, 6 * SIZE(CO2)
LD C11, 1 * SIZE(CO2)
LD C21, 3 * SIZE(CO2)
LD C31, 5 * SIZE(CO2)
LD C41, 7 * SIZE(CO2)
MADD C13, C13, C12, A1
MADD C23, C23, C22, A1
MADD C33, C33, C32, A1
ST B1, 0 * SIZE(CO1)
MADD C43, C43, C42, A1
ST B3, 2 * SIZE(CO1)
MADD C11, C11, C14, A1
ST B5, 4 * SIZE(CO1)
MADD C21, C21, C24, A1
ST B7, 6 * SIZE(CO1)
MADD C31, C31, C34, A1
ST B2, 1 * SIZE(CO1)
MADD C41, C41, C44, A1
ST B4, 3 * SIZE(CO1)
NMSUB C13, C13, C14, A2
ST B6, 5 * SIZE(CO1)
NMSUB C23, C23, C24, A2
ST B8, 7 * SIZE(CO1)
NMSUB C33, C33, C34, A2
NMSUB C43, C43, C44, A2
MADD C11, C11, C12, A2
MADD C21, C21, C22, A2
MADD C31, C31, C32, A2
MADD C41, C41, C42, A2
ST C13, 0 * SIZE(CO2)
ST C23, 2 * SIZE(CO2)
ST C33, 4 * SIZE(CO2)
ST C43, 6 * SIZE(CO2)
ST C11, 1 * SIZE(CO2)
ST C21, 3 * SIZE(CO2)
ST C31, 5 * SIZE(CO2)
ST C41, 7 * SIZE(CO2)
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
/* (a + bi) * (c - di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
# LD A1, 0 * SIZE(A) # load alpha_r
ADD C31, A3, C31
LD A1, 152($sp) # load alpha_r
ADD C41, A4, C41
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_r
SUB C13, A5, C13 # ad'+'cb
SUB C23, A6, C23
SUB C33, A7, C33
SUB C43, A8, C43
ADD C12, B1, C12
ADD C22, B2, C22
ADD C32, B3, C32
ADD C42, B4, C42
SUB C14, B5, C14
SUB C24, B6, C24
SUB C34, B7, C34
SUB C44, B8, C44
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B5, 4 * SIZE(CO1)
LD B7, 6 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
LD B6, 5 * SIZE(CO1)
LD B8, 7 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B5, B5, C31, A1
MADD B7, B7, C41, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
MADD B6, B6, C33, A1
MADD B8, B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
LD C13, 0 * SIZE(CO2)
LD C23, 2 * SIZE(CO2)
LD C33, 4 * SIZE(CO2)
LD C43, 6 * SIZE(CO2)
LD C11, 1 * SIZE(CO2)
LD C21, 3 * SIZE(CO2)
LD C31, 5 * SIZE(CO2)
LD C41, 7 * SIZE(CO2)
MADD C13, C13, C12, A1
MADD C23, C23, C22, A1
MADD C33, C33, C32, A1
ST B1, 0 * SIZE(CO1)
MADD C43, C43, C42, A1
ST B3, 2 * SIZE(CO1)
MADD C11, C11, C14, A1
ST B5, 4 * SIZE(CO1)
MADD C21, C21, C24, A1
ST B7, 6 * SIZE(CO1)
MADD C31, C31, C34, A1
ST B2, 1 * SIZE(CO1)
MADD C41, C41, C44, A1
ST B4, 3 * SIZE(CO1)
NMSUB C13, C13, C14, A2
ST B6, 5 * SIZE(CO1)
NMSUB C23, C23, C24, A2
ST B8, 7 * SIZE(CO1)
NMSUB C33, C33, C34, A2
NMSUB C43, C43, C44, A2
MADD C11, C11, C12, A2
MADD C21, C21, C22, A2
MADD C31, C31, C32, A2
MADD C41, C41, C42, A2
ST C13, 0 * SIZE(CO2)
ST C23, 2 * SIZE(CO2)
ST C33, 4 * SIZE(CO2)
ST C43, 6 * SIZE(CO2)
ST C11, 1 * SIZE(CO2)
ST C21, 3 * SIZE(CO2)
ST C31, 5 * SIZE(CO2)
ST C41, 7 * SIZE(CO2)
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
/* (a - bi) * (c + di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
# LD A1, 0 * SIZE(A) # load alpha_r
ADD C31, A3, C31
LD A1, 152($sp) # load alpha_r
# LD A2, 0 * SIZE(A) # load alpha_r
ADD C41, A4, C41
LD A2, 160($sp) # load alpha_i
SUB C13, C13, A5 # ad'+'cb
SUB C23, C23, A6
SUB C33, C33, A7
SUB C43, C43, A8
ADD C12, B1, C12
ADD C22, B2, C22
ADD C32, B3, C32
ADD C42, B4, C42
SUB C14, C14, B5
SUB C24, C24, B6
SUB C34, C34, B7
SUB C44, C44, B8
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B5, 4 * SIZE(CO1)
LD B7, 6 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
LD B6, 5 * SIZE(CO1)
LD B8, 7 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B5, B5, C31, A1
MADD B7, B7, C41, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
MADD B6, B6, C33, A1
MADD B8, B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
LD C13, 0 * SIZE(CO2)
LD C23, 2 * SIZE(CO2)
LD C33, 4 * SIZE(CO2)
LD C43, 6 * SIZE(CO2)
LD C11, 1 * SIZE(CO2)
LD C21, 3 * SIZE(CO2)
LD C31, 5 * SIZE(CO2)
LD C41, 7 * SIZE(CO2)
MADD C13, C13, C12, A1
MADD C23, C23, C22, A1
MADD C33, C33, C32, A1
ST B1, 0 * SIZE(CO1)
MADD C43, C43, C42, A1
ST B3, 2 * SIZE(CO1)
MADD C11, C11, C14, A1
ST B5, 4 * SIZE(CO1)
MADD C21, C21, C24, A1
ST B7, 6 * SIZE(CO1)
MADD C31, C31, C34, A1
ST B2, 1 * SIZE(CO1)
MADD C41, C41, C44, A1
ST B4, 3 * SIZE(CO1)
NMSUB C13, C13, C14, A2
ST B6, 5 * SIZE(CO1)
NMSUB C23, C23, C24, A2
ST B8, 7 * SIZE(CO1)
NMSUB C33, C33, C34, A2
NMSUB C43, C43, C44, A2
MADD C11, C11, C12, A2
MADD C21, C21, C22, A2
MADD C31, C31, C32, A2
MADD C41, C41, C42, A2
ST C13, 0 * SIZE(CO2)
ST C23, 2 * SIZE(CO2)
ST C33, 4 * SIZE(CO2)
ST C43, 6 * SIZE(CO2)
ST C11, 1 * SIZE(CO2)
ST C21, 3 * SIZE(CO2)
ST C31, 5 * SIZE(CO2)
ST C41, 7 * SIZE(CO2)
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
/* (a - bi) * (c - di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
SUB C31, C31, A3
LD A1, 152($sp) # load alpha_r
# LD A1, 0 * SIZE(A) # load alpha_r
SUB C41, C41, A4
LD A2, 160($sp)
# LD A2, 0 * SIZE(A) # load alpha_i
ADD C13, A5, C13 # ad'+'cb
ADD C23, A6, C23
ADD C33, A7, C33
ADD C43, A8, C43
SUB C12, C12, B1
SUB C22, C22, B2
SUB C32, C32, B3
SUB C42, C42, B4
ADD C14, B5, C14
ADD C24, B6, C24
ADD C34, B7, C34
ADD C44, B8, C44
NEG C13, C13
NEG C23, C23
NEG C33, C33
NEG C43, C43
NEG C14, C14
NEG C24, C24
NEG C34, C34
NEG C44, C44
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B5, 4 * SIZE(CO1)
LD B7, 6 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
LD B6, 5 * SIZE(CO1)
LD B8, 7 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B5, B5, C31, A1
MADD B7, B7, C41, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
MADD B6, B6, C33, A1
MADD B8, B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
LD C13, 0 * SIZE(CO2)
LD C43, 6 * SIZE(CO2)
LD C23, 2 * SIZE(CO2)
LD C33, 4 * SIZE(CO2)
LD C11, 1 * SIZE(CO2)
LD C21, 3 * SIZE(CO2)
LD C31, 5 * SIZE(CO2)
LD C41, 7 * SIZE(CO2)
MADD C13, C13, C12, A1
ST B1, 0 * SIZE(CO1)
MADD C23, C23, C22, A1
ST B3, 2 * SIZE(CO1)
MADD C33, C33, C32, A1
ST B5, 4 * SIZE(CO1)
MADD C43, C43, C42, A1
ST B7, 6 * SIZE(CO1)
MADD C11, C11, C14, A1
ST B2, 1 * SIZE(CO1)
MADD C21, C21, C24, A1
ST B4, 3 * SIZE(CO1)
MADD C31, C31, C34, A1
ST B6, 5 * SIZE(CO1)
MADD C41, C41, C44, A1
ST B8, 7 * SIZE(CO1)
NMSUB C13, C13, C14, A2
NMSUB C23, C23, C24, A2
NMSUB C33, C33, C34, A2
NMSUB C43, C43, C44, A2
MADD C11, C11, C12, A2
MADD C21, C21, C22, A2
MADD C31, C31, C32, A2
MADD C41, C41, C42, A2
ST C13, 0 * SIZE(CO2)
ST C23, 2 * SIZE(CO2)
ST C33, 4 * SIZE(CO2)
ST C43, 6 * SIZE(CO2)
ST C11, 1 * SIZE(CO2)
ST C21, 3 * SIZE(CO2)
ST C31, 5 * SIZE(CO2)
ST C41, 7 * SIZE(CO2)
#endif
#else
daddiu I, I, -1
CVTU A1, C11
CVTU A2, C21
CVTU A3, C31
CVTU A4, C41
CVTU A5, C13
CVTU A6, C23
CVTU A7, C33
CVTU A8, C43
CVTU B1, C12
CVTU B2, C22
CVTU B3, C32
CVTU B4, C42
CVTU B5, C14
CVTU B6, C24
CVTU B7, C34
CVTU B8, C44
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
/* (a + bi) * (c + di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
SUB C31, C31, A3
LD A1, 152($sp) # load alpha_r
SUB C41, C41, A4
# LD A1, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp) # load alpha_i
ADD C13, A5, C13 # ad'+'cb
ADD C23, A6, C23
# LD A2, 0 * SIZE(A) # load alpha_i
ADD C33, A7, C33
ADD C43, A8, C43
SUB C12, C12, B1
SUB C22, C22, B2
SUB C32, C32, B3
SUB C42, C42, B4
ADD C14, B5, C14
ADD C24, B6, C24
ADD C34, B7, C34
ADD C44, B8, C44
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B5, C31, A1
MUL B7, C41, A1
MUL B2, C13, A1
MUL B4, C23, A1
MUL B6, C33, A1
MUL B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
ST B1, 0 * SIZE(CO1)
MUL C13, C12, A1
MUL C23, C22, A1
ST B3, 2 * SIZE(CO1)
MUL C33, C32, A1
MUL C43, C42, A1
ST B5, 4 * SIZE(CO1)
MUL C11, C14, A1
MUL C21, C24, A1
ST B7, 6 * SIZE(CO1)
MUL C31, C34, A1
MUL C41, C44, A1
ST B2, 1 * SIZE(CO1)
NMSUB C13, C13, C14, A2
NMSUB C23, C23, C24, A2
ST B4, 3 * SIZE(CO1)
NMSUB C33, C33, C34, A2
NMSUB C43, C43, C44, A2
ST B6, 5 * SIZE(CO1)
MADD C11, C11, C12, A2
MADD C21, C21, C22, A2
ST B8, 7 * SIZE(CO1)
MADD C31, C31, C32, A2
MADD C41, C41, C42, A2
ST C13, 0 * SIZE(CO2)
ST C23, 2 * SIZE(CO2)
ST C33, 4 * SIZE(CO2)
ST C43, 6 * SIZE(CO2)
ST C11, 1 * SIZE(CO2)
ST C21, 3 * SIZE(CO2)
ST C31, 5 * SIZE(CO2)
ST C41, 7 * SIZE(CO2)
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
/* (a + bi) * (c - di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
# LD A1, 0 * SIZE(A) # load alpha_r
ADD C31, A3, C31
LD A1, 152($sp) # load alpha_r
ADD C41, A4, C41
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_r
SUB C13, A5, C13 # ad'+'cb
SUB C23, A6, C23
SUB C33, A7, C33
SUB C43, A8, C43
ADD C12, B1, C12
ADD C22, B2, C22
ADD C32, B3, C32
ADD C42, B4, C42
SUB C14, B5, C14
SUB C24, B6, C24
SUB C34, B7, C34
SUB C44, B8, C44
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B5, C31, A1
MUL B7, C41, A1
MUL B2, C13, A1
MUL B4, C23, A1
MUL B6, C33, A1
MUL B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
MUL C13, C12, A1
MUL C23, C22, A1
ST B1, 0 * SIZE(CO1)
MUL C33, C32, A1
MUL C43, C42, A1
ST B3, 2 * SIZE(CO1)
MUL C11, C14, A1
MUL C21, C24, A1
ST B5, 4 * SIZE(CO1)
MUL C31, C34, A1
MUL C41, C44, A1
ST B7, 6 * SIZE(CO1)
NMSUB C13, C13, C14, A2
NMSUB C23, C23, C24, A2
ST B2, 1 * SIZE(CO1)
NMSUB C33, C33, C34, A2
NMSUB C43, C43, C44, A2
ST B4, 3 * SIZE(CO1)
MADD C11, C11, C12, A2
MADD C21, C21, C22, A2
ST B6, 5 * SIZE(CO1)
MADD C31, C31, C32, A2
MADD C41, C41, C42, A2
ST B8, 7 * SIZE(CO1)
ST C13, 0 * SIZE(CO2)
ST C23, 2 * SIZE(CO2)
ST C33, 4 * SIZE(CO2)
ST C43, 6 * SIZE(CO2)
ST C11, 1 * SIZE(CO2)
ST C21, 3 * SIZE(CO2)
ST C31, 5 * SIZE(CO2)
ST C41, 7 * SIZE(CO2)
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
/* (a - bi) * (c + di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
# LD A1, 0 * SIZE(A) # load alpha_r
ADD C31, A3, C31
LD A1, 152($sp) # load alpha_r
# LD A2, 0 * SIZE(A) # load alpha_r
ADD C41, A4, C41
LD A2, 160($sp) # load alpha_i
SUB C13, C13, A5 # ad'+'cb
SUB C23, C23, A6
SUB C33, C33, A7
SUB C43, C43, A8
ADD C12, B1, C12
ADD C22, B2, C22
ADD C32, B3, C32
ADD C42, B4, C42
SUB C14, C14, B5
SUB C24, C24, B6
SUB C34, C34, B7
SUB C44, C44, B8
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B5, C31, A1
MUL B7, C41, A1
MUL B2, C13, A1
MUL B4, C23, A1
MUL B6, C33, A1
MUL B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
MUL C13, C12, A1
MUL C23, C22, A1
ST B1, 0 * SIZE(CO1)
MUL C33, C32, A1
MUL C43, C42, A1
ST B3, 2 * SIZE(CO1)
MUL C11, C14, A1
MUL C21, C24, A1
ST B5, 4 * SIZE(CO1)
MUL C31, C34, A1
MUL C41, C44, A1
ST B7, 6 * SIZE(CO1)
NMSUB C13, C13, C14, A2
NMSUB C23, C23, C24, A2
ST B2, 1 * SIZE(CO1)
NMSUB C33, C33, C34, A2
NMSUB C43, C43, C44, A2
ST B4, 3 * SIZE(CO1)
MADD C11, C11, C12, A2
MADD C21, C21, C22, A2
ST B6, 5 * SIZE(CO1)
MADD C31, C31, C32, A2
MADD C41, C41, C42, A2
ST B8, 7 * SIZE(CO1)
ST C13, 0 * SIZE(CO2)
ST C23, 2 * SIZE(CO2)
ST C33, 4 * SIZE(CO2)
ST C43, 6 * SIZE(CO2)
ST C11, 1 * SIZE(CO2)
ST C21, 3 * SIZE(CO2)
ST C31, 5 * SIZE(CO2)
ST C41, 7 * SIZE(CO2)
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
/* (a - bi) * (c - di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
SUB C31, C31, A3
LD A1, 152($sp) # load alpha_r
# LD A1, 0 * SIZE(A) # load alpha_r
SUB C41, C41, A4
LD A2, 160($sp)
# LD A2, 0 * SIZE(A) # load alpha_i
ADD C13, A5, C13 # ad'+'cb
ADD C23, A6, C23
ADD C33, A7, C33
ADD C43, A8, C43
SUB C12, C12, B1
SUB C22, C22, B2
SUB C32, C32, B3
SUB C42, C42, B4
ADD C14, B5, C14
ADD C24, B6, C24
ADD C34, B7, C34
ADD C44, B8, C44
NEG C13, C13
NEG C23, C23
NEG C33, C33
NEG C43, C43
NEG C14, C14
NEG C24, C24
NEG C34, C34
NEG C44, C44
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B5, C31, A1
MUL B7, C41, A1
MUL B2, C13, A1
MUL B4, C23, A1
MUL B6, C33, A1
MUL B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
ST B1, 0 * SIZE(CO1)
MUL C13, C12, A1
MUL C23, C22, A1
ST B3, 2 * SIZE(CO1)
MUL C33, C32, A1
MUL C43, C42, A1
ST B5, 4 * SIZE(CO1)
MUL C11, C14, A1
MUL C21, C24, A1
ST B7, 6 * SIZE(CO1)
MUL C31, C34, A1
MUL C41, C44, A1
ST B2, 1 * SIZE(CO1)
NMSUB C13, C13, C14, A2
NMSUB C23, C23, C24, A2
ST B4, 3 * SIZE(CO1)
NMSUB C33, C33, C34, A2
NMSUB C43, C43, C44, A2
ST B6, 5 * SIZE(CO1)
MADD C11, C11, C12, A2
MADD C21, C21, C22, A2
ST B8, 7 * SIZE(CO1)
MADD C31, C31, C32, A2
MADD C41, C41, C42, A2
ST C13, 0 * SIZE(CO2)
ST C23, 2 * SIZE(CO2)
ST C33, 4 * SIZE(CO2)
ST C43, 6 * SIZE(CO2)
ST C11, 1 * SIZE(CO2)
ST C21, 3 * SIZE(CO2)
ST C31, 5 * SIZE(CO2)
ST C41, 7 * SIZE(CO2)
#endif
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -4
#else
daddiu TEMP, TEMP, -2
#endif
dsll L, TEMP, 2 + ZBASE_SHIFT
dsll TEMP, TEMP, 1 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 4
#endif
#endif
daddiu CO1, CO1, 8 * SIZE
bgtz I, .L241
daddiu CO2, CO2, 8 * SIZE
.align 4
.L22:
andi I, M, 2 # MR=4
blez I, .L21
NOP
.align 4
.L221:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2
daddu AO, AO, TEMP
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
MOV C12, C11
MOV C21, C11
MOV C22, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
MOV C13, C11
MOV C14, C11
MOV C23, C11
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 8 * SIZE(CO1)
MOV C24, C11
FETCH $0, 0 * SIZE(CO2)
FETCH $0, 8 * SIZE(CO2)
PLU B3, B1, B1
PLU B4, B2, B2
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2 # MR=2
#else
daddiu TEMP, KK, 2 # NR=2
#endif
dsra L, TEMP, 2
blez L, .L222
NOP
#else
move BO, B # Reset B
dsra L, K, 2 # UnRoll K=64
MTC $0, C11 # CLEAR REAULTS REGISTERS
MOV C12, C11
MOV C21, C11
MOV C22, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
MOV C13, C11
MOV C14, C11
MOV C23, C11
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 8 * SIZE(CO1)
MOV C24, C11
FETCH $0, 0 * SIZE(CO2)
FETCH $0, 8 * SIZE(CO2)
PLU B3, B1, B1
blez L, .L222
PLU B4, B2, B2
#endif
.L2210:
daddiu L, L, -1
gsLQC1(R13, F13, F12, 1) # B3 B4
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
gsLQC1(R12, F3, F2, 1) # A3 A4
MADPS C12, C12, A1, B2
MADPS C22, C22, A2, B2
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
MADPS C14, C14, A1, B4
MADPS C24, C24, A2, B4
gsLQC1(R12, F5, F4, 2) # A5 A6
PLU B7, B5, B5
PLU B8, B6, B6
gsLQC1(R13, F9, F8, 2) # B1 B2
MADPS C11, C11, A3, B5
MADPS C21, C21, A4, B5
MADPS C12, C12, A3, B6
MADPS C22, C22, A4, B6
MADPS C13, C13, A3, B7
MADPS C23, C23, A4, B7
MADPS C14, C14, A3, B8
MADPS C24, C24, A4, B8
gsLQC1(R12, F7, F6, 3) # A7 A8
PLU B3, B1, B1
PLU B4, B2, B2
gsLQC1(R13, F13, F12, 3) # B3 B4
MADPS C11, C11, A5, B1
MADPS C21, C21, A6, B1
MADPS C12, C12, A5, B2
MADPS C22, C22, A6, B2
daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR
daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR
MADPS C13, C13, A5, B3
MADPS C23, C23, A6, B3
MADPS C14, C14, A5, B4
MADPS C24, C24, A6, B4
gsLQC1(R12, F1, F0, 0) # A1 A2
PLU B7, B5, B5
PLU B8, B6, B6
gsLQC1(R13, F9, F8, 0) # B1 B2
MADPS C11, C11, A7, B5
MADPS C21, C21, A8, B5
MADPS C12, C12, A7, B6
MADPS C22, C22, A8, B6
MADPS C13, C13, A7, B7
MADPS C23, C23, A8, B7
MADPS C14, C14, A7, B8
MADPS C24, C24, A8, B8
PLU B3, B1, B1
bgtz L, .L2210
PLU B4, B2, B2
.align 4
.L222:
#ifndef TRMMKERNEL
andi L, K, 2
#else
andi L, TEMP, 2
#endif
blez L, .L227
NOP
gsLQC1(R13, F13, F12, 1) # B3 B4
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
gsLQC1(R12, F3, F2, 1) # A3 A4
MADPS C12, C12, A1, B2
MADPS C22, C22, A2, B2
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
MADPS C14, C14, A1, B4
MADPS C24, C24, A2, B4
PLU B7, B5, B5
PLU B8, B6, B6
daddiu BO, BO, 2 * 4 * SIZE
daddiu AO, AO, 2 * 4 * SIZE
MADPS C11, C11, A3, B5
MADPS C21, C21, A4, B5
gsLQC1(R13, F9, F8, 0) # A1 A2
MADPS C12, C12, A3, B6
MADPS C22, C22, A4, B6
gsLQC1(R12, F1, F0, 0) # A1 A2
MADPS C13, C13, A3, B7
MADPS C23, C23, A4, B7
MADPS C14, C14, A3, B8
MADPS C24, C24, A4, B8
PLU B3, B1, B1
PLU B4, B2, B2
.align 4
.L227:
#ifndef TRMMKERNEL
andi L, K, 1
#else
andi L, TEMP, 1
#endif
blez L, .L220
NOP
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
daddiu BO, BO, 4 * SIZE
daddiu AO, AO, 4 * SIZE
MADPS C12, C12, A1, B2
MADPS C22, C22, A2, B2
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
MADPS C14, C14, A1, B4
MADPS C24, C24, A2, B4
.align 4
.L220: # Write Back
#ifndef TRMMKERNEL
daddiu I, I, -1
CVTU A1, C11
CVTU A2, C21
CVTU A3, C13
CVTU A4, C23
CVTU A5, C12
CVTU A6, C22
CVTU A7, C14
CVTU A8, C24
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
/* (a + bi) * (c + di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
ADD C13, A3, C13 # ad'+'cb
ADD C23, A4, C23
# LD A1, 0 * SIZE(A) # load alpha_r
LD A1, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_i
SUB C12, C12, A5
SUB C22, C22, A6
ADD C14, A7, C14
ADD C24, A8, C24
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
LD B5, 0 * SIZE(CO2)
LD B7, 2 * SIZE(CO2)
LD B6, 1 * SIZE(CO2)
LD B8, 3 * SIZE(CO2)
MADD B5, B5, C12, A1
MADD B7, B7, C22, A1
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
MADD B6, B6, C14, A1
MADD B8, B8, C24, A1
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
NMSUB B5, B5, C14, A2
NMSUB B7, B7, C24, A2
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
ST B8, 3 * SIZE(CO2)
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
/* (a + bi) * (c - di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
SUB C13, A3, C13 # ad'+'cb
SUB C23, A4, C23
# LD A1, 0 * SIZE(A) # load alpha_r
LD A1, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_r
ADD C12, A5, C12
ADD C22, A6, C22
SUB C14, A7, C14
SUB C24, A8, C24
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
LD B5, 0 * SIZE(CO2)
LD B7, 2 * SIZE(CO2)
LD B6, 1 * SIZE(CO2)
LD B8, 3 * SIZE(CO2)
MADD B5, B5, C12, A1
MADD B7, B7, C22, A1
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
MADD B6, B6, C14, A1
MADD B8, B8, C24, A1
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
NMSUB B5, B5, C14, A2
NMSUB B7, B7, C24, A2
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
ST B8, 3 * SIZE(CO2)
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
/* (a - bi) * (c + di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
SUB C13, C13, A3 # ad'+'cb
SUB C23, C23, A4
# LD A1, 0 * SIZE(A) # load alpha_r
LD A1, 152($sp) # load alpha_r
# LD A2, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp) # load alpha_i
ADD C12, A5, C12
ADD C22, A6, C22
SUB C14, C14, A7
SUB C24, C24, A8
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
LD B5, 0 * SIZE(CO2)
LD B7, 2 * SIZE(CO2)
LD B6, 1 * SIZE(CO2)
LD B8, 3 * SIZE(CO2)
MADD B5, B5, C12, A1
MADD B7, B7, C22, A1
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
MADD B6, B6, C14, A1
MADD B8, B8, C24, A1
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
NMSUB B5, B5, C14, A2
NMSUB B7, B7, C24, A2
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
ST B8, 3 * SIZE(CO2)
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
/* (a - bi) * (c - di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
ADD C13, A3, C13 # ad'+'cb
ADD C23, A4, C23
LD A1, 152($sp) # load alpha_r
# LD A1, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp)
# LD A2, 0 * SIZE(A) # load alpha_i
SUB C12, C12, A5
SUB C22, C22, A6
ADD C14, A7, C14
ADD C24, A8, C24
NEG C13, C13
NEG C23, C23
NEG C14, C14
NEG C24, C24
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
LD B5, 0 * SIZE(CO2)
LD B7, 2 * SIZE(CO2)
LD B6, 1 * SIZE(CO2)
LD B8, 3 * SIZE(CO2)
MADD B5, B5, C12, A1
MADD B7, B7, C22, A1
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
MADD B6, B6, C14, A1
MADD B8, B8, C24, A1
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
NMSUB B5, B5, C14, A2
NMSUB B7, B7, C24, A2
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
ST B8, 3 * SIZE(CO2)
#endif
#else
daddiu I, I, -1
CVTU A1, C11
CVTU A2, C21
CVTU A3, C13
CVTU A4, C23
CVTU A5, C12
CVTU A6, C22
CVTU A7, C14
CVTU A8, C24
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
/* (a + bi) * (c + di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
ADD C13, A3, C13 # ad'+'cb
ADD C23, A4, C23
# LD A1, 0 * SIZE(A) # load alpha_r
LD A1, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_i
SUB C12, C12, A5
SUB C22, C22, A6
ADD C14, A7, C14
ADD C24, A8, C24
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B2, C13, A1
MUL B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MUL B5, C12, A1
MUL B7, C22, A1
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
MUL B6, C14, A1
MUL B8, C24, A1
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
NMSUB B5, B5, C14, A2
NMSUB B7, B7, C24, A2
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
ST B8, 3 * SIZE(CO2)
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
/* (a + bi) * (c - di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
SUB C13, A3, C13 # ad'+'cb
SUB C23, A4, C23
# LD A1, 0 * SIZE(A) # load alpha_r
LD A1, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_r
ADD C12, A5, C12
ADD C22, A6, C22
SUB C14, A7, C14
SUB C24, A8, C24
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B2, C13, A1
MUL B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MUL B5, C12, A1
MUL B7, C22, A1
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
MUL B6, C14, A1
MUL B8, C24, A1
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
NMSUB B5, B5, C14, A2
NMSUB B7, B7, C24, A2
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
ST B8, 3 * SIZE(CO2)
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
/* (a - bi) * (c + di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
SUB C13, C13, A3 # ad'+'cb
SUB C23, C23, A4
# LD A1, 0 * SIZE(A) # load alpha_r
LD A1, 152($sp) # load alpha_r
# LD A2, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp) # load alpha_i
ADD C12, A5, C12
ADD C22, A6, C22
SUB C14, C14, A7
SUB C24, C24, A8
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B2, C13, A1
MUL B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MUL B5, C12, A1
MUL B7, C22, A1
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
MUL B6, C14, A1
MUL B8, C24, A1
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
NMSUB B5, B5, C14, A2
NMSUB B7, B7, C24, A2
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
ST B8, 3 * SIZE(CO2)
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
/* (a - bi) * (c - di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
ADD C13, A3, C13 # ad'+'cb
ADD C23, A4, C23
LD A1, 152($sp) # load alpha_r
# LD A1, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp)
# LD A2, 0 * SIZE(A) # load alpha_i
SUB C12, C12, A5
SUB C22, C22, A6
ADD C14, A7, C14
ADD C24, A8, C24
NEG C13, C13
NEG C23, C23
NEG C14, C14
NEG C24, C24
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B2, C13, A1
MUL B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MUL B5, C12, A1
MUL B7, C22, A1
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
MUL B6, C14, A1
MUL B8, C24, A1
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
NMSUB B5, B5, C14, A2
NMSUB B7, B7, C24, A2
MADD B6, B6, C12, A2
MADD B8, B8, C22, A2
ST B5, 0 * SIZE(CO2)
ST B7, 2 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
ST B8, 3 * SIZE(CO2)
#endif
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -2
#endif
dsll TEMP, TEMP, 1 + ZBASE_SHIFT
daddu AO, AO, TEMP
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
#endif
daddiu CO1, CO1, 4 * SIZE
daddiu CO2, CO2, 4 * SIZE
.align 4
.L21:
andi I, M, 1
blez I, .L20
NOP
.align 4
.L211:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, ZBASE_SHIFT # MR=1
dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2
daddu AO, AO, L
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
MOV C12, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
MOV C13, C11
MOV C14, C11
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 0 * SIZE(CO2)
PLU B3, B1, B1
PLU B4, B2, B2
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1 # MR=1
#else
daddiu TEMP, KK, 2 # NR=2
#endif
dsra L, TEMP, 2
blez L, .L212
NOP
#else
move BO, B # Reset B
dsra L, K, 2 # UnRoll K=64
MTC $0, C11 # CLEAR REAULTS REGISTERS
MOV C12, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
MOV C13, C11
MOV C14, C11
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 0 * SIZE(CO2)
PLU B3, B1, B1
blez L, .L212
PLU B4, B2, B2
#endif
.L2110:
daddiu L, L, -1
gsLQC1(R13, F13, F12, 1) # B3 B4
MADPS C11, C11, A1, B1
MADPS C12, C12, A1, B2
MADPS C13, C13, A1, B3
MADPS C14, C14, A1, B4
PLU B7, B5, B5
PLU B8, B6, B6
gsLQC1(R13, F9, F8, 2) # B1 B2
MADPS C11, C11, A2, B5
MADPS C12, C12, A2, B6
gsLQC1(R12, F3, F2, 1) # A3 A4
MADPS C13, C13, A2, B7
MADPS C14, C14, A2, B8
PLU B3, B1, B1
PLU B4, B2, B2
gsLQC1(R13, F13, F12, 3) # B3 B4
MADPS C11, C11, A3, B1
MADPS C12, C12, A3, B2
daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR
daddiu AO, AO, 2 * 4 * SIZE # 4KR*8MR
MADPS C13, C13, A3, B3
MADPS C14, C14, A3, B4
PLU B7, B5, B5
PLU B8, B6, B6
gsLQC1(R13, F9, F8, 0) # B1 B2
MADPS C11, C11, A4, B5
MADPS C12, C12, A4, B6
gsLQC1(R12, F1, F0, 0) # A1 A2
MADPS C13, C13, A4, B7
MADPS C14, C14, A4, B8
PLU B3, B1, B1
bgtz L, .L2110
PLU B4, B2, B2
.align 4
.L212:
#ifndef TRMMKERNEL
andi L, K, 2
#else
andi L, TEMP, 2
#endif
blez L, .L217
NOP
gsLQC1(R13, F13, F12, 1) # B3 B4
MADPS C11, C11, A1, B1
MADPS C12, C12, A1, B2
MADPS C13, C13, A1, B3
MADPS C14, C14, A1, B4
PLU B7, B5, B5
PLU B8, B6, B6
daddiu BO, BO, 2 * 4 * SIZE
MADPS C11, C11, A2, B5
MADPS C12, C12, A2, B6
daddiu AO, AO, 4 * SIZE
MADPS C13, C13, A2, B7
MADPS C14, C14, A2, B8
gsLQC1(R12, F1, F0, 0) # A5 A6
gsLQC1(R13, F9, F8, 0) # B1 B2
PLU B3, B1, B1
PLU B4, B2, B2
.align 4
.L217:
#ifndef TRMMKERNEL
andi L, K, 1
#else
andi L, TEMP, 1
#endif
blez L, .L210
NOP
MADPS C11, C11, A1, B1
daddiu BO, BO, 4 * SIZE
MADPS C12, C12, A1, B2
daddiu AO, AO, 2 * SIZE
MADPS C13, C13, A1, B3
MADPS C14, C14, A1, B4
.align 4
.L210: # Write Back
#ifndef TRMMKERNEL
daddiu I, I, -1
CVTU A1, C11
CVTU A3, C13
CVTU A5, C12
CVTU A7, C14
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
/* (a + bi) * (c + di) */
SUB C11, C11, A1 # ac'+'bd
ADD C13, A3, C13 # ad'+'cb
# LD A1, 0 * SIZE(A) # load alpha_r
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_i
SUB C12, C12, A5
ADD C14, A7, C14
LD B1, 0 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
MADD B1, B1, C11, A4 # A1 = alpha_r
MADD B2, B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
LD B5, 0 * SIZE(CO2)
LD B6, 1 * SIZE(CO2)
MADD B5, B5, C12, A4
ST B1, 0 * SIZE(CO1)
MADD B6, B6, C14, A4
ST B2, 1 * SIZE(CO1)
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
/* (a + bi) * (c - di) */
ADD C11, A1, C11 # ac'+'bd
SUB C13, A3, C13 # ad'+'cb
# LD A1, 0 * SIZE(A) # load alpha_r
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_r
ADD C12, A5, C12
SUB C14, A7, C14
LD B1, 0 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
MADD B1, B1, C11, A4 # A1 = alpha_r
MADD B2, B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
LD B5, 0 * SIZE(CO2)
LD B6, 1 * SIZE(CO2)
MADD B5, B5, C12, A4
ST B1, 0 * SIZE(CO1)
MADD B6, B6, C14, A4
ST B2, 1 * SIZE(CO1)
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
/* (a - bi) * (c + di) */
ADD C11, A1, C11 # ac'+'bd
SUB C13, C13, A3 # ad'+'cb
# LD A1, 0 * SIZE(A) # load alpha_r
LD A4, 152($sp) # load alpha_r
# LD A2, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp) # load alpha_i
ADD C12, A5, C12
SUB C14, C14, A7
LD B1, 0 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
MADD B1, B1, C11, A4 # A1 = alpha_r
MADD B2, B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
LD B5, 0 * SIZE(CO2)
LD B6, 1 * SIZE(CO2)
MADD B5, B5, C12, A4
ST B1, 0 * SIZE(CO1)
MADD B6, B6, C14, A4
ST B2, 1 * SIZE(CO1)
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
/* (a - bi) * (c - di) */
SUB C11, C11, A1 # ac'+'bd
ADD C13, A3, C13 # ad'+'cb
LD A4, 152($sp) # load alpha_r
# LD A1, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp)
# LD A2, 0 * SIZE(A) # load alpha_i
SUB C12, C12, A5
ADD C14, A7, C14
NEG C13, C13
LD B1, 0 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
NEG C14, C14
MADD B1, B1, C11, A4 # A1 = alpha_r
MADD B2, B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
LD B5, 0 * SIZE(CO2)
LD B6, 1 * SIZE(CO2)
MADD B5, B5, C12, A4
ST B1, 0 * SIZE(CO1)
MADD B6, B6, C14, A4
ST B2, 1 * SIZE(CO1)
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
#endif
#else
daddiu I, I, -1
CVTU A1, C11
CVTU A3, C13
CVTU A5, C12
CVTU A7, C14
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
/* (a + bi) * (c + di) */
SUB C11, C11, A1 # ac'+'bd
ADD C13, A3, C13 # ad'+'cb
# LD A1, 0 * SIZE(A) # load alpha_r
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_i
SUB C12, C12, A5
ADD C14, A7, C14
MUL B1, C11, A4 # A1 = alpha_r
MUL B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
MUL B5, C12, A4
ST B1, 0 * SIZE(CO1)
MUL B6, C14, A4
ST B2, 1 * SIZE(CO1)
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
/* (a + bi) * (c - di) */
ADD C11, A1, C11 # ac'+'bd
SUB C13, A3, C13 # ad'+'cb
# LD A1, 0 * SIZE(A) # load alpha_r
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_r
ADD C12, A5, C12
SUB C14, A7, C14
MUL B1, C11, A4 # A1 = alpha_r
MUL B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
MUL B5, C12, A4
ST B1, 0 * SIZE(CO1)
MUL B6, C14, A4
ST B2, 1 * SIZE(CO1)
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
/* (a - bi) * (c + di) */
ADD C11, A1, C11 # ac'+'bd
SUB C13, C13, A3 # ad'+'cb
# LD A1, 0 * SIZE(A) # load alpha_r
LD A4, 152($sp) # load alpha_r
# LD A2, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp) # load alpha_i
ADD C12, A5, C12
SUB C14, C14, A7
MUL B1, C11, A4 # A1 = alpha_r
MUL B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
MUL B5, C12, A4
ST B1, 0 * SIZE(CO1)
MUL B6, C14, A4
ST B2, 1 * SIZE(CO1)
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
/* (a - bi) * (c - di) */
SUB C11, C11, A1 # ac'+'bd
ADD C13, A3, C13 # ad'+'cb
LD A4, 152($sp) # load alpha_r
# LD A1, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp)
# LD A2, 0 * SIZE(A) # load alpha_i
SUB C12, C12, A5
ADD C14, A7, C14
NEG C13, C13
NEG C14, C14
MUL B1, C11, A4 # A1 = alpha_r
MUL B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
MUL B5, C12, A4
ST B1, 0 * SIZE(CO1)
MUL B6, C14, A4
ST B2, 1 * SIZE(CO1)
NMSUB B5, B5, C14, A2
MADD B6, B6, C12, A2
ST B5, 0 * SIZE(CO2)
ST B6, 1 * SIZE(CO2)
#endif
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -2
#endif
dsll L, TEMP, ZBASE_SHIFT
dsll TEMP, TEMP, 1 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
daddiu CO1, CO1, 2 * SIZE
daddiu CO2, CO2, 2 * SIZE
.align 4
.L20:
daddiu J, J, -1
move B, BO
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 2
#endif
bgtz J, .L24
NOP
.align 4
.L1:
andi J, N, 1
blez J, .L999
NOP
.L14:
dsra I, M, 2 # MR=8
move AO, A # Reset A
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
move CO1, C
blez I, .L12
daddu C, CO1, LDC
.align 4
.L141:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 2 + ZBASE_SHIFT
dsll TEMP, KK, ZBASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
MOV C21, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
MOV C31, C11
MOV C41, C11
gsLQC1(R12, F3, F2, 1) # A3 A4
MOV C13, C11
MOV C23, C11
FETCH $0, 0 * SIZE(CO1)
MOV C33, C11
MOV C43, C11
FETCH $0, 8 * SIZE(CO1)
PLU B3, B1, B1
PLU B4, B2, B2
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 4 # define Mr=4
#else
daddiu TEMP, KK, 1 # define NR=1
#endif
dsra L, TEMP, 2
blez L, .L142
NOP
#else
move BO, B # Reset B
dsra L, K, 2 # UnRoll K=64
MTC $0, C11 # CLEAR REAULTS REGISTERS
MOV C21, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
MOV C31, C11
MOV C41, C11
gsLQC1(R12, F3, F2, 1) # A3 A4
MOV C13, C11
MOV C23, C11
FETCH $0, 0 * SIZE(CO1)
MOV C33, C11
MOV C43, C11
FETCH $0, 8 * SIZE(CO1)
PLU B3, B1, B1
blez L, .L142
PLU B4, B2, B2
#endif
.L1410:
daddiu L, L, -1
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
gsLQC1(R12, F5, F4, 2) # A5 A6
gsLQC1(R12, F7, F6, 3) # A7 A8
MADPS C31, C31, A3, B1
MADPS C41, C41, A4, B1
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
MADPS C33, C33, A3, B3
MADPS C43, C43, A4, B3
gsLQC1(R13, F13, F12, 1) # B3 B4
gsLQC1(R12, F1, F0, 4) # A1 A2
MADPS C11, C11, A5, B2
MADPS C21, C21, A6, B2
gsLQC1(R12, F3, F2, 5) # A3 A4
MADPS C31, C31, A7, B2
MADPS C41, C41, A8, B2
daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR
MADPS C13, C13, A5, B4
MADPS C23, C23, A6, B4
MADPS C33, C33, A7, B4
MADPS C43, C43, A8, B4
PLU B7, B5, B5
PLU B8, B6, B6
MADPS C11, C11, A1, B5
MADPS C21, C21, A2, B5
gsLQC1(R12, F5, F4, 6) # A5 A6
gsLQC1(R12, F7, F6, 7) # A7 A8
MADPS C31, C31, A3, B5
MADPS C41, C41, A4, B5
daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR
MADPS C13, C13, A1, B7
MADPS C23, C23, A2, B7
MADPS C33, C33, A3, B7
MADPS C43, C43, A4, B7
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
MADPS C11, C11, A5, B6
MADPS C21, C21, A6, B6
gsLQC1(R12, F3, F2, 1) # A3 A4
MADPS C31, C31, A7, B6
MADPS C41, C41, A8, B6
MADPS C13, C13, A5, B8
MADPS C23, C23, A6, B8
MADPS C33, C33, A7, B8
MADPS C43, C43, A8, B8
PLU B3, B1, B1
bgtz L, .L1410
PLU B4, B2, B2
.align 4
.L142:
#ifndef TRMMKERNEL
andi L, K, 2
#else
andi L, TEMP, 2
#endif
blez L, .L147
NOP
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
gsLQC1(R12, F5, F4, 2) # A5 A6
gsLQC1(R12, F7, F6, 3) # A7 A8
MADPS C31, C31, A3, B1
MADPS C41, C41, A4, B1
daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
MADPS C33, C33, A3, B3
MADPS C43, C43, A4, B3
gsLQC1(R13, F13, F8, 1) # B3 B4
gsLQC1(R12, F1, F0, 0) # A1 A2
MADPS C11, C11, A5, B2
MADPS C21, C21, A6, B2
gsLQC1(R12, F3, F2, 1) # A3 A4
MADPS C31, C31, A7, B2
MADPS C41, C41, A8, B2
daddiu BO, BO, 4 * SIZE # 4KR*4NR
MADPS C13, C13, A5, B4
MADPS C23, C23, A6, B4
MADPS C33, C33, A7, B4
MADPS C43, C43, A8, B4
PLU B3, B1, B1
.align 4
.L147:
#ifndef TRMMKERNEL
andi L, K, 1
#else
andi L, TEMP, 1
#endif
blez L, .L140
NOP
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
daddiu BO, BO, 2 * SIZE
MADPS C31, C31, A3, B1
MADPS C41, C41, A4, B1
daddiu AO, AO, 2 * 4 * SIZE
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
MADPS C33, C33, A3, B3
MADPS C43, C43, A4, B3
.align 4
.L140: # Write Back
#ifndef TRMMKERNEL
daddiu I, I, -1
CVTU A1, C11
CVTU A2, C21
CVTU A3, C31
CVTU A4, C41
CVTU A5, C13
CVTU A6, C23
CVTU A7, C33
CVTU A8, C43
CVTU B1, C12
CVTU B2, C22
CVTU B3, C32
CVTU B4, C42
CVTU B5, C14
CVTU B6, C24
CVTU B7, C34
CVTU B8, C44
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
/* (a + bi) * (c + di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
# LD A1, 0 * SIZE(A) # load alpha_r
SUB C31, C31, A3
LD A1, 152($sp) # load alpha_r
SUB C41, C41, A4
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_i
ADD C13, A5, C13 # ad'+'cb
ADD C23, A6, C23
ADD C33, A7, C33
ADD C43, A8, C43
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B5, 4 * SIZE(CO1)
LD B7, 6 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
LD B6, 5 * SIZE(CO1)
LD B8, 7 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B5, B5, C31, A1
MADD B7, B7, C41, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
MADD B6, B6, C33, A1
MADD B8, B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B5, 4 * SIZE(CO1)
ST B7, 6 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
ST B6, 5 * SIZE(CO1)
ST B8, 7 * SIZE(CO1)
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
/* (a + bi) * (c - di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
# LD A1, 0 * SIZE(A) # load alpha_r
ADD C31, A3, C31
LD A1, 152($sp) # load alpha_r
ADD C41, A4, C41
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_r
SUB C13, A5, C13 # ad'+'cb
SUB C23, A6, C23
SUB C33, A7, C33
SUB C43, A8, C43
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B5, 4 * SIZE(CO1)
LD B7, 6 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
LD B6, 5 * SIZE(CO1)
LD B8, 7 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B5, B5, C31, A1
MADD B7, B7, C41, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
MADD B6, B6, C33, A1
MADD B8, B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B5, 4 * SIZE(CO1)
ST B7, 6 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
ST B6, 5 * SIZE(CO1)
ST B8, 7 * SIZE(CO1)
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
/* (a - bi) * (c + di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
# LD A1, 0 * SIZE(A) # load alpha_r
ADD C31, A3, C31
LD A1, 152($sp) # load alpha_r
# LD A2, 0 * SIZE(A) # load alpha_r
ADD C41, A4, C41
LD A2, 160($sp) # load alpha_i
SUB C13, C13, A5 # ad'+'cb
SUB C23, C23, A6
SUB C33, C33, A7
SUB C43, C43, A8
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B5, 4 * SIZE(CO1)
LD B7, 6 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
LD B6, 5 * SIZE(CO1)
LD B8, 7 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B5, B5, C31, A1
MADD B7, B7, C41, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
MADD B6, B6, C33, A1
MADD B8, B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B5, 4 * SIZE(CO1)
ST B7, 6 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
ST B6, 5 * SIZE(CO1)
ST B8, 7 * SIZE(CO1)
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
/* (a - bi) * (c - di) */
SUB C11, C11, A1 # AC'+'BD
SUB C21, C21, A2
SUB C31, C31, A3
LD A1, 152($sp) # LOAD ALPHA_R
# LD A1, 0 * SIZE(A) # LOAD ALPHA_R
SUB C41, C41, A4
LD A2, 160($sp)
# LD A2, 0 * SIZE(A) # LOAD ALPHA_I
ADD C13, A5, C13 # AD'+'CB
ADD C23, A6, C23
ADD C33, A7, C33
ADD C43, A8, C43
NEG C13, C13 # AD'+'CB
NEG C23, C23
NEG C33, C33
NEG C43, C43
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B5, 4 * SIZE(CO1)
LD B7, 6 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
LD B6, 5 * SIZE(CO1)
LD B8, 7 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = ALPHA_R
MADD B3, B3, C21, A1
MADD B5, B5, C31, A1
MADD B7, B7, C41, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
MADD B6, B6, C33, A1
MADD B8, B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = ALPHA_I
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B5, 4 * SIZE(CO1)
ST B7, 6 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
ST B6, 5 * SIZE(CO1)
ST B8, 7 * SIZE(CO1)
#endif
#else
daddiu I, I, -1
CVTU A1, C11
CVTU A2, C21
CVTU A3, C31
CVTU A4, C41
CVTU A5, C13
CVTU A6, C23
CVTU A7, C33
CVTU A8, C43
CVTU B1, C12
CVTU B2, C22
CVTU B3, C32
CVTU B4, C42
CVTU B5, C14
CVTU B6, C24
CVTU B7, C34
CVTU B8, C44
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
/* (a + bi) * (c + di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
# LD A1, 0 * SIZE(A) # load alpha_r
SUB C31, C31, A3
LD A1, 152($sp) # load alpha_r
SUB C41, C41, A4
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_i
ADD C13, A5, C13 # ad'+'cb
ADD C23, A6, C23
ADD C33, A7, C33
ADD C43, A8, C43
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B5, C31, A1
MUL B7, C41, A1
MUL B2, C13, A1
MUL B4, C23, A1
MUL B6, C33, A1
MUL B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B5, 4 * SIZE(CO1)
ST B7, 6 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
ST B6, 5 * SIZE(CO1)
ST B8, 7 * SIZE(CO1)
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
/* (a + bi) * (c - di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
# LD A1, 0 * SIZE(A) # load alpha_r
ADD C31, A3, C31
LD A1, 152($sp) # load alpha_r
ADD C41, A4, C41
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_r
SUB C13, A5, C13 # ad'+'cb
SUB C23, A6, C23
SUB C33, A7, C33
SUB C43, A8, C43
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B5, C31, A1
MUL B7, C41, A1
MUL B2, C13, A1
MUL B4, C23, A1
MUL B6, C33, A1
MUL B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B5, 4 * SIZE(CO1)
ST B7, 6 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
ST B6, 5 * SIZE(CO1)
ST B8, 7 * SIZE(CO1)
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
/* (a - bi) * (c + di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
# LD A1, 0 * SIZE(A) # load alpha_r
ADD C31, A3, C31
LD A1, 152($sp) # load alpha_r
# LD A2, 0 * SIZE(A) # load alpha_r
ADD C41, A4, C41
LD A2, 160($sp) # load alpha_i
SUB C13, C13, A5 # ad'+'cb
SUB C23, C23, A6
SUB C33, C33, A7
SUB C43, C43, A8
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B5, C31, A1
MUL B7, C41, A1
MUL B2, C13, A1
MUL B4, C23, A1
MUL B6, C33, A1
MUL B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B5, 4 * SIZE(CO1)
ST B7, 6 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
ST B6, 5 * SIZE(CO1)
ST B8, 7 * SIZE(CO1)
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
/* (a - bi) * (c - di) */
SUB C11, C11, A1 # AC'+'BD
SUB C21, C21, A2
SUB C31, C31, A3
LD A1, 152($sp) # LOAD ALPHA_R
# LD A1, 0 * SIZE(A) # LOAD ALPHA_R
SUB C41, C41, A4
LD A2, 160($sp)
# LD A2, 0 * SIZE(A) # LOAD ALPHA_I
ADD C13, A5, C13 # AD'+'CB
ADD C23, A6, C23
ADD C33, A7, C33
ADD C43, A8, C43
NEG C13, C13 # AD'+'CB
NEG C23, C23
NEG C33, C33
NEG C43, C43
MUL B1, C11, A1 # A1 = ALPHA_R
MUL B3, C21, A1
MUL B5, C31, A1
MUL B7, C41, A1
MUL B2, C13, A1
MUL B4, C23, A1
MUL B6, C33, A1
MUL B8, C43, A1
NMSUB B1, B1, C13, A2 # A2 = ALPHA_I
NMSUB B3, B3, C23, A2
NMSUB B5, B5, C33, A2
NMSUB B7, B7, C43, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
MADD B6, B6, C31, A2
MADD B8, B8, C41, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B5, 4 * SIZE(CO1)
ST B7, 6 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
ST B6, 5 * SIZE(CO1)
ST B8, 7 * SIZE(CO1)
#endif
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -4
#else
daddiu TEMP, TEMP, -1
#endif
dsll L, TEMP, 2 + ZBASE_SHIFT
dsll TEMP, TEMP, ZBASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 4
#endif
#endif
bgtz I, .L141
daddiu CO1, CO1, 8 * SIZE
.align 4
.L12:
andi I, M, 2 # MR=4
blez I, .L11
NOP
.align 4
.L121:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 1 + ZBASE_SHIFT
dsll TEMP, KK, ZBASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
MOV C21, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
MOV C13, C11
MOV C23, C11
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 8 * SIZE(CO1)
PLU B3, B1, B1
PLU B4, B2, B2
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2
#else
daddiu TEMP, KK, 1
#endif
dsra L, TEMP, 2
blez L, .L122
NOP
#else
move BO, B # Reset B
dsra L, K, 2 # UnRoll K=64
MTC $0, C11 # CLEAR REAULTS REGISTERS
MOV C21, C11
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
MOV C13, C11
MOV C23, C11
FETCH $0, 0 * SIZE(CO1)
FETCH $0, 8 * SIZE(CO1)
PLU B3, B1, B1
blez L, .L122
PLU B4, B2, B2
#endif
.L1210:
daddiu L, L, -1
gsLQC1(R13, F13, F12, 1) # B3 B4
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
gsLQC1(R12, F3, F2, 1) # A3 A4
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
gsLQC1(R12, F5, F4, 2) # A5 A6
PLU B7, B5, B5
PLU B8, B6, B6
daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR
MADPS C11, C11, A3, B2
MADPS C21, C21, A4, B2
gsLQC1(R12, F7, F6, 3) # A7 A8
MADPS C13, C13, A3, B4
MADPS C23, C23, A4, B4
MADPS C11, C11, A5, B5
MADPS C21, C21, A6, B5
daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR
gsLQC1(R13, F9, F8, 0) # B1 B2
MADPS C13, C13, A5, B7
MADPS C23, C23, A6, B7
gsLQC1(R12, F1, F0, 0) # A1 A2
MADPS C11, C11, A7, B6
MADPS C21, C21, A8, B6
MADPS C13, C13, A7, B8
MADPS C23, C23, A8, B8
PLU B3, B1, B1
bgtz L, .L1210
PLU B4, B2, B2
.align 4
.L122:
#ifndef TRMMKERNEL
andi L, K, 2
#else
andi L, TEMP, 2
#endif
blez L, .L127
NOP
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
gsLQC1(R12, F3, F2, 1) # A3 A4
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
PLU B7, B5, B5
daddiu BO, BO, 1 * 4 * SIZE
daddiu AO, AO, 2 * 4 * SIZE
MADPS C11, C11, A3, B2
MADPS C21, C21, A4, B2
MADPS C13, C13, A3, B4
MADPS C23, C23, A4, B4
gsLQC1(R13, F9, F8, 0)
gsLQC1(R12, F1, F0, 0)
PLU B3, B1, B1
.align 4
.L127:
#ifndef TRMMKERNEL
andi L, K, 1
#else
andi L, TEMP, 1
#endif
blez L, .L120
NOP
MADPS C11, C11, A1, B1
MADPS C21, C21, A2, B1
daddiu BO, BO, 2 * SIZE
daddiu AO, AO, 4 * SIZE
MADPS C13, C13, A1, B3
MADPS C23, C23, A2, B3
.align 4
.L120: # Write Back
#ifndef TRMMKERNEL
daddiu I, I, -1
CVTU A1, C11
CVTU A2, C21
CVTU A3, C13
CVTU A4, C23
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
/* (a + bi) * (c + di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
ADD C13, A3, C13 # ad'+'cb
ADD C23, A4, C23
# LD A1, 0 * SIZE(A) # load alpha_r
LD A1, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_i
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
/* (a + bi) * (c - di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
SUB C13, A3, C13 # ad'+'cb
SUB C23, A4, C23
# LD A1, 0 * SIZE(A) # load alpha_r
LD A1, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_r
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
/* (a - bi) * (c + di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
SUB C13, C13, A3 # ad'+'cb
SUB C23, C23, A4
# LD A1, 0 * SIZE(A) # load alpha_r
LD A1, 152($sp) # load alpha_r
# LD A2, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp) # load alpha_i
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
/* (a - bi) * (c - di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
ADD C13, A3, C13 # ad'+'cb
ADD C23, A4, C23
LD A1, 152($sp) # load alpha_r
# LD A1, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp)
# LD A2, 0 * SIZE(A) # load alpha_i
NEG C13, C13 # ad'+'cb
NEG C23, C23
LD B1, 0 * SIZE(CO1)
LD B3, 2 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
LD B4, 3 * SIZE(CO1)
MADD B1, B1, C11, A1 # A1 = alpha_r
MADD B3, B3, C21, A1
MADD B2, B2, C13, A1
MADD B4, B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
#endif
#else
daddiu I, I, -1
CVTU A1, C11
CVTU A2, C21
CVTU A3, C13
CVTU A4, C23
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
/* (a + bi) * (c + di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
ADD C13, A3, C13 # ad'+'cb
ADD C23, A4, C23
# LD A1, 0 * SIZE(A) # load alpha_r
LD A1, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_i
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B2, C13, A1
MUL B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
/* (a + bi) * (c - di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
SUB C13, A3, C13 # ad'+'cb
SUB C23, A4, C23
# LD A1, 0 * SIZE(A) # load alpha_r
LD A1, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_r
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B2, C13, A1
MUL B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
/* (a - bi) * (c + di) */
ADD C11, A1, C11 # ac'+'bd
ADD C21, A2, C21
SUB C13, C13, A3 # ad'+'cb
SUB C23, C23, A4
# LD A1, 0 * SIZE(A) # load alpha_r
LD A1, 152($sp) # load alpha_r
# LD A2, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp) # load alpha_i
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B2, C13, A1
MUL B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
/* (a - bi) * (c - di) */
SUB C11, C11, A1 # ac'+'bd
SUB C21, C21, A2
ADD C13, A3, C13 # ad'+'cb
ADD C23, A4, C23
LD A1, 152($sp) # load alpha_r
# LD A1, 0 * SIZE(A) # load alpha_r
LD A2, 160($sp)
# LD A2, 0 * SIZE(A) # load alpha_i
NEG C13, C13 # ad'+'cb
NEG C23, C23
MUL B1, C11, A1 # A1 = alpha_r
MUL B3, C21, A1
MUL B2, C13, A1
MUL B4, C23, A1
NMSUB B1, B1, C13, A2 # A2 = alpha_i
NMSUB B3, B3, C23, A2
MADD B2, B2, C11, A2
MADD B4, B4, C21, A2
ST B1, 0 * SIZE(CO1)
ST B3, 2 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
ST B4, 3 * SIZE(CO1)
#endif
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -1
#endif
dsll L, TEMP, 1 + ZBASE_SHIFT
dsll TEMP, TEMP, ZBASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
#endif
daddiu CO1, CO1, 4 * SIZE
daddiu CO2, CO2, 4 * SIZE
.align 4
.L11:
andi I, M, 1
blez I, .L10
NOP
.align 4
.L111:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll TEMP, KK, ZBASE_SHIFT
daddu AO, AO, TEMP
daddu BO, B, TEMP
#endif
MTC $0, C11 # CLEAR REAULTS REGISTERS
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
MOV C13, C11
FETCH $0, 0 * SIZE(CO1)
PLU B3, B1, B1
PLU B4, B2, B2
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 1
#endif
dsra L, TEMP, 2
blez L, .L112
NOP
#else
move BO, B # Reset B
dsra L, K, 2 # UnRoll K=64
MTC $0, C11 # CLEAR REAULTS REGISTERS
gsLQC1(R13, F9, F8, 0) # B1 B2
gsLQC1(R12, F1, F0, 0) # A1 A2
MOV C13, C11
FETCH $0, 0 * SIZE(CO1)
PLU B3, B1, B1
blez L, .L112
PLU B4, B2, B2
#endif
.L1110:
daddiu L, L, -1
gsLQC1(R13, F13, F12, 1) # B3 B4
MADPS C11, C11, A1, B1
gsLQC1(R12, F3, F2, 1) # A3 A4
MADPS C13, C13, A1, B3
daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR
PLU B7, B5, B5
PLU B8, B6, B6
daddiu AO, AO, 2 * 4 * SIZE # 4KR*8MR
MADPS C11, C11, A2, B2
MADPS C13, C13, A2, B4
MADPS C11, C11, A3, B5
MADPS C13, C13, A3, B7
gsLQC1(R13, F9, F8, 0) # B1 B2
MADPS C11, C11, A4, B6
gsLQC1(R12, F1, F0, 0) # A1 A2
MADPS C13, C13, A4, B8
PLU B3, B1, B1
bgtz L, .L1110
PLU B4, B2, B2
.align 4
.L112:
#ifndef TRMMKERNEL
andi L, K, 2
#else
andi L, TEMP, 2
#endif
blez L, .L117
NOP
MADPS C11, C11, A1, B1
MADPS C13, C13, A1, B3
daddiu BO, BO, 4 * SIZE
daddiu AO, AO, 4 * SIZE
MADPS C11, C11, A2, B2
MADPS C13, C13, A2, B4
gsLQC1(R13, F9, F8, 0)
gsLQC1(R12, F1, F0, 0)
PLU B3, B1, B1
.align 4
.L117:
#ifndef TRMMKERNEL
andi L, K, 1
#else
andi L, TEMP, 1
#endif
blez L, .L110
NOP
daddiu BO, BO, 2 * SIZE
daddiu AO, AO, 2 * SIZE
MADPS C11, C11, A1, B1
MADPS C13, C13, A1, B3
.align 4
.L110: # Write Back
#ifndef TRMMKERNEL
daddiu I, I, -1
CVTU A1, C11
CVTU A3, C13
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
/* (a + bi) * (c + di) */
SUB C11, C11, A1 # ac'+'bd
ADD C13, A3, C13 # ad'+'cb
# LD A1, 0 * SIZE(A) # load alpha_r
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_i
LD B1, 0 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
MADD B1, B1, C11, A4 # A1 = alpha_r
MADD B2, B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
ST B1, 0 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
/* (a + bi) * (c - di) */
ADD C11, A1, C11 # ac'+'bd
SUB C13, A3, C13 # ad'+'cb
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
LD B1, 0 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
MADD B1, B1, C11, A4 # A1 = alpha_r
MADD B2, B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
ST B1, 0 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
/* (a - bi) * (c + di) */
ADD C11, A1, C11 # ac'+'bd
SUB C13, C13, A3 # ad'+'cb
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
LD B1, 0 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
MADD B1, B1, C11, A4 # A1 = alpha_r
MADD B2, B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
ST B1, 0 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
/* (a - bi) * (c - di) */
SUB C11, C11, A1 # ac'+'bd
ADD C13, A3, C13 # ad'+'cb
NEG C13, C13
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp)
LD B1, 0 * SIZE(CO1)
LD B2, 1 * SIZE(CO1)
MADD B1, B1, C11, A4 # A1 = alpha_r
MADD B2, B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
ST B1, 0 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
#endif
#else
daddiu I, I, -1
CVTU A1, C11
CVTU A3, C13
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
/* (a + bi) * (c + di) */
SUB C11, C11, A1 # ac'+'bd
ADD C13, A3, C13 # ad'+'cb
# LD A1, 0 * SIZE(A) # load alpha_r
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
# LD A2, 0 * SIZE(A) # load alpha_i
MUL B1, C11, A4 # A1 = alpha_r
MUL B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
ST B1, 0 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
/* (a + bi) * (c - di) */
ADD C11, A1, C11 # ac'+'bd
SUB C13, A3, C13 # ad'+'cb
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
MUL B1, C11, A4 # A1 = alpha_r
MUL B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
ST B1, 0 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
/* (a - bi) * (c + di) */
ADD C11, A1, C11 # ac'+'bd
SUB C13, C13, A3 # ad'+'cb
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp) # load alpha_i
MUL B1, C11, A4 # A1 = alpha_r
MUL B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
ST B1, 0 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
/* (a - bi) * (c - di) */
SUB C11, C11, A1 # ac'+'bd
ADD C13, A3, C13 # ad'+'cb
NEG C13, C13
LD A4, 152($sp) # load alpha_r
LD A2, 160($sp)
MUL B1, C11, A4 # A1 = alpha_r
MUL B2, C13, A4
NMSUB B1, B1, C13, A2 # A2 = alpha_i
MADD B2, B2, C11, A2
ST B1, 0 * SIZE(CO1)
ST B2, 1 * SIZE(CO1)
#endif
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -1
#endif
dsll TEMP, TEMP, ZBASE_SHIFT
daddu AO, AO, TEMP
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
daddiu CO1, CO1, 2 * SIZE
daddiu CO2, CO2, 2 * SIZE
.align 4
.L10:
move B, BO
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 1
#endif
.L999:
ld $16, 0($sp)
ld $17, 8($sp)
ld $18, 16($sp)
ld $19, 24($sp)
ld $20, 32($sp)
ld $21, 40($sp)
ld $22, 48($sp)
LD $f24, 56($sp)
LD $f25, 64($sp)
LD $f26, 72($sp)
LD $f27, 80($sp)
LD $f28, 88($sp)
#if defined(TRMMKERNEL)
ld $23, 96($sp)
ld $24, 104($sp)
ld $25, 112($sp)
#endif
#ifndef __64BIT__
LD $f20,120($sp)
LD $f21,128($sp)
LD $f22,136($sp)
LD $f23,144($sp)
#endif
daddiu $sp,$sp,STACKSIZE
j $31
nop
EPILOGUE