#define REALNAME ASMNAME
#define ASSEMBLER
#include "common.h"
#define FETCH ld
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define M $4
#define N $5
#define K $6
#define A $8
#define B $9
#define C $10
#define LDC $11
#define AO $12
#define BO $13
#define CO1 $14
#define CO2 $15
#define CO3 $16
#define CO4 $17
#define KCO $18
#define MCO $19
#define NCO $20
#define SPANB $21
#define PREB $23
#define PREA $24
#define SPANA $25
#define ALPHA $f15
#if defined(TRMMKERNEL)
#define OFFSET $2
#define KK $3
#define TEMP $7
#endif
#define R8 8
#define R9 9
#define R14 14
#define R15 15
#define R16 16
#define R17 17
#define t11 $f30
#define t21 $f31
#define t31 $f28
#define t41 $f29
#define t12 $f26
#define t22 $f27
#define t32 $f24
#define t42 $f25
#define t13 $f22
#define t23 $f23
#define t33 $f20
#define t43 $f21
#define t14 $f18
#define t24 $f19
#define t34 $f16
#define t44 $f17
#define c11 $f0
#define c21 $f1
#define c31 $f2
#define c41 $f3
#define c12 $f4
#define c22 $f5
#define c32 $f6
#define c42 $f7
#define c13 $f8
#define c23 $f9
#define c33 $f10
#define c43 $f11
#define c14 $f12
#define c24 $f13
#define c34 $f14
#define c44 $f0
#define a0 $f0
#define a1 $f1
#define a2 $f2
#define a3 $f3
#define a4 $f4
#define a5 $f5
#define a6 $f6
#define a7 $f7
#define b0 $f8
#define b1 $f9
#define b2 $f10
#define b3 $f11
#define b4 $f12
#define b5 $f13
#define b6 $f14
#define b7 $f15
#define F31 31
#define F30 30
#define F29 29
#define F28 28
#define F27 27
#define F26 26
#define F25 25
#define F24 24
#define F23 23
#define F22 22
#define F21 21
#define F20 20
#define F19 19
#define F18 18
#define F17 17
#define F16 16
#define F15 15
#define F14 14
#define F13 13
#define F12 12
#define F11 11
#define F10 10
#define F9 9
#define F8 8
#define F7 7
#define F6 6
#define F5 5
#define F4 4
#define F3 3
#define F2 2
#define F1 1
#define F0 0
PROLOGUE
daddiu $sp, $sp, -160
sd $16, 0($sp)
sd $17, 8($sp)
sd $18, 16($sp)
sd $19, 24($sp)
sd $20, 32($sp)
sd $21, 40($sp)
sd $22, 48($sp)
ST $f24, 56($sp)
ST $f25, 64($sp)
ST $f26, 72($sp)
ST $f27, 80($sp)
ST $f28, 88($sp)
sd $23, 96($sp)
sd $24, 104($sp)
sd $25, 112($sp)
ST $f20,120($sp)
ST $f21,128($sp)
ST $f22,136($sp)
ST $f23,144($sp)
.align 5
.L0_N4: # Loop N
ST ALPHA,152($sp) # Backup ALPHA
move MCO,M # Backup M
move NCO,N # Backup N
move KCO,K # Backup K
move AO,A # Backup A_addr
dsra N,NCO,2 # N=NCO/2
dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5
#if defined(TRMMKERNEL)
LDARG OFFSET,160($sp) # OFFSET is relate to the data part
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK,OFFSET
#endif
move BO,B # Backup B_addr
beq N,$0,.L0_N2 # N=0,NCO<4
dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte
.L0_N4_Lb: # mr=4,nr=4
move CO1,C
dsra M,MCO,2 # M=MCO/2
move A,AO # Reset A
daddu CO2,C,LDC
daddu PREB,BO,SPANB # PreB point next panelB
daddu CO3,CO2,LDC
daddu PREA,AO,SPANA
daddu CO4,CO3,LDC
#if defined(TRMMKERNEL) && defined(LEFT)
move KK,OFFSET
#endif
beqz M,.L14_M2
daddu C,CO4,LDC # move C to next panel Cj
.L10:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U)
#else
dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part
dsll TEMP,KK,2 + BASE_SHIFT
daddu A,A,K # move A B to data part
daddu B,BO,TEMP
#endif
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) # a0,a1
MOV t31,t11
MOV t41,t11
gsLQC1(R9,F9,F8,0) # b0,b1
MOV t12,t11
MOV t22,t11
gsLQC1(R8,F3,F2,1) # a2,a3
MOV t32,t11
MOV t42,t11
gsLQC1(R9,F11,F10,1) # b2,b3
MOV t13,t11
MOV t23,t11
MOV t33,t11
MOV t43,t11
MOV t14,t11
MOV t24,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK # temp is the length of the data part
#elif defined(LEFT)
daddiu TEMP, KK, 4 # S=L,U=L
#else
daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part
#endif
dsra K,TEMP,2 # K=KCO/2
MOV t34,t11
beqz K,.L15
MOV t44,t11
#else
move B,BO # Reset B
MTC $0,t11 # GEMM part NR=4,MR=4
gsLQC1(R8,F1,F0,0) # a0,a1
MOV t21,t11
MOV t31,t11
gsLQC1(R9,F9,F8,0) # b0,b1
MOV t41,t11
MOV t12,t11
gsLQC1(R8,F3,F2,1) # a2,a3
MOV t22,t11
MOV t32,t11
gsLQC1(R9,F11,F10,1) # b2,b3
MOV t42,t11
dsra K,KCO,2 # K=KCO/2
MOV t13,t11
MOV t23,t11
MOV t33,t11
MOV t43,t11
MOV t14,t11
MOV t24,t11
MOV t34,t11
beqz K,.L15
MOV t44,t11 # clear 16 results registers
#endif
.align 5
.L11: # kr=4
gsLQC1(R8,F5,F4,2)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
gsLQC1(R9,F15,F14,3)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
FETCH $0,(PREB)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
FETCH $0,(PREA)
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
.L12:
gsLQC1(R8,F1,F0,4)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,4)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,5)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
gsLQC1(R9,F11,F10,5)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
FETCH $0,4*SIZE(PREB)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
FETCH $0,4*SIZE(PREA)
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
MADD t34,t34,a6,b7
MADD t44,t44,a7,b7
.L13:
gsLQC1(R8,F5,F4,6)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,6)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,7)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
gsLQC1(R9,F15,F14,7)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
daddu A,A,16*SIZE # 4mr*4kr
FETCH $0,8*SIZE(PREB)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
daddu B,B,16*SIZE # 4nr*4kr
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
FETCH $0,8*SIZE(PREA)
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
.L14:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
daddiu K,K,-1
gsLQC1(R9,F11,F10,1)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
FETCH $0,12*SIZE(PREB)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
FETCH $0,12*SIZE(PREA)
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
daddu PREB,PREB,16*SIZE
MADD t34,t34,a6,b7
MADD t44,t44,a7,b7
bnez K,.L11
daddu PREA,PREA,16*SIZE
.L15: # kr=2
#ifndef TRMMKERNEL
andi K,KCO,2
#else
andi K,TEMP, 2
#endif
beqz K,.L18
nop
.L16:
gsLQC1(R8,F5,F4,2)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
gsLQC1(R9,F15,F14,3)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
daddu A,A,8*SIZE # 4mr*2kr
FETCH $0,0(PREB)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
daddu B,B,8*SIZE # 4nr*2kr
FETCH $0,0(PREA)
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
.L17:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
gsLQC1(R9,F11,F10,1)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
FETCH $0,4*SIZE(PREB)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
FETCH $0,4*SIZE(PREA)
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
daddu PREB,PREB,8*SIZE
MADD t33,t33,a6,b6
MADD t43,t43,a7,b6
daddu PREA,PREA,8*SIZE
MADD t34,t34,a6,b7
MADD t44,t44,a7,b7
.L18: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L19
LD ALPHA,152($sp) # Get ALPHA
FETCH $0,0(PREB)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # 4mr*kr
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
daddu B,B,4*SIZE # 4nr*kr
FETCH $0,0(PREA)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
daddu PREB,PREB,4*SIZE
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
daddu PREA,PREA,4*SIZE
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
MADD t33,t33,a2,b2
MADD t43,t43,a3,b2
MADD t34,t34,a2,b3
MADD t44,t44,a3,b3
.L19: # Write Back to C
#ifndef TRMMKERNEL
LD c11,0(CO1) # GEMM write part
LD c21,1*SIZE(CO1) # get 16 C
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
LD c12,0(CO2)
MADD t11,c11,t11,ALPHA
LD c22,1*SIZE(CO2)
MADD t21,c21,t21,ALPHA
LD c32,2*SIZE(CO2)
MADD t31,c31,t31,ALPHA
LD c42,3*SIZE(CO2)
MADD t41,c41,t41,ALPHA
LD c13,0(CO3)
MADD t12,c12,t12,ALPHA
LD c23,1*SIZE(CO3)
MADD t22,c22,t22,ALPHA
LD c33,2*SIZE(CO3)
MADD t32,c32,t32,ALPHA
LD c43,3*SIZE(CO3)
MADD t42,c42,t42,ALPHA
LD c14,0(CO4)
MADD t13,c13,t13,ALPHA
LD c24,1*SIZE(CO4)
MADD t23,c23,t23,ALPHA
LD c34,2*SIZE(CO4)
MADD t33,c33,t33,ALPHA
LD c44,3*SIZE(CO4)
MADD t43,c43,t43,ALPHA
ST t11,0(CO1)
MADD t14,c14,t14,ALPHA
ST t21,1*SIZE(CO1)
MADD t24,c24,t24,ALPHA
ST t31,2*SIZE(CO1)
MADD t34,c34,t34,ALPHA
ST t41,3*SIZE(CO1)
MADD t44,c44,t44,ALPHA
daddiu M,M,-1 # M--
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
ST t32,2*SIZE(CO2)
ST t42,3*SIZE(CO2)
ST t13,0(CO3)
ST t23,1*SIZE(CO3)
ST t33,2*SIZE(CO3)
ST t43,3*SIZE(CO3)
FETCH $0,4*SIZE(CO1)
FETCH $0,4*SIZE(CO2)
FETCH $0,4*SIZE(CO3)
FETCH $0,4*SIZE(CO4)
FETCH $0,8*SIZE(CO1)
FETCH $0,8*SIZE(CO2)
FETCH $0,8*SIZE(CO3)
FETCH $0,8*SIZE(CO4)
ST t14,0(CO4)
daddu CO1,CO1,4*SIZE # COi += 4
ST t24,1*SIZE(CO4)
daddu CO2,CO2,4*SIZE
ST t34,2*SIZE(CO4)
daddu CO3,CO3,4*SIZE
ST t44,3*SIZE(CO4)
daddu PREB,BO,SPANB
bnez M,.L10
daddu CO4,CO4,4*SIZE
#else
MUL t11, ALPHA, t11 # TRMM write back part
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
MUL t41, ALPHA, t41
ST t11, 0 * SIZE(CO1)
MUL t12, ALPHA, t12
ST t21, 1 * SIZE(CO1)
MUL t22, ALPHA, t22
ST t31, 2 * SIZE(CO1)
MUL t32, ALPHA, t32
ST t41, 3 * SIZE(CO1)
MUL t42, ALPHA, t42
ST t12, 0 * SIZE(CO2)
MUL t13, ALPHA, t13
ST t22, 1 * SIZE(CO2)
MUL t23, ALPHA, t23
ST t32, 2 * SIZE(CO2)
MUL t33, ALPHA, t33
ST t42, 3 * SIZE(CO2)
MUL t43, ALPHA, t43
ST t13, 0 * SIZE(CO3)
MUL t14, ALPHA, t14
ST t23, 1 * SIZE(CO3)
MUL t24, ALPHA, t24
ST t33, 2 * SIZE(CO3)
MUL t34, ALPHA, t34
ST t43, 3 * SIZE(CO3)
MUL t44, ALPHA, t44
ST t14, 0 * SIZE(CO4)
daddiu M,M,-1 # M--
ST t24, 1 * SIZE(CO4)
ST t34, 2 * SIZE(CO4)
ST t44, 3 * SIZE(CO4)
daddiu CO1,CO1, 4 * SIZE
daddiu CO2,CO2, 4 * SIZE
daddiu CO3,CO3, 4 * SIZE
daddiu CO4,CO4, 4 * SIZE
FETCH $0,4*SIZE(CO1)
FETCH $0,4*SIZE(CO2)
FETCH $0,4*SIZE(CO3)
FETCH $0,4*SIZE(CO4)
FETCH $0,0(CO1)
FETCH $0,0(CO2)
FETCH $0,0(CO3)
FETCH $0,0(CO4)
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP,KCO,KK
#ifdef LEFT
daddiu TEMP,TEMP, -4
#else
daddiu TEMP,TEMP, -4
#endif
dsll K,TEMP,2 + BASE_SHIFT
dsll TEMP,TEMP,2 + BASE_SHIFT
daddu A,A,K # mov A to the end of panel Ai
daddu B,B,TEMP # mov B to the end of panel Bj
#endif
#ifdef LEFT
daddiu KK, KK,4
#endif
bnez M,.L10
nop
#endif
.align 3
.L14_M2:
andi M, MCO, 2 # nr=4,mr=2
beqz M,.L14_M1
nop
.L20:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # Reset B
#else
dsll K,KK,1 + BASE_SHIFT # mr=2
dsll TEMP,KK,2 + BASE_SHIFT # nr=4
daddu A,A,K
daddu B,BO,TEMP
#endif
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) # a0,a1
MOV t12,t11
MOV t22,t11
gsLQC1(R9,F9,F8,0) # b0,b1
MOV t13,t11
MOV t23,t11
gsLQC1(R9,F11,F10,1) # b2,b3
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK
#elif defined(LEFT)
daddiu TEMP,KK,2 # left part,controlled by mr, mr=2
#else
daddiu TEMP,KK,4 # right part,controlled by nr,nr=4
#endif
dsra K,TEMP,2
MOV t14,t11
beqz K,.L25
MOV t24,t11 # clear 2*4=8 results registers
#else
move B,BO # Reset B
MTC $0,t11
gsLQC1(R8,F1,F0,0)
MOV t21,t11
MOV t12,t11
gsLQC1(R9,F9,F8,0)
MOV t22,t11
dsra K,KCO,2
gsLQC1(R9,F11,F10,1)
MOV t13,t11
MOV t23,t11
MOV t14,t11
beqz K,.L25
MOV t24,t11
#endif
.L21: # nr=4,mr=2,kr=4
gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
gsLQC1(R8,F3,F2,2)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,4)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R9,F11,F10,5)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
daddiu K,K,-1
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b0
MADD t21,t21,a3,b0
gsLQC1(R9,F13,F12,6)
MADD t12,t12,a2,b1
MADD t22,t22,a3,b1
gsLQC1(R9,F15,F14,7)
MADD t13,t13,a2,b2
MADD t23,t23,a3,b2
daddu A,A,8*SIZE # 2mr*4kr
MADD t14,t14,a2,b3
MADD t24,t24,a3,b3
daddu B,B,16*SIZE # 4nr*4kr
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a6,b4
MADD t21,t21,a7,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a6,b5
MADD t22,t22,a7,b5
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a6,b6
MADD t23,t23,a7,b6
MADD t14,t14,a6,b7
bnez K,.L21
MADD t24,t24,a7,b7
.L25:
#ifndef TRMMKERNEL
andi K,KCO,2 # kr=2
#else
andi K,TEMP,2
#endif
beqz K,.L28
nop
.L26:
gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
daddu A,A,4*SIZE # 2mr*2kr
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
daddu B,B,8*SIZE # 4nr*2kr
.L27:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a4,b6
MADD t23,t23,a5,b6
MADD t14,t14,a4,b7
MADD t24,t24,a5,b7
.L28: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L29
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # 2mr*kr
daddu B,B,4*SIZE # 4nr*kr
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
MADD t13,t13,a0,b2
MADD t23,t23,a1,b2
MADD t14,t14,a0,b3
MADD t24,t24,a1,b3
.L29: # Write Back to C
#ifndef TRMMKERNEL
LD c11,0(CO1) # GEMM write back part
LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
LD c13,0(CO3)
MADD t11,c11,t11,ALPHA
LD c23,1*SIZE(CO3)
MADD t21,c21,t21,ALPHA
LD c14,0(CO4)
MADD t12,c12,t12,ALPHA
LD c24,1*SIZE(CO4)
MADD t22,c22,t22,ALPHA
ST t11,0(CO1)
MADD t13,c13,t13,ALPHA
ST t21,1*SIZE(CO1)
MADD t23,c23,t23,ALPHA
ST t12,0(CO2)
MADD t14,c14,t14,ALPHA
ST t22,1*SIZE(CO2)
MADD t24,c24,t24,ALPHA
ST t13,0(CO3)
daddu CO1,CO1,2*SIZE # COi += 2
ST t23,1*SIZE(CO3)
daddu CO2,CO2,2*SIZE
ST t14,0(CO4)
daddu CO3,CO3,2*SIZE
ST t24,1*SIZE(CO4)
daddu CO4,CO4,2*SIZE
FETCH $0,0(CO1)
FETCH $0,0(CO2)
FETCH $0,0(CO3)
FETCH $0,0(CO4)
#else
MUL t11, ALPHA, t11 # TRMM write back part
MUL t21, ALPHA, t21
ST t11, 0 * SIZE(CO1)
MUL t12, ALPHA, t12
ST t21, 1 * SIZE(CO1)
MUL t22, ALPHA, t22
ST t12, 0 * SIZE(CO2)
MUL t13, ALPHA, t13
ST t22, 1 * SIZE(CO2)
MUL t23, ALPHA, t23
ST t13, 0 * SIZE(CO3)
MUL t14, ALPHA, t14
ST t23, 1 * SIZE(CO3)
MUL t24, ALPHA, t24
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
daddiu CO3,CO3, 2 * SIZE
daddiu CO4,CO4, 2 * SIZE
FETCH $0,0(CO1)
FETCH $0,0(CO2)
FETCH $0,0(CO3)
FETCH $0,0(CO4)
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP,KCO,KK
#ifdef LEFT
daddiu TEMP,TEMP,-2
#else
daddiu TEMP,TEMP,-4
#endif
dsll K,TEMP,1 + BASE_SHIFT
dsll TEMP,TEMP,2 + BASE_SHIFT
daddu A,A,K # move A to next panel Ai
daddu B,B,TEMP # move B to next panel Bj
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
#endif
.align 3
.L14_M1:
andi M,MCO,1 # mr=1
beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj
nop
.L30:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # Reset B
#else
dsll K,KK, 0 + BASE_SHIFT
dsll TEMP,KK,2 + BASE_SHIFT
daddu A,A,K
daddu B,BO,TEMP
#endif
MTC $0,t11
MOV t12,t11
LD a0, 0 * SIZE(A) # a0
MOV t13,t11
gsLQC1(R9,F9,F8,0) # b0,b1
MOV t14,t11 # clear result registers
gsLQC1(R9,F11,F10,1) # b2,b3
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 4
#endif
dsra K,TEMP, 2
nop
beqz K,.L35
nop
#else
move B,BO # Reset B, GEMM part
dsra K,KCO,2 # K=KCO/2
LD a0, 0 * SIZE(A) # a0
MTC $0,t11
MOV t12,t11
gsLQC1(R9,F9,F8,0) # b0,b1
MOV t13,t11
MOV t14,t11
gsLQC1(R9,F11,F10,1) # b2,b3
beqz K,.L35
nop
#endif
.L31: # nr=4,mr=1,kr=4
LD a1, 1*SIZE(A) # load a1
MADD t11,t11,a0,b0
gsLQC1(R9,F13,F12,2) # b4,b5
MADD t12,t12,a0,b1
gsLQC1(R9,F15,F14,3) # b6,b7
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
LD a2, 2*SIZE(A) # a2
MADD t11,t11,a1,b4
gsLQC1(R9,F9,F8,4)
MADD t12,t12,a1,b5
gsLQC1(R9,F11,F10,5)
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
daddiu K,K,-1
LD a3, 3*SIZE(A) # a3
MADD t11,t11,a2,b0
gsLQC1(R9,F13,F12,6)
MADD t12,t12,a2,b1
daddu A,A,4*SIZE # 1mr*4kr
gsLQC1(R9,F15,F14,7)
MADD t13,t13,a2,b2
MADD t14,t14,a2,b3
daddu B,B,16*SIZE # 4nr*4kr
LD a0, 0*SIZE(A) # a0
MADD t11,t11,a3,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a3,b5
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a3,b6
bnez K,.L31
MADD t14,t14,a3,b7
.L35: # kr=2
#ifndef TRMMKERNEL
andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L38
nop
.L36:
LD a1,1*SIZE(A) # load a1
MADD t11,t11,a0,b0
gsLQC1(R9,F13,F12,2)
MADD t12,t12,a0,b1
daddu A,A,2*SIZE # mr*2kr
gsLQC1(R9,F15,F14,3)
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
daddu B,B,8*SIZE # 4nr*2kr
.L37:
LD a0,0(A)
MADD t11,t11,a1,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a1,b5
gsLQC1(R9,F11,F10,1)
MADD t13,t13,a1,b6
MADD t14,t14,a1,b7
.L38: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L39
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,1*SIZE
daddu B,B,4*SIZE
MADD t13,t13,a0,b2
MADD t14,t14,a0,b3
.L39: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1)
LD c12,0(CO2)
LD c13,0(CO3)
LD c14,0(CO4)
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
MADD t13,c13,t13,ALPHA
MADD t14,c14,t14,ALPHA
ST t11,0(CO1)
ST t12,0(CO2)
ST t13,0(CO3)
ST t14,0(CO4)
#else
MUL t11, ALPHA, t11
MUL t12, ALPHA, t12
MUL t13, ALPHA, t13
MUL t14, ALPHA, t14
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t13, 0 * SIZE(CO3)
ST t14, 0 * SIZE(CO4)
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -4
#endif
dsll K,TEMP, 0 + BASE_SHIFT
dsll TEMP,TEMP, 2 + BASE_SHIFT
daddu A,A,K
daddu B,B,TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
.align 3
.L0_N4_Loop: # mc finished
daddiu N,N,-1 # N--
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK,4
#endif
bnez N,.L0_N4_Lb
move BO,B # Set BO point to next panel Bj
.align 5
.L0_N2:
andi N,NCO,2 # nr = 2
beqz N,.L0_N1
nop
.L0_N2_Lb:
move CO1,C
daddu CO2,C,LDC
dsra M,MCO,2
move A,AO # Reset A
daddu PREA,AO,SPANA
daddu C,CO2,LDC
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
beqz M,.L12_M2
nop
.L40:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # Reset B
#else
dsll K,KK, 2 + BASE_SHIFT
dsll TEMP, KK,1 + BASE_SHIFT
daddu A,A,K
daddu B,BO,TEMP
#endif
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) # a0,a1
MOV t31,t11
MOV t41,t11
gsLQC1(R9,F9,F8,0) # b0,b1
MOV t12,t11
MOV t22,t11
gsLQC1(R8,F3,F2,1) # a2,a3
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP,KCO,KK
#elif defined(LEFT)
daddiu TEMP, KK, 4
#else
daddiu TEMP, KK, 2
#endif
dsra K,TEMP,2
MOV t32,t11
beqz K,.L45
MOV t42,t11
#else
move B,BO # Reset B
MTC $0,t11 # gemm part
gsLQC1(R8,F1,F0,0) # a0,a1
MOV t21,t11
MOV t31,t11
gsLQC1(R9,F9,F8,0) # b0,b1
MOV t41,t11
dsra K,KCO,2 # K=KCO/2
gsLQC1(R8,F3,F2,1) # a2,a3
MOV t12,t11
MOV t22,t11
MOV t32,t11
beqz K,.L45
MOV t42,t11
#endif
.L41: # nr=2,mr=kr=4
gsLQC1(R8,F5,F4,2)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
FETCH $0,(PREA)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
.L42:
gsLQC1(R8,F1,F0,4)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F11,F10,2)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,5)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
FETCH $0,4*SIZE(PREA)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
.L43:
gsLQC1(R8,F5,F4,6)
MADD t11,t11,a0,b2
MADD t21,t21,a1,b2
gsLQC1(R9,F15,F14,3)
MADD t12,t12,a0,b3
MADD t22,t22,a1,b3
gsLQC1(R8,F7,F6,7)
MADD t31,t31,a2,b2
MADD t41,t41,a3,b2
daddu B,B,8*SIZE # 2nr*4kr
FETCH $0,8*SIZE(PREA)
MADD t32,t32,a2,b3
MADD t42,t42,a3,b3
daddu A,A,16*SIZE # 4mr*4kr
.L44:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b6
MADD t21,t21,a5,b6
daddiu K,K,-1
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b7
MADD t22,t22,a5,b7
daddu PREA,PREA,16*SIZE
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b6
MADD t41,t41,a7,b6
FETCH $0,-4*SIZE(PREA)
MADD t32,t32,a6,b7
bnez K,.L41
MADD t42,t42,a7,b7
.L45: # kr=2
#ifndef TRMMKERNEL
andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L48
nop
.L46:
gsLQC1(R8,F5,F4,2)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
FETCH $0,0(PREA)
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
.L47:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
FETCH $0,4*SIZE(PREA)
MADD t32,t32,a6,b5
MADD t42,t42,a7,b5
daddu PREA,PREA,8*SIZE
.L48: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L49
LD ALPHA,152($sp) # Get ALPHA
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
daddu B,B,2*SIZE
daddu PREA,PREA,4*SIZE
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
MADD t32,t32,a2,b1
MADD t42,t42,a3,b1
.L49: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # gemm write back part Fetch 16 C
LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
LD c12,0(CO2)
MADD t11,c11,t11,ALPHA
LD c22,1*SIZE(CO2)
MADD t21,c21,t21,ALPHA
LD c32,2*SIZE(CO2)
MADD t31,c31,t31,ALPHA
LD c42,3*SIZE(CO2)
MADD t41,c41,t41,ALPHA
ST t11,0(CO1)
MADD t12,c12,t12,ALPHA
ST t21,1*SIZE(CO1)
MADD t22,c22,t22,ALPHA
ST t31,2*SIZE(CO1)
MADD t32,c32,t32,ALPHA
ST t41,3*SIZE(CO1)
MADD t42,c42,t42,ALPHA
daddiu M,M,-1
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
ST t32,2*SIZE(CO2)
ST t42,3*SIZE(CO2)
FETCH $0,4*SIZE(CO1)
FETCH $0,4*SIZE(CO2)
FETCH $0,8*SIZE(CO1)
FETCH $0,8*SIZE(CO2)
daddu CO1,CO1,4*SIZE
bnez M,.L40
daddu CO2,CO2,4*SIZE
#else
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
MUL t41, ALPHA, t41
MUL t12, ALPHA, t12
ST t11, 0 * SIZE(CO1)
MUL t22, ALPHA, t22
ST t21, 1 * SIZE(CO1)
MUL t32, ALPHA, t32
ST t31, 2 * SIZE(CO1)
MUL t42, ALPHA, t42
ST t41, 3 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
daddiu M,M,-1
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
daddiu CO1,CO1, 4*SIZE
daddiu CO2,CO2, 4*SIZE
FETCH $0,0(CO1)
FETCH $0,0(CO2)
FETCH $0,4(CO1)
FETCH $0,4(CO2)
#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -4
#else
daddiu TEMP, TEMP, -2
#endif
dsll K,TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu A,A,K
daddu B,B,TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 4
#endif
bnez M,.L40
nop
#endif
.align 3
.L12_M2:
andi M,MCO,2 # mr = 2
beqz M,.L12_M1
nop
.L50:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO
#else
dsll K, KK, 1 + BASE_SHIFT #mr=2
dsll TEMP, KK, 1 + BASE_SHIFT #nr=2
daddu A, A, K
daddu B, BO, TEMP
#endif
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
gsLQC1(R9,F9,F8,0) #b0,b1
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2
#else
daddiu TEMP, KK, 2
#endif
dsra K,TEMP,2
MOV t12,t11
beqz K,.L55
MOV t22,t11
#else
move B,BO
dsra K,KCO,2 # K=KCO/2
gsLQC1(R8,F1,F0,0) #a0,a1
MTC $0,t11
MOV t21,t11
gsLQC1(R9,F9,F8,0) #b0,b1
MOV t12,t11
beqz K,.L55
MOV t22,t11
#endif
.L51: # nr=2 mr=2,kr=4
gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
gsLQC1(R8,F3,F2,2)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F11,F10,2)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
daddiu K,K,-1
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b2
MADD t21,t21,a3,b2
daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F15,F14,3)
MADD t12,t12,a2,b3
MADD t22,t22,a3,b3
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a6,b6
MADD t21,t21,a7,b6
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a6,b7
bnez K,.L51
MADD t22,t22,a7,b7
.L55: # kr=2
#ifndef TRMMKERNEL
andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L58
nop
.L56:
gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
daddu B,B,4*SIZE # 2nr*2kr
.L57:
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
MADD t22,t22,a5,b5
.L58: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP, 1
#endif
beqz K,.L59
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
daddu B,B,2*SIZE # 2nr*kr
MADD t12,t12,a0,b1
MADD t22,t22,a1,b1
.L59: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # write gemm part back Fetch 16 C
LD c21,1*SIZE(CO1)
LD c12,0(CO2)
LD c22,1*SIZE(CO2)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
MADD t12,c12,t12,ALPHA
MADD t22,c22,t22,ALPHA
ST t11,0(CO1)
ST t21,1*SIZE(CO1)
ST t12,0(CO2)
ST t22,1*SIZE(CO2)
daddu CO1,CO1,2*SIZE
daddu CO2,CO2,2*SIZE
FETCH $0,0(CO1)
FETCH $0,0(CO2)
#else
daddiu M, M, -1
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
MUL t12, ALPHA, t12
MUL t22, ALPHA, t22
ST t11, -2 * SIZE(CO1)
ST t21, -1 * SIZE(CO1)
ST t12, -2 * SIZE(CO2)
ST t22, -1 * SIZE(CO2)
FETCH $0,0(CO1)
FETCH $0,0(CO2)
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -2
#endif
dsll K, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu A, A, K
daddu B, B, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
#endif
.align 3
.L12_M1:
andi M,MCO,1 # mr = 1
beqz M,.L0_N2_Loop
nop
.L60:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B,BO # Reset B
#else
dsll K, KK, 0 + BASE_SHIFT
dsll TEMP, KK, 1 + BASE_SHIFT
daddu A, A, K
daddu B, BO, TEMP
#endif
MTC $0,t11
LD a0, 0*SIZE(A) # a0
MOV t21,t11
gsLQC1(R9,F9,F8,0) # b0,b1
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 2
#endif
dsra K,TEMP,2
MOV t12,t11
beqz K,.L65
MOV t22,t11
#else
dsra K,KCO,2
move B,BO # Reset B
LD a0,0*SIZE(A)
MTC $0,t11
MOV t21,t11
gsLQC1(R9,F9,F8,0)
MOV t12,t11
beqz K,.L65
MOV t22,t11
#endif
.L61: # nr=2,mr=1,kr=4
LD a4, 1*SIZE(A) # a2
MADD t11,t11,a0,b0
gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
LD a2, 2*SIZE(A) # a3
MADD t11,t11,a4,b4
gsLQC1(R9,F11,F10,2)
MADD t12,t12,a4,b5
LD a6, 3*SIZE(A) # a4
MADD t11,t11,a2,b2
daddiu K,K,-1
gsLQC1(R9,F15,F14,3)
MADD t12,t12,a2,b3
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
LD a0, 0*SIZE(A)
MADD t11,t11,a6,b6
daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
gsLQC1(R9,F9,F8,0) # a0
bnez K,.L61
MADD t12,t12,a6,b7
.L65: # kr=2
#ifndef TRMMKERNEL
andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L68
nop
.L66:
LD a4, 1*SIZE(A) # a1
MADD t11,t11,a0,b0
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
gsLQC1(R9,F13,F12,1)
MADD t12,t12,a0,b1
daddu B,B,4*SIZE
.L67:
LD a0,0(A) # a0
MADD t11,t11,a4,b4
gsLQC1(R9,F9,F8,0)
MADD t12,t12,a4,b5
.L68: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L69
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t12,t12,a0,b1
daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
daddu B,B,2*SIZE
.L69: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
LD c12,0(CO2)
MADD t11,c11,t11,ALPHA
MADD t12,c12,t12,ALPHA
ST t11,0(CO1)
ST t12,0(CO2)
daddu CO1,CO1,1*SIZE
daddu CO2,CO2,1*SIZE
#else
MUL t11, ALPHA, t11
MUL t12, ALPHA, t12
ST t11, 0 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
daddu CO1,CO1,1*SIZE
daddu CO2,CO2,1*SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -2
#endif
dsll K, TEMP, 0 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu A, A, K
daddu B, B, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
.L0_N2_Loop:
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 2
#endif
move BO, B
.align 5
.L0_N1:
andi N,NCO,1 # nr = 1
beqz N,.L999
nop
move CO1,C
dsra M,MCO,2
move A,AO # Reset A
daddu PREA,AO,SPANA
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
beqz M,.L11_M2
daddu C,CO1,LDC
.L70:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO # Reset B
#else
dsll K, KK, 2 + BASE_SHIFT
dsll TEMP, KK, 0 + BASE_SHIFT
daddu A, A, K
daddu B, BO, TEMP
#endif
MTC $0,t11
LD b0, 0*SIZE(B)
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t31,t11
gsLQC1(R8,F3,F2,1) #a2,a3
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
daddiu TEMP, KK, 4
#else
daddiu TEMP, KK, 1
#endif
dsra K,TEMP,2
MOV t41,t11
beqz K,.L75
nop
#else
move B, BO # Reset B
dsra K,KCO,2
LD b0, 0*SIZE(B)
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t31,t11
MOV t41,t11
gsLQC1(R8,F3,F2,1) #a2,a3
beqz K,.L75
nop
#endif
.L71: # nr=1,mr=kr=4
LD b4, 1*SIZE(B) # b1
MADD t11,t11,a0,b0
gsLQC1(R8,F5,F4,2)
MADD t21,t21,a1,b0
gsLQC1(R8,F7,F6,3)
FETCH $0,(PREA)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
.L72:
LD b2, 2*SIZE(B) # b2
MADD t11,t11,a4,b4
gsLQC1(R8,F1,F0,4)
MADD t21,t21,a5,b4
gsLQC1(R8,F3,F2,5)
FETCH $0,4*SIZE(PREA)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
.L73:
LD b6, 3*SIZE(B)
MADD t11,t11,a0,b2
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
gsLQC1(R8,F5,F4,6)
MADD t21,t21,a1,b2
FETCH $0,8*SIZE(PREA)
gsLQC1(R8,F7,F6,7)
MADD t31,t31,a2,b2
MADD t41,t41,a3,b2
daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
.L74:
LD b0, 0*SIZE(B)
MADD t11,t11,a4,b6
daddu PREA,PREA,16*SIZE
gsLQC1(R8,F1,F0,0)
MADD t21,t21,a5,b6
daddiu K,K,-1
FETCH $0,-32(PREA)
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b6
bnez K,.L71
MADD t41,t41,a7,b6
.L75: # kr=2
#ifndef TRMMKERNEL
andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L78
nop
.L76:
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
gsLQC1(R8,F5,F4,2)
MADD t21,t21,a1,b0
FETCH $0,0(PREA)
gsLQC1(R8,F7,F6,3)
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
.L77:
LD b0,0(B)
MADD t11,t11,a4,b4
gsLQC1(R8,F1,F0,0)
MADD t21,t21,a5,b4
FETCH $0,4*SIZE(PREA)
gsLQC1(R8,F3,F2,1)
MADD t31,t31,a6,b4
MADD t41,t41,a7,b4
daddu PREA,PREA,8*SIZE
.L78: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L79
LD ALPHA,152($sp) # Get ALPHA
FETCH $0,0(PREA)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
MADD t31,t31,a2,b0
MADD t41,t41,a3,b0
daddu B,B,1*SIZE
daddu PREA,PREA,4*SIZE
.L79: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
LD c21,1*SIZE(CO1)
LD c31,2*SIZE(CO1)
LD c41,3*SIZE(CO1)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
MADD t31,c31,t31,ALPHA
MADD t41,c41,t41,ALPHA
ST t11,0(CO1)
ST t21,1*SIZE(CO1)
ST t31,2*SIZE(CO1)
ST t41,3*SIZE(CO1)
daddiu M,M,-1 # M--
FETCH $0,4*SIZE(CO1)
FETCH $0,8*SIZE(CO1)
bnez M,.L70 # M!=0
daddu CO1,CO1,4*SIZE # COx += 4*8Byte
#else
daddiu M,M,-1 # M--
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
MUL t31, ALPHA, t31
MUL t41, ALPHA, t41
ST t11,0(CO1)
ST t21,1*SIZE(CO1)
ST t31,2*SIZE(CO1)
ST t41,3*SIZE(CO1)
FETCH $0,4*SIZE(CO1)
FETCH $0,8*SIZE(CO1)
daddu CO1,CO1,4*SIZE
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -4
#else
daddiu TEMP, TEMP, -1
#endif
dsll K, TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, 0 + BASE_SHIFT
daddu A, A,K
daddu B, B, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 4
#endif
bnez M,.L70
nop
#endif
.align 3
.L11_M2:
andi M,MCO,2 # mr = 2
beqz M,.L11_M1
nop
.L80:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
#else
dsll K, KK, 1 + BASE_SHIFT
dsll TEMP, KK, 0 + BASE_SHIFT
daddu A, A, K
daddu B, BO, TEMP
#endif
LD b0, 0*SIZE(B)
MTC $0,t11
gsLQC1(R8,F1,F0,0) #a0,a1
MOV t21,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2
#else
daddiu TEMP, KK, 1
#endif
dsra K,TEMP,2 # K=KCO/2
beqz K,.L85
nop
#else
move B, BO
dsra K,KCO,2
LD b0, 0*SIZE(B)
MTC $0,t11
MOV t21,t11
gsLQC1(R8,F1,F0,0) #a0,a1
beqz K,.L85
nop
#endif
.L81: # nr=1,mr=2,kr=4
LD b4, 1*SIZE(B)
gsLQC1(R8,F5,F4,1)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
LD b2, 2*SIZE(B)
gsLQC1(R8,F3,F2,2)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
LD b6, 3*SIZE(B)
gsLQC1(R8,F7,F6,3)
MADD t11,t11,a2,b2
MADD t21,t21,a3,b2
daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
LD b0, 0*SIZE(B)
gsLQC1(R8,F1,F0,0)
MADD t11,t11,a6,b6
MADD t21,t21,a7,b6
daddiu K,K,-1
bnez K,.L81
nop
.L85: # kr=2
#ifndef TRMMKERNEL
andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L88
nop
.L86:
gsLQC1(R8,F5,F4,1)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
gsLQC1(R8,F1,F0,0)
LD b0,0(B)
MADD t11,t11,a4,b4
MADD t21,t21,a5,b4
.L88: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L89
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
MADD t21,t21,a1,b0
daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
daddu B,B,1*SIZE
.L89: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
LD c21,1*SIZE(CO1)
MADD t11,c11,t11,ALPHA
MADD t21,c21,t21,ALPHA
ST t11,0(CO1)
ST t21,1*SIZE(CO1)
FETCH $0,2*SIZE(CO1)
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
#else
daddu CO1,CO1,2*SIZE # COx += 2*8Byte
MUL t11, ALPHA, t11
MUL t21, ALPHA, t21
FETCH $0,0(CO1)
ST t11, -2 * SIZE(CO1)
ST t21, -1 * SIZE(CO1)
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, KCO, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -1
#endif
dsll K, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 0 + BASE_SHIFT
daddu A, A, K
daddu B, B, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
#endif
.align 3
.L11_M1:
andi M,MCO,1 # mr = 1
beqz M,.L999
nop
.L90:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move B, BO
#else
dsll K, KK, 0 + BASE_SHIFT
dsll TEMP, KK, 0 + BASE_SHIFT
daddu A, A, K
daddu B, BO, TEMP
#endif
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
MTC $0,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, KCO, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 1
#endif
dsra K, TEMP, 2
beqz K,.L95
nop
#else
move B, BO
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
dsra K,KCO,2
beqz K,.L95
MTC $0,t11
#endif
.L91: # nr=mr=1,kr=4
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
LD a2, 2*SIZE(A)
LD b2, 2*SIZE(B)
MADD t11,t11,a4,b4
LD a6, 3*SIZE(A)
LD b6, 3*SIZE(B)
MADD t11,t11,a2,b2
daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
LD a0, 0*SIZE(A)
LD b0, 0*SIZE(B)
MADD t11,t11,a6,b6
daddiu K,K,-1
bnez K,.L91
nop
.L95: # kr=2
#ifndef TRMMKERNEL
andi K,KCO,2
#else
andi K,TEMP,2
#endif
beqz K,.L98
nop
.L96:
LD a4, 1*SIZE(A)
LD b4, 1*SIZE(B)
MADD t11,t11,a0,b0
daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32
LD b0,0(B)
LD a0,0(A)
MADD t11,t11,a4,b4
.L98: # kr=1
#ifndef TRMMKERNEL
andi K,KCO,1
#else
andi K,TEMP,1
#endif
beqz K,.L99
LD ALPHA,152($sp) # Get ALPHA
MADD t11,t11,a0,b0
.L99: # Write Back
#ifndef TRMMKERNEL
LD c11,0(CO1) # Fetch 16 C
MADD t11,c11,t11,ALPHA
ST t11,0(CO1)
#else
MUL t11, ALPHA, t11
ST t11, 0 * SIZE(CO1)
#endif
.L999: # End
ld $16, 0($sp)
ld $17, 8($sp)
ld $18, 16($sp)
ld $19, 24($sp)
ld $20, 32($sp)
ld $21, 40($sp)
ld $22, 48($sp)
LD $f24, 56($sp)
LD $f25, 64($sp)
LD $f26, 72($sp)
LD $f27, 80($sp)
LD $f28, 88($sp)
ld $23, 96($sp)
ld $24, 104($sp)
ld $25, 112($sp)
LD $f20,120($sp)
LD $f21,128($sp)
LD $f22,136($sp)
LD $f23,144($sp)
j $31
daddiu $sp, $sp, 160
EPILOGUE