#define REALNAME ASMNAME #define ASSEMBLER #include "common.h" #define FETCH ld #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define KCO $18 #define MCO $19 #define NCO $20 #define SPANB $21 #define PREB $23 #define PREA $24 #define SPANA $25 #define ALPHA $f15 #if defined(TRMMKERNEL) #define OFFSET $2 #define KK $3 #define TEMP $7 #endif #define R8 8 #define R9 9 #define R14 14 #define R15 15 #define R16 16 #define R17 17 #define t11 $f30 #define t21 $f31 #define t31 $f28 #define t41 $f29 #define t12 $f26 #define t22 $f27 #define t32 $f24 #define t42 $f25 #define t13 $f22 #define t23 $f23 #define t33 $f20 #define t43 $f21 #define t14 $f18 #define t24 $f19 #define t34 $f16 #define t44 $f17 #define c11 $f0 #define c21 $f1 #define c31 $f2 #define c41 $f3 #define c12 $f4 #define c22 $f5 #define c32 $f6 #define c42 $f7 #define c13 $f8 #define c23 $f9 #define c33 $f10 #define c43 $f11 #define c14 $f12 #define c24 $f13 #define c34 $f14 #define c44 $f0 #define a0 $f0 #define a1 $f1 #define a2 $f2 #define a3 $f3 #define a4 $f4 #define a5 $f5 #define a6 $f6 #define a7 $f7 #define b0 $f8 #define b1 $f9 #define b2 $f10 #define b3 $f11 #define b4 $f12 #define b5 $f13 #define b6 $f14 #define b7 $f15 #define F31 31 #define F30 30 #define F29 29 #define F28 28 #define F27 27 #define F26 26 #define F25 25 #define F24 24 #define F23 23 #define F22 22 #define F21 21 #define F20 20 #define F19 19 #define F18 18 #define F17 17 #define F16 16 #define F15 15 #define F14 14 #define F13 13 #define F12 12 #define F11 11 #define F10 10 #define F9 9 #define F8 8 #define F7 7 #define F6 6 #define F5 5 #define F4 4 #define F3 3 #define F2 2 #define F1 1 #define F0 0 PROLOGUE daddiu $sp, $sp, -160 sd $16, 0($sp) sd $17, 8($sp) sd $18, 16($sp) sd $19, 24($sp) sd $20, 32($sp) sd $21, 40($sp) sd $22, 48($sp) ST $f24, 56($sp) ST $f25, 64($sp) ST $f26, 72($sp) ST $f27, 80($sp) ST $f28, 88($sp) sd $23, 96($sp) sd $24, 104($sp) sd $25, 112($sp) ST $f20,120($sp) ST $f21,128($sp) ST $f22,136($sp) ST $f23,144($sp) .align 5 .L0_N4: # Loop N ST ALPHA,152($sp) # Backup ALPHA move MCO,M # Backup M move NCO,N # Backup N move KCO,K # Backup K move AO,A # Backup A_addr dsra N,NCO,2 # N=NCO/2 dsll LDC,LDC,BASE_SHIFT # LDC*8Byte dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 #if defined(TRMMKERNEL) LDARG OFFSET,160($sp) # OFFSET is relate to the data part #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK,OFFSET #endif move BO,B # Backup B_addr beq N,$0,.L0_N2 # N=0,NCO<4 dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte .L0_N4_Lb: # mr=4,nr=4 move CO1,C dsra M,MCO,2 # M=MCO/2 move A,AO # Reset A daddu CO2,C,LDC daddu PREB,BO,SPANB # PreB point next panelB daddu CO3,CO2,LDC daddu PREA,AO,SPANA daddu CO4,CO3,LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK,OFFSET #endif beqz M,.L14_M2 daddu C,CO4,LDC # move C to next panel Cj .L10: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) #else dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part dsll TEMP,KK,2 + BASE_SHIFT daddu A,A,K # move A B to data part daddu B,BO,TEMP #endif MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) # a0,a1 MOV t31,t11 MOV t41,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t12,t11 MOV t22,t11 gsLQC1(R8,F3,F2,1) # a2,a3 MOV t32,t11 MOV t42,t11 gsLQC1(R9,F11,F10,1) # b2,b3 MOV t13,t11 MOV t23,t11 MOV t33,t11 MOV t43,t11 MOV t14,t11 MOV t24,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK # temp is the length of the data part #elif defined(LEFT) daddiu TEMP, KK, 4 # S=L,U=L #else daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part #endif dsra K,TEMP,2 # K=KCO/2 MOV t34,t11 beqz K,.L15 MOV t44,t11 #else move B,BO # Reset B MTC $0,t11 # GEMM part NR=4,MR=4 gsLQC1(R8,F1,F0,0) # a0,a1 MOV t21,t11 MOV t31,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t41,t11 MOV t12,t11 gsLQC1(R8,F3,F2,1) # a2,a3 MOV t22,t11 MOV t32,t11 gsLQC1(R9,F11,F10,1) # b2,b3 MOV t42,t11 dsra K,KCO,2 # K=KCO/2 MOV t13,t11 MOV t23,t11 MOV t33,t11 MOV t43,t11 MOV t14,t11 MOV t24,t11 MOV t34,t11 beqz K,.L15 MOV t44,t11 # clear 16 results registers #endif .align 5 .L11: # kr=4 gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 gsLQC1(R9,F15,F14,3) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 FETCH $0,(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 FETCH $0,(PREA) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 .L12: gsLQC1(R8,F1,F0,4) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,4) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R8,F3,F2,5) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 gsLQC1(R9,F11,F10,5) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 FETCH $0,4*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 FETCH $0,4*SIZE(PREA) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 .L13: gsLQC1(R8,F5,F4,6) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,6) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,7) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 gsLQC1(R9,F15,F14,7) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 daddu A,A,16*SIZE # 4mr*4kr FETCH $0,8*SIZE(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 daddu B,B,16*SIZE # 4nr*4kr MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 FETCH $0,8*SIZE(PREA) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 .L14: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 daddiu K,K,-1 gsLQC1(R9,F11,F10,1) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 FETCH $0,12*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 FETCH $0,12*SIZE(PREA) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREB,PREB,16*SIZE MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 bnez K,.L11 daddu PREA,PREA,16*SIZE .L15: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP, 2 #endif beqz K,.L18 nop .L16: gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 gsLQC1(R9,F15,F14,3) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 daddu A,A,8*SIZE # 4mr*2kr FETCH $0,0(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 daddu B,B,8*SIZE # 4nr*2kr FETCH $0,0(PREA) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 .L17: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 gsLQC1(R9,F11,F10,1) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 FETCH $0,4*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 FETCH $0,4*SIZE(PREA) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 daddu PREB,PREB,8*SIZE MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREA,PREA,8*SIZE MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 .L18: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L19 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREB) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # 4mr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,4*SIZE # 4nr*kr FETCH $0,0(PREA) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu PREB,PREB,4*SIZE MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 daddu PREA,PREA,4*SIZE MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 .L19: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write part LD c21,1*SIZE(CO1) # get 16 C LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) LD c12,0(CO2) MADD t11,c11,t11,ALPHA LD c22,1*SIZE(CO2) MADD t21,c21,t21,ALPHA LD c32,2*SIZE(CO2) MADD t31,c31,t31,ALPHA LD c42,3*SIZE(CO2) MADD t41,c41,t41,ALPHA LD c13,0(CO3) MADD t12,c12,t12,ALPHA LD c23,1*SIZE(CO3) MADD t22,c22,t22,ALPHA LD c33,2*SIZE(CO3) MADD t32,c32,t32,ALPHA LD c43,3*SIZE(CO3) MADD t42,c42,t42,ALPHA LD c14,0(CO4) MADD t13,c13,t13,ALPHA LD c24,1*SIZE(CO4) MADD t23,c23,t23,ALPHA LD c34,2*SIZE(CO4) MADD t33,c33,t33,ALPHA LD c44,3*SIZE(CO4) MADD t43,c43,t43,ALPHA ST t11,0(CO1) MADD t14,c14,t14,ALPHA ST t21,1*SIZE(CO1) MADD t24,c24,t24,ALPHA ST t31,2*SIZE(CO1) MADD t34,c34,t34,ALPHA ST t41,3*SIZE(CO1) MADD t44,c44,t44,ALPHA daddiu M,M,-1 # M-- ST t12,0(CO2) ST t22,1*SIZE(CO2) ST t32,2*SIZE(CO2) ST t42,3*SIZE(CO2) ST t13,0(CO3) ST t23,1*SIZE(CO3) ST t33,2*SIZE(CO3) ST t43,3*SIZE(CO3) FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,4*SIZE(CO3) FETCH $0,4*SIZE(CO4) FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) FETCH $0,8*SIZE(CO3) FETCH $0,8*SIZE(CO4) ST t14,0(CO4) daddu CO1,CO1,4*SIZE # COi += 4 ST t24,1*SIZE(CO4) daddu CO2,CO2,4*SIZE ST t34,2*SIZE(CO4) daddu CO3,CO3,4*SIZE ST t44,3*SIZE(CO4) daddu PREB,BO,SPANB bnez M,.L10 daddu CO4,CO4,4*SIZE #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 ST t31, 2 * SIZE(CO1) MUL t32, ALPHA, t32 ST t41, 3 * SIZE(CO1) MUL t42, ALPHA, t42 ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 ST t32, 2 * SIZE(CO2) MUL t33, ALPHA, t33 ST t42, 3 * SIZE(CO2) MUL t43, ALPHA, t43 ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 ST t33, 2 * SIZE(CO3) MUL t34, ALPHA, t34 ST t43, 3 * SIZE(CO3) MUL t44, ALPHA, t44 ST t14, 0 * SIZE(CO4) daddiu M,M,-1 # M-- ST t24, 1 * SIZE(CO4) ST t34, 2 * SIZE(CO4) ST t44, 3 * SIZE(CO4) daddiu CO1,CO1, 4 * SIZE daddiu CO2,CO2, 4 * SIZE daddiu CO3,CO3, 4 * SIZE daddiu CO4,CO4, 4 * SIZE FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,4*SIZE(CO3) FETCH $0,4*SIZE(CO4) FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP, -4 #else daddiu TEMP,TEMP, -4 #endif dsll K,TEMP,2 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT daddu A,A,K # mov A to the end of panel Ai daddu B,B,TEMP # mov B to the end of panel Bj #endif #ifdef LEFT daddiu KK, KK,4 #endif bnez M,.L10 nop #endif .align 3 .L14_M2: andi M, MCO, 2 # nr=4,mr=2 beqz M,.L14_M1 nop .L20: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK,1 + BASE_SHIFT # mr=2 dsll TEMP,KK,2 + BASE_SHIFT # nr=4 daddu A,A,K daddu B,BO,TEMP #endif MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) # a0,a1 MOV t12,t11 MOV t22,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t13,t11 MOV t23,t11 gsLQC1(R9,F11,F10,1) # b2,b3 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 #else daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 #endif dsra K,TEMP,2 MOV t14,t11 beqz K,.L25 MOV t24,t11 # clear 2*4=8 results registers #else move B,BO # Reset B MTC $0,t11 gsLQC1(R8,F1,F0,0) MOV t21,t11 MOV t12,t11 gsLQC1(R9,F9,F8,0) MOV t22,t11 dsra K,KCO,2 gsLQC1(R9,F11,F10,1) MOV t13,t11 MOV t23,t11 MOV t14,t11 beqz K,.L25 MOV t24,t11 #endif .L21: # nr=4,mr=2,kr=4 gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R9,F15,F14,3) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 gsLQC1(R8,F3,F2,2) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,4) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R9,F11,F10,5) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 daddiu K,K,-1 gsLQC1(R8,F7,F6,3) MADD t11,t11,a2,b0 MADD t21,t21,a3,b0 gsLQC1(R9,F13,F12,6) MADD t12,t12,a2,b1 MADD t22,t22,a3,b1 gsLQC1(R9,F15,F14,7) MADD t13,t13,a2,b2 MADD t23,t23,a3,b2 daddu A,A,8*SIZE # 2mr*4kr MADD t14,t14,a2,b3 MADD t24,t24,a3,b3 daddu B,B,16*SIZE # 4nr*4kr gsLQC1(R8,F1,F0,0) MADD t11,t11,a6,b4 MADD t21,t21,a7,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a6,b5 MADD t22,t22,a7,b5 gsLQC1(R9,F11,F10,1) MADD t13,t13,a6,b6 MADD t23,t23,a7,b6 MADD t14,t14,a6,b7 bnez K,.L21 MADD t24,t24,a7,b7 .L25: #ifndef TRMMKERNEL andi K,KCO,2 # kr=2 #else andi K,TEMP,2 #endif beqz K,.L28 nop .L26: gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R9,F15,F14,3) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 daddu A,A,4*SIZE # 2mr*2kr MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu B,B,8*SIZE # 4nr*2kr .L27: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R9,F11,F10,1) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 .L28: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L29 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # 2mr*kr daddu B,B,4*SIZE # 4nr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 .L29: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write back part LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) LD c13,0(CO3) MADD t11,c11,t11,ALPHA LD c23,1*SIZE(CO3) MADD t21,c21,t21,ALPHA LD c14,0(CO4) MADD t12,c12,t12,ALPHA LD c24,1*SIZE(CO4) MADD t22,c22,t22,ALPHA ST t11,0(CO1) MADD t13,c13,t13,ALPHA ST t21,1*SIZE(CO1) MADD t23,c23,t23,ALPHA ST t12,0(CO2) MADD t14,c14,t14,ALPHA ST t22,1*SIZE(CO2) MADD t24,c24,t24,ALPHA ST t13,0(CO3) daddu CO1,CO1,2*SIZE # COi += 2 ST t23,1*SIZE(CO3) daddu CO2,CO2,2*SIZE ST t14,0(CO4) daddu CO3,CO3,2*SIZE ST t24,1*SIZE(CO4) daddu CO4,CO4,2*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE daddiu CO3,CO3, 2 * SIZE daddiu CO4,CO4, 2 * SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP,-2 #else daddiu TEMP,TEMP,-4 #endif dsll K,TEMP,1 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT daddu A,A,K # move A to next panel Ai daddu B,B,TEMP # move B to next panel Bj #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L14_M1: andi M,MCO,1 # mr=1 beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj nop .L30: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, 0 + BASE_SHIFT dsll TEMP,KK,2 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP #endif MTC $0,t11 MOV t12,t11 LD a0, 0 * SIZE(A) # a0 MOV t13,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t14,t11 # clear result registers gsLQC1(R9,F11,F10,1) # b2,b3 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 4 #endif dsra K,TEMP, 2 nop beqz K,.L35 nop #else move B,BO # Reset B, GEMM part dsra K,KCO,2 # K=KCO/2 LD a0, 0 * SIZE(A) # a0 MTC $0,t11 MOV t12,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t13,t11 MOV t14,t11 gsLQC1(R9,F11,F10,1) # b2,b3 beqz K,.L35 nop #endif .L31: # nr=4,mr=1,kr=4 LD a1, 1*SIZE(A) # load a1 MADD t11,t11,a0,b0 gsLQC1(R9,F13,F12,2) # b4,b5 MADD t12,t12,a0,b1 gsLQC1(R9,F15,F14,3) # b6,b7 MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 LD a2, 2*SIZE(A) # a2 MADD t11,t11,a1,b4 gsLQC1(R9,F9,F8,4) MADD t12,t12,a1,b5 gsLQC1(R9,F11,F10,5) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 daddiu K,K,-1 LD a3, 3*SIZE(A) # a3 MADD t11,t11,a2,b0 gsLQC1(R9,F13,F12,6) MADD t12,t12,a2,b1 daddu A,A,4*SIZE # 1mr*4kr gsLQC1(R9,F15,F14,7) MADD t13,t13,a2,b2 MADD t14,t14,a2,b3 daddu B,B,16*SIZE # 4nr*4kr LD a0, 0*SIZE(A) # a0 MADD t11,t11,a3,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a3,b5 gsLQC1(R9,F11,F10,1) MADD t13,t13,a3,b6 bnez K,.L31 MADD t14,t14,a3,b7 .L35: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L38 nop .L36: LD a1,1*SIZE(A) # load a1 MADD t11,t11,a0,b0 gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 daddu A,A,2*SIZE # mr*2kr gsLQC1(R9,F15,F14,3) MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 daddu B,B,8*SIZE # 4nr*2kr .L37: LD a0,0(A) MADD t11,t11,a1,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a1,b5 gsLQC1(R9,F11,F10,1) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 .L38: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L39 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE daddu B,B,4*SIZE MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 .L39: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) LD c12,0(CO2) LD c13,0(CO3) LD c14,0(CO4) MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA MADD t13,c13,t13,ALPHA MADD t14,c14,t14,ALPHA ST t11,0(CO1) ST t12,0(CO2) ST t13,0(CO3) ST t14,0(CO4) #else MUL t11, ALPHA, t11 MUL t12, ALPHA, t12 MUL t13, ALPHA, t13 MUL t14, ALPHA, t14 ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t14, 0 * SIZE(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -4 #endif dsll K,TEMP, 0 + BASE_SHIFT dsll TEMP,TEMP, 2 + BASE_SHIFT daddu A,A,K daddu B,B,TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 3 .L0_N4_Loop: # mc finished daddiu N,N,-1 # N-- #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK,4 #endif bnez N,.L0_N4_Lb move BO,B # Set BO point to next panel Bj .align 5 .L0_N2: andi N,NCO,2 # nr = 2 beqz N,.L0_N1 nop .L0_N2_Lb: move CO1,C daddu CO2,C,LDC dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA daddu C,CO2,LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif beqz M,.L12_M2 nop .L40: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, 2 + BASE_SHIFT dsll TEMP, KK,1 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP #endif MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) # a0,a1 MOV t31,t11 MOV t41,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t12,t11 MOV t22,t11 gsLQC1(R8,F3,F2,1) # a2,a3 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t32,t11 beqz K,.L45 MOV t42,t11 #else move B,BO # Reset B MTC $0,t11 # gemm part gsLQC1(R8,F1,F0,0) # a0,a1 MOV t21,t11 MOV t31,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t41,t11 dsra K,KCO,2 # K=KCO/2 gsLQC1(R8,F3,F2,1) # a2,a3 MOV t12,t11 MOV t22,t11 MOV t32,t11 beqz K,.L45 MOV t42,t11 #endif .L41: # nr=2,mr=kr=4 gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 FETCH $0,(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 .L42: gsLQC1(R8,F1,F0,4) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F11,F10,2) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R8,F3,F2,5) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 FETCH $0,4*SIZE(PREA) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 .L43: gsLQC1(R8,F5,F4,6) MADD t11,t11,a0,b2 MADD t21,t21,a1,b2 gsLQC1(R9,F15,F14,3) MADD t12,t12,a0,b3 MADD t22,t22,a1,b3 gsLQC1(R8,F7,F6,7) MADD t31,t31,a2,b2 MADD t41,t41,a3,b2 daddu B,B,8*SIZE # 2nr*4kr FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b3 MADD t42,t42,a3,b3 daddu A,A,16*SIZE # 4mr*4kr .L44: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b6 MADD t21,t21,a5,b6 daddiu K,K,-1 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b7 MADD t22,t22,a5,b7 daddu PREA,PREA,16*SIZE gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b6 MADD t41,t41,a7,b6 FETCH $0,-4*SIZE(PREA) MADD t32,t32,a6,b7 bnez K,.L41 MADD t42,t42,a7,b7 .L45: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L48 nop .L46: gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 FETCH $0,0(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE .L47: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 FETCH $0,4*SIZE(PREA) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 daddu PREA,PREA,8*SIZE .L48: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L49 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,2*SIZE daddu PREA,PREA,4*SIZE MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 .L49: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # gemm write back part Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) LD c12,0(CO2) MADD t11,c11,t11,ALPHA LD c22,1*SIZE(CO2) MADD t21,c21,t21,ALPHA LD c32,2*SIZE(CO2) MADD t31,c31,t31,ALPHA LD c42,3*SIZE(CO2) MADD t41,c41,t41,ALPHA ST t11,0(CO1) MADD t12,c12,t12,ALPHA ST t21,1*SIZE(CO1) MADD t22,c22,t22,ALPHA ST t31,2*SIZE(CO1) MADD t32,c32,t32,ALPHA ST t41,3*SIZE(CO1) MADD t42,c42,t42,ALPHA daddiu M,M,-1 ST t12,0(CO2) ST t22,1*SIZE(CO2) ST t32,2*SIZE(CO2) ST t42,3*SIZE(CO2) FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) daddu CO1,CO1,4*SIZE bnez M,.L40 daddu CO2,CO2,4*SIZE #else MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) MUL t22, ALPHA, t22 ST t21, 1 * SIZE(CO1) MUL t32, ALPHA, t32 ST t31, 2 * SIZE(CO1) MUL t42, ALPHA, t42 ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) daddiu M,M,-1 ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) daddiu CO1,CO1, 4*SIZE daddiu CO2,CO2, 4*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,4(CO1) FETCH $0,4(CO2) #if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -2 #endif dsll K,TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A,A,K daddu B,B,TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif bnez M,.L40 nop #endif .align 3 .L12_M2: andi M,MCO,2 # mr = 2 beqz M,.L12_M1 nop .L50: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO #else dsll K, KK, 1 + BASE_SHIFT #mr=2 dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 daddu A, A, K daddu B, BO, TEMP #endif MTC $0,t11 gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 gsLQC1(R9,F9,F8,0) #b0,b1 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t12,t11 beqz K,.L55 MOV t22,t11 #else move B,BO dsra K,KCO,2 # K=KCO/2 gsLQC1(R8,F1,F0,0) #a0,a1 MTC $0,t11 MOV t21,t11 gsLQC1(R9,F9,F8,0) #b0,b1 MOV t12,t11 beqz K,.L55 MOV t22,t11 #endif .L51: # nr=2 mr=2,kr=4 gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F3,F2,2) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F11,F10,2) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 daddiu K,K,-1 gsLQC1(R8,F7,F6,3) MADD t11,t11,a2,b2 MADD t21,t21,a3,b2 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE gsLQC1(R9,F15,F14,3) MADD t12,t12,a2,b3 MADD t22,t22,a3,b3 daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE gsLQC1(R8,F1,F0,0) MADD t11,t11,a6,b6 MADD t21,t21,a7,b6 gsLQC1(R9,F9,F8,0) MADD t12,t12,a6,b7 bnez K,.L51 MADD t22,t22,a7,b7 .L55: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L58 nop .L56: gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,4*SIZE # 2nr*2kr .L57: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 .L58: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP, 1 #endif beqz K,.L59 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 daddu B,B,2*SIZE # 2nr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 .L59: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # write gemm part back Fetch 16 C LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t12,c12,t12,ALPHA MADD t22,c22,t22,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t12,0(CO2) ST t22,1*SIZE(CO2) daddu CO1,CO1,2*SIZE daddu CO2,CO2,2*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) #else daddiu M, M, -1 daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t12, ALPHA, t12 MUL t22, ALPHA, t22 ST t11, -2 * SIZE(CO1) ST t21, -1 * SIZE(CO1) ST t12, -2 * SIZE(CO2) ST t22, -1 * SIZE(CO2) FETCH $0,0(CO1) FETCH $0,0(CO2) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -2 #endif dsll K, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L12_M1: andi M,MCO,1 # mr = 1 beqz M,.L0_N2_Loop nop .L60: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K, KK, 0 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif MTC $0,t11 LD a0, 0*SIZE(A) # a0 MOV t21,t11 gsLQC1(R9,F9,F8,0) # b0,b1 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t12,t11 beqz K,.L65 MOV t22,t11 #else dsra K,KCO,2 move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 MOV t21,t11 gsLQC1(R9,F9,F8,0) MOV t12,t11 beqz K,.L65 MOV t22,t11 #endif .L61: # nr=2,mr=1,kr=4 LD a4, 1*SIZE(A) # a2 MADD t11,t11,a0,b0 gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 LD a2, 2*SIZE(A) # a3 MADD t11,t11,a4,b4 gsLQC1(R9,F11,F10,2) MADD t12,t12,a4,b5 LD a6, 3*SIZE(A) # a4 MADD t11,t11,a2,b2 daddiu K,K,-1 gsLQC1(R9,F15,F14,3) MADD t12,t12,a2,b3 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) MADD t11,t11,a6,b6 daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE gsLQC1(R9,F9,F8,0) # a0 bnez K,.L61 MADD t12,t12,a6,b7 .L65: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L68 nop .L66: LD a4, 1*SIZE(A) # a1 MADD t11,t11,a0,b0 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 daddu B,B,4*SIZE .L67: LD a0,0(A) # a0 MADD t11,t11,a4,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 .L68: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L69 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 daddu B,B,2*SIZE .L69: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c12,0(CO2) MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA ST t11,0(CO1) ST t12,0(CO2) daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #else MUL t11, ALPHA, t11 MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll K, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .L0_N2_Loop: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif move BO, B .align 5 .L0_N1: andi N,NCO,1 # nr = 1 beqz N,.L999 nop move CO1,C dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif beqz M,.L11_M2 daddu C,CO1,LDC .L70: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO # Reset B #else dsll K, KK, 2 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif MTC $0,t11 LD b0, 0*SIZE(B) MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 MOV t31,t11 gsLQC1(R8,F3,F2,1) #a2,a3 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 1 #endif dsra K,TEMP,2 MOV t41,t11 beqz K,.L75 nop #else move B, BO # Reset B dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 MOV t31,t11 MOV t41,t11 gsLQC1(R8,F3,F2,1) #a2,a3 beqz K,.L75 nop #endif .L71: # nr=1,mr=kr=4 LD b4, 1*SIZE(B) # b1 MADD t11,t11,a0,b0 gsLQC1(R8,F5,F4,2) MADD t21,t21,a1,b0 gsLQC1(R8,F7,F6,3) FETCH $0,(PREA) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 .L72: LD b2, 2*SIZE(B) # b2 MADD t11,t11,a4,b4 gsLQC1(R8,F1,F0,4) MADD t21,t21,a5,b4 gsLQC1(R8,F3,F2,5) FETCH $0,4*SIZE(PREA) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 .L73: LD b6, 3*SIZE(B) MADD t11,t11,a0,b2 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 gsLQC1(R8,F5,F4,6) MADD t21,t21,a1,b2 FETCH $0,8*SIZE(PREA) gsLQC1(R8,F7,F6,7) MADD t31,t31,a2,b2 MADD t41,t41,a3,b2 daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE .L74: LD b0, 0*SIZE(B) MADD t11,t11,a4,b6 daddu PREA,PREA,16*SIZE gsLQC1(R8,F1,F0,0) MADD t21,t21,a5,b6 daddiu K,K,-1 FETCH $0,-32(PREA) gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b6 bnez K,.L71 MADD t41,t41,a7,b6 .L75: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L78 nop .L76: LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 gsLQC1(R8,F5,F4,2) MADD t21,t21,a1,b0 FETCH $0,0(PREA) gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE .L77: LD b0,0(B) MADD t11,t11,a4,b4 gsLQC1(R8,F1,F0,0) MADD t21,t21,a5,b4 FETCH $0,4*SIZE(PREA) gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 daddu PREA,PREA,8*SIZE .L78: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L79 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu B,B,1*SIZE daddu PREA,PREA,4*SIZE .L79: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t31,c31,t31,ALPHA MADD t41,c41,t41,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t31,2*SIZE(CO1) ST t41,3*SIZE(CO1) daddiu M,M,-1 # M-- FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) bnez M,.L70 # M!=0 daddu CO1,CO1,4*SIZE # COx += 4*8Byte #else daddiu M,M,-1 # M-- MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t31,2*SIZE(CO1) ST t41,3*SIZE(CO1) FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) daddu CO1,CO1,4*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -1 #endif dsll K, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu A, A,K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif bnez M,.L70 nop #endif .align 3 .L11_M2: andi M,MCO,2 # mr = 2 beqz M,.L11_M1 nop .L80: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO #else dsll K, KK, 1 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD b0, 0*SIZE(B) MTC $0,t11 gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 1 #endif dsra K,TEMP,2 # K=KCO/2 beqz K,.L85 nop #else move B, BO dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 beqz K,.L85 nop #endif .L81: # nr=1,mr=2,kr=4 LD b4, 1*SIZE(B) gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD b2, 2*SIZE(B) gsLQC1(R8,F3,F2,2) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 LD b6, 3*SIZE(B) gsLQC1(R8,F7,F6,3) MADD t11,t11,a2,b2 MADD t21,t21,a3,b2 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD b0, 0*SIZE(B) gsLQC1(R8,F1,F0,0) MADD t11,t11,a6,b6 MADD t21,t21,a7,b6 daddiu K,K,-1 bnez K,.L81 nop .L85: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L88 nop .L86: gsLQC1(R8,F5,F4,1) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 gsLQC1(R8,F1,F0,0) LD b0,0(B) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 .L88: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L89 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 daddu B,B,1*SIZE .L89: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) FETCH $0,2*SIZE(CO1) daddu CO1,CO1,2*SIZE # COx += 2*8Byte #else daddu CO1,CO1,2*SIZE # COx += 2*8Byte MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 FETCH $0,0(CO1) ST t11, -2 * SIZE(CO1) ST t21, -1 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -1 #endif dsll K, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L11_M1: andi M,MCO,1 # mr = 1 beqz M,.L999 nop .L90: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO #else dsll K, KK, 0 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MTC $0,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra K, TEMP, 2 beqz K,.L95 nop #else move B, BO LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) dsra K,KCO,2 beqz K,.L95 MTC $0,t11 #endif .L91: # nr=mr=1,kr=4 LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 LD a2, 2*SIZE(A) LD b2, 2*SIZE(B) MADD t11,t11,a4,b4 LD a6, 3*SIZE(A) LD b6, 3*SIZE(B) MADD t11,t11,a2,b2 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MADD t11,t11,a6,b6 daddiu K,K,-1 bnez K,.L91 nop .L95: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L98 nop .L96: LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 LD b0,0(B) LD a0,0(A) MADD t11,t11,a4,b4 .L98: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L99 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 .L99: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C MADD t11,c11,t11,ALPHA ST t11,0(CO1) #else MUL t11, ALPHA, t11 ST t11, 0 * SIZE(CO1) #endif .L999: # End ld $16, 0($sp) ld $17, 8($sp) ld $18, 16($sp) ld $19, 24($sp) ld $20, 32($sp) ld $21, 40($sp) ld $22, 48($sp) LD $f24, 56($sp) LD $f25, 64($sp) LD $f26, 72($sp) LD $f27, 80($sp) LD $f28, 88($sp) ld $23, 96($sp) ld $24, 104($sp) ld $25, 112($sp) LD $f20,120($sp) LD $f21,128($sp) LD $f22,136($sp) LD $f23,144($sp) j $31 daddiu $sp, $sp, 160 EPILOGUE