Blame thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/mips64/sgemm_kernel_loongson3b_4x4.S

kusano 2b45e8
#define REALNAME ASMNAME
kusano 2b45e8
#define ASSEMBLER
kusano 2b45e8
#include "common.h"
kusano 2b45e8
kusano 2b45e8
#define FETCH	ld
kusano 2b45e8
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
kusano 2b45e8
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
kusano 2b45e8
kusano 2b45e8
#define M	$4
kusano 2b45e8
#define	N	$5
kusano 2b45e8
#define	K	$6
kusano 2b45e8
#define A	$8
kusano 2b45e8
#define B	$9
kusano 2b45e8
#define C	$10
kusano 2b45e8
#define LDC	$11
kusano 2b45e8
kusano 2b45e8
#define AO	$12
kusano 2b45e8
#define BO	$13
kusano 2b45e8
kusano 2b45e8
#define CO1	$14
kusano 2b45e8
#define CO2	$15
kusano 2b45e8
#define CO3	$16
kusano 2b45e8
#define CO4	$17
kusano 2b45e8
kusano 2b45e8
#define KCO	$18
kusano 2b45e8
#define MCO	$19
kusano 2b45e8
#define NCO	$20
kusano 2b45e8
kusano 2b45e8
#define SPANB	$21
kusano 2b45e8
#define PREB	$23
kusano 2b45e8
#define PREA	$24
kusano 2b45e8
#define SPANA	$25
kusano 2b45e8
kusano 2b45e8
#define ALPHA	$f15
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#define	OFFSET	$2
kusano 2b45e8
#define	KK	$3
kusano 2b45e8
#define	TEMP	$7
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#define R8	8
kusano 2b45e8
#define	R9	9
kusano 2b45e8
#define R14	14
kusano 2b45e8
#define R15	15
kusano 2b45e8
#define R16	16
kusano 2b45e8
#define R17 17
kusano 2b45e8
kusano 2b45e8
#define	t11	$f30
kusano 2b45e8
#define	t21	$f31
kusano 2b45e8
#define	t31	$f28
kusano 2b45e8
#define	t41	$f29
kusano 2b45e8
kusano 2b45e8
#define	t12	$f26
kusano 2b45e8
#define	t22	$f27
kusano 2b45e8
#define	t32	$f24
kusano 2b45e8
#define	t42	$f25
kusano 2b45e8
kusano 2b45e8
#define	t13	$f22
kusano 2b45e8
#define	t23	$f23
kusano 2b45e8
#define	t33	$f20
kusano 2b45e8
#define	t43	$f21
kusano 2b45e8
kusano 2b45e8
#define	t14	$f18
kusano 2b45e8
#define	t24	$f19
kusano 2b45e8
#define	t34	$f16
kusano 2b45e8
#define	t44	$f17
kusano 2b45e8
kusano 2b45e8
#define	c11	$f0
kusano 2b45e8
#define	c21	$f1
kusano 2b45e8
#define	c31	$f2
kusano 2b45e8
#define	c41	$f3
kusano 2b45e8
kusano 2b45e8
#define	c12	$f4
kusano 2b45e8
#define	c22	$f5
kusano 2b45e8
#define	c32	$f6
kusano 2b45e8
#define	c42	$f7
kusano 2b45e8
kusano 2b45e8
#define	c13	$f8
kusano 2b45e8
#define	c23	$f9
kusano 2b45e8
#define	c33	$f10
kusano 2b45e8
#define c43	$f11
kusano 2b45e8
kusano 2b45e8
#define	c14	$f12
kusano 2b45e8
#define	c24	$f13
kusano 2b45e8
#define	c34	$f14
kusano 2b45e8
#define	c44	$f0
kusano 2b45e8
kusano 2b45e8
#define	a0	$f0
kusano 2b45e8
#define	a1	$f1
kusano 2b45e8
#define	a2	$f2
kusano 2b45e8
#define	a3	$f3
kusano 2b45e8
#define	a4	$f4
kusano 2b45e8
#define	a5	$f5
kusano 2b45e8
#define	a6	$f6
kusano 2b45e8
#define	a7	$f7
kusano 2b45e8
#define	b0	$f8
kusano 2b45e8
#define	b1	$f9
kusano 2b45e8
#define	b2	$f10
kusano 2b45e8
#define b3	$f11
kusano 2b45e8
#define	b4	$f12
kusano 2b45e8
#define	b5	$f13
kusano 2b45e8
#define	b6	$f14
kusano 2b45e8
#define	b7	$f15
kusano 2b45e8
kusano 2b45e8
#define F31 31
kusano 2b45e8
#define F30 30
kusano 2b45e8
#define F29 29
kusano 2b45e8
#define F28 28
kusano 2b45e8
#define F27 27
kusano 2b45e8
#define F26 26
kusano 2b45e8
#define F25 25
kusano 2b45e8
#define F24 24 
kusano 2b45e8
#define F23 23
kusano 2b45e8
#define F22 22
kusano 2b45e8
#define F21 21
kusano 2b45e8
#define F20 20
kusano 2b45e8
#define F19 19
kusano 2b45e8
#define F18 18
kusano 2b45e8
#define F17 17
kusano 2b45e8
#define F16 16 
kusano 2b45e8
#define F15 15
kusano 2b45e8
#define F14 14
kusano 2b45e8
#define F13 13
kusano 2b45e8
#define F12 12
kusano 2b45e8
#define F11 11
kusano 2b45e8
#define F10 10
kusano 2b45e8
#define F9 9
kusano 2b45e8
#define F8 8
kusano 2b45e8
#define F7 7
kusano 2b45e8
#define F6 6
kusano 2b45e8
#define F5 5
kusano 2b45e8
#define F4 4 
kusano 2b45e8
#define F3 3 
kusano 2b45e8
#define F2 2 
kusano 2b45e8
#define F1 1 
kusano 2b45e8
#define F0 0
kusano 2b45e8
kusano 2b45e8
	PROLOGUE
kusano 2b45e8
	
kusano 2b45e8
	daddiu	$sp, $sp, -160
kusano 2b45e8
	sd	$16,   0($sp)
kusano 2b45e8
	sd	$17,   8($sp)
kusano 2b45e8
	sd	$18,  16($sp)
kusano 2b45e8
	sd	$19,  24($sp)
kusano 2b45e8
	sd	$20,  32($sp)
kusano 2b45e8
	sd	$21,  40($sp)
kusano 2b45e8
	sd	$22,  48($sp)
kusano 2b45e8
	ST	$f24, 56($sp)
kusano 2b45e8
	ST	$f25, 64($sp)
kusano 2b45e8
	ST	$f26, 72($sp)
kusano 2b45e8
	ST	$f27, 80($sp)
kusano 2b45e8
	ST	$f28, 88($sp)
kusano 2b45e8
	sd	$23,  96($sp)
kusano 2b45e8
	sd	$24, 104($sp)
kusano 2b45e8
	sd	$25, 112($sp)
kusano 2b45e8
	ST	$f20,120($sp)
kusano 2b45e8
	ST	$f21,128($sp)
kusano 2b45e8
	ST	$f22,136($sp)
kusano 2b45e8
	ST	$f23,144($sp)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align	5					
kusano 2b45e8
.L0_N4:									#  Loop N
kusano 2b45e8
	ST	ALPHA,152($sp)					#  Backup	ALPHA
kusano 2b45e8
	move	MCO,M						#  Backup	M
kusano 2b45e8
kusano 2b45e8
	move	NCO,N						#  Backup	N
kusano 2b45e8
	move	KCO,K						#  Backup	K
kusano 2b45e8
kusano 2b45e8
	move	AO,A						#  Backup	A_addr
kusano 2b45e8
	dsra	N,NCO,2						#  N=NCO/2
kusano 2b45e8
	
kusano 2b45e8
	dsll	LDC,LDC,BASE_SHIFT			#  LDC*8Byte
kusano 2b45e8
	dsll	SPANB,KCO,2+BASE_SHIFT		#  SPANB=KC*4nr*8Byte=KC*2^5
kusano 2b45e8
	
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
	LDARG	OFFSET,160($sp)				#	OFFSET is relate to the data part		
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && !defined(LEFT)
kusano 2b45e8
	neg		KK,OFFSET				
kusano 2b45e8
#endif
kusano 2b45e8
	
kusano 2b45e8
	move	BO,B						#  Backup	B_addr
kusano 2b45e8
	beq		N,$0,.L0_N2					#  N=0,NCO<4
kusano 2b45e8
	dsll	SPANA,KCO,1+BASE_SHIFT		#  SPANA = KCO*2mr*8Byte
kusano 2b45e8
kusano 2b45e8
.L0_N4_Lb:								#	mr=4,nr=4
kusano 2b45e8
	move	CO1,C							
kusano 2b45e8
	dsra	M,MCO,2						#  M=MCO/2
kusano 2b45e8
	
kusano 2b45e8
	move	A,AO						#  Reset A
kusano 2b45e8
	daddu	CO2,C,LDC
kusano 2b45e8
kusano 2b45e8
	daddu	PREB,BO,SPANB				#  PreB point next panelB
kusano 2b45e8
	daddu	CO3,CO2,LDC
kusano 2b45e8
kusano 2b45e8
	daddu	PREA,AO,SPANA
kusano 2b45e8
	daddu	CO4,CO3,LDC
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	move	KK,OFFSET					
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	M,.L14_M2
kusano 2b45e8
	daddu	C,CO4,LDC					#	move C to next panel Cj
kusano 2b45e8
kusano 2b45e8
.L10:
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	B,BO						#	(SIDE=L and UPLO=L) or (SIZE=R and UPLO=U)
kusano 2b45e8
#else
kusano 2b45e8
	dsll	K,KK,2 + BASE_SHIFT			#  KK is the length that needs to span to the data part
kusano 2b45e8
	dsll	TEMP,KK,2 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A,A,K						#  move A B to data part
kusano 2b45e8
	daddu	B,BO,TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	MTC		$0,t11						# 	GEMM part	NR=4,MR=4
kusano 2b45e8
	LD	a0,0(A)
kusano 2b45e8
kusano 2b45e8
	MOV	t21,t11
kusano 2b45e8
	MOV	t31,t11
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MOV	t41,t11
kusano 2b45e8
	MOV	t12,t11
kusano 2b45e8
	LD	b0,0(B)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t22,t11
kusano 2b45e8
	MOV	t32,t11
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t42,t11
kusano 2b45e8
	LD	a2,2*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t13,t11
kusano 2b45e8
	MOV	t23,t11
kusano 2b45e8
	LD	b2,2*SIZE(B)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t33,t11
kusano 2b45e8
	MOV	t43,t11
kusano 2b45e8
	LD	a3,3*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MOV	t14,t11
kusano 2b45e8
	MOV	t24,t11
kusano 2b45e8
	LD	b3,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP,KCO,KK					#  temp is the length of the data part
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP, KK, 4					#	S=L,U=L	
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, KK, 4					#	S=R,U=U,for this two situation KK is the length of the data part
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	K,TEMP,2					#  K=KCO/2
kusano 2b45e8
	MOV	t34,t11
kusano 2b45e8
	beqz	K,.L15
kusano 2b45e8
	MOV	t44,t11
kusano 2b45e8
kusano 2b45e8
#else							
kusano 2b45e8
	move	B,BO						#	Reset B
kusano 2b45e8
	MTC		$0,t11						# 	GEMM part	NR=4,MR=4
kusano 2b45e8
	LD	a0,0(A)
kusano 2b45e8
kusano 2b45e8
	MOV	t21,t11
kusano 2b45e8
	MOV	t31,t11
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MOV	t41,t11
kusano 2b45e8
	MOV	t12,t11
kusano 2b45e8
	LD	b0,0(B)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t22,t11
kusano 2b45e8
	MOV	t32,t11
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t42,t11
kusano 2b45e8
	dsra	K,KCO,2						#  K=KCO/2
kusano 2b45e8
	LD	a2,2*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t13,t11
kusano 2b45e8
	MOV	t23,t11
kusano 2b45e8
	LD	b2,2*SIZE(B)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t33,t11
kusano 2b45e8
	MOV	t43,t11
kusano 2b45e8
	LD	a3,3*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MOV	t14,t11
kusano 2b45e8
	MOV	t24,t11
kusano 2b45e8
	LD	b3,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t34,t11
kusano 2b45e8
	beqz	K,.L15
kusano 2b45e8
	MOV	t44,t11							#	clear 16 results registers
kusano 2b45e8
#endif
kusano 2b45e8
	
kusano 2b45e8
	.align	5
kusano 2b45e8
.L11:									#  kr=4
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	LD	a4,4*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
	LD	a5,5*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MADD	t31,t31,a2,b0
kusano 2b45e8
	MADD	t41,t41,a3,b0
kusano 2b45e8
	LD	b4,4*SIZE(B)
kusano 2b45e8
	
kusano 2b45e8
	MADD	t32,t32,a2,b1
kusano 2b45e8
	MADD	t42,t42,a3,b1
kusano 2b45e8
	LD	b5,5*SIZE(B)
kusano 2b45e8
	FETCH		$0,(PREB)
kusano 2b45e8
	
kusano 2b45e8
	MADD	t13,t13,a0,b2
kusano 2b45e8
	MADD	t23,t23,a1,b2
kusano 2b45e8
	LD	a6,6*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MADD	t14,t14,a0,b3
kusano 2b45e8
	MADD	t24,t24,a1,b3
kusano 2b45e8
	LD	b6,6*SIZE(B)
kusano 2b45e8
	FETCH		$0,(PREA)
kusano 2b45e8
	
kusano 2b45e8
	MADD	t33,t33,a2,b2
kusano 2b45e8
	MADD	t43,t43,a3,b2
kusano 2b45e8
	LD	a7,7*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MADD	t34,t34,a2,b3
kusano 2b45e8
	MADD	t44,t44,a3,b3
kusano 2b45e8
	LD	b7,7*SIZE(B)
kusano 2b45e8
kusano 2b45e8
.L12:
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
	LD	a0,8*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a4,b5
kusano 2b45e8
	MADD	t22,t22,a5,b5
kusano 2b45e8
	LD	a1,9*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a6,b4
kusano 2b45e8
	MADD	t41,t41,a7,b4
kusano 2b45e8
	LD	b0,8*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t32,t32,a6,b5
kusano 2b45e8
	MADD	t42,t42,a7,b5
kusano 2b45e8
	LD	b1,9*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,4*SIZE(PREB)
kusano 2b45e8
	MADD	t13,t13,a4,b6
kusano 2b45e8
	MADD	t23,t23,a5,b6
kusano 2b45e8
	LD	a2,10*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t14,t14,a4,b7
kusano 2b45e8
	MADD	t24,t24,a5,b7
kusano 2b45e8
	LD	b2,10*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,4*SIZE(PREA)
kusano 2b45e8
	MADD	t33,t33,a6,b6
kusano 2b45e8
	MADD	t43,t43,a7,b6
kusano 2b45e8
	LD	a3,11*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t34,t34,a6,b7
kusano 2b45e8
	MADD	t44,t44,a7,b7
kusano 2b45e8
	LD	b3,11*SIZE(B)
kusano 2b45e8
kusano 2b45e8
.L13:
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	LD	a4,12*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
	LD	a5,13*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a2,b0
kusano 2b45e8
	MADD	t41,t41,a3,b0
kusano 2b45e8
	LD	b4,12*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,8*SIZE(PREA)
kusano 2b45e8
	MADD	t32,t32,a2,b1
kusano 2b45e8
	MADD	t42,t42,a3,b1
kusano 2b45e8
	LD	b5,13*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,8*SIZE(PREB)
kusano 2b45e8
	MADD	t13,t13,a0,b2
kusano 2b45e8
	MADD	t23,t23,a1,b2
kusano 2b45e8
	LD	a6,14*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t14,t14,a0,b3
kusano 2b45e8
	MADD	t24,t24,a1,b3
kusano 2b45e8
	daddu	A,A,16*SIZE					#  4mr*4kr
kusano 2b45e8
	LD	b6,14*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t33,t33,a2,b2
kusano 2b45e8
	MADD	t43,t43,a3,b2
kusano 2b45e8
	daddu	B,B,16*SIZE					#	4nr*4kr
kusano 2b45e8
	LD	a7,-1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t34,t34,a2,b3
kusano 2b45e8
	MADD	t44,t44,a3,b3
kusano 2b45e8
	LD	b7,-1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
.L14:
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
	LD	a0,0(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a4,b5
kusano 2b45e8
	MADD	t22,t22,a5,b5
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a6,b4
kusano 2b45e8
	MADD	t41,t41,a7,b4
kusano 2b45e8
	daddiu	K,K,-1
kusano 2b45e8
	LD	b0,0(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t32,t32,a6,b5
kusano 2b45e8
	MADD	t42,t42,a7,b5
kusano 2b45e8
	daddu	PREA,PREA,16*SIZE
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,12*SIZE(PREB)
kusano 2b45e8
	MADD	t13,t13,a4,b6
kusano 2b45e8
	MADD	t23,t23,a5,b6
kusano 2b45e8
	LD	a2,2*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,-4*SIZE(PREA)
kusano 2b45e8
	MADD	t14,t14,a4,b7
kusano 2b45e8
	MADD	t24,t24,a5,b7
kusano 2b45e8
	LD	b2,2*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t33,t33,a6,b6
kusano 2b45e8
	MADD	t43,t43,a7,b6
kusano 2b45e8
	daddu	PREB,PREB,16*SIZE
kusano 2b45e8
	LD	a3,3*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t34,t34,a6,b7
kusano 2b45e8
	MADD	t44,t44,a7,b7
kusano 2b45e8
	bnez 	K,.L11
kusano 2b45e8
	LD	b3,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L15:									#  kr=2
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,2				
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP, 2
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L18
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L16:			
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	LD	a4,4*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
	LD	a5,5*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a2,b0
kusano 2b45e8
	MADD	t41,t41,a3,b0
kusano 2b45e8
	LD	b4,4*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,0(PREA)
kusano 2b45e8
	MADD	t32,t32,a2,b1
kusano 2b45e8
	MADD	t42,t42,a3,b1
kusano 2b45e8
	LD	b5,5*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,0(PREB)
kusano 2b45e8
	MADD	t13,t13,a0,b2
kusano 2b45e8
	MADD	t23,t23,a1,b2
kusano 2b45e8
	LD	a6,6*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t14,t14,a0,b3
kusano 2b45e8
	MADD	t24,t24,a1,b3
kusano 2b45e8
	daddu	A,A,8*SIZE					#	4mr*2kr
kusano 2b45e8
	LD	b6,6*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t33,t33,a2,b2
kusano 2b45e8
	MADD	t43,t43,a3,b2
kusano 2b45e8
	daddu	B,B,8*SIZE					#	4nr*2kr
kusano 2b45e8
	LD	a7,-1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t34,t34,a2,b3
kusano 2b45e8
	MADD	t44,t44,a3,b3
kusano 2b45e8
	LD	b7,-1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
.L17:
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a4,b5
kusano 2b45e8
	MADD	t22,t22,a5,b5
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a6,b4
kusano 2b45e8
	MADD	t41,t41,a7,b4
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t32,t32,a6,b5
kusano 2b45e8
	MADD	t42,t42,a7,b5
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,4*SIZE(PREB)
kusano 2b45e8
	MADD	t13,t13,a4,b6
kusano 2b45e8
	MADD	t23,t23,a5,b6
kusano 2b45e8
	LD	a2,2*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,4*SIZE(PREA)
kusano 2b45e8
	MADD	t14,t14,a4,b7
kusano 2b45e8
	MADD	t24,t24,a5,b7
kusano 2b45e8
	LD	b2,2*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t33,t33,a6,b6
kusano 2b45e8
	MADD	t43,t43,a7,b6
kusano 2b45e8
	daddu	PREA,PREA,8*SIZE
kusano 2b45e8
	LD	a3,3*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t34,t34,a6,b7
kusano 2b45e8
	MADD	t44,t44,a7,b7
kusano 2b45e8
	daddu	PREB,PREB,8*SIZE
kusano 2b45e8
	LD	b3,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
.L18:									#	kr=1
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,1
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,1
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L19				  
kusano 2b45e8
	LD	ALPHA,152($sp)					#  Get ALPHA
kusano 2b45e8
	
kusano 2b45e8
	FETCH		$0,0(PREB)
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	daddu	A,A,4*SIZE					#  	4mr*kr
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
	daddu	B,B,4*SIZE					#	4nr*kr
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,0(PREA)
kusano 2b45e8
	MADD	t31,t31,a2,b0
kusano 2b45e8
	MADD	t41,t41,a3,b0
kusano 2b45e8
	daddu	PREB,PREB,4*SIZE
kusano 2b45e8
kusano 2b45e8
	MADD	t32,t32,a2,b1
kusano 2b45e8
	MADD	t42,t42,a3,b1
kusano 2b45e8
	daddu	PREA,PREA,4*SIZE
kusano 2b45e8
kusano 2b45e8
	MADD	t13,t13,a0,b2
kusano 2b45e8
	MADD	t23,t23,a1,b2
kusano 2b45e8
kusano 2b45e8
	MADD	t14,t14,a0,b3
kusano 2b45e8
	MADD	t24,t24,a1,b3
kusano 2b45e8
kusano 2b45e8
	MADD	t33,t33,a2,b2
kusano 2b45e8
	MADD	t43,t43,a3,b2
kusano 2b45e8
kusano 2b45e8
	MADD	t34,t34,a2,b3
kusano 2b45e8
	MADD	t44,t44,a3,b3
kusano 2b45e8
kusano 2b45e8
.L19:									#  Write Back to C
kusano 2b45e8
#ifndef TRMMKERNEL				
kusano 2b45e8
	LD	c11,0(CO1)						#  GEMM write part 
kusano 2b45e8
	LD	c21,1*SIZE(CO1)					#  get 16 C
kusano 2b45e8
	LD	c31,2*SIZE(CO1)
kusano 2b45e8
	LD	c41,3*SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	LD	c12,0(CO2)
kusano 2b45e8
	MADD	t11,c11,t11,ALPHA
kusano 2b45e8
	LD	c22,1*SIZE(CO2)
kusano 2b45e8
	MADD	t21,c21,t21,ALPHA
kusano 2b45e8
	LD	c32,2*SIZE(CO2)
kusano 2b45e8
	MADD	t31,c31,t31,ALPHA
kusano 2b45e8
	LD	c42,3*SIZE(CO2)
kusano 2b45e8
	MADD	t41,c41,t41,ALPHA
kusano 2b45e8
kusano 2b45e8
	LD	c13,0(CO3)
kusano 2b45e8
	MADD	t12,c12,t12,ALPHA
kusano 2b45e8
	LD	c23,1*SIZE(CO3)
kusano 2b45e8
	MADD	t22,c22,t22,ALPHA
kusano 2b45e8
	LD	c33,2*SIZE(CO3)
kusano 2b45e8
	MADD	t32,c32,t32,ALPHA
kusano 2b45e8
	LD	c43,3*SIZE(CO3)
kusano 2b45e8
	MADD	t42,c42,t42,ALPHA
kusano 2b45e8
kusano 2b45e8
	LD	c14,0(CO4)
kusano 2b45e8
	MADD	t13,c13,t13,ALPHA
kusano 2b45e8
	LD	c24,1*SIZE(CO4)
kusano 2b45e8
	MADD	t23,c23,t23,ALPHA
kusano 2b45e8
	LD	c34,2*SIZE(CO4)
kusano 2b45e8
	MADD	t33,c33,t33,ALPHA
kusano 2b45e8
	LD	c44,3*SIZE(CO4)
kusano 2b45e8
	MADD	t43,c43,t43,ALPHA
kusano 2b45e8
kusano 2b45e8
	ST	t11,0(CO1)
kusano 2b45e8
	MADD	t14,c14,t14,ALPHA
kusano 2b45e8
	ST	t21,1*SIZE(CO1)
kusano 2b45e8
	MADD	t24,c24,t24,ALPHA
kusano 2b45e8
	ST	t31,2*SIZE(CO1)
kusano 2b45e8
	MADD	t34,c34,t34,ALPHA
kusano 2b45e8
	ST	t41,3*SIZE(CO1)
kusano 2b45e8
	MADD	t44,c44,t44,ALPHA
kusano 2b45e8
	daddiu	M,M,-1						#  M--
kusano 2b45e8
kusano 2b45e8
	ST	t12,0(CO2)
kusano 2b45e8
	ST	t22,1*SIZE(CO2)
kusano 2b45e8
	ST	t32,2*SIZE(CO2)
kusano 2b45e8
	ST	t42,3*SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	ST	t13,0(CO3)
kusano 2b45e8
	ST	t23,1*SIZE(CO3)
kusano 2b45e8
	ST	t33,2*SIZE(CO3)
kusano 2b45e8
	ST	t43,3*SIZE(CO3)
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,4*SIZE(CO1)
kusano 2b45e8
	FETCH	$0,4*SIZE(CO2)
kusano 2b45e8
	FETCH	$0,4*SIZE(CO3)
kusano 2b45e8
	FETCH	$0,4*SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,8*SIZE(CO1)
kusano 2b45e8
	FETCH	$0,8*SIZE(CO2)
kusano 2b45e8
	FETCH	$0,8*SIZE(CO3)
kusano 2b45e8
	FETCH	$0,8*SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
	ST	t14,0(CO4)
kusano 2b45e8
	daddu	CO1,CO1,4*SIZE				#  COi += 4
kusano 2b45e8
	ST	t24,1*SIZE(CO4)
kusano 2b45e8
	daddu	CO2,CO2,4*SIZE
kusano 2b45e8
	ST	t34,2*SIZE(CO4)
kusano 2b45e8
	daddu	CO3,CO3,4*SIZE
kusano 2b45e8
	ST	t44,3*SIZE(CO4)
kusano 2b45e8
	daddu	PREB,BO,SPANB
kusano 2b45e8
	
kusano 2b45e8
	bnez	M,.L10				
kusano 2b45e8
	daddu	CO4,CO4,4*SIZE
kusano 2b45e8
kusano 2b45e8
#else							
kusano 2b45e8
	MUL	t11, ALPHA, t11					#	TRMM write back part
kusano 2b45e8
	MUL	t21, ALPHA, t21
kusano 2b45e8
	MUL	t31, ALPHA, t31
kusano 2b45e8
	MUL	t41, ALPHA, t41
kusano 2b45e8
kusano 2b45e8
	ST	t11, 0 * SIZE(CO1)
kusano 2b45e8
	MUL	t12, ALPHA, t12
kusano 2b45e8
	ST	t21, 1 * SIZE(CO1)
kusano 2b45e8
	MUL	t22, ALPHA, t22
kusano 2b45e8
	ST	t31, 2 * SIZE(CO1)
kusano 2b45e8
	MUL	t32, ALPHA, t32
kusano 2b45e8
	ST	t41, 3 * SIZE(CO1)
kusano 2b45e8
	MUL	t42, ALPHA, t42
kusano 2b45e8
kusano 2b45e8
	ST	t12, 0 * SIZE(CO2)
kusano 2b45e8
	MUL	t13, ALPHA, t13
kusano 2b45e8
	ST	t22, 1 * SIZE(CO2)
kusano 2b45e8
	MUL	t23, ALPHA, t23
kusano 2b45e8
	ST	t32, 2 * SIZE(CO2)
kusano 2b45e8
	MUL	t33, ALPHA, t33
kusano 2b45e8
	ST	t42, 3 * SIZE(CO2)
kusano 2b45e8
	MUL	t43, ALPHA, t43
kusano 2b45e8
kusano 2b45e8
	ST	t13, 0 * SIZE(CO3)
kusano 2b45e8
	MUL	t14, ALPHA, t14
kusano 2b45e8
	ST	t23, 1 * SIZE(CO3)
kusano 2b45e8
	MUL	t24, ALPHA, t24
kusano 2b45e8
	ST	t33, 2 * SIZE(CO3)
kusano 2b45e8
	MUL	t34, ALPHA, t34
kusano 2b45e8
	ST	t43, 3 * SIZE(CO3)
kusano 2b45e8
	MUL	t44, ALPHA, t44
kusano 2b45e8
kusano 2b45e8
	ST	t14, 0 * SIZE(CO4)
kusano 2b45e8
	daddiu	M,M,-1						#  M--
kusano 2b45e8
	ST	t24, 1 * SIZE(CO4)
kusano 2b45e8
	ST	t34, 2 * SIZE(CO4)
kusano 2b45e8
	ST	t44, 3 * SIZE(CO4)
kusano 2b45e8
	daddiu	CO1,CO1, 4 * SIZE
kusano 2b45e8
	daddiu	CO2,CO2, 4 * SIZE
kusano 2b45e8
	daddiu	CO3,CO3, 4 * SIZE
kusano 2b45e8
	daddiu	CO4,CO4, 4 * SIZE	
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,4*SIZE(CO1)
kusano 2b45e8
	FETCH	$0,4*SIZE(CO2)
kusano 2b45e8
	FETCH	$0,4*SIZE(CO3)
kusano 2b45e8
	FETCH	$0,4*SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,0(CO1)
kusano 2b45e8
	FETCH	$0,0(CO2)
kusano 2b45e8
	FETCH	$0,0(CO3)
kusano 2b45e8
	FETCH	$0,0(CO4)
kusano 2b45e8
kusano 2b45e8
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP,KCO,KK								
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	TEMP,TEMP, -4
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP,TEMP, -4
kusano 2b45e8
#endif
kusano 2b45e8
	dsll	K,TEMP,2 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP,TEMP,2 + BASE_SHIFT
kusano 2b45e8
	daddu	A,A,K						# 	mov A to the end of panel Ai
kusano 2b45e8
	daddu	B,B,TEMP					# 	mov B to the end of panel Bj
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef LEFT										
kusano 2b45e8
	daddiu	KK, KK,4
kusano 2b45e8
#endif
kusano 2b45e8
	bnez	M,.L10					
kusano 2b45e8
	nop
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L14_M2:
kusano 2b45e8
	andi	M, MCO, 2					#	nr=4,mr=2
kusano 2b45e8
	beqz	M,.L14_M1			
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L20:
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	B,BO						#	Reset B
kusano 2b45e8
#else
kusano 2b45e8
	dsll	K,KK,1 + BASE_SHIFT			#	mr=2	
kusano 2b45e8
	dsll	TEMP,KK,2 + BASE_SHIFT		#	nr=4
kusano 2b45e8
	daddu	A,A,K
kusano 2b45e8
	daddu	B,BO,TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t21,t11
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	MOV	t12,t11
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t22,t11
kusano 2b45e8
	LD	b2,2*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t13,t11
kusano 2b45e8
	MOV	t23,t11
kusano 2b45e8
	LD	b3,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP,KCO,KK
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP,KK,2					#	left part,controlled by mr, mr=2
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP,KK,4					#  	right part,controlled by nr,nr=4
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	K,TEMP,2
kusano 2b45e8
	MOV	t14,t11
kusano 2b45e8
	beqz	K,.L25
kusano 2b45e8
	MOV	t24,t11							#	clear 2*4=8 results registers
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	move	B,BO						#	Reset B 
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t21,t11
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	MOV	t12,t11
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t22,t11
kusano 2b45e8
	dsra	K,KCO,2				
kusano 2b45e8
	LD	b2,2*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t13,t11
kusano 2b45e8
	MOV	t23,t11
kusano 2b45e8
	LD	b3,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t14,t11
kusano 2b45e8
	beqz	K,.L25
kusano 2b45e8
	MOV	t24,t11
kusano 2b45e8
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
.L21:									#  nr=4,mr=2,kr=4
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	LD	a4,2*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	LD	a5,3*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	LD	b4,4*SIZE(B)
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
	LD	b5,5*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t13,t13,a0,b2
kusano 2b45e8
	LD	b6,6*SIZE(B)
kusano 2b45e8
	MADD	t23,t23,a1,b2
kusano 2b45e8
	LD	b7,7*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t14,t14,a0,b3
kusano 2b45e8
	MADD	t24,t24,a1,b3
kusano 2b45e8
	
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	LD	a2,4*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
	LD	a3,5*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a4,b5
kusano 2b45e8
	LD	b0,8*SIZE(B)
kusano 2b45e8
	MADD	t22,t22,a5,b5
kusano 2b45e8
	LD	b1,9*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t13,t13,a4,b6
kusano 2b45e8
	LD	b2,10*SIZE(B)
kusano 2b45e8
	MADD	t23,t23,a5,b6
kusano 2b45e8
	LD	b3,11*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t14,t14,a4,b7
kusano 2b45e8
	MADD	t24,t24,a5,b7
kusano 2b45e8
	daddiu	K,K,-1
kusano 2b45e8
kusano 2b45e8
	MADD	t11,t11,a2,b0
kusano 2b45e8
	LD	a6,6*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a3,b0
kusano 2b45e8
	LD	a7,7*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a2,b1
kusano 2b45e8
	LD	b4,12*SIZE(B)
kusano 2b45e8
	MADD	t22,t22,a3,b1
kusano 2b45e8
	LD	b5,13*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t13,t13,a2,b2
kusano 2b45e8
	LD	b6,14*SIZE(B)
kusano 2b45e8
	MADD	t23,t23,a3,b2
kusano 2b45e8
	LD	b7,15*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t14,t14,a2,b3
kusano 2b45e8
	MADD	t24,t24,a3,b3
kusano 2b45e8
	daddu	A,A,8*SIZE					#  2mr*4kr
kusano 2b45e8
	daddu	B,B,16*SIZE					#	4nr*4kr
kusano 2b45e8
kusano 2b45e8
	MADD	t11,t11,a6,b4
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a7,b4
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a6,b5
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	MADD	t22,t22,a7,b5
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t13,t13,a6,b6
kusano 2b45e8
	LD	b2,2*SIZE(B)
kusano 2b45e8
	MADD	t23,t23,a7,b6
kusano 2b45e8
	LD	b3,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t14,t14,a6,b7
kusano 2b45e8
	bnez 	K,.L21
kusano 2b45e8
	MADD	t24,t24,a7,b7
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L25:										
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,2						#	kr=2
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,2
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L28
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L26:			
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	LD	a4,2*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	LD	a5,3*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	LD	b4,4*SIZE(B)
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
	LD	b5,5*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t13,t13,a0,b2
kusano 2b45e8
	LD	b6,6*SIZE(B)
kusano 2b45e8
	MADD	t23,t23,a1,b2
kusano 2b45e8
	LD	b7,7*SIZE(B)
kusano 2b45e8
	
kusano 2b45e8
	MADD	t14,t14,a0,b3
kusano 2b45e8
	MADD	t24,t24,a1,b3
kusano 2b45e8
	daddu	A,A,4*SIZE					#  	2mr*2kr
kusano 2b45e8
	daddu	B,B,8*SIZE					#	4nr*2kr
kusano 2b45e8
kusano 2b45e8
.L27:
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a4,b5
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	MADD	t22,t22,a5,b5
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t13,t13,a4,b6
kusano 2b45e8
	LD	b2,2*SIZE(B)
kusano 2b45e8
	MADD	t23,t23,a5,b6
kusano 2b45e8
	LD	b3,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t14,t14,a4,b7
kusano 2b45e8
	MADD	t24,t24,a5,b7
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
.L28:									#	kr=1	
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,1
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,1
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L29				  
kusano 2b45e8
	LD	ALPHA,152($sp)					#  Get ALPHA
kusano 2b45e8
	
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	daddu	A,A,2*SIZE					#  2mr*kr
kusano 2b45e8
	daddu	B,B,4*SIZE					#  4nr*kr
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
kusano 2b45e8
	MADD	t13,t13,a0,b2
kusano 2b45e8
	MADD	t23,t23,a1,b2
kusano 2b45e8
kusano 2b45e8
	MADD	t14,t14,a0,b3
kusano 2b45e8
	MADD	t24,t24,a1,b3
kusano 2b45e8
kusano 2b45e8
.L29:									#  Write Back to C
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	LD	c11,0(CO1)						#	GEMM write back part
kusano 2b45e8
	LD	c21,1*SIZE(CO1)			
kusano 2b45e8
kusano 2b45e8
	LD	c12,0(CO2)
kusano 2b45e8
	LD	c22,1*SIZE(CO2)
kusano 2b45e8
	
kusano 2b45e8
	LD	c13,0(CO3)
kusano 2b45e8
	MADD	t11,c11,t11,ALPHA
kusano 2b45e8
	LD	c23,1*SIZE(CO3)
kusano 2b45e8
	MADD	t21,c21,t21,ALPHA
kusano 2b45e8
kusano 2b45e8
	LD	c14,0(CO4)
kusano 2b45e8
	MADD	t12,c12,t12,ALPHA
kusano 2b45e8
	LD	c24,1*SIZE(CO4)
kusano 2b45e8
	MADD	t22,c22,t22,ALPHA
kusano 2b45e8
kusano 2b45e8
	ST	t11,0(CO1)
kusano 2b45e8
	MADD	t13,c13,t13,ALPHA
kusano 2b45e8
	ST	t21,1*SIZE(CO1)
kusano 2b45e8
	MADD	t23,c23,t23,ALPHA
kusano 2b45e8
kusano 2b45e8
	ST	t12,0(CO2)
kusano 2b45e8
	MADD	t14,c14,t14,ALPHA
kusano 2b45e8
	ST	t22,1*SIZE(CO2)
kusano 2b45e8
	MADD	t24,c24,t24,ALPHA
kusano 2b45e8
kusano 2b45e8
	ST	t13,0(CO3)
kusano 2b45e8
	daddu	CO1,CO1,2*SIZE				#  COi += 2
kusano 2b45e8
	ST	t23,1*SIZE(CO3)
kusano 2b45e8
	daddu	CO2,CO2,2*SIZE
kusano 2b45e8
kusano 2b45e8
	ST	t14,0(CO4)
kusano 2b45e8
	daddu	CO3,CO3,2*SIZE
kusano 2b45e8
	ST	t24,1*SIZE(CO4)
kusano 2b45e8
	daddu	CO4,CO4,2*SIZE
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,0(CO1)
kusano 2b45e8
	FETCH	$0,0(CO2)
kusano 2b45e8
	FETCH	$0,0(CO3)
kusano 2b45e8
	FETCH	$0,0(CO4)
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	MUL	t11, ALPHA, t11					#	TRMM write back part
kusano 2b45e8
	MUL	t21, ALPHA, t21
kusano 2b45e8
	
kusano 2b45e8
	ST	t11, 0 * SIZE(CO1)
kusano 2b45e8
	MUL	t12, ALPHA, t12
kusano 2b45e8
	ST	t21, 1 * SIZE(CO1)
kusano 2b45e8
	MUL	t22, ALPHA, t22
kusano 2b45e8
	
kusano 2b45e8
	ST	t12, 0 * SIZE(CO2)
kusano 2b45e8
	MUL	t13, ALPHA, t13
kusano 2b45e8
	ST	t22, 1 * SIZE(CO2)
kusano 2b45e8
	MUL	t23, ALPHA, t23
kusano 2b45e8
	
kusano 2b45e8
	ST	t13, 0 * SIZE(CO3)
kusano 2b45e8
	MUL	t14, ALPHA, t14
kusano 2b45e8
	ST	t23, 1 * SIZE(CO3)
kusano 2b45e8
	MUL	t24, ALPHA, t24
kusano 2b45e8
	
kusano 2b45e8
	ST	t14, 0 * SIZE(CO4)
kusano 2b45e8
	ST	t24, 1 * SIZE(CO4)
kusano 2b45e8
	
kusano 2b45e8
	daddiu	CO1,CO1, 2 * SIZE
kusano 2b45e8
	daddiu	CO2,CO2, 2 * SIZE
kusano 2b45e8
	daddiu	CO3,CO3, 2 * SIZE
kusano 2b45e8
	daddiu	CO4,CO4, 2 * SIZE
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,0(CO1)
kusano 2b45e8
	FETCH	$0,0(CO2)
kusano 2b45e8
	FETCH	$0,0(CO3)
kusano 2b45e8
	FETCH	$0,0(CO4)
kusano 2b45e8
kusano 2b45e8
#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP,KCO,KK
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	TEMP,TEMP,-2
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP,TEMP,-4
kusano 2b45e8
#endif
kusano 2b45e8
	dsll	K,TEMP,1 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP,TEMP,2 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A,A,K						#	move A to next panel Ai
kusano 2b45e8
	daddu	B,B,TEMP					#	move B to next panel Bj
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	KK, KK, 2
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L14_M1:
kusano 2b45e8
	andi	M,MCO,1						#	mr=1	
kusano 2b45e8
	beqz	M,.L0_N4_Loop				#  	M = 0, finishing one panel Bj
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L30:
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	B,BO						#	Reset B
kusano 2b45e8
#else
kusano 2b45e8
	dsll	K,KK, BASE_SHIFT
kusano 2b45e8
	dsll	TEMP,KK,2 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A,A,K
kusano 2b45e8
	daddu	B,BO,TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	LD	a0, 0 * SIZE(A)					#	a0
kusano 2b45e8
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t12,t11
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t13,t11
kusano 2b45e8
	LD	b2,2*SIZE(B)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t14,t11
kusano 2b45e8
	LD	b3,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, KCO, KK
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP, KK, 1
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, KK, 4
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	K,TEMP, 2
kusano 2b45e8
	nop
kusano 2b45e8
	beqz	K,.L35
kusano 2b45e8
	nop
kusano 2b45e8
								
kusano 2b45e8
#else							
kusano 2b45e8
	move	B,BO						#	Reset B, GEMM part
kusano 2b45e8
	dsra	K,KCO,2						#  	K=KCO/2
kusano 2b45e8
	LD	a0, 0 * SIZE(A)					#	a0
kusano 2b45e8
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t12,t11
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t13,t11
kusano 2b45e8
	LD	b2,2*SIZE(B)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t14,t11
kusano 2b45e8
	beqz	K,.L35
kusano 2b45e8
	LD	b3,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
.L31:									#	nr=4,mr=1,kr=4	
kusano 2b45e8
	LD	a1,	1*SIZE(A)					#	load a1
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	
kusano 2b45e8
	LD	b4,4*SIZE(B)
kusano 2b45e8
	LD	b5,5*SIZE(B)
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	
kusano 2b45e8
	LD	b6,6*SIZE(B)
kusano 2b45e8
	LD	b7,7*SIZE(B)
kusano 2b45e8
	MADD	t13,t13,a0,b2
kusano 2b45e8
	MADD	t14,t14,a0,b3
kusano 2b45e8
kusano 2b45e8
	LD	a2,	2*SIZE(A)					#	a2
kusano 2b45e8
	MADD	t11,t11,a1,b4
kusano 2b45e8
	
kusano 2b45e8
	LD	b0,8*SIZE(B)
kusano 2b45e8
	LD	b1,9*SIZE(B)
kusano 2b45e8
	MADD	t12,t12,a1,b5
kusano 2b45e8
	
kusano 2b45e8
	LD	b2,10*SIZE(B)
kusano 2b45e8
	LD	b3,11*SIZE(B)
kusano 2b45e8
	MADD	t13,t13,a1,b6
kusano 2b45e8
	MADD	t14,t14,a1,b7
kusano 2b45e8
kusano 2b45e8
	LD	a3,	3*SIZE(A)					#	a3
kusano 2b45e8
	MADD	t11,t11,a2,b0
kusano 2b45e8
	daddiu	K,K,-1
kusano 2b45e8
	
kusano 2b45e8
	LD	b4,12*SIZE(B)
kusano 2b45e8
	LD	b5,13*SIZE(B)
kusano 2b45e8
	MADD	t12,t12,a2,b1
kusano 2b45e8
	daddu	A,A,4*SIZE					#	1mr*4kr
kusano 2b45e8
	
kusano 2b45e8
	LD	b6,14*SIZE(B)
kusano 2b45e8
	LD	b7,15*SIZE(B)
kusano 2b45e8
	MADD	t13,t13,a2,b2
kusano 2b45e8
	MADD	t14,t14,a2,b3
kusano 2b45e8
kusano 2b45e8
	LD	a0,	0*SIZE(A)					#	a0
kusano 2b45e8
	daddu	B,B,16*SIZE					#	4nr*4kr
kusano 2b45e8
	MADD	t11,t11,a3,b4
kusano 2b45e8
	
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	MADD	t12,t12,a3,b5
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
	MADD	t13,t13,a3,b6
kusano 2b45e8
kusano 2b45e8
	LD	b2,2*SIZE(B)
kusano 2b45e8
	MADD	t14,t14,a3,b7
kusano 2b45e8
	bnez 	K,.L31
kusano 2b45e8
	LD	b3,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L35:									#  kr=2
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,2			
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,2
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L38
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L36:			
kusano 2b45e8
	LD	a1,1*SIZE(A)					#	load a1
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
kusano 2b45e8
	LD	b4,4*SIZE(B)
kusano 2b45e8
	LD	b5,5*SIZE(B)
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	daddu	A,A,2*SIZE					#  	mr*2kr
kusano 2b45e8
	
kusano 2b45e8
	LD	b6,6*SIZE(B)
kusano 2b45e8
	MADD	t13,t13,a0,b2
kusano 2b45e8
	
kusano 2b45e8
	LD	b7,7*SIZE(B)
kusano 2b45e8
	MADD	t14,t14,a0,b3
kusano 2b45e8
	daddu	B,B,8*SIZE					#	4nr*2kr
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L37:
kusano 2b45e8
	LD	a0,0(A)
kusano 2b45e8
	MADD	t11,t11,a1,b4
kusano 2b45e8
	
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
	MADD	t12,t12,a1,b5
kusano 2b45e8
	
kusano 2b45e8
	LD	b2,2*SIZE(B)
kusano 2b45e8
	LD	b3,3*SIZE(B)
kusano 2b45e8
	MADD	t13,t13,a1,b6
kusano 2b45e8
	MADD	t14,t14,a1,b7
kusano 2b45e8
	
kusano 2b45e8
	
kusano 2b45e8
.L38:									#  	kr=1
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,1
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,1
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L39				
kusano 2b45e8
	LD	ALPHA,152($sp)					#  Get ALPHA
kusano 2b45e8
	
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	daddu	A,A,1*SIZE				
kusano 2b45e8
	daddu	B,B,4*SIZE
kusano 2b45e8
	
kusano 2b45e8
	MADD	t13,t13,a0,b2
kusano 2b45e8
	MADD	t14,t14,a0,b3
kusano 2b45e8
kusano 2b45e8
.L39:									#  Write Back
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	LD	c11,0(CO1)			
kusano 2b45e8
	LD	c12,0(CO2)
kusano 2b45e8
	LD	c13,0(CO3)
kusano 2b45e8
	LD	c14,0(CO4)
kusano 2b45e8
	
kusano 2b45e8
	MADD	t11,c11,t11,ALPHA
kusano 2b45e8
	MADD	t12,c12,t12,ALPHA
kusano 2b45e8
	MADD	t13,c13,t13,ALPHA
kusano 2b45e8
	MADD	t14,c14,t14,ALPHA
kusano 2b45e8
kusano 2b45e8
	ST	t11,0(CO1)
kusano 2b45e8
	ST	t12,0(CO2)
kusano 2b45e8
	ST	t13,0(CO3)
kusano 2b45e8
	ST	t14,0(CO4)
kusano 2b45e8
#else
kusano 2b45e8
	MUL	t11, ALPHA, t11
kusano 2b45e8
	MUL	t12, ALPHA, t12
kusano 2b45e8
	MUL	t13, ALPHA, t13
kusano 2b45e8
	MUL	t14, ALPHA, t14
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
	ST	t13,  0 * SIZE(CO3)
kusano 2b45e8
	ST	t14,  0 * SIZE(CO4)
kusano 2b45e8
kusano 2b45e8
#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, KCO, KK
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	TEMP, TEMP, -1
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, TEMP, -4
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	dsll	K,TEMP, BASE_SHIFT
kusano 2b45e8
	dsll	TEMP,TEMP, 2 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A,A,K
kusano 2b45e8
	daddu	B,B,TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	KK, KK, 1
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
.L0_N4_Loop:								#	mc finished
kusano 2b45e8
	daddiu	N,N,-1							#  N--
kusano 2b45e8
#if defined(TRMMKERNEL) && !defined(LEFT)
kusano 2b45e8
	daddiu	KK, KK,4 
kusano 2b45e8
#endif
kusano 2b45e8
	bnez	N,.L0_N4_Lb			
kusano 2b45e8
	move	BO,B							#  Set BO point to next panel Bj
kusano 2b45e8
kusano 2b45e8
	.align	5					
kusano 2b45e8
.L0_N2:
kusano 2b45e8
	andi	N,NCO,2							#  	nr = 2
kusano 2b45e8
	beqz	N,.L0_N1		
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L0_N2_Lb:
kusano 2b45e8
	move	CO1,C					
kusano 2b45e8
	daddu	CO2,C,LDC
kusano 2b45e8
kusano 2b45e8
	dsra	M,MCO,2				
kusano 2b45e8
	move	A,AO							#  Reset A
kusano 2b45e8
kusano 2b45e8
	daddu	PREA,AO,SPANA
kusano 2b45e8
	daddu	C,CO2,LDC
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) &&  defined(LEFT)
kusano 2b45e8
	move	KK, OFFSET
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	M,.L12_M2
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L40:						
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	B,BO							#	Reset B
kusano 2b45e8
#else
kusano 2b45e8
	dsll	K,KK, 2 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, KK,1 + BASE_SHIFT	
kusano 2b45e8
kusano 2b45e8
	daddu	A,A,K
kusano 2b45e8
	daddu	B,BO,TEMP
kusano 2b45e8
#endif
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MTC		$0,t11							#  	gemm part
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MOV	t21,t11
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	MOV	t31,t11
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t41,t11
kusano 2b45e8
	LD	a2,2*SIZE(A)
kusano 2b45e8
	LD	a3,3*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t12,t11
kusano 2b45e8
	MOV	t22,t11
kusano 2b45e8
	
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP,KCO,KK
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP, KK, 4
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, KK, 2
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	K,TEMP,2				
kusano 2b45e8
	MOV	t32,t11
kusano 2b45e8
	beqz	K,.L45
kusano 2b45e8
	MOV	t42,t11
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	move	B,BO							#	Reset B
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MTC		$0,t11							#  	gemm part
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MOV	t21,t11
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	MOV	t31,t11
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t41,t11
kusano 2b45e8
	LD	a2,2*SIZE(A)
kusano 2b45e8
	dsra	K,KCO,2							#	K=KCO/2
kusano 2b45e8
	LD	a3,3*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t12,t11
kusano 2b45e8
	MOV	t22,t11
kusano 2b45e8
	
kusano 2b45e8
	MOV	t32,t11
kusano 2b45e8
	beqz	K,.L45
kusano 2b45e8
	MOV	t42,t11
kusano 2b45e8
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
.L41:										#  	nr=2,mr=kr=4
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	LD	a4,4*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	LD	a5,5*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	LD	b4,2*SIZE(B)
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
	LD	b5,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a2,b0
kusano 2b45e8
	LD	a6,6*SIZE(A)
kusano 2b45e8
	MADD	t41,t41,a3,b0
kusano 2b45e8
	LD	a7,7*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,(PREA)
kusano 2b45e8
	MADD	t32,t32,a2,b1
kusano 2b45e8
	MADD	t42,t42,a3,b1
kusano 2b45e8
kusano 2b45e8
.L42:
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	LD	a0,8*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
	LD	a1,9*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a4,b5
kusano 2b45e8
	LD	b2,4*SIZE(B)
kusano 2b45e8
	MADD	t22,t22,a5,b5
kusano 2b45e8
	LD	b3,5*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a6,b4
kusano 2b45e8
	LD	a2,10*SIZE(A)
kusano 2b45e8
	MADD	t41,t41,a7,b4
kusano 2b45e8
	LD	a3,11*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,4*SIZE(PREA)
kusano 2b45e8
	MADD	t32,t32,a6,b5
kusano 2b45e8
	MADD	t42,t42,a7,b5
kusano 2b45e8
kusano 2b45e8
.L43:
kusano 2b45e8
	MADD	t11,t11,a0,b2
kusano 2b45e8
	LD	a4,12*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a1,b2
kusano 2b45e8
	LD	a5,13*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b3
kusano 2b45e8
	LD	b6,6*SIZE(B)
kusano 2b45e8
	MADD	t22,t22,a1,b3
kusano 2b45e8
	LD	b7,7*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a2,b2
kusano 2b45e8
	LD	a6,14*SIZE(A)
kusano 2b45e8
	MADD	t41,t41,a3,b2
kusano 2b45e8
	LD	a7,15*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,8*SIZE(PREA)
kusano 2b45e8
	MADD	t32,t32,a2,b3
kusano 2b45e8
	MADD	t42,t42,a3,b3
kusano 2b45e8
	
kusano 2b45e8
	daddu	A,A,16*SIZE						#	4mr*4kr
kusano 2b45e8
	daddu	B,B,8*SIZE						#	2nr*4kr	
kusano 2b45e8
kusano 2b45e8
.L44:
kusano 2b45e8
	MADD	t11,t11,a4,b6
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a5,b6
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a4,b7
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	MADD	t22,t22,a5,b7
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	daddiu	K,K,-1
kusano 2b45e8
	daddu	PREA,PREA,16*SIZE
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a6,b6
kusano 2b45e8
	LD	a2,2*SIZE(A)
kusano 2b45e8
	MADD	t41,t41,a7,b6
kusano 2b45e8
	LD	a3,3*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,-4*SIZE(PREA)
kusano 2b45e8
	MADD	t32,t32,a6,b7
kusano 2b45e8
	bnez 	K,.L41
kusano 2b45e8
	MADD	t42,t42,a7,b7
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L45:										#  	kr=2
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,2				
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,2
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L48
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L46:			
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	LD	a4,4*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	LD	a5,5*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	LD	b4,2*SIZE(B)
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
	LD	b5,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a2,b0
kusano 2b45e8
	LD	a6,6*SIZE(A)
kusano 2b45e8
	MADD	t41,t41,a3,b0
kusano 2b45e8
	LD	a7,7*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,0(PREA)
kusano 2b45e8
	MADD	t32,t32,a2,b1
kusano 2b45e8
	daddu	B,B,4*SIZE						#  B+=2(nr)*2(kr)*8Byte=32
kusano 2b45e8
	
kusano 2b45e8
	MADD	t42,t42,a3,b1
kusano 2b45e8
	daddu	A,A,8*SIZE						#  A+=4(mr)*2(kr)*8Byte=8*SIZE
kusano 2b45e8
kusano 2b45e8
.L47:
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a4,b5
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	MADD	t22,t22,a5,b5
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a6,b4
kusano 2b45e8
	LD	a2,2*SIZE(A)
kusano 2b45e8
	MADD	t41,t41,a7,b4
kusano 2b45e8
	LD	a3,3*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	FETCH		$0,4*SIZE(PREA)
kusano 2b45e8
	MADD	t32,t32,a6,b5
kusano 2b45e8
	MADD	t42,t42,a7,b5
kusano 2b45e8
	daddu	PREA,PREA,8*SIZE
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
.L48:										#	 kr=1
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,1
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,1
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L49				 
kusano 2b45e8
	LD	ALPHA,152($sp)						#  Get ALPHA
kusano 2b45e8
	
kusano 2b45e8
	FETCH		$0,0(PREA)
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	daddu	A,A,4*SIZE						#  A+=4(mr)*1(kr)*8Byte=32
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
	daddu	B,B,2*SIZE
kusano 2b45e8
	daddu	PREA,PREA,4*SIZE
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a2,b0
kusano 2b45e8
	MADD	t41,t41,a3,b0
kusano 2b45e8
kusano 2b45e8
	MADD	t32,t32,a2,b1
kusano 2b45e8
	MADD	t42,t42,a3,b1
kusano 2b45e8
kusano 2b45e8
.L49:										#  Write Back
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	LD	c11,0(CO1)							#  gemm write back part Fetch 16 C
kusano 2b45e8
	LD	c21,1*SIZE(CO1)			
kusano 2b45e8
	LD	c31,2*SIZE(CO1)
kusano 2b45e8
	LD	c41,3*SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	LD	c12,0(CO2)
kusano 2b45e8
	MADD	t11,c11,t11,ALPHA
kusano 2b45e8
	LD	c22,1*SIZE(CO2)
kusano 2b45e8
	MADD	t21,c21,t21,ALPHA
kusano 2b45e8
	LD	c32,2*SIZE(CO2)
kusano 2b45e8
	MADD	t31,c31,t31,ALPHA
kusano 2b45e8
	LD	c42,3*SIZE(CO2)
kusano 2b45e8
	MADD	t41,c41,t41,ALPHA
kusano 2b45e8
kusano 2b45e8
	ST	t11,0(CO1)
kusano 2b45e8
	MADD	t12,c12,t12,ALPHA
kusano 2b45e8
	ST	t21,1*SIZE(CO1)
kusano 2b45e8
	MADD	t22,c22,t22,ALPHA
kusano 2b45e8
	ST	t31,2*SIZE(CO1)
kusano 2b45e8
	MADD	t32,c32,t32,ALPHA
kusano 2b45e8
	ST	t41,3*SIZE(CO1)
kusano 2b45e8
	MADD	t42,c42,t42,ALPHA
kusano 2b45e8
	daddiu	M,M,-1				
kusano 2b45e8
kusano 2b45e8
	ST	t12,0(CO2)
kusano 2b45e8
	ST	t22,1*SIZE(CO2)
kusano 2b45e8
	ST	t32,2*SIZE(CO2)
kusano 2b45e8
	ST	t42,3*SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,4*SIZE(CO1)
kusano 2b45e8
	FETCH	$0,4*SIZE(CO2)
kusano 2b45e8
	FETCH	$0,8*SIZE(CO1)
kusano 2b45e8
	FETCH	$0,8*SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	daddu	CO1,CO1,4*SIZE			
kusano 2b45e8
	bnez	M,.L40				
kusano 2b45e8
	daddu	CO2,CO2,4*SIZE
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	MUL	t11, ALPHA, t11
kusano 2b45e8
	MUL	t21, ALPHA, t21
kusano 2b45e8
	MUL	t31, ALPHA, t31
kusano 2b45e8
	MUL	t41, ALPHA, t41
kusano 2b45e8
	
kusano 2b45e8
	MUL	t12, ALPHA, t12
kusano 2b45e8
	ST	t11, 0 * SIZE(CO1)
kusano 2b45e8
	MUL	t22, ALPHA, t22
kusano 2b45e8
	ST	t21, 1 * SIZE(CO1)
kusano 2b45e8
	MUL	t32, ALPHA, t32
kusano 2b45e8
	ST	t31, 2 * SIZE(CO1)
kusano 2b45e8
	MUL	t42, ALPHA, t42
kusano 2b45e8
	ST	t41, 3 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
	ST	t12, 0 * SIZE(CO2)
kusano 2b45e8
	daddiu	M,M,-1
kusano 2b45e8
	ST	t22, 1 * SIZE(CO2)
kusano 2b45e8
	ST	t32, 2 * SIZE(CO2)
kusano 2b45e8
	ST	t42, 3 * SIZE(CO2)
kusano 2b45e8
	
kusano 2b45e8
	daddiu	CO1,CO1, 4*SIZE
kusano 2b45e8
	daddiu	CO2,CO2, 4*SIZE
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,0(CO1)
kusano 2b45e8
	FETCH	$0,0(CO2)
kusano 2b45e8
	FETCH	$0,4(CO1)
kusano 2b45e8
	FETCH	$0,4(CO2)
kusano 2b45e8
kusano 2b45e8
#if ( defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, KCO, KK
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	TEMP, TEMP, -4
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, TEMP, -2
kusano 2b45e8
#endif
kusano 2b45e8
	dsll	K,TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A,A,K
kusano 2b45e8
	daddu	B,B,TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	KK, KK, 4
kusano 2b45e8
#endif
kusano 2b45e8
	bnez	M,.L40
kusano 2b45e8
	nop
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L12_M2:
kusano 2b45e8
	andi	M,MCO,2						#  	mr = 2
kusano 2b45e8
	beqz	M,.L12_M1			
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L50:
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	B,BO
kusano 2b45e8
#else
kusano 2b45e8
	dsll	K,    KK, 1 + BASE_SHIFT	#mr=2
kusano 2b45e8
	dsll	TEMP, KK, 1 + BASE_SHIFT	#nr=2
kusano 2b45e8
kusano 2b45e8
	daddu	A, A, K
kusano 2b45e8
	daddu	B, BO,  TEMP
kusano 2b45e8
#endif
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	MOV	t21,t11
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
	
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, KCO, KK
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP, KK, 2
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, KK, 2
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	K,TEMP,2			
kusano 2b45e8
	MOV	t12,t11
kusano 2b45e8
	beqz	K,.L55
kusano 2b45e8
	MOV	t22,t11
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	move	B,BO
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	dsra	K,KCO,2						#  K=KCO/2
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	MOV	t21,t11
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t12,t11
kusano 2b45e8
	beqz	K,.L55
kusano 2b45e8
	MOV	t22,t11
kusano 2b45e8
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
.L51:									#  nr=2 mr=2,kr=4
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	LD	a4,2*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	LD	b4,2*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	LD	a5,3*SIZE(A)
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
	LD	b5,3*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	LD	a2,4*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
	LD	b2,4*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a4,b5
kusano 2b45e8
	LD	a3,5*SIZE(A)
kusano 2b45e8
	MADD	t22,t22,a5,b5
kusano 2b45e8
	daddiu	K,K,-1
kusano 2b45e8
	LD	b3,5*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t11,t11,a2,b2
kusano 2b45e8
	LD	a6,6*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a3,b2
kusano 2b45e8
	daddu	A,A,8*SIZE					#  A+=2(mr)*4(kr)*8Byte=8*SIZE
kusano 2b45e8
	LD	b6,6*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a2,b3
kusano 2b45e8
	daddu	B,B,8*SIZE					#  B+=2(nr)*4(kr)*8Byte=16*SIZE
kusano 2b45e8
	LD	a7,-1*SIZE(A)
kusano 2b45e8
	MADD	t22,t22,a3,b3
kusano 2b45e8
	LD	b7,-1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t11,t11,a6,b6
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a7,b6
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a6,b7
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
	MADD	t22,t22,a7,b7
kusano 2b45e8
	bnez 	K,.L51
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L55:									#  	kr=2
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,2				
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,2
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L58
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L56:			
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	LD	a4,2*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	daddu	A,A,4*SIZE					#  A+=2(mr)*2(kr)*8Byte=32
kusano 2b45e8
	LD	b4,2*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	daddu	B,B,4*SIZE					#	2nr*2kr
kusano 2b45e8
	LD	a5,-1*SIZE(A)
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
	LD	b5,-1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
.L57:
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a4,b5
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
	MADD	t22,t22,a5,b5
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
.L58:									#  kr=1
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,1
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP, 1
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L59				
kusano 2b45e8
	LD	ALPHA,152($sp)					#  Get ALPHA
kusano 2b45e8
	
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	daddu	A,A,2*SIZE					#  	A+=2(mr)*1(kr)*8Byte=16
kusano 2b45e8
	daddu	B,B,2*SIZE					#	2nr*kr
kusano 2b45e8
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	MADD	t22,t22,a1,b1
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L59:									#  Write Back
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	LD	c11,0(CO1)						#  write gemm part back Fetch 16 C
kusano 2b45e8
	LD	c21,1*SIZE(CO1)			
kusano 2b45e8
	LD	c12,0(CO2)
kusano 2b45e8
	LD	c22,1*SIZE(CO2)
kusano 2b45e8
	
kusano 2b45e8
	MADD	t11,c11,t11,ALPHA
kusano 2b45e8
	MADD	t21,c21,t21,ALPHA
kusano 2b45e8
	MADD	t12,c12,t12,ALPHA
kusano 2b45e8
	MADD	t22,c22,t22,ALPHA
kusano 2b45e8
kusano 2b45e8
	ST	t11,0(CO1)
kusano 2b45e8
	ST	t21,1*SIZE(CO1)
kusano 2b45e8
	ST	t12,0(CO2)
kusano 2b45e8
	ST	t22,1*SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	daddu	CO1,CO1,2*SIZE			
kusano 2b45e8
	daddu	CO2,CO2,2*SIZE
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,0(CO1)
kusano 2b45e8
	FETCH	$0,0(CO2)
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	M, M, -1
kusano 2b45e8
	daddiu	CO1,CO1, 2 * SIZE
kusano 2b45e8
	daddiu	CO2,CO2, 2 * SIZE
kusano 2b45e8
	MUL	t11, ALPHA, t11
kusano 2b45e8
	MUL	t21, ALPHA, t21
kusano 2b45e8
	MUL	t12, ALPHA, t12
kusano 2b45e8
	MUL	t22, ALPHA, t22
kusano 2b45e8
kusano 2b45e8
	ST	t11, -2 * SIZE(CO1)
kusano 2b45e8
	ST	t21, -1 * SIZE(CO1)
kusano 2b45e8
	ST	t12, -2 * SIZE(CO2)
kusano 2b45e8
	ST	t22, -1 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,0(CO1)
kusano 2b45e8
	FETCH	$0,0(CO2)
kusano 2b45e8
kusano 2b45e8
#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, KCO, KK
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	TEMP, TEMP, -2
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, TEMP, -2
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	dsll	K,    TEMP, 1 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A, A, K
kusano 2b45e8
	daddu	B, B, TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	KK, KK, 2
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L12_M1:
kusano 2b45e8
	andi	M,MCO,1					#  	mr = 1
kusano 2b45e8
	beqz	M,.L0_N2_Loop		
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L60:
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	B,BO					#	Reset B
kusano 2b45e8
#else
kusano 2b45e8
	dsll	K,    KK, 0 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, KK, 1 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A, A, K
kusano 2b45e8
	daddu	B, BO,  TEMP
kusano 2b45e8
#endif
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
	MOV	t21,t11
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t12,t11
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, KCO, KK
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP, KK, 1
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, KK, 2
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	K,TEMP,2				
kusano 2b45e8
	MOV	t22,t11
kusano 2b45e8
	beqz	K,.L65
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	dsra	K,KCO,2				
kusano 2b45e8
	move	B,BO					#  Reset B
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
	MOV	t21,t11
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MOV	t12,t11
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
	beqz	K,.L65
kusano 2b45e8
	MOV	t22,t11
kusano 2b45e8
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
.L61:								#	nr=2,mr=1,kr=4	
kusano 2b45e8
	LD	a4,	1*SIZE(A)				#	a2
kusano 2b45e8
	LD	b4, 2*SIZE(B)
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	
kusano 2b45e8
	LD	b5,3*SIZE(B)
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
kusano 2b45e8
	LD	a2,	2*SIZE(A)				#	a3
kusano 2b45e8
	LD	b2,4*SIZE(B)
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	
kusano 2b45e8
	LD	b3,5*SIZE(B)
kusano 2b45e8
	MADD	t12,t12,a4,b5
kusano 2b45e8
kusano 2b45e8
	LD	a6,	3*SIZE(A)				#	a4
kusano 2b45e8
	daddiu	K,K,-1
kusano 2b45e8
	LD	b6,6*SIZE(B)
kusano 2b45e8
	MADD	t11,t11,a2,b2
kusano 2b45e8
	
kusano 2b45e8
	LD	b7,7*SIZE(B)
kusano 2b45e8
	MADD	t12,t12,a2,b3
kusano 2b45e8
	daddu	A,A,4*SIZE				#  A+=1(mr)*4(kr)*8Byte=32
kusano 2b45e8
kusano 2b45e8
	LD	a0,	0*SIZE(A)
kusano 2b45e8
	daddu	B,B,8*SIZE				#  B+=2(nr)*4(kr)*8Byte=8*SIZE
kusano 2b45e8
	
kusano 2b45e8
	LD	b0,0*SIZE(B)	
kusano 2b45e8
	MADD	t11,t11,a6,b6
kusano 2b45e8
	
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
	bnez 	K,.L61
kusano 2b45e8
	MADD	t12,t12,a6,b7
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L65:								#  kr=2
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,2				
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,2
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L68
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L66:			
kusano 2b45e8
	LD	a4,	1*SIZE(A)				#	a1
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	LD	b4,2*SIZE(B)
kusano 2b45e8
	daddu	A,A,2*SIZE				#  A+=1(mr)*2(kr)*8Byte=16
kusano 2b45e8
	
kusano 2b45e8
	LD	b5,3*SIZE(B)
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	daddu	B,B,4*SIZE
kusano 2b45e8
kusano 2b45e8
.L67:
kusano 2b45e8
	LD	a0,0(A)						#	a0
kusano 2b45e8
	LD	b0,0*SIZE(B)
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	
kusano 2b45e8
	LD	b1,1*SIZE(B)
kusano 2b45e8
	MADD	t12,t12,a4,b5
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L68:								#   kr=1
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,1
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,1
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L69				  
kusano 2b45e8
	LD	ALPHA,152($sp)				#  Get ALPHA
kusano 2b45e8
	
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	MADD	t12,t12,a0,b1
kusano 2b45e8
	daddu	A,A,1*SIZE				#  A+=1(mr)*1(kr)*8Byte=16
kusano 2b45e8
	daddu	B,B,2*SIZE
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L69:								#  Write Back
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	LD	c11,0(CO1)					#  Fetch 16 C
kusano 2b45e8
	LD	c12,0(CO2)
kusano 2b45e8
	
kusano 2b45e8
	MADD	t11,c11,t11,ALPHA
kusano 2b45e8
	MADD	t12,c12,t12,ALPHA
kusano 2b45e8
kusano 2b45e8
	ST	t11,0(CO1)
kusano 2b45e8
	ST	t12,0(CO2)
kusano 2b45e8
kusano 2b45e8
	daddu	CO1,CO1,1*SIZE		
kusano 2b45e8
	daddu	CO2,CO2,1*SIZE
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	MUL	t11, ALPHA, t11
kusano 2b45e8
	MUL	t12, ALPHA, t12
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)
kusano 2b45e8
	ST	t12,  0 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	daddu	CO1,CO1,1*SIZE			
kusano 2b45e8
	daddu	CO2,CO2,1*SIZE
kusano 2b45e8
kusano 2b45e8
#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, KCO, KK
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	TEMP, TEMP, -1
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, TEMP, -2
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	dsll	K,    TEMP, 0 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A, A, K
kusano 2b45e8
	daddu	B, B, TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	KK, KK, 1
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
.L0_N2_Loop:
kusano 2b45e8
#if defined(TRMMKERNEL) && !defined(LEFT)
kusano 2b45e8
	daddiu	KK, KK, 2
kusano 2b45e8
#endif
kusano 2b45e8
	move	BO, B
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align	5					
kusano 2b45e8
.L0_N1:
kusano 2b45e8
	andi	N,NCO,1					#  nr = 1
kusano 2b45e8
	beqz	N,.L999					
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	move	CO1,C				
kusano 2b45e8
	dsra	M,MCO,2				
kusano 2b45e8
	
kusano 2b45e8
	move	A,AO					#  Reset A
kusano 2b45e8
	daddu	PREA,AO,SPANA
kusano 2b45e8
#if defined(TRMMKERNEL) &&  defined(LEFT)
kusano 2b45e8
	move	KK, OFFSET
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	beqz	M,.L11_M2
kusano 2b45e8
	daddu	C,CO1,LDC
kusano 2b45e8
kusano 2b45e8
.L70:						
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	B, BO					#	Reset B
kusano 2b45e8
#else
kusano 2b45e8
	dsll	K,    KK, 2 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, KK, 0 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A, A, K
kusano 2b45e8
	daddu	B, BO,  TEMP
kusano 2b45e8
#endif
kusano 2b45e8
	LD	b0,	0*SIZE(B)
kusano 2b45e8
	
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MOV	t21,t11
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t31,t11
kusano 2b45e8
	LD	a2,2*SIZE(A)
kusano 2b45e8
	MOV	t41,t11
kusano 2b45e8
	LD	a3,3*SIZE(A)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, KCO, KK
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP, KK, 4
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, KK, 1
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	K,TEMP,2		
kusano 2b45e8
	beqz	K,.L75
kusano 2b45e8
	nop
kusano 2b45e8
#else
kusano 2b45e8
	move	B, BO					#	Reset B
kusano 2b45e8
	dsra	K,KCO,2			
kusano 2b45e8
	LD	b0,	0*SIZE(B)
kusano 2b45e8
	
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MOV	t21,t11
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	MOV	t31,t11
kusano 2b45e8
	LD	a2,2*SIZE(A)
kusano 2b45e8
	MOV	t41,t11
kusano 2b45e8
	beqz	K,.L75
kusano 2b45e8
	LD	a3,3*SIZE(A)
kusano 2b45e8
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
.L71:								#  nr=1,mr=kr=4
kusano 2b45e8
	LD	b4,	1*SIZE(B)				#	b1
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	
kusano 2b45e8
	LD	a4,	4*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
kusano 2b45e8
	LD	a5,	5*SIZE(A)
kusano 2b45e8
	FETCH		$0,(PREA)
kusano 2b45e8
kusano 2b45e8
	LD	a6,6*SIZE(A)
kusano 2b45e8
	MADD	t31,t31,a2,b0
kusano 2b45e8
kusano 2b45e8
	LD	a7,7*SIZE(A)
kusano 2b45e8
	MADD	t41,t41,a3,b0
kusano 2b45e8
kusano 2b45e8
.L72:
kusano 2b45e8
	LD	b2,	2*SIZE(B)				#	b2
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	
kusano 2b45e8
	LD	a0,8*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
kusano 2b45e8
	LD	a1,9*SIZE(A)
kusano 2b45e8
	FETCH		$0,4*SIZE(PREA)
kusano 2b45e8
kusano 2b45e8
	LD	a2,10*SIZE(A)
kusano 2b45e8
	MADD	t31,t31,a6,b4
kusano 2b45e8
	
kusano 2b45e8
	LD	a3,11*SIZE(A)
kusano 2b45e8
	MADD	t41,t41,a7,b4
kusano 2b45e8
kusano 2b45e8
.L73:
kusano 2b45e8
	LD	b6,	3*SIZE(B)
kusano 2b45e8
	MADD	t11,t11,a0,b2
kusano 2b45e8
	
kusano 2b45e8
	LD	a4,12*SIZE(A)
kusano 2b45e8
	daddu	B,B,4*SIZE				#  B+=1(nr)*4(kr)*8Byte=32
kusano 2b45e8
	
kusano 2b45e8
	LD	a5,13*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a1,b2
kusano 2b45e8
kusano 2b45e8
	LD	a6,14*SIZE(A)
kusano 2b45e8
	FETCH		$0,8*SIZE(PREA)
kusano 2b45e8
	MADD	t31,t31,a2,b2
kusano 2b45e8
kusano 2b45e8
	LD	a7,15*SIZE(A)
kusano 2b45e8
	MADD	t41,t41,a3,b2
kusano 2b45e8
	daddu	A,A,16*SIZE				#  A+=4(mr)*4(kr)*8Byte=16*SIZE
kusano 2b45e8
kusano 2b45e8
.L74:
kusano 2b45e8
	LD	b0,	0*SIZE(B)
kusano 2b45e8
	MADD	t11,t11,a4,b6
kusano 2b45e8
	
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	daddu	PREA,PREA,16*SIZE
kusano 2b45e8
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a5,b6
kusano 2b45e8
kusano 2b45e8
	LD	a2,2*SIZE(A)
kusano 2b45e8
	daddiu	K,K,-1
kusano 2b45e8
	MADD	t31,t31,a6,b6
kusano 2b45e8
kusano 2b45e8
	LD	a3,3*SIZE(A)
kusano 2b45e8
	MADD	t41,t41,a7,b6
kusano 2b45e8
	bnez 	K,.L71
kusano 2b45e8
	FETCH		$0,-32(PREA)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L75:								#  kr=2
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,2				
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,2
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L78
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L76:			
kusano 2b45e8
	LD	b4,	1*SIZE(B)
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	
kusano 2b45e8
	LD	a4,4*SIZE(A)
kusano 2b45e8
	daddu	B,B,2*SIZE				#  B+=1(nr)*2(kr)*8Byte=32
kusano 2b45e8
	
kusano 2b45e8
	LD	a5,5*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	FETCH		$0,0(PREA)
kusano 2b45e8
kusano 2b45e8
	LD	a6,6*SIZE(A)
kusano 2b45e8
	MADD	t31,t31,a2,b0
kusano 2b45e8
kusano 2b45e8
	LD	a7,7*SIZE(A)
kusano 2b45e8
	MADD	t41,t41,a3,b0
kusano 2b45e8
	daddu	A,A,8*SIZE				#  A+=4(mr)*2(kr)*8Byte=8*SIZE
kusano 2b45e8
kusano 2b45e8
.L77:
kusano 2b45e8
	LD	b0,0(B)
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
	FETCH		$0,4*SIZE(PREA)
kusano 2b45e8
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
	MADD	t31,t31,a6,b4
kusano 2b45e8
kusano 2b45e8
	LD	a2,2*SIZE(A)
kusano 2b45e8
	MADD	t41,t41,a7,b4
kusano 2b45e8
kusano 2b45e8
	LD	a3,3*SIZE(A)
kusano 2b45e8
	daddu	PREA,PREA,8*SIZE
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
.L78:								#   kr=1
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,1
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,1
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L79				 
kusano 2b45e8
	LD	ALPHA,152($sp)				#  Get ALPHA
kusano 2b45e8
	
kusano 2b45e8
	FETCH		$0,0(PREA)
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	daddu	A,A,4*SIZE				#  A+=4(mr)*1(kr)*8Byte=32
kusano 2b45e8
kusano 2b45e8
	MADD	t31,t31,a2,b0
kusano 2b45e8
	MADD	t41,t41,a3,b0
kusano 2b45e8
	daddu	B,B,1*SIZE
kusano 2b45e8
	daddu	PREA,PREA,4*SIZE
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L79:								#  Write Back
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	LD	c11,0(CO1)					#  Fetch 16 C
kusano 2b45e8
	LD	c21,1*SIZE(CO1)			
kusano 2b45e8
	LD	c31,2*SIZE(CO1)
kusano 2b45e8
	LD	c41,3*SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	MADD	t11,c11,t11,ALPHA
kusano 2b45e8
	MADD	t21,c21,t21,ALPHA
kusano 2b45e8
	MADD	t31,c31,t31,ALPHA
kusano 2b45e8
	MADD	t41,c41,t41,ALPHA
kusano 2b45e8
kusano 2b45e8
	ST	t11,0(CO1)
kusano 2b45e8
	ST	t21,1*SIZE(CO1)
kusano 2b45e8
	ST	t31,2*SIZE(CO1)
kusano 2b45e8
	ST	t41,3*SIZE(CO1)
kusano 2b45e8
	daddiu	M,M,-1					#  M--
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,4*SIZE(CO1)
kusano 2b45e8
	FETCH	$0,8*SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	bnez	M,.L70					#  M!=0
kusano 2b45e8
	daddu	CO1,CO1,4*SIZE			#  COx += 4*8Byte
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	M,M,-1					#  M--
kusano 2b45e8
	MUL	t11, ALPHA, t11
kusano 2b45e8
	MUL	t21, ALPHA, t21
kusano 2b45e8
	MUL	t31, ALPHA, t31
kusano 2b45e8
	MUL	t41, ALPHA, t41
kusano 2b45e8
kusano 2b45e8
	ST	t11,0(CO1)
kusano 2b45e8
	ST	t21,1*SIZE(CO1)
kusano 2b45e8
	ST	t31,2*SIZE(CO1)
kusano 2b45e8
	ST	t41,3*SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,4*SIZE(CO1)
kusano 2b45e8
	FETCH	$0,8*SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	daddu	CO1,CO1,4*SIZE			
kusano 2b45e8
#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, KCO, KK
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	TEMP, TEMP, -4
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, TEMP, -1
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	dsll	K,    TEMP, 2 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 0 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A, A,K
kusano 2b45e8
	daddu	B, B, TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	KK, KK, 4
kusano 2b45e8
#endif
kusano 2b45e8
	bnez	M,.L70				
kusano 2b45e8
	nop
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L11_M2:
kusano 2b45e8
	andi	M,MCO,2					#  mr = 2
kusano 2b45e8
	beqz	M,.L11_M1			
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L80:						
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	B, BO
kusano 2b45e8
#else
kusano 2b45e8
	dsll	K,    KK, 1 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, KK, 0 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A, A, K
kusano 2b45e8
	daddu	B, BO,  TEMP
kusano 2b45e8
#endif
kusano 2b45e8
	LD	b0,	0*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
	MOV		t21,t11
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, KCO, KK
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP, KK, 2
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, KK, 1
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	K,TEMP,2				#  K=KCO/2
kusano 2b45e8
	beqz	K,.L85
kusano 2b45e8
	nop
kusano 2b45e8
#else
kusano 2b45e8
	move	B, BO
kusano 2b45e8
	dsra	K,KCO,2				
kusano 2b45e8
	LD	b0,	0*SIZE(B)
kusano 2b45e8
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
	MOV		t21,t11
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	
kusano 2b45e8
	beqz	K,.L85
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
.L81:								#  nr=1,mr=2,kr=4
kusano 2b45e8
	LD	b4,	1*SIZE(B)
kusano 2b45e8
	LD	a4,2*SIZE(A)
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	LD	a5,3*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
kusano 2b45e8
	LD	b2,	2*SIZE(B)
kusano 2b45e8
	LD	a2,4*SIZE(A)
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	LD	a3,5*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
	
kusano 2b45e8
	LD	b6,	3*SIZE(B)
kusano 2b45e8
	LD	a6,6*SIZE(A)
kusano 2b45e8
	MADD	t11,t11,a2,b2
kusano 2b45e8
	LD	a7,7*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a3,b2
kusano 2b45e8
kusano 2b45e8
	daddu	A,A,8*SIZE				#  A+=2(mr)*4(kr)*8Byte=8*SIZE
kusano 2b45e8
	daddu	B,B,4*SIZE				#  B+=1(nr)*4(kr)*8Byte=32
kusano 2b45e8
kusano 2b45e8
	LD	b0,	0*SIZE(B)
kusano 2b45e8
	daddiu	K,K,-1
kusano 2b45e8
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MADD	t11,t11,a6,b6
kusano 2b45e8
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
	bnez 	K,.L81
kusano 2b45e8
	MADD	t21,t21,a7,b6
kusano 2b45e8
kusano 2b45e8
.L85:								#  kr=2
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,2				
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,2
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L88
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L86:			
kusano 2b45e8
	LD	b4,	1*SIZE(B)
kusano 2b45e8
	LD	a4,2*SIZE(A)
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	LD	a5,3*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	
kusano 2b45e8
	daddu	A,A,4*SIZE				#  A+=2(mr)*2(kr)*8Byte=32
kusano 2b45e8
	daddu	B,B,2*SIZE				#  B+=1(nr)*2(kr)*8Byte=16
kusano 2b45e8
	
kusano 2b45e8
	LD	b0,0(B)
kusano 2b45e8
	LD	a0,0*SIZE(A)
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	LD	a1,1*SIZE(A)
kusano 2b45e8
	MADD	t21,t21,a5,b4
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	
kusano 2b45e8
.L88:								#  kr=1
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,1
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,1
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L89				
kusano 2b45e8
	LD	ALPHA,152($sp)				#  Get ALPHA
kusano 2b45e8
	
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	MADD	t21,t21,a1,b0
kusano 2b45e8
	daddu	A,A,2*SIZE				#  A+=2(mr)*1(kr)*8Byte=16
kusano 2b45e8
	daddu	B,B,1*SIZE
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L89:								#  Write Back
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	LD	c11,0(CO1)					#  Fetch 16 C
kusano 2b45e8
	LD	c21,1*SIZE(CO1)			
kusano 2b45e8
kusano 2b45e8
	MADD	t11,c11,t11,ALPHA
kusano 2b45e8
	MADD	t21,c21,t21,ALPHA
kusano 2b45e8
kusano 2b45e8
	ST	t11,0(CO1)
kusano 2b45e8
	ST	t21,1*SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,2*SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
	daddu	CO1,CO1,2*SIZE			#  COx += 2*8Byte
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	daddu	CO1,CO1,2*SIZE			#  COx += 2*8Byte
kusano 2b45e8
	MUL	t11, ALPHA, t11
kusano 2b45e8
	MUL	t21, ALPHA, t21
kusano 2b45e8
kusano 2b45e8
	FETCH	$0,0(CO1)
kusano 2b45e8
	ST	t11, -2 * SIZE(CO1)
kusano 2b45e8
	ST	t21, -1 * SIZE(CO1)
kusano 2b45e8
#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, KCO, KK
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	TEMP, TEMP, -2
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, TEMP, -1
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	dsll	K,    TEMP, 1 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 0 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A, A, K
kusano 2b45e8
	daddu	B, B, TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	KK, KK, 2
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 3
kusano 2b45e8
.L11_M1:
kusano 2b45e8
	andi		M,MCO,1				#   mr = 1
kusano 2b45e8
	beqz	M,.L999			
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L90:			
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	B,  BO
kusano 2b45e8
#else
kusano 2b45e8
	dsll	K,    KK, 0 + BASE_SHIFT
kusano 2b45e8
	dsll	TEMP, KK, 0 + BASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	A, A, K
kusano 2b45e8
	daddu	B, BO,  TEMP
kusano 2b45e8
#endif
kusano 2b45e8
	LD	a0,	0*SIZE(A)
kusano 2b45e8
	LD	b0,	0*SIZE(B)
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, KCO, KK
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP, KK, 1
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, KK, 1
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	K,  TEMP, 2
kusano 2b45e8
	beqz	K,.L95
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	move	B,  BO
kusano 2b45e8
	LD	a0,	0*SIZE(A)
kusano 2b45e8
	LD	b0,	0*SIZE(B)
kusano 2b45e8
	dsra	K,KCO,2				
kusano 2b45e8
	beqz	K,.L95
kusano 2b45e8
	MTC		$0,t11
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
.L91:								#  nr=mr=1,kr=4
kusano 2b45e8
	LD	a4,	1*SIZE(A)
kusano 2b45e8
	LD	b4,	1*SIZE(B)
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	
kusano 2b45e8
	LD	a2,	2*SIZE(A)
kusano 2b45e8
	LD	b2,	2*SIZE(B)
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
kusano 2b45e8
	LD	a6,	3*SIZE(A)
kusano 2b45e8
	LD	b6,	3*SIZE(B)
kusano 2b45e8
	MADD	t11,t11,a2,b2
kusano 2b45e8
	
kusano 2b45e8
	daddu	A,A,4*SIZE				#  A+=1(mr)*4(kr)*8Byte=32
kusano 2b45e8
	daddu	B,B,4*SIZE				#  B+=1(nr)*4(kr)*8Byte=32
kusano 2b45e8
kusano 2b45e8
	LD	a0,	0*SIZE(A)
kusano 2b45e8
	LD	b0,	0*SIZE(B)
kusano 2b45e8
	MADD	t11,t11,a6,b6
kusano 2b45e8
	
kusano 2b45e8
	daddiu	K,K,-1
kusano 2b45e8
	bnez 	K,.L91
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L95:								#  kr=2
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	K,KCO,2			
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,2
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L98
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
.L96:			
kusano 2b45e8
	LD	a4,	1*SIZE(A)
kusano 2b45e8
	LD	b4,	1*SIZE(B)
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
	daddu	B,B,2*SIZE				#  B+=1(nr)*2(kr)*8Byte=16
kusano 2b45e8
	daddu	A,A,2*SIZE				#  A+=1(mr)*2(kr)*8Byte=32
kusano 2b45e8
kusano 2b45e8
	LD	b0,0(B)
kusano 2b45e8
	LD	a0,0(A)
kusano 2b45e8
	MADD	t11,t11,a4,b4
kusano 2b45e8
	
kusano 2b45e8
.L98:								#  kr=1
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi		K,KCO,1
kusano 2b45e8
#else
kusano 2b45e8
	andi	K,TEMP,1
kusano 2b45e8
#endif
kusano 2b45e8
	beqz	K,.L99				
kusano 2b45e8
	LD	ALPHA,152($sp)				#  Get ALPHA
kusano 2b45e8
kusano 2b45e8
	MADD	t11,t11,a0,b0
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L99:								#  Write Back
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	LD	c11,0(CO1)					#  Fetch 16 C
kusano 2b45e8
	MADD	t11,c11,t11,ALPHA
kusano 2b45e8
	ST	t11,0(CO1)
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	MUL	t11, ALPHA, t11
kusano 2b45e8
kusano 2b45e8
	ST	t11,  0 * SIZE(CO1)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L999:							#  End
kusano 2b45e8
	ld	$16,   0($sp)
kusano 2b45e8
	ld	$17,   8($sp)
kusano 2b45e8
	ld	$18,  16($sp)
kusano 2b45e8
	ld	$19,  24($sp)
kusano 2b45e8
	ld	$20,  32($sp)
kusano 2b45e8
	ld	$21,  40($sp)
kusano 2b45e8
	ld	$22,  48($sp)
kusano 2b45e8
	LD	$f24, 56($sp)
kusano 2b45e8
	LD	$f25, 64($sp)
kusano 2b45e8
	LD	$f26, 72($sp)
kusano 2b45e8
	LD	$f27, 80($sp)
kusano 2b45e8
	LD	$f28, 88($sp)
kusano 2b45e8
	ld	$23,  96($sp)
kusano 2b45e8
	ld	$24, 104($sp)
kusano 2b45e8
	ld	$25, 112($sp)
kusano 2b45e8
	LD	$f20,120($sp)
kusano 2b45e8
	LD	$f21,128($sp)
kusano 2b45e8
	LD	$f22,136($sp)
kusano 2b45e8
	LD	$f23,144($sp)
kusano 2b45e8
kusano 2b45e8
	j	$31
kusano 2b45e8
	daddiu	$sp, $sp, 160
kusano 2b45e8
kusano 2b45e8
	EPILOGUE