Blob Blame Raw
#define REALNAME ASMNAME
#define ASSEMBLER
#include "common.h"

#define FETCH	ld
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)

#define M	$4
#define	N	$5
#define	K	$6
#define A	$8
#define B	$9
#define C	$10
#define LDC	$11

#define AO	$12
#define BO	$13

#define CO1	$14
#define CO2	$15
#define CO3	$16
#define CO4	$17

#define KCO	$18
#define MCO	$19
#define NCO	$20

#define SPANB	$21
#define PREB	$23
#define PREA	$24
#define SPANA	$25

#define ALPHA	$f15

#if defined(TRMMKERNEL)
#define	OFFSET	$2
#define	KK	$3
#define	TEMP	$7
#endif

#define R8	8
#define	R9	9
#define R14	14
#define R15	15
#define R16	16
#define R17 17

#define	t11	$f30
#define	t21	$f31
#define	t31	$f28
#define	t41	$f29

#define	t12	$f26
#define	t22	$f27
#define	t32	$f24
#define	t42	$f25

#define	t13	$f22
#define	t23	$f23
#define	t33	$f20
#define	t43	$f21

#define	t14	$f18
#define	t24	$f19
#define	t34	$f16
#define	t44	$f17

#define	c11	$f0
#define	c21	$f1
#define	c31	$f2
#define	c41	$f3

#define	c12	$f4
#define	c22	$f5
#define	c32	$f6
#define	c42	$f7

#define	c13	$f8
#define	c23	$f9
#define	c33	$f10
#define c43	$f11

#define	c14	$f12
#define	c24	$f13
#define	c34	$f14
#define	c44	$f0

#define	a0	$f0
#define	a1	$f1
#define	a2	$f2
#define	a3	$f3
#define	a4	$f4
#define	a5	$f5
#define	a6	$f6
#define	a7	$f7
#define	b0	$f8
#define	b1	$f9
#define	b2	$f10
#define b3	$f11
#define	b4	$f12
#define	b5	$f13
#define	b6	$f14
#define	b7	$f15

#define F31 31
#define F30 30
#define F29 29
#define F28 28
#define F27 27
#define F26 26
#define F25 25
#define F24 24 
#define F23 23
#define F22 22
#define F21 21
#define F20 20
#define F19 19
#define F18 18
#define F17 17
#define F16 16 
#define F15 15
#define F14 14
#define F13 13
#define F12 12
#define F11 11
#define F10 10
#define F9 9
#define F8 8
#define F7 7
#define F6 6
#define F5 5
#define F4 4 
#define F3 3 
#define F2 2 
#define F1 1 
#define F0 0

	PROLOGUE
	
	daddiu	$sp, $sp, -160
	sd	$16,   0($sp)
	sd	$17,   8($sp)
	sd	$18,  16($sp)
	sd	$19,  24($sp)
	sd	$20,  32($sp)
	sd	$21,  40($sp)
	sd	$22,  48($sp)
	ST	$f24, 56($sp)
	ST	$f25, 64($sp)
	ST	$f26, 72($sp)
	ST	$f27, 80($sp)
	ST	$f28, 88($sp)
	sd	$23,  96($sp)
	sd	$24, 104($sp)
	sd	$25, 112($sp)
	ST	$f20,120($sp)
	ST	$f21,128($sp)
	ST	$f22,136($sp)
	ST	$f23,144($sp)


	.align	5					
.L0_N4:									#  Loop N
	ST	ALPHA,152($sp)					#  Backup	ALPHA
	move	MCO,M						#  Backup	M

	move	NCO,N						#  Backup	N
	move	KCO,K						#  Backup	K

	move	AO,A						#  Backup	A_addr
	dsra	N,NCO,2						#  N=NCO/2
	
	dsll	LDC,LDC,BASE_SHIFT			#  LDC*8Byte
	dsll	SPANB,KCO,2+BASE_SHIFT		#  SPANB=KC*4nr*8Byte=KC*2^5
	
#if defined(TRMMKERNEL)
	LDARG	OFFSET,160($sp)				#	OFFSET is relate to the data part		
#endif

#if defined(TRMMKERNEL) && !defined(LEFT)
	neg		KK,OFFSET				
#endif
	
	move	BO,B						#  Backup	B_addr
	beq		N,$0,.L0_N2					#  N=0,NCO<4
	dsll	SPANA,KCO,1+BASE_SHIFT		#  SPANA = KCO*2mr*8Byte

.L0_N4_Lb:								#	mr=4,nr=4
	move	CO1,C							
	dsra	M,MCO,2						#  M=MCO/2
	
	move	A,AO						#  Reset A
	daddu	CO2,C,LDC

	daddu	PREB,BO,SPANB				#  PreB point next panelB
	daddu	CO3,CO2,LDC

	daddu	PREA,AO,SPANA
	daddu	CO4,CO3,LDC

#if defined(TRMMKERNEL) && defined(LEFT)
	move	KK,OFFSET					
#endif
	beqz	M,.L14_M2
	daddu	C,CO4,LDC					#	move C to next panel Cj

.L10:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,BO						#	(SIDE=L and UPLO=L) or (SIZE=R and UPLO=U)
#else
	dsll	K,KK,2 + BASE_SHIFT			#  KK is the length that needs to span to the data part
	dsll	TEMP,KK,2 + BASE_SHIFT

	daddu	A,A,K						#  move A B to data part
	daddu	B,BO,TEMP
#endif

	MTC		$0,t11						# 	GEMM part	NR=4,MR=4
	LD	a0,0(A)

	MOV	t21,t11
	MOV	t31,t11
	LD	a1,1*SIZE(A)

	MOV	t41,t11
	MOV	t12,t11
	LD	b0,0(B)
	
	MOV	t22,t11
	MOV	t32,t11
	LD	b1,1*SIZE(B)

	MOV	t42,t11
	LD	a2,2*SIZE(A)
	
	MOV	t13,t11
	MOV	t23,t11
	LD	b2,2*SIZE(B)
	
	MOV	t33,t11
	MOV	t43,t11
	LD	a3,3*SIZE(A)

	MOV	t14,t11
	MOV	t24,t11
	LD	b3,3*SIZE(B)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP,KCO,KK					#  temp is the length of the data part
#elif defined(LEFT)
	daddiu	TEMP, KK, 4					#	S=L,U=L	
#else
	daddiu	TEMP, KK, 4					#	S=R,U=U,for this two situation KK is the length of the data part
#endif
	dsra	K,TEMP,2					#  K=KCO/2
	MOV	t34,t11
	beqz	K,.L15
	MOV	t44,t11

#else							
	move	B,BO						#	Reset B
	MTC		$0,t11						# 	GEMM part	NR=4,MR=4
	LD	a0,0(A)

	MOV	t21,t11
	MOV	t31,t11
	LD	a1,1*SIZE(A)

	MOV	t41,t11
	MOV	t12,t11
	LD	b0,0(B)
	
	MOV	t22,t11
	MOV	t32,t11
	LD	b1,1*SIZE(B)

	MOV	t42,t11
	dsra	K,KCO,2						#  K=KCO/2
	LD	a2,2*SIZE(A)
	
	MOV	t13,t11
	MOV	t23,t11
	LD	b2,2*SIZE(B)
	
	MOV	t33,t11
	MOV	t43,t11
	LD	a3,3*SIZE(A)

	MOV	t14,t11
	MOV	t24,t11
	LD	b3,3*SIZE(B)

	MOV	t34,t11
	beqz	K,.L15
	MOV	t44,t11							#	clear 16 results registers
#endif
	
	.align	5
.L11:									#  kr=4
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	LD	a4,4*SIZE(A)

	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1
	LD	a5,5*SIZE(A)
	
	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0
	LD	b4,4*SIZE(B)
	
	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1
	LD	b5,5*SIZE(B)
	FETCH		$0,(PREB)
	
	MADD	t13,t13,a0,b2
	MADD	t23,t23,a1,b2
	LD	a6,6*SIZE(A)
	
	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3
	LD	b6,6*SIZE(B)
	FETCH		$0,(PREA)
	
	MADD	t33,t33,a2,b2
	MADD	t43,t43,a3,b2
	LD	a7,7*SIZE(A)
	
	MADD	t34,t34,a2,b3
	MADD	t44,t44,a3,b3
	LD	b7,7*SIZE(B)

.L12:
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4
	LD	a0,8*SIZE(A)

	MADD	t12,t12,a4,b5
	MADD	t22,t22,a5,b5
	LD	a1,9*SIZE(A)

	MADD	t31,t31,a6,b4
	MADD	t41,t41,a7,b4
	LD	b0,8*SIZE(B)

	MADD	t32,t32,a6,b5
	MADD	t42,t42,a7,b5
	LD	b1,9*SIZE(B)

	FETCH		$0,4*SIZE(PREB)
	MADD	t13,t13,a4,b6
	MADD	t23,t23,a5,b6
	LD	a2,10*SIZE(A)

	MADD	t14,t14,a4,b7
	MADD	t24,t24,a5,b7
	LD	b2,10*SIZE(B)

	FETCH		$0,4*SIZE(PREA)
	MADD	t33,t33,a6,b6
	MADD	t43,t43,a7,b6
	LD	a3,11*SIZE(A)

	MADD	t34,t34,a6,b7
	MADD	t44,t44,a7,b7
	LD	b3,11*SIZE(B)

.L13:
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	LD	a4,12*SIZE(A)

	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1
	LD	a5,13*SIZE(A)

	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0
	LD	b4,12*SIZE(B)

	FETCH		$0,8*SIZE(PREA)
	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1
	LD	b5,13*SIZE(B)

	FETCH		$0,8*SIZE(PREB)
	MADD	t13,t13,a0,b2
	MADD	t23,t23,a1,b2
	LD	a6,14*SIZE(A)

	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3
	daddu	A,A,16*SIZE					#  4mr*4kr
	LD	b6,14*SIZE(B)

	MADD	t33,t33,a2,b2
	MADD	t43,t43,a3,b2
	daddu	B,B,16*SIZE					#	4nr*4kr
	LD	a7,-1*SIZE(A)

	MADD	t34,t34,a2,b3
	MADD	t44,t44,a3,b3
	LD	b7,-1*SIZE(B)

.L14:
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4
	LD	a0,0(A)

	MADD	t12,t12,a4,b5
	MADD	t22,t22,a5,b5
	LD	a1,1*SIZE(A)

	MADD	t31,t31,a6,b4
	MADD	t41,t41,a7,b4
	daddiu	K,K,-1
	LD	b0,0(B)

	MADD	t32,t32,a6,b5
	MADD	t42,t42,a7,b5
	daddu	PREA,PREA,16*SIZE
	LD	b1,1*SIZE(B)

	FETCH		$0,12*SIZE(PREB)
	MADD	t13,t13,a4,b6
	MADD	t23,t23,a5,b6
	LD	a2,2*SIZE(A)

	FETCH		$0,-4*SIZE(PREA)
	MADD	t14,t14,a4,b7
	MADD	t24,t24,a5,b7
	LD	b2,2*SIZE(B)

	MADD	t33,t33,a6,b6
	MADD	t43,t43,a7,b6
	daddu	PREB,PREB,16*SIZE
	LD	a3,3*SIZE(A)

	MADD	t34,t34,a6,b7
	MADD	t44,t44,a7,b7
	bnez 	K,.L11
	LD	b3,3*SIZE(B)


.L15:									#  kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2				
#else
	andi	K,TEMP, 2
#endif
	beqz	K,.L18
	nop

.L16:			
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	LD	a4,4*SIZE(A)

	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1
	LD	a5,5*SIZE(A)

	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0
	LD	b4,4*SIZE(B)

	FETCH		$0,0(PREA)
	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1
	LD	b5,5*SIZE(B)

	FETCH		$0,0(PREB)
	MADD	t13,t13,a0,b2
	MADD	t23,t23,a1,b2
	LD	a6,6*SIZE(A)

	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3
	daddu	A,A,8*SIZE					#	4mr*2kr
	LD	b6,6*SIZE(B)

	MADD	t33,t33,a2,b2
	MADD	t43,t43,a3,b2
	daddu	B,B,8*SIZE					#	4nr*2kr
	LD	a7,-1*SIZE(A)

	MADD	t34,t34,a2,b3
	MADD	t44,t44,a3,b3
	LD	b7,-1*SIZE(B)

.L17:
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4
	LD	a0,0*SIZE(A)

	MADD	t12,t12,a4,b5
	MADD	t22,t22,a5,b5
	LD	a1,1*SIZE(A)

	MADD	t31,t31,a6,b4
	MADD	t41,t41,a7,b4
	LD	b0,0*SIZE(B)

	MADD	t32,t32,a6,b5
	MADD	t42,t42,a7,b5
	LD	b1,1*SIZE(B)

	FETCH		$0,4*SIZE(PREB)
	MADD	t13,t13,a4,b6
	MADD	t23,t23,a5,b6
	LD	a2,2*SIZE(A)

	FETCH		$0,4*SIZE(PREA)
	MADD	t14,t14,a4,b7
	MADD	t24,t24,a5,b7
	LD	b2,2*SIZE(B)

	MADD	t33,t33,a6,b6
	MADD	t43,t43,a7,b6
	daddu	PREA,PREA,8*SIZE
	LD	a3,3*SIZE(A)

	MADD	t34,t34,a6,b7
	MADD	t44,t44,a7,b7
	daddu	PREB,PREB,8*SIZE
	LD	b3,3*SIZE(B)

	
.L18:									#	kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L19				  
	LD	ALPHA,152($sp)					#  Get ALPHA
	
	FETCH		$0,0(PREB)
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,4*SIZE					#  	4mr*kr

	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1
	daddu	B,B,4*SIZE					#	4nr*kr

	FETCH		$0,0(PREA)
	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0
	daddu	PREB,PREB,4*SIZE

	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1
	daddu	PREA,PREA,4*SIZE

	MADD	t13,t13,a0,b2
	MADD	t23,t23,a1,b2

	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3

	MADD	t33,t33,a2,b2
	MADD	t43,t43,a3,b2

	MADD	t34,t34,a2,b3
	MADD	t44,t44,a3,b3

.L19:									#  Write Back to C
#ifndef TRMMKERNEL				
	LD	c11,0(CO1)						#  GEMM write part 
	LD	c21,1*SIZE(CO1)					#  get 16 C
	LD	c31,2*SIZE(CO1)
	LD	c41,3*SIZE(CO1)

	LD	c12,0(CO2)
	MADD	t11,c11,t11,ALPHA
	LD	c22,1*SIZE(CO2)
	MADD	t21,c21,t21,ALPHA
	LD	c32,2*SIZE(CO2)
	MADD	t31,c31,t31,ALPHA
	LD	c42,3*SIZE(CO2)
	MADD	t41,c41,t41,ALPHA

	LD	c13,0(CO3)
	MADD	t12,c12,t12,ALPHA
	LD	c23,1*SIZE(CO3)
	MADD	t22,c22,t22,ALPHA
	LD	c33,2*SIZE(CO3)
	MADD	t32,c32,t32,ALPHA
	LD	c43,3*SIZE(CO3)
	MADD	t42,c42,t42,ALPHA

	LD	c14,0(CO4)
	MADD	t13,c13,t13,ALPHA
	LD	c24,1*SIZE(CO4)
	MADD	t23,c23,t23,ALPHA
	LD	c34,2*SIZE(CO4)
	MADD	t33,c33,t33,ALPHA
	LD	c44,3*SIZE(CO4)
	MADD	t43,c43,t43,ALPHA

	ST	t11,0(CO1)
	MADD	t14,c14,t14,ALPHA
	ST	t21,1*SIZE(CO1)
	MADD	t24,c24,t24,ALPHA
	ST	t31,2*SIZE(CO1)
	MADD	t34,c34,t34,ALPHA
	ST	t41,3*SIZE(CO1)
	MADD	t44,c44,t44,ALPHA
	daddiu	M,M,-1						#  M--

	ST	t12,0(CO2)
	ST	t22,1*SIZE(CO2)
	ST	t32,2*SIZE(CO2)
	ST	t42,3*SIZE(CO2)

	ST	t13,0(CO3)
	ST	t23,1*SIZE(CO3)
	ST	t33,2*SIZE(CO3)
	ST	t43,3*SIZE(CO3)

	FETCH	$0,4*SIZE(CO1)
	FETCH	$0,4*SIZE(CO2)
	FETCH	$0,4*SIZE(CO3)
	FETCH	$0,4*SIZE(CO4)

	FETCH	$0,8*SIZE(CO1)
	FETCH	$0,8*SIZE(CO2)
	FETCH	$0,8*SIZE(CO3)
	FETCH	$0,8*SIZE(CO4)

	ST	t14,0(CO4)
	daddu	CO1,CO1,4*SIZE				#  COi += 4
	ST	t24,1*SIZE(CO4)
	daddu	CO2,CO2,4*SIZE
	ST	t34,2*SIZE(CO4)
	daddu	CO3,CO3,4*SIZE
	ST	t44,3*SIZE(CO4)
	daddu	PREB,BO,SPANB
	
	bnez	M,.L10				
	daddu	CO4,CO4,4*SIZE

#else							
	MUL	t11, ALPHA, t11					#	TRMM write back part
	MUL	t21, ALPHA, t21
	MUL	t31, ALPHA, t31
	MUL	t41, ALPHA, t41

	ST	t11, 0 * SIZE(CO1)
	MUL	t12, ALPHA, t12
	ST	t21, 1 * SIZE(CO1)
	MUL	t22, ALPHA, t22
	ST	t31, 2 * SIZE(CO1)
	MUL	t32, ALPHA, t32
	ST	t41, 3 * SIZE(CO1)
	MUL	t42, ALPHA, t42

	ST	t12, 0 * SIZE(CO2)
	MUL	t13, ALPHA, t13
	ST	t22, 1 * SIZE(CO2)
	MUL	t23, ALPHA, t23
	ST	t32, 2 * SIZE(CO2)
	MUL	t33, ALPHA, t33
	ST	t42, 3 * SIZE(CO2)
	MUL	t43, ALPHA, t43

	ST	t13, 0 * SIZE(CO3)
	MUL	t14, ALPHA, t14
	ST	t23, 1 * SIZE(CO3)
	MUL	t24, ALPHA, t24
	ST	t33, 2 * SIZE(CO3)
	MUL	t34, ALPHA, t34
	ST	t43, 3 * SIZE(CO3)
	MUL	t44, ALPHA, t44

	ST	t14, 0 * SIZE(CO4)
	daddiu	M,M,-1						#  M--
	ST	t24, 1 * SIZE(CO4)
	ST	t34, 2 * SIZE(CO4)
	ST	t44, 3 * SIZE(CO4)
	daddiu	CO1,CO1, 4 * SIZE
	daddiu	CO2,CO2, 4 * SIZE
	daddiu	CO3,CO3, 4 * SIZE
	daddiu	CO4,CO4, 4 * SIZE	

	FETCH	$0,4*SIZE(CO1)
	FETCH	$0,4*SIZE(CO2)
	FETCH	$0,4*SIZE(CO3)
	FETCH	$0,4*SIZE(CO4)

	FETCH	$0,0(CO1)
	FETCH	$0,0(CO2)
	FETCH	$0,0(CO3)
	FETCH	$0,0(CO4)

#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP,KCO,KK								
#ifdef LEFT
	daddiu	TEMP,TEMP, -4
#else
	daddiu	TEMP,TEMP, -4
#endif
	dsll	K,TEMP,2 + BASE_SHIFT
	dsll	TEMP,TEMP,2 + BASE_SHIFT
	daddu	A,A,K						# 	mov A to the end of panel Ai
	daddu	B,B,TEMP					# 	mov B to the end of panel Bj
#endif

#ifdef LEFT										
	daddiu	KK, KK,4
#endif
	bnez	M,.L10					
	nop
#endif


	.align 3
.L14_M2:
	andi	M, MCO, 2					#	nr=4,mr=2
	beqz	M,.L14_M1			
	nop

.L20:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,BO						#	Reset B
#else
	dsll	K,KK,1 + BASE_SHIFT			#	mr=2	
	dsll	TEMP,KK,2 + BASE_SHIFT		#	nr=4
	daddu	A,A,K
	daddu	B,BO,TEMP
#endif

	LD	a0,0*SIZE(A)
	MTC		$0,t11
	LD	a1,1*SIZE(A)
	
	MOV	t21,t11
	LD	b0,0*SIZE(B)
	MOV	t12,t11
	LD	b1,1*SIZE(B)

	MOV	t22,t11
	LD	b2,2*SIZE(B)

	MOV	t13,t11
	MOV	t23,t11
	LD	b3,3*SIZE(B)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP,KCO,KK
#elif defined(LEFT)
	daddiu	TEMP,KK,2					#	left part,controlled by mr, mr=2
#else
	daddiu	TEMP,KK,4					#  	right part,controlled by nr,nr=4
#endif
	dsra	K,TEMP,2
	MOV	t14,t11
	beqz	K,.L25
	MOV	t24,t11							#	clear 2*4=8 results registers

#else
	move	B,BO						#	Reset B 
	LD	a0,0*SIZE(A)
	MTC		$0,t11
	LD	a1,1*SIZE(A)
	
	MOV	t21,t11
	LD	b0,0*SIZE(B)
	MOV	t12,t11
	LD	b1,1*SIZE(B)

	MOV	t22,t11
	dsra	K,KCO,2				
	LD	b2,2*SIZE(B)

	MOV	t13,t11
	MOV	t23,t11
	LD	b3,3*SIZE(B)

	MOV	t14,t11
	beqz	K,.L25
	MOV	t24,t11

#endif

.L21:									#  nr=4,mr=2,kr=4
	MADD	t11,t11,a0,b0
	LD	a4,2*SIZE(A)
	MADD	t21,t21,a1,b0
	LD	a5,3*SIZE(A)

	MADD	t12,t12,a0,b1
	LD	b4,4*SIZE(B)
	MADD	t22,t22,a1,b1
	LD	b5,5*SIZE(B)

	MADD	t13,t13,a0,b2
	LD	b6,6*SIZE(B)
	MADD	t23,t23,a1,b2
	LD	b7,7*SIZE(B)

	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3
	
	MADD	t11,t11,a4,b4
	LD	a2,4*SIZE(A)
	MADD	t21,t21,a5,b4
	LD	a3,5*SIZE(A)

	MADD	t12,t12,a4,b5
	LD	b0,8*SIZE(B)
	MADD	t22,t22,a5,b5
	LD	b1,9*SIZE(B)

	MADD	t13,t13,a4,b6
	LD	b2,10*SIZE(B)
	MADD	t23,t23,a5,b6
	LD	b3,11*SIZE(B)

	MADD	t14,t14,a4,b7
	MADD	t24,t24,a5,b7
	daddiu	K,K,-1

	MADD	t11,t11,a2,b0
	LD	a6,6*SIZE(A)
	MADD	t21,t21,a3,b0
	LD	a7,7*SIZE(A)

	MADD	t12,t12,a2,b1
	LD	b4,12*SIZE(B)
	MADD	t22,t22,a3,b1
	LD	b5,13*SIZE(B)

	MADD	t13,t13,a2,b2
	LD	b6,14*SIZE(B)
	MADD	t23,t23,a3,b2
	LD	b7,15*SIZE(B)

	MADD	t14,t14,a2,b3
	MADD	t24,t24,a3,b3
	daddu	A,A,8*SIZE					#  2mr*4kr
	daddu	B,B,16*SIZE					#	4nr*4kr

	MADD	t11,t11,a6,b4
	LD	a0,0*SIZE(A)
	MADD	t21,t21,a7,b4
	LD	a1,1*SIZE(A)

	MADD	t12,t12,a6,b5
	LD	b0,0*SIZE(B)
	MADD	t22,t22,a7,b5
	LD	b1,1*SIZE(B)

	MADD	t13,t13,a6,b6
	LD	b2,2*SIZE(B)
	MADD	t23,t23,a7,b6
	LD	b3,3*SIZE(B)

	MADD	t14,t14,a6,b7
	bnez 	K,.L21
	MADD	t24,t24,a7,b7


.L25:										
#ifndef TRMMKERNEL
	andi	K,KCO,2						#	kr=2
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L28
	nop

.L26:			
	MADD	t11,t11,a0,b0
	LD	a4,2*SIZE(A)
	MADD	t21,t21,a1,b0
	LD	a5,3*SIZE(A)

	MADD	t12,t12,a0,b1
	LD	b4,4*SIZE(B)
	MADD	t22,t22,a1,b1
	LD	b5,5*SIZE(B)

	MADD	t13,t13,a0,b2
	LD	b6,6*SIZE(B)
	MADD	t23,t23,a1,b2
	LD	b7,7*SIZE(B)
	
	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3
	daddu	A,A,4*SIZE					#  	2mr*2kr
	daddu	B,B,8*SIZE					#	4nr*2kr

.L27:
	MADD	t11,t11,a4,b4
	LD	a0,0*SIZE(A)
	MADD	t21,t21,a5,b4
	LD	a1,1*SIZE(A)

	MADD	t12,t12,a4,b5
	LD	b0,0*SIZE(B)
	MADD	t22,t22,a5,b5
	LD	b1,1*SIZE(B)

	MADD	t13,t13,a4,b6
	LD	b2,2*SIZE(B)
	MADD	t23,t23,a5,b6
	LD	b3,3*SIZE(B)

	MADD	t14,t14,a4,b7
	MADD	t24,t24,a5,b7

	
.L28:									#	kr=1	
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L29				  
	LD	ALPHA,152($sp)					#  Get ALPHA
	
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,2*SIZE					#  2mr*kr
	daddu	B,B,4*SIZE					#  4nr*kr

	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1

	MADD	t13,t13,a0,b2
	MADD	t23,t23,a1,b2

	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3

.L29:									#  Write Back to C
#ifndef TRMMKERNEL
	LD	c11,0(CO1)						#	GEMM write back part
	LD	c21,1*SIZE(CO1)			

	LD	c12,0(CO2)
	LD	c22,1*SIZE(CO2)
	
	LD	c13,0(CO3)
	MADD	t11,c11,t11,ALPHA
	LD	c23,1*SIZE(CO3)
	MADD	t21,c21,t21,ALPHA

	LD	c14,0(CO4)
	MADD	t12,c12,t12,ALPHA
	LD	c24,1*SIZE(CO4)
	MADD	t22,c22,t22,ALPHA

	ST	t11,0(CO1)
	MADD	t13,c13,t13,ALPHA
	ST	t21,1*SIZE(CO1)
	MADD	t23,c23,t23,ALPHA

	ST	t12,0(CO2)
	MADD	t14,c14,t14,ALPHA
	ST	t22,1*SIZE(CO2)
	MADD	t24,c24,t24,ALPHA

	ST	t13,0(CO3)
	daddu	CO1,CO1,2*SIZE				#  COi += 2
	ST	t23,1*SIZE(CO3)
	daddu	CO2,CO2,2*SIZE

	ST	t14,0(CO4)
	daddu	CO3,CO3,2*SIZE
	ST	t24,1*SIZE(CO4)
	daddu	CO4,CO4,2*SIZE

	FETCH	$0,0(CO1)
	FETCH	$0,0(CO2)
	FETCH	$0,0(CO3)
	FETCH	$0,0(CO4)

#else
	MUL	t11, ALPHA, t11					#	TRMM write back part
	MUL	t21, ALPHA, t21
	
	ST	t11, 0 * SIZE(CO1)
	MUL	t12, ALPHA, t12
	ST	t21, 1 * SIZE(CO1)
	MUL	t22, ALPHA, t22
	
	ST	t12, 0 * SIZE(CO2)
	MUL	t13, ALPHA, t13
	ST	t22, 1 * SIZE(CO2)
	MUL	t23, ALPHA, t23
	
	ST	t13, 0 * SIZE(CO3)
	MUL	t14, ALPHA, t14
	ST	t23, 1 * SIZE(CO3)
	MUL	t24, ALPHA, t24
	
	ST	t14, 0 * SIZE(CO4)
	ST	t24, 1 * SIZE(CO4)
	
	daddiu	CO1,CO1, 2 * SIZE
	daddiu	CO2,CO2, 2 * SIZE
	daddiu	CO3,CO3, 2 * SIZE
	daddiu	CO4,CO4, 2 * SIZE

	FETCH	$0,0(CO1)
	FETCH	$0,0(CO2)
	FETCH	$0,0(CO3)
	FETCH	$0,0(CO4)

#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP,KCO,KK
#ifdef LEFT
	daddiu	TEMP,TEMP,-2
#else
	daddiu	TEMP,TEMP,-4
#endif
	dsll	K,TEMP,1 + BASE_SHIFT
	dsll	TEMP,TEMP,2 + BASE_SHIFT

	daddu	A,A,K						#	move A to next panel Ai
	daddu	B,B,TEMP					#	move B to next panel Bj
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif
#endif


	.align 3
.L14_M1:
	andi	M,MCO,1						#	mr=1	
	beqz	M,.L0_N4_Loop				#  	M = 0, finishing one panel Bj
	nop

.L30:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,BO						#	Reset B
#else
	dsll	K,KK, BASE_SHIFT
	dsll	TEMP,KK,2 + BASE_SHIFT

	daddu	A,A,K
	daddu	B,BO,TEMP
#endif

	LD	a0, 0 * SIZE(A)					#	a0

	MTC		$0,t11
	LD	b0,0*SIZE(B)
	
	MOV	t12,t11
	LD	b1,1*SIZE(B)

	MOV	t13,t11
	LD	b2,2*SIZE(B)
	
	MOV	t14,t11
	LD	b3,3*SIZE(B)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, KCO, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 4
#endif
	dsra	K,TEMP, 2
	nop
	beqz	K,.L35
	nop
								
#else							
	move	B,BO						#	Reset B, GEMM part
	dsra	K,KCO,2						#  	K=KCO/2
	LD	a0, 0 * SIZE(A)					#	a0

	MTC		$0,t11
	LD	b0,0*SIZE(B)
	
	MOV	t12,t11
	LD	b1,1*SIZE(B)

	MOV	t13,t11
	LD	b2,2*SIZE(B)
	
	MOV	t14,t11
	beqz	K,.L35
	LD	b3,3*SIZE(B)

#endif

.L31:									#	nr=4,mr=1,kr=4	
	LD	a1,	1*SIZE(A)					#	load a1
	MADD	t11,t11,a0,b0
	
	LD	b4,4*SIZE(B)
	LD	b5,5*SIZE(B)
	MADD	t12,t12,a0,b1
	
	LD	b6,6*SIZE(B)
	LD	b7,7*SIZE(B)
	MADD	t13,t13,a0,b2
	MADD	t14,t14,a0,b3

	LD	a2,	2*SIZE(A)					#	a2
	MADD	t11,t11,a1,b4
	
	LD	b0,8*SIZE(B)
	LD	b1,9*SIZE(B)
	MADD	t12,t12,a1,b5
	
	LD	b2,10*SIZE(B)
	LD	b3,11*SIZE(B)
	MADD	t13,t13,a1,b6
	MADD	t14,t14,a1,b7

	LD	a3,	3*SIZE(A)					#	a3
	MADD	t11,t11,a2,b0
	daddiu	K,K,-1
	
	LD	b4,12*SIZE(B)
	LD	b5,13*SIZE(B)
	MADD	t12,t12,a2,b1
	daddu	A,A,4*SIZE					#	1mr*4kr
	
	LD	b6,14*SIZE(B)
	LD	b7,15*SIZE(B)
	MADD	t13,t13,a2,b2
	MADD	t14,t14,a2,b3

	LD	a0,	0*SIZE(A)					#	a0
	daddu	B,B,16*SIZE					#	4nr*4kr
	MADD	t11,t11,a3,b4
	
	LD	b0,0*SIZE(B)
	MADD	t12,t12,a3,b5
	LD	b1,1*SIZE(B)
	MADD	t13,t13,a3,b6

	LD	b2,2*SIZE(B)
	MADD	t14,t14,a3,b7
	bnez 	K,.L31
	LD	b3,3*SIZE(B)


.L35:									#  kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2			
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L38
	nop

.L36:			
	LD	a1,1*SIZE(A)					#	load a1
	MADD	t11,t11,a0,b0

	LD	b4,4*SIZE(B)
	LD	b5,5*SIZE(B)
	MADD	t12,t12,a0,b1
	daddu	A,A,2*SIZE					#  	mr*2kr
	
	LD	b6,6*SIZE(B)
	MADD	t13,t13,a0,b2
	
	LD	b7,7*SIZE(B)
	MADD	t14,t14,a0,b3
	daddu	B,B,8*SIZE					#	4nr*2kr


.L37:
	LD	a0,0(A)
	MADD	t11,t11,a1,b4
	
	LD	b0,0*SIZE(B)
	LD	b1,1*SIZE(B)
	MADD	t12,t12,a1,b5
	
	LD	b2,2*SIZE(B)
	LD	b3,3*SIZE(B)
	MADD	t13,t13,a1,b6
	MADD	t14,t14,a1,b7
	
	
.L38:									#  	kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L39				
	LD	ALPHA,152($sp)					#  Get ALPHA
	
	MADD	t11,t11,a0,b0
	MADD	t12,t12,a0,b1
	daddu	A,A,1*SIZE				
	daddu	B,B,4*SIZE
	
	MADD	t13,t13,a0,b2
	MADD	t14,t14,a0,b3

.L39:									#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)			
	LD	c12,0(CO2)
	LD	c13,0(CO3)
	LD	c14,0(CO4)
	
	MADD	t11,c11,t11,ALPHA
	MADD	t12,c12,t12,ALPHA
	MADD	t13,c13,t13,ALPHA
	MADD	t14,c14,t14,ALPHA

	ST	t11,0(CO1)
	ST	t12,0(CO2)
	ST	t13,0(CO3)
	ST	t14,0(CO4)
#else
	MUL	t11, ALPHA, t11
	MUL	t12, ALPHA, t12
	MUL	t13, ALPHA, t13
	MUL	t14, ALPHA, t14

	ST	t11,  0 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)
	ST	t13,  0 * SIZE(CO3)
	ST	t14,  0 * SIZE(CO4)

#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, KCO, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -4
#endif

	dsll	K,TEMP, BASE_SHIFT
	dsll	TEMP,TEMP, 2 + BASE_SHIFT

	daddu	A,A,K
	daddu	B,B,TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 1
#endif
#endif


	.align	3
.L0_N4_Loop:								#	mc finished
	daddiu	N,N,-1							#  N--
#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu	KK, KK,4 
#endif
	bnez	N,.L0_N4_Lb			
	move	BO,B							#  Set BO point to next panel Bj

	.align	5					
.L0_N2:
	andi	N,NCO,2							#  	nr = 2
	beqz	N,.L0_N1		
	nop

.L0_N2_Lb:
	move	CO1,C					
	daddu	CO2,C,LDC

	dsra	M,MCO,2				
	move	A,AO							#  Reset A

	daddu	PREA,AO,SPANA
	daddu	C,CO2,LDC

#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif
	beqz	M,.L12_M2
	nop

.L40:						
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,BO							#	Reset B
#else
	dsll	K,KK, 2 + BASE_SHIFT
	dsll	TEMP, KK,1 + BASE_SHIFT	

	daddu	A,A,K
	daddu	B,BO,TEMP
#endif
	LD	a0,0*SIZE(A)
	MTC		$0,t11							#  	gemm part
	LD	a1,1*SIZE(A)

	MOV	t21,t11
	LD	b0,0*SIZE(B)
	MOV	t31,t11
	LD	b1,1*SIZE(B)

	MOV	t41,t11
	LD	a2,2*SIZE(A)
	LD	a3,3*SIZE(A)
	
	MOV	t12,t11
	MOV	t22,t11
	
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP,KCO,KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 4
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	K,TEMP,2				
	MOV	t32,t11
	beqz	K,.L45
	MOV	t42,t11

#else
	move	B,BO							#	Reset B
	LD	a0,0*SIZE(A)
	MTC		$0,t11							#  	gemm part
	LD	a1,1*SIZE(A)

	MOV	t21,t11
	LD	b0,0*SIZE(B)
	MOV	t31,t11
	LD	b1,1*SIZE(B)

	MOV	t41,t11
	LD	a2,2*SIZE(A)
	dsra	K,KCO,2							#	K=KCO/2
	LD	a3,3*SIZE(A)
	
	MOV	t12,t11
	MOV	t22,t11
	
	MOV	t32,t11
	beqz	K,.L45
	MOV	t42,t11

#endif

.L41:										#  	nr=2,mr=kr=4
	MADD	t11,t11,a0,b0
	LD	a4,4*SIZE(A)
	MADD	t21,t21,a1,b0
	LD	a5,5*SIZE(A)

	MADD	t12,t12,a0,b1
	LD	b4,2*SIZE(B)
	MADD	t22,t22,a1,b1
	LD	b5,3*SIZE(B)

	MADD	t31,t31,a2,b0
	LD	a6,6*SIZE(A)
	MADD	t41,t41,a3,b0
	LD	a7,7*SIZE(A)

	FETCH		$0,(PREA)
	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1

.L42:
	MADD	t11,t11,a4,b4
	LD	a0,8*SIZE(A)
	MADD	t21,t21,a5,b4
	LD	a1,9*SIZE(A)

	MADD	t12,t12,a4,b5
	LD	b2,4*SIZE(B)
	MADD	t22,t22,a5,b5
	LD	b3,5*SIZE(B)

	MADD	t31,t31,a6,b4
	LD	a2,10*SIZE(A)
	MADD	t41,t41,a7,b4
	LD	a3,11*SIZE(A)

	FETCH		$0,4*SIZE(PREA)
	MADD	t32,t32,a6,b5
	MADD	t42,t42,a7,b5

.L43:
	MADD	t11,t11,a0,b2
	LD	a4,12*SIZE(A)
	MADD	t21,t21,a1,b2
	LD	a5,13*SIZE(A)

	MADD	t12,t12,a0,b3
	LD	b6,6*SIZE(B)
	MADD	t22,t22,a1,b3
	LD	b7,7*SIZE(B)

	MADD	t31,t31,a2,b2
	LD	a6,14*SIZE(A)
	MADD	t41,t41,a3,b2
	LD	a7,15*SIZE(A)

	FETCH		$0,8*SIZE(PREA)
	MADD	t32,t32,a2,b3
	MADD	t42,t42,a3,b3
	
	daddu	A,A,16*SIZE						#	4mr*4kr
	daddu	B,B,8*SIZE						#	2nr*4kr	

.L44:
	MADD	t11,t11,a4,b6
	LD	a0,0*SIZE(A)
	MADD	t21,t21,a5,b6
	LD	a1,1*SIZE(A)


	MADD	t12,t12,a4,b7
	LD	b0,0*SIZE(B)
	MADD	t22,t22,a5,b7
	LD	b1,1*SIZE(B)

	daddiu	K,K,-1
	daddu	PREA,PREA,16*SIZE

	MADD	t31,t31,a6,b6
	LD	a2,2*SIZE(A)
	MADD	t41,t41,a7,b6
	LD	a3,3*SIZE(A)

	FETCH		$0,-4*SIZE(PREA)
	MADD	t32,t32,a6,b7
	bnez 	K,.L41
	MADD	t42,t42,a7,b7


.L45:										#  	kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2				
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L48
	nop

.L46:			
	MADD	t11,t11,a0,b0
	LD	a4,4*SIZE(A)
	MADD	t21,t21,a1,b0
	LD	a5,5*SIZE(A)

	MADD	t12,t12,a0,b1
	LD	b4,2*SIZE(B)
	MADD	t22,t22,a1,b1
	LD	b5,3*SIZE(B)

	MADD	t31,t31,a2,b0
	LD	a6,6*SIZE(A)
	MADD	t41,t41,a3,b0
	LD	a7,7*SIZE(A)

	FETCH		$0,0(PREA)
	MADD	t32,t32,a2,b1
	daddu	B,B,4*SIZE						#  B+=2(nr)*2(kr)*8Byte=32
	
	MADD	t42,t42,a3,b1
	daddu	A,A,8*SIZE						#  A+=4(mr)*2(kr)*8Byte=8*SIZE

.L47:
	MADD	t11,t11,a4,b4
	LD	a0,0*SIZE(A)
	MADD	t21,t21,a5,b4
	LD	a1,1*SIZE(A)

	MADD	t12,t12,a4,b5
	LD	b0,0*SIZE(B)
	MADD	t22,t22,a5,b5
	LD	b1,1*SIZE(B)

	MADD	t31,t31,a6,b4
	LD	a2,2*SIZE(A)
	MADD	t41,t41,a7,b4
	LD	a3,3*SIZE(A)

	FETCH		$0,4*SIZE(PREA)
	MADD	t32,t32,a6,b5
	MADD	t42,t42,a7,b5
	daddu	PREA,PREA,8*SIZE


	
.L48:										#	 kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L49				 
	LD	ALPHA,152($sp)						#  Get ALPHA
	
	FETCH		$0,0(PREA)
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,4*SIZE						#  A+=4(mr)*1(kr)*8Byte=32

	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1
	daddu	B,B,2*SIZE
	daddu	PREA,PREA,4*SIZE

	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0

	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1

.L49:										#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)							#  gemm write back part Fetch 16 C
	LD	c21,1*SIZE(CO1)			
	LD	c31,2*SIZE(CO1)
	LD	c41,3*SIZE(CO1)

	LD	c12,0(CO2)
	MADD	t11,c11,t11,ALPHA
	LD	c22,1*SIZE(CO2)
	MADD	t21,c21,t21,ALPHA
	LD	c32,2*SIZE(CO2)
	MADD	t31,c31,t31,ALPHA
	LD	c42,3*SIZE(CO2)
	MADD	t41,c41,t41,ALPHA

	ST	t11,0(CO1)
	MADD	t12,c12,t12,ALPHA
	ST	t21,1*SIZE(CO1)
	MADD	t22,c22,t22,ALPHA
	ST	t31,2*SIZE(CO1)
	MADD	t32,c32,t32,ALPHA
	ST	t41,3*SIZE(CO1)
	MADD	t42,c42,t42,ALPHA
	daddiu	M,M,-1				

	ST	t12,0(CO2)
	ST	t22,1*SIZE(CO2)
	ST	t32,2*SIZE(CO2)
	ST	t42,3*SIZE(CO2)

	FETCH	$0,4*SIZE(CO1)
	FETCH	$0,4*SIZE(CO2)
	FETCH	$0,8*SIZE(CO1)
	FETCH	$0,8*SIZE(CO2)

	daddu	CO1,CO1,4*SIZE			
	bnez	M,.L40				
	daddu	CO2,CO2,4*SIZE

#else
	MUL	t11, ALPHA, t11
	MUL	t21, ALPHA, t21
	MUL	t31, ALPHA, t31
	MUL	t41, ALPHA, t41
	
	MUL	t12, ALPHA, t12
	ST	t11, 0 * SIZE(CO1)
	MUL	t22, ALPHA, t22
	ST	t21, 1 * SIZE(CO1)
	MUL	t32, ALPHA, t32
	ST	t31, 2 * SIZE(CO1)
	MUL	t42, ALPHA, t42
	ST	t41, 3 * SIZE(CO1)
	
	ST	t12, 0 * SIZE(CO2)
	daddiu	M,M,-1
	ST	t22, 1 * SIZE(CO2)
	ST	t32, 2 * SIZE(CO2)
	ST	t42, 3 * SIZE(CO2)
	
	daddiu	CO1,CO1, 4*SIZE
	daddiu	CO2,CO2, 4*SIZE

	FETCH	$0,0(CO1)
	FETCH	$0,0(CO2)
	FETCH	$0,4(CO1)
	FETCH	$0,4(CO2)

#if ( defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, KCO, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -4
#else
	daddiu	TEMP, TEMP, -2
#endif
	dsll	K,TEMP, 2 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT

	daddu	A,A,K
	daddu	B,B,TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 4
#endif
	bnez	M,.L40
	nop
#endif


	.align 3
.L12_M2:
	andi	M,MCO,2						#  	mr = 2
	beqz	M,.L12_M1			
	nop

.L50:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,BO
#else
	dsll	K,    KK, 1 + BASE_SHIFT	#mr=2
	dsll	TEMP, KK, 1 + BASE_SHIFT	#nr=2

	daddu	A, A, K
	daddu	B, BO,  TEMP
#endif
	LD	a0,0*SIZE(A)
	LD	a1,1*SIZE(A)

	MTC		$0,t11
	LD	b0,0*SIZE(B)
	MOV	t21,t11
	LD	b1,1*SIZE(B)
	
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, KCO, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	K,TEMP,2			
	MOV	t12,t11
	beqz	K,.L55
	MOV	t22,t11

#else
	move	B,BO
	LD	a0,0*SIZE(A)
	dsra	K,KCO,2						#  K=KCO/2
	LD	a1,1*SIZE(A)

	MTC		$0,t11
	LD	b0,0*SIZE(B)
	MOV	t21,t11
	LD	b1,1*SIZE(B)
	
	MOV	t12,t11
	beqz	K,.L55
	MOV	t22,t11

#endif

.L51:									#  nr=2 mr=2,kr=4
	MADD	t11,t11,a0,b0
	LD	a4,2*SIZE(A)
	MADD	t21,t21,a1,b0
	LD	b4,2*SIZE(B)

	MADD	t12,t12,a0,b1
	LD	a5,3*SIZE(A)
	MADD	t22,t22,a1,b1
	LD	b5,3*SIZE(B)

	MADD	t11,t11,a4,b4
	LD	a2,4*SIZE(A)
	MADD	t21,t21,a5,b4
	LD	b2,4*SIZE(B)

	MADD	t12,t12,a4,b5
	LD	a3,5*SIZE(A)
	MADD	t22,t22,a5,b5
	daddiu	K,K,-1
	LD	b3,5*SIZE(B)

	MADD	t11,t11,a2,b2
	LD	a6,6*SIZE(A)
	MADD	t21,t21,a3,b2
	daddu	A,A,8*SIZE					#  A+=2(mr)*4(kr)*8Byte=8*SIZE
	LD	b6,6*SIZE(B)

	MADD	t12,t12,a2,b3
	daddu	B,B,8*SIZE					#  B+=2(nr)*4(kr)*8Byte=16*SIZE
	LD	a7,-1*SIZE(A)
	MADD	t22,t22,a3,b3
	LD	b7,-1*SIZE(B)

	MADD	t11,t11,a6,b6
	LD	a0,0*SIZE(A)
	MADD	t21,t21,a7,b6
	LD	b0,0*SIZE(B)

	MADD	t12,t12,a6,b7
	LD	a1,1*SIZE(A)

	MADD	t22,t22,a7,b7
	bnez 	K,.L51
	LD	b1,1*SIZE(B)


.L55:									#  	kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2				
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L58
	nop

.L56:			
	MADD	t11,t11,a0,b0
	LD	a4,2*SIZE(A)
	MADD	t21,t21,a1,b0
	daddu	A,A,4*SIZE					#  A+=2(mr)*2(kr)*8Byte=32
	LD	b4,2*SIZE(B)

	MADD	t12,t12,a0,b1
	daddu	B,B,4*SIZE					#	2nr*2kr
	LD	a5,-1*SIZE(A)
	MADD	t22,t22,a1,b1
	LD	b5,-1*SIZE(B)

.L57:
	MADD	t11,t11,a4,b4
	LD	a0,0*SIZE(A)
	MADD	t21,t21,a5,b4
	LD	b0,0*SIZE(B)

	MADD	t12,t12,a4,b5
	LD	a1,1*SIZE(A)
	MADD	t22,t22,a5,b5
	LD	b1,1*SIZE(B)

.L58:									#  kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP, 1
#endif
	beqz	K,.L59				
	LD	ALPHA,152($sp)					#  Get ALPHA
	
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,2*SIZE					#  	A+=2(mr)*1(kr)*8Byte=16
	daddu	B,B,2*SIZE					#	2nr*kr

	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1


.L59:									#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)						#  write gemm part back Fetch 16 C
	LD	c21,1*SIZE(CO1)			
	LD	c12,0(CO2)
	LD	c22,1*SIZE(CO2)
	
	MADD	t11,c11,t11,ALPHA
	MADD	t21,c21,t21,ALPHA
	MADD	t12,c12,t12,ALPHA
	MADD	t22,c22,t22,ALPHA

	ST	t11,0(CO1)
	ST	t21,1*SIZE(CO1)
	ST	t12,0(CO2)
	ST	t22,1*SIZE(CO2)

	daddu	CO1,CO1,2*SIZE			
	daddu	CO2,CO2,2*SIZE

	FETCH	$0,0(CO1)
	FETCH	$0,0(CO2)
#else
	daddiu	M, M, -1
	daddiu	CO1,CO1, 2 * SIZE
	daddiu	CO2,CO2, 2 * SIZE
	MUL	t11, ALPHA, t11
	MUL	t21, ALPHA, t21
	MUL	t12, ALPHA, t12
	MUL	t22, ALPHA, t22

	ST	t11, -2 * SIZE(CO1)
	ST	t21, -1 * SIZE(CO1)
	ST	t12, -2 * SIZE(CO2)
	ST	t22, -1 * SIZE(CO2)

	FETCH	$0,0(CO1)
	FETCH	$0,0(CO2)

#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, KCO, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -2
#endif

	dsll	K,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, B, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif
#endif


	.align 3
.L12_M1:
	andi	M,MCO,1					#  	mr = 1
	beqz	M,.L0_N2_Loop		
	nop

.L60:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,BO					#	Reset B
#else
	dsll	K,    KK, 0 + BASE_SHIFT
	dsll	TEMP, KK, 1 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, BO,  TEMP
#endif
	LD	a0,0*SIZE(A)
	
	MTC		$0,t11
	MOV	t21,t11
	LD	b0,0*SIZE(B)

	MOV	t12,t11
	LD	b1,1*SIZE(B)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, KCO, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	K,TEMP,2				
	MOV	t22,t11
	beqz	K,.L65
	nop

#else
	dsra	K,KCO,2				
	move	B,BO					#  Reset B
	LD	a0,0*SIZE(A)
	
	MTC		$0,t11
	MOV	t21,t11
	LD	b0,0*SIZE(B)

	MOV	t12,t11
	LD	b1,1*SIZE(B)
	beqz	K,.L65
	MOV	t22,t11

#endif

.L61:								#	nr=2,mr=1,kr=4	
	LD	a4,	1*SIZE(A)				#	a2
	LD	b4, 2*SIZE(B)
	MADD	t11,t11,a0,b0
	
	LD	b5,3*SIZE(B)
	MADD	t12,t12,a0,b1

	LD	a2,	2*SIZE(A)				#	a3
	LD	b2,4*SIZE(B)
	MADD	t11,t11,a4,b4
	
	LD	b3,5*SIZE(B)
	MADD	t12,t12,a4,b5

	LD	a6,	3*SIZE(A)				#	a4
	daddiu	K,K,-1
	LD	b6,6*SIZE(B)
	MADD	t11,t11,a2,b2
	
	LD	b7,7*SIZE(B)
	MADD	t12,t12,a2,b3
	daddu	A,A,4*SIZE				#  A+=1(mr)*4(kr)*8Byte=32

	LD	a0,	0*SIZE(A)
	daddu	B,B,8*SIZE				#  B+=2(nr)*4(kr)*8Byte=8*SIZE
	
	LD	b0,0*SIZE(B)	
	MADD	t11,t11,a6,b6
	
	LD	b1,1*SIZE(B)
	bnez 	K,.L61
	MADD	t12,t12,a6,b7



.L65:								#  kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2				
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L68
	nop

.L66:			
	LD	a4,	1*SIZE(A)				#	a1
	MADD	t11,t11,a0,b0
	LD	b4,2*SIZE(B)
	daddu	A,A,2*SIZE				#  A+=1(mr)*2(kr)*8Byte=16
	
	LD	b5,3*SIZE(B)
	MADD	t12,t12,a0,b1
	daddu	B,B,4*SIZE

.L67:
	LD	a0,0(A)						#	a0
	LD	b0,0*SIZE(B)
	MADD	t11,t11,a4,b4
	
	LD	b1,1*SIZE(B)
	MADD	t12,t12,a4,b5


.L68:								#   kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L69				  
	LD	ALPHA,152($sp)				#  Get ALPHA
	
	MADD	t11,t11,a0,b0
	MADD	t12,t12,a0,b1
	daddu	A,A,1*SIZE				#  A+=1(mr)*1(kr)*8Byte=16
	daddu	B,B,2*SIZE


.L69:								#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)					#  Fetch 16 C
	LD	c12,0(CO2)
	
	MADD	t11,c11,t11,ALPHA
	MADD	t12,c12,t12,ALPHA

	ST	t11,0(CO1)
	ST	t12,0(CO2)

	daddu	CO1,CO1,1*SIZE		
	daddu	CO2,CO2,1*SIZE

#else
	MUL	t11, ALPHA, t11
	MUL	t12, ALPHA, t12

	ST	t11,  0 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)

	daddu	CO1,CO1,1*SIZE			
	daddu	CO2,CO2,1*SIZE

#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, KCO, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -2
#endif

	dsll	K,    TEMP, 0 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, B, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 1
#endif
#endif

.L0_N2_Loop:
#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu	KK, KK, 2
#endif
	move	BO, B


	.align	5					
.L0_N1:
	andi	N,NCO,1					#  nr = 1
	beqz	N,.L999					
	nop

	move	CO1,C				
	dsra	M,MCO,2				
	
	move	A,AO					#  Reset A
	daddu	PREA,AO,SPANA
#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif

	beqz	M,.L11_M2
	daddu	C,CO1,LDC

.L70:						
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B, BO					#	Reset B
#else
	dsll	K,    KK, 2 + BASE_SHIFT
	dsll	TEMP, KK, 0 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, BO,  TEMP
#endif
	LD	b0,	0*SIZE(B)
	
	MTC		$0,t11
	LD	a0,0*SIZE(A)
	MOV	t21,t11
	LD	a1,1*SIZE(A)
	
	MOV	t31,t11
	LD	a2,2*SIZE(A)
	MOV	t41,t11
	LD	a3,3*SIZE(A)


#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, KCO, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 4
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	K,TEMP,2		
	beqz	K,.L75
	nop
#else
	move	B, BO					#	Reset B
	dsra	K,KCO,2			
	LD	b0,	0*SIZE(B)
	
	MTC		$0,t11
	LD	a0,0*SIZE(A)
	MOV	t21,t11
	LD	a1,1*SIZE(A)
	
	MOV	t31,t11
	LD	a2,2*SIZE(A)
	MOV	t41,t11
	beqz	K,.L75
	LD	a3,3*SIZE(A)

#endif

.L71:								#  nr=1,mr=kr=4
	LD	b4,	1*SIZE(B)				#	b1
	MADD	t11,t11,a0,b0
	
	LD	a4,	4*SIZE(A)
	MADD	t21,t21,a1,b0

	LD	a5,	5*SIZE(A)
	FETCH		$0,(PREA)

	LD	a6,6*SIZE(A)
	MADD	t31,t31,a2,b0

	LD	a7,7*SIZE(A)
	MADD	t41,t41,a3,b0

.L72:
	LD	b2,	2*SIZE(B)				#	b2
	MADD	t11,t11,a4,b4
	
	LD	a0,8*SIZE(A)
	MADD	t21,t21,a5,b4

	LD	a1,9*SIZE(A)
	FETCH		$0,4*SIZE(PREA)

	LD	a2,10*SIZE(A)
	MADD	t31,t31,a6,b4
	
	LD	a3,11*SIZE(A)
	MADD	t41,t41,a7,b4

.L73:
	LD	b6,	3*SIZE(B)
	MADD	t11,t11,a0,b2
	
	LD	a4,12*SIZE(A)
	daddu	B,B,4*SIZE				#  B+=1(nr)*4(kr)*8Byte=32
	
	LD	a5,13*SIZE(A)
	MADD	t21,t21,a1,b2

	LD	a6,14*SIZE(A)
	FETCH		$0,8*SIZE(PREA)
	MADD	t31,t31,a2,b2

	LD	a7,15*SIZE(A)
	MADD	t41,t41,a3,b2
	daddu	A,A,16*SIZE				#  A+=4(mr)*4(kr)*8Byte=16*SIZE

.L74:
	LD	b0,	0*SIZE(B)
	MADD	t11,t11,a4,b6
	
	LD	a0,0*SIZE(A)
	daddu	PREA,PREA,16*SIZE

	LD	a1,1*SIZE(A)
	MADD	t21,t21,a5,b6

	LD	a2,2*SIZE(A)
	daddiu	K,K,-1
	MADD	t31,t31,a6,b6

	LD	a3,3*SIZE(A)
	MADD	t41,t41,a7,b6
	bnez 	K,.L71
	FETCH		$0,-32(PREA)


.L75:								#  kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2				
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L78
	nop

.L76:			
	LD	b4,	1*SIZE(B)
	MADD	t11,t11,a0,b0
	
	LD	a4,4*SIZE(A)
	daddu	B,B,2*SIZE				#  B+=1(nr)*2(kr)*8Byte=32
	
	LD	a5,5*SIZE(A)
	MADD	t21,t21,a1,b0
	FETCH		$0,0(PREA)

	LD	a6,6*SIZE(A)
	MADD	t31,t31,a2,b0

	LD	a7,7*SIZE(A)
	MADD	t41,t41,a3,b0
	daddu	A,A,8*SIZE				#  A+=4(mr)*2(kr)*8Byte=8*SIZE

.L77:
	LD	b0,0(B)
	MADD	t11,t11,a4,b4

	LD	a0,0*SIZE(A)
	MADD	t21,t21,a5,b4
	FETCH		$0,4*SIZE(PREA)

	LD	a1,1*SIZE(A)
	MADD	t31,t31,a6,b4

	LD	a2,2*SIZE(A)
	MADD	t41,t41,a7,b4

	LD	a3,3*SIZE(A)
	daddu	PREA,PREA,8*SIZE


	
.L78:								#   kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L79				 
	LD	ALPHA,152($sp)				#  Get ALPHA
	
	FETCH		$0,0(PREA)
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,4*SIZE				#  A+=4(mr)*1(kr)*8Byte=32

	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0
	daddu	B,B,1*SIZE
	daddu	PREA,PREA,4*SIZE


.L79:								#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)					#  Fetch 16 C
	LD	c21,1*SIZE(CO1)			
	LD	c31,2*SIZE(CO1)
	LD	c41,3*SIZE(CO1)

	MADD	t11,c11,t11,ALPHA
	MADD	t21,c21,t21,ALPHA
	MADD	t31,c31,t31,ALPHA
	MADD	t41,c41,t41,ALPHA

	ST	t11,0(CO1)
	ST	t21,1*SIZE(CO1)
	ST	t31,2*SIZE(CO1)
	ST	t41,3*SIZE(CO1)
	daddiu	M,M,-1					#  M--

	FETCH	$0,4*SIZE(CO1)
	FETCH	$0,8*SIZE(CO1)

	bnez	M,.L70					#  M!=0
	daddu	CO1,CO1,4*SIZE			#  COx += 4*8Byte
#else
	daddiu	M,M,-1					#  M--
	MUL	t11, ALPHA, t11
	MUL	t21, ALPHA, t21
	MUL	t31, ALPHA, t31
	MUL	t41, ALPHA, t41

	ST	t11,0(CO1)
	ST	t21,1*SIZE(CO1)
	ST	t31,2*SIZE(CO1)
	ST	t41,3*SIZE(CO1)

	FETCH	$0,4*SIZE(CO1)
	FETCH	$0,8*SIZE(CO1)

	daddu	CO1,CO1,4*SIZE			
#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, KCO, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -4
#else
	daddiu	TEMP, TEMP, -1
#endif

	dsll	K,    TEMP, 2 + BASE_SHIFT
	dsll	TEMP, TEMP, 0 + BASE_SHIFT

	daddu	A, A,K
	daddu	B, B, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 4
#endif
	bnez	M,.L70				
	nop
#endif


	.align 3
.L11_M2:
	andi	M,MCO,2					#  mr = 2
	beqz	M,.L11_M1			
	nop

.L80:						
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B, BO
#else
	dsll	K,    KK, 1 + BASE_SHIFT
	dsll	TEMP, KK, 0 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, BO,  TEMP
#endif
	LD	b0,	0*SIZE(B)

	MTC		$0,t11
	MOV		t21,t11
	LD	a0,0*SIZE(A)
	LD	a1,1*SIZE(A)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, KCO, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	K,TEMP,2				#  K=KCO/2
	beqz	K,.L85
	nop
#else
	move	B, BO
	dsra	K,KCO,2				
	LD	b0,	0*SIZE(B)

	MTC		$0,t11
	MOV		t21,t11
	LD	a0,0*SIZE(A)
	
	beqz	K,.L85
	LD	a1,1*SIZE(A)

#endif

.L81:								#  nr=1,mr=2,kr=4
	LD	b4,	1*SIZE(B)
	LD	a4,2*SIZE(A)
	MADD	t11,t11,a0,b0
	LD	a5,3*SIZE(A)
	MADD	t21,t21,a1,b0

	LD	b2,	2*SIZE(B)
	LD	a2,4*SIZE(A)
	MADD	t11,t11,a4,b4
	LD	a3,5*SIZE(A)
	MADD	t21,t21,a5,b4
	
	LD	b6,	3*SIZE(B)
	LD	a6,6*SIZE(A)
	MADD	t11,t11,a2,b2
	LD	a7,7*SIZE(A)
	MADD	t21,t21,a3,b2

	daddu	A,A,8*SIZE				#  A+=2(mr)*4(kr)*8Byte=8*SIZE
	daddu	B,B,4*SIZE				#  B+=1(nr)*4(kr)*8Byte=32

	LD	b0,	0*SIZE(B)
	daddiu	K,K,-1

	LD	a0,0*SIZE(A)
	MADD	t11,t11,a6,b6

	LD	a1,1*SIZE(A)
	bnez 	K,.L81
	MADD	t21,t21,a7,b6

.L85:								#  kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2				
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L88
	nop

.L86:			
	LD	b4,	1*SIZE(B)
	LD	a4,2*SIZE(A)
	MADD	t11,t11,a0,b0
	LD	a5,3*SIZE(A)
	MADD	t21,t21,a1,b0
	
	daddu	A,A,4*SIZE				#  A+=2(mr)*2(kr)*8Byte=32
	daddu	B,B,2*SIZE				#  B+=1(nr)*2(kr)*8Byte=16
	
	LD	b0,0(B)
	LD	a0,0*SIZE(A)
	MADD	t11,t11,a4,b4
	LD	a1,1*SIZE(A)
	MADD	t21,t21,a5,b4


	
.L88:								#  kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L89				
	LD	ALPHA,152($sp)				#  Get ALPHA
	
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,2*SIZE				#  A+=2(mr)*1(kr)*8Byte=16
	daddu	B,B,1*SIZE


.L89:								#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)					#  Fetch 16 C
	LD	c21,1*SIZE(CO1)			

	MADD	t11,c11,t11,ALPHA
	MADD	t21,c21,t21,ALPHA

	ST	t11,0(CO1)
	ST	t21,1*SIZE(CO1)

	FETCH	$0,2*SIZE(CO1)
	
	daddu	CO1,CO1,2*SIZE			#  COx += 2*8Byte

#else
	daddu	CO1,CO1,2*SIZE			#  COx += 2*8Byte
	MUL	t11, ALPHA, t11
	MUL	t21, ALPHA, t21

	FETCH	$0,0(CO1)
	ST	t11, -2 * SIZE(CO1)
	ST	t21, -1 * SIZE(CO1)
#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, KCO, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -1
#endif

	dsll	K,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 0 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, B, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif
#endif


	.align 3
.L11_M1:
	andi		M,MCO,1				#   mr = 1
	beqz	M,.L999			
	nop

.L90:			
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,  BO
#else
	dsll	K,    KK, 0 + BASE_SHIFT
	dsll	TEMP, KK, 0 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, BO,  TEMP
#endif
	LD	a0,	0*SIZE(A)
	LD	b0,	0*SIZE(B)
	MTC		$0,t11

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, KCO, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	K,  TEMP, 2
	beqz	K,.L95
	nop

#else
	move	B,  BO
	LD	a0,	0*SIZE(A)
	LD	b0,	0*SIZE(B)
	dsra	K,KCO,2				
	beqz	K,.L95
	MTC		$0,t11
#endif

.L91:								#  nr=mr=1,kr=4
	LD	a4,	1*SIZE(A)
	LD	b4,	1*SIZE(B)
	MADD	t11,t11,a0,b0
	
	LD	a2,	2*SIZE(A)
	LD	b2,	2*SIZE(B)
	MADD	t11,t11,a4,b4

	LD	a6,	3*SIZE(A)
	LD	b6,	3*SIZE(B)
	MADD	t11,t11,a2,b2
	
	daddu	A,A,4*SIZE				#  A+=1(mr)*4(kr)*8Byte=32
	daddu	B,B,4*SIZE				#  B+=1(nr)*4(kr)*8Byte=32

	LD	a0,	0*SIZE(A)
	LD	b0,	0*SIZE(B)
	MADD	t11,t11,a6,b6
	
	daddiu	K,K,-1
	bnez 	K,.L91
	nop

.L95:								#  kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2			
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L98
	nop

.L96:			
	LD	a4,	1*SIZE(A)
	LD	b4,	1*SIZE(B)
	MADD	t11,t11,a0,b0
	daddu	B,B,2*SIZE				#  B+=1(nr)*2(kr)*8Byte=16
	daddu	A,A,2*SIZE				#  A+=1(mr)*2(kr)*8Byte=32

	LD	b0,0(B)
	LD	a0,0(A)
	MADD	t11,t11,a4,b4
	
.L98:								#  kr=1
#ifndef TRMMKERNEL
	andi		K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L99				
	LD	ALPHA,152($sp)				#  Get ALPHA

	MADD	t11,t11,a0,b0


.L99:								#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)					#  Fetch 16 C
	MADD	t11,c11,t11,ALPHA
	ST	t11,0(CO1)

#else
	MUL	t11, ALPHA, t11

	ST	t11,  0 * SIZE(CO1)
#endif


.L999:							#  End
	ld	$16,   0($sp)
	ld	$17,   8($sp)
	ld	$18,  16($sp)
	ld	$19,  24($sp)
	ld	$20,  32($sp)
	ld	$21,  40($sp)
	ld	$22,  48($sp)
	LD	$f24, 56($sp)
	LD	$f25, 64($sp)
	LD	$f26, 72($sp)
	LD	$f27, 80($sp)
	LD	$f28, 88($sp)
	ld	$23,  96($sp)
	ld	$24, 104($sp)
	ld	$25, 112($sp)
	LD	$f20,120($sp)
	LD	$f21,128($sp)
	LD	$f22,136($sp)
	LD	$f23,144($sp)

	j	$31
	daddiu	$sp, $sp, 160

	EPILOGUE