Blob Blame Raw
#define REALNAME ASMNAME
#define ASSEMBLER
#include "common.h"
#define FETCH	ld
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)

#define M	$4
#define	N	$5
#define	K	$6
#define A	$8
#define B	$9
#define C	$10
#define LDC	$11

#define AO	$12
#define BO	$13

#define CO1	$14
#define CO2	$15
#define CO3	$16
#define CO4	$17

#define KCO	$18
#define MCO	$19
#define NCO	$20

#define SPANB	$21
#define PREB	$23
#define PREA	$24
#define SPANA	$25

#define ALPHA	$f15

#if defined(TRMMKERNEL)
#define	OFFSET	$2
#define	KK	$3
#define	TEMP	$7
#endif

#define R8	8
#define	R9	9
#define R14	14
#define R15	15
#define R16	16
#define R17 17

#define	t11	$f30
#define	t21	$f31
#define	t31	$f28
#define	t41	$f29

#define	t12	$f26
#define	t22	$f27
#define	t32	$f24
#define	t42	$f25

#define	t13	$f22
#define	t23	$f23
#define	t33	$f20
#define	t43	$f21

#define	t14	$f18
#define	t24	$f19
#define	t34	$f16
#define	t44	$f17

#define	c11	$f0
#define	c21	$f1
#define	c31	$f2
#define	c41	$f3

#define	c12	$f4
#define	c22	$f5
#define	c32	$f6
#define	c42	$f7

#define	c13	$f8
#define	c23	$f9
#define	c33	$f10
#define c43	$f11

#define	c14	$f12
#define	c24	$f13
#define	c34	$f14
#define	c44	$f0

#define	a0	$f0
#define	a1	$f1
#define	a2	$f2
#define	a3	$f3
#define	a4	$f4
#define	a5	$f5
#define	a6	$f6
#define	a7	$f7
#define	b0	$f8
#define	b1	$f9
#define	b2	$f10
#define b3	$f11
#define	b4	$f12
#define	b5	$f13
#define	b6	$f14
#define	b7	$f15

#define F31 31
#define F30 30
#define F29 29
#define F28 28
#define F27 27
#define F26 26
#define F25 25
#define F24 24 
#define F23 23
#define F22 22
#define F21 21
#define F20 20
#define F19 19
#define F18 18
#define F17 17
#define F16 16 
#define F15 15
#define F14 14
#define F13 13
#define F12 12
#define F11 11
#define F10 10
#define F9 9
#define F8 8
#define F7 7
#define F6 6
#define F5 5
#define F4 4 
#define F3 3 
#define F2 2 
#define F1 1 
#define F0 0

	PROLOGUE
	
	daddiu	$sp, $sp, -160
	sd	$16,   0($sp)
	sd	$17,   8($sp)
	sd	$18,  16($sp)
	sd	$19,  24($sp)
	sd	$20,  32($sp)
	sd	$21,  40($sp)
	sd	$22,  48($sp)
	ST	$f24, 56($sp)
	ST	$f25, 64($sp)
	ST	$f26, 72($sp)
	ST	$f27, 80($sp)
	ST	$f28, 88($sp)
	sd	$23,  96($sp)
	sd	$24, 104($sp)
	sd	$25, 112($sp)
	ST	$f20,120($sp)
	ST	$f21,128($sp)
	ST	$f22,136($sp)
	ST	$f23,144($sp)


	.align	5					
.L0_N4:									#  Loop N
	ST	ALPHA,152($sp)					#  Backup	ALPHA
	move	MCO,M						#  Backup	M

	move	NCO,N						#  Backup	N
	move	KCO,K						#  Backup	K

	move	AO,A						#  Backup	A_addr
	dsra	N,NCO,2						#  N=NCO/2
	
	dsll	LDC,LDC,BASE_SHIFT			#  LDC*8Byte
	dsll	SPANB,KCO,2+BASE_SHIFT		#  SPANB=KC*4nr*8Byte=KC*2^5
	
#if defined(TRMMKERNEL)
	LDARG	OFFSET,160($sp)				#	OFFSET is relate to the data part		
#endif

#if defined(TRMMKERNEL) && !defined(LEFT)
	neg		KK,OFFSET				
#endif
	
	move	BO,B						#  Backup	B_addr
	beq		N,$0,.L0_N2					#  N=0,NCO<4
	dsll	SPANA,KCO,1+BASE_SHIFT		#  SPANA = KCO*2mr*8Byte

.L0_N4_Lb:								#	mr=4,nr=4
	move	CO1,C							
	dsra	M,MCO,2						#  M=MCO/2
	
	move	A,AO						#  Reset A
	daddu	CO2,C,LDC

	daddu	PREB,BO,SPANB				#  PreB point next panelB
	daddu	CO3,CO2,LDC

	daddu	PREA,AO,SPANA
	daddu	CO4,CO3,LDC

#if defined(TRMMKERNEL) && defined(LEFT)
	move	KK,OFFSET					
#endif
	beqz	M,.L14_M2
	daddu	C,CO4,LDC					#	move C to next panel Cj

.L10:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,BO						#	(SIDE=L and UPLO=L) or (SIZE=R and UPLO=U)
#else
	dsll	K,KK,2 + BASE_SHIFT			#  KK is the length that needs to span to the data part
	dsll	TEMP,KK,2 + BASE_SHIFT

	daddu	A,A,K						#  move A B to data part
	daddu	B,BO,TEMP
#endif

	MTC		$0,t11
	MOV	t21,t11
	gsLQC1(R8,F1,F0,0)					#	a0,a1

	MOV	t31,t11
	MOV	t41,t11
	gsLQC1(R9,F9,F8,0)					#	b0,b1

	MOV	t12,t11
	MOV	t22,t11
	gsLQC1(R8,F3,F2,1)					#	a2,a3
	
	MOV	t32,t11
	MOV	t42,t11
	gsLQC1(R9,F11,F10,1)				#	b2,b3

	MOV	t13,t11
	MOV	t23,t11
	
	MOV	t33,t11
	MOV	t43,t11

	MOV	t14,t11
	MOV	t24,t11
	

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP,KCO,KK					#  temp is the length of the data part
#elif defined(LEFT)
	daddiu	TEMP, KK, 4					#	S=L,U=L	
#else
	daddiu	TEMP, KK, 4					#	S=R,U=U,for this two situation KK is the length of the data part
#endif
	dsra	K,TEMP,2					#  K=KCO/2
	MOV	t34,t11
	beqz	K,.L15
	MOV	t44,t11

#else							
	move	B,BO						#	Reset B
	MTC		$0,t11						# 	GEMM part	NR=4,MR=4
	gsLQC1(R8,F1,F0,0)					#	a0,a1

	MOV	t21,t11
	MOV	t31,t11
	gsLQC1(R9,F9,F8,0)					#	b0,b1

	MOV	t41,t11
	MOV	t12,t11
	gsLQC1(R8,F3,F2,1)					#	a2,a3
	
	MOV	t22,t11
	MOV	t32,t11
	gsLQC1(R9,F11,F10,1)				#	b2,b3

	MOV	t42,t11
	dsra	K,KCO,2						#  K=KCO/2
	
	MOV	t13,t11
	MOV	t23,t11
	
	MOV	t33,t11
	MOV	t43,t11

	MOV	t14,t11
	MOV	t24,t11
	
	MOV	t34,t11
	beqz	K,.L15
	MOV	t44,t11							#	clear 16 results registers
#endif
	
	.align	5
.L11:									#  kr=4
	gsLQC1(R8,F5,F4,2)			
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0

	gsLQC1(R9,F13,F12,2)		
	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1

	gsLQC1(R8,F7,F6,3)
	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0
	
	gsLQC1(R9,F15,F14,3)
	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1

	FETCH		$0,(PREB)
	MADD	t13,t13,a0,b2
	MADD	t23,t23,a1,b2
	
	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3
	
	FETCH		$0,(PREA)
	MADD	t33,t33,a2,b2
	MADD	t43,t43,a3,b2

	MADD	t34,t34,a2,b3
	MADD	t44,t44,a3,b3
								
.L12:
	gsLQC1(R8,F1,F0,4)
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4

	gsLQC1(R9,F9,F8,4)
	MADD	t12,t12,a4,b5
	MADD	t22,t22,a5,b5

	gsLQC1(R8,F3,F2,5)
	MADD	t31,t31,a6,b4
	MADD	t41,t41,a7,b4

	gsLQC1(R9,F11,F10,5)
	MADD	t32,t32,a6,b5
	MADD	t42,t42,a7,b5

	FETCH		$0,4*SIZE(PREB)
	MADD	t13,t13,a4,b6
	MADD	t23,t23,a5,b6

	MADD	t14,t14,a4,b7
	MADD	t24,t24,a5,b7

	FETCH		$0,4*SIZE(PREA)
	MADD	t33,t33,a6,b6
	MADD	t43,t43,a7,b6
	
	MADD	t34,t34,a6,b7
	MADD	t44,t44,a7,b7

.L13:
	gsLQC1(R8,F5,F4,6)	
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0

	gsLQC1(R9,F13,F12,6)
	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1

	gsLQC1(R8,F7,F6,7)
	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0

	gsLQC1(R9,F15,F14,7)
	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1
	daddu	A,A,16*SIZE					#  4mr*4kr

	FETCH		$0,8*SIZE(PREB)
	MADD	t13,t13,a0,b2
	MADD	t23,t23,a1,b2
	daddu	B,B,16*SIZE					#	4nr*4kr

	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3

	FETCH		$0,8*SIZE(PREA)
	MADD	t33,t33,a2,b2
	MADD	t43,t43,a3,b2

	MADD	t34,t34,a2,b3
	MADD	t44,t44,a3,b3
	
.L14:
	gsLQC1(R8,F1,F0,0)						
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4

	gsLQC1(R9,F9,F8,0)
	MADD	t12,t12,a4,b5
	MADD	t22,t22,a5,b5

	gsLQC1(R8,F3,F2,1)
	MADD	t31,t31,a6,b4
	MADD	t41,t41,a7,b4
	daddiu	K,K,-1

	gsLQC1(R9,F11,F10,1)
	MADD	t32,t32,a6,b5
	MADD	t42,t42,a7,b5

	FETCH		$0,12*SIZE(PREB)
	MADD	t13,t13,a4,b6
	MADD	t23,t23,a5,b6

	FETCH		$0,12*SIZE(PREA)
	MADD	t14,t14,a4,b7
	MADD	t24,t24,a5,b7

	MADD	t33,t33,a6,b6
	MADD	t43,t43,a7,b6
	daddu	PREB,PREB,16*SIZE
	
	MADD	t34,t34,a6,b7
	MADD	t44,t44,a7,b7
	bnez 	K,.L11
	daddu	PREA,PREA,16*SIZE

.L15:									#  kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2				
#else
	andi	K,TEMP, 2
#endif
	beqz	K,.L18
	nop

.L16:			
	gsLQC1(R8,F5,F4,2)			
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0

	gsLQC1(R9,F13,F12,2)		
	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1

	gsLQC1(R8,F7,F6,3)
	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0

	gsLQC1(R9,F15,F14,3)
	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1
	daddu	A,A,8*SIZE					#	4mr*2kr

	FETCH		$0,0(PREB)
	MADD	t13,t13,a0,b2
	MADD	t23,t23,a1,b2
	daddu	B,B,8*SIZE					#	4nr*2kr

	FETCH		$0,0(PREA)
	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3

	MADD	t33,t33,a2,b2
	MADD	t43,t43,a3,b2

	MADD	t34,t34,a2,b3
	MADD	t44,t44,a3,b3
								
.L17:
	gsLQC1(R8,F1,F0,0)
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4

	gsLQC1(R9,F9,F8,0)
	MADD	t12,t12,a4,b5
	MADD	t22,t22,a5,b5

	gsLQC1(R8,F3,F2,1)
	MADD	t31,t31,a6,b4
	MADD	t41,t41,a7,b4

	gsLQC1(R9,F11,F10,1)
	MADD	t32,t32,a6,b5
	MADD	t42,t42,a7,b5

	FETCH		$0,4*SIZE(PREB)
	MADD	t13,t13,a4,b6
	MADD	t23,t23,a5,b6

	FETCH		$0,4*SIZE(PREA)
	MADD	t14,t14,a4,b7
	MADD	t24,t24,a5,b7
	daddu	PREB,PREB,8*SIZE

	MADD	t33,t33,a6,b6
	MADD	t43,t43,a7,b6
	daddu	PREA,PREA,8*SIZE
	
	MADD	t34,t34,a6,b7
	MADD	t44,t44,a7,b7
	
.L18:									#	kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L19				  
	LD	ALPHA,152($sp)					#  Get ALPHA
	
	FETCH		$0,0(PREB)
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,4*SIZE					#  	4mr*kr

	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1
	daddu	B,B,4*SIZE					#	4nr*kr

	FETCH		$0,0(PREA)
	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0
	daddu	PREB,PREB,4*SIZE

	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1
	daddu	PREA,PREA,4*SIZE

	MADD	t13,t13,a0,b2
	MADD	t23,t23,a1,b2

	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3

	MADD	t33,t33,a2,b2
	MADD	t43,t43,a3,b2

	MADD	t34,t34,a2,b3
	MADD	t44,t44,a3,b3

.L19:									#  Write Back to C
#ifndef TRMMKERNEL				
	LD	c11,0(CO1)						#  GEMM write part 
	LD	c21,1*SIZE(CO1)					#  get 16 C
	LD	c31,2*SIZE(CO1)
	LD	c41,3*SIZE(CO1)

	LD	c12,0(CO2)
	MADD	t11,c11,t11,ALPHA
	LD	c22,1*SIZE(CO2)
	MADD	t21,c21,t21,ALPHA
	LD	c32,2*SIZE(CO2)
	MADD	t31,c31,t31,ALPHA
	LD	c42,3*SIZE(CO2)
	MADD	t41,c41,t41,ALPHA

	LD	c13,0(CO3)
	MADD	t12,c12,t12,ALPHA
	LD	c23,1*SIZE(CO3)
	MADD	t22,c22,t22,ALPHA
	LD	c33,2*SIZE(CO3)
	MADD	t32,c32,t32,ALPHA
	LD	c43,3*SIZE(CO3)
	MADD	t42,c42,t42,ALPHA

	LD	c14,0(CO4)
	MADD	t13,c13,t13,ALPHA
	LD	c24,1*SIZE(CO4)
	MADD	t23,c23,t23,ALPHA
	LD	c34,2*SIZE(CO4)
	MADD	t33,c33,t33,ALPHA
	LD	c44,3*SIZE(CO4)
	MADD	t43,c43,t43,ALPHA

	ST	t11,0(CO1)
	MADD	t14,c14,t14,ALPHA
	ST	t21,1*SIZE(CO1)
	MADD	t24,c24,t24,ALPHA
	ST	t31,2*SIZE(CO1)
	MADD	t34,c34,t34,ALPHA
	ST	t41,3*SIZE(CO1)
	MADD	t44,c44,t44,ALPHA
	daddiu	M,M,-1						#  M--

	ST	t12,0(CO2)
	ST	t22,1*SIZE(CO2)
	ST	t32,2*SIZE(CO2)
	ST	t42,3*SIZE(CO2)

	ST	t13,0(CO3)
	ST	t23,1*SIZE(CO3)
	ST	t33,2*SIZE(CO3)
	ST	t43,3*SIZE(CO3)

	FETCH	$0,4*SIZE(CO1)
	FETCH	$0,4*SIZE(CO2)
	FETCH	$0,4*SIZE(CO3)
	FETCH	$0,4*SIZE(CO4)

	FETCH	$0,8*SIZE(CO1)
	FETCH	$0,8*SIZE(CO2)
	FETCH	$0,8*SIZE(CO3)
	FETCH	$0,8*SIZE(CO4)

	ST	t14,0(CO4)
	daddu	CO1,CO1,4*SIZE				#  COi += 4
	ST	t24,1*SIZE(CO4)
	daddu	CO2,CO2,4*SIZE
	ST	t34,2*SIZE(CO4)
	daddu	CO3,CO3,4*SIZE
	ST	t44,3*SIZE(CO4)
	daddu	PREB,BO,SPANB
	
	bnez	M,.L10				
	daddu	CO4,CO4,4*SIZE

#else							
	MUL	t11, ALPHA, t11					#	TRMM write back part
	MUL	t21, ALPHA, t21
	MUL	t31, ALPHA, t31
	MUL	t41, ALPHA, t41

	ST	t11, 0 * SIZE(CO1)
	MUL	t12, ALPHA, t12
	ST	t21, 1 * SIZE(CO1)
	MUL	t22, ALPHA, t22
	ST	t31, 2 * SIZE(CO1)
	MUL	t32, ALPHA, t32
	ST	t41, 3 * SIZE(CO1)
	MUL	t42, ALPHA, t42

	ST	t12, 0 * SIZE(CO2)
	MUL	t13, ALPHA, t13
	ST	t22, 1 * SIZE(CO2)
	MUL	t23, ALPHA, t23
	ST	t32, 2 * SIZE(CO2)
	MUL	t33, ALPHA, t33
	ST	t42, 3 * SIZE(CO2)
	MUL	t43, ALPHA, t43

	ST	t13, 0 * SIZE(CO3)
	MUL	t14, ALPHA, t14
	ST	t23, 1 * SIZE(CO3)
	MUL	t24, ALPHA, t24
	ST	t33, 2 * SIZE(CO3)
	MUL	t34, ALPHA, t34
	ST	t43, 3 * SIZE(CO3)
	MUL	t44, ALPHA, t44

	ST	t14, 0 * SIZE(CO4)
	daddiu	M,M,-1						#  M--
	ST	t24, 1 * SIZE(CO4)
	ST	t34, 2 * SIZE(CO4)
	ST	t44, 3 * SIZE(CO4)
	daddiu	CO1,CO1, 4 * SIZE
	daddiu	CO2,CO2, 4 * SIZE
	daddiu	CO3,CO3, 4 * SIZE
	daddiu	CO4,CO4, 4 * SIZE	

	FETCH	$0,4*SIZE(CO1)
	FETCH	$0,4*SIZE(CO2)
	FETCH	$0,4*SIZE(CO3)
	FETCH	$0,4*SIZE(CO4)

	FETCH	$0,0(CO1)
	FETCH	$0,0(CO2)
	FETCH	$0,0(CO3)
	FETCH	$0,0(CO4)

#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP,KCO,KK								
#ifdef LEFT
	daddiu	TEMP,TEMP, -4
#else
	daddiu	TEMP,TEMP, -4
#endif
	dsll	K,TEMP,2 + BASE_SHIFT
	dsll	TEMP,TEMP,2 + BASE_SHIFT
	daddu	A,A,K						# 	mov A to the end of panel Ai
	daddu	B,B,TEMP					# 	mov B to the end of panel Bj
#endif

#ifdef LEFT										
	daddiu	KK, KK,4
#endif
	bnez	M,.L10					
	nop
#endif


	.align 3
.L14_M2:
	andi	M, MCO, 2					#	nr=4,mr=2
	beqz	M,.L14_M1			
	nop

.L20:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,BO						#	Reset B
#else
	dsll	K,KK,1 + BASE_SHIFT			#	mr=2	
	dsll	TEMP,KK,2 + BASE_SHIFT		#	nr=4
	daddu	A,A,K
	daddu	B,BO,TEMP
#endif

	MTC		$0,t11
	MOV	t21,t11
	gsLQC1(R8,F1,F0,0)					#	a0,a1

	MOV	t12,t11
	MOV	t22,t11
	gsLQC1(R9,F9,F8,0)					#	b0,b1
	
	MOV	t13,t11
	MOV	t23,t11
	gsLQC1(R9,F11,F10,1)				#	b2,b3
	

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP,KCO,KK
#elif defined(LEFT)
	daddiu	TEMP,KK,2					#	left part,controlled by mr, mr=2
#else
	daddiu	TEMP,KK,4					#  	right part,controlled by nr,nr=4
#endif
	dsra	K,TEMP,2
	MOV	t14,t11
	beqz	K,.L25
	MOV	t24,t11							#	clear 2*4=8 results registers

#else
	move	B,BO						#	Reset B 
	MTC		$0,t11
	gsLQC1(R8,F1,F0,0)		
	
	MOV	t21,t11
	MOV	t12,t11
	gsLQC1(R9,F9,F8,0)			

	MOV	t22,t11
	dsra	K,KCO,2				
	gsLQC1(R9,F11,F10,1)		
	
	MOV	t13,t11
	MOV	t23,t11
	
	MOV	t14,t11
	beqz	K,.L25
	MOV	t24,t11
#endif

.L21:									#  nr=4,mr=2,kr=4
	gsLQC1(R8,F5,F4,1)			
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0

	gsLQC1(R9,F13,F12,2)		
	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1

	gsLQC1(R9,F15,F14,3)
	MADD	t13,t13,a0,b2
	MADD	t23,t23,a1,b2
	
	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3
	
	gsLQC1(R8,F3,F2,2)
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4

	gsLQC1(R9,F9,F8,4)
	MADD	t12,t12,a4,b5
	MADD	t22,t22,a5,b5

	gsLQC1(R9,F11,F10,5)
	MADD	t13,t13,a4,b6
	MADD	t23,t23,a5,b6

	MADD	t14,t14,a4,b7
	MADD	t24,t24,a5,b7
	daddiu	K,K,-1

	gsLQC1(R8,F7,F6,3)	
	MADD	t11,t11,a2,b0
	MADD	t21,t21,a3,b0

	gsLQC1(R9,F13,F12,6)
	MADD	t12,t12,a2,b1
	MADD	t22,t22,a3,b1

	gsLQC1(R9,F15,F14,7)
	MADD	t13,t13,a2,b2
	MADD	t23,t23,a3,b2
	daddu	A,A,8*SIZE					#  2mr*4kr

	MADD	t14,t14,a2,b3
	MADD	t24,t24,a3,b3
	daddu	B,B,16*SIZE					#	4nr*4kr

	gsLQC1(R8,F1,F0,0)
	MADD	t11,t11,a6,b4
	MADD	t21,t21,a7,b4

	gsLQC1(R9,F9,F8,0)
	MADD	t12,t12,a6,b5
	MADD	t22,t22,a7,b5

	gsLQC1(R9,F11,F10,1)
	MADD	t13,t13,a6,b6
	MADD	t23,t23,a7,b6

	MADD	t14,t14,a6,b7
	bnez 	K,.L21
	MADD	t24,t24,a7,b7

.L25:										
#ifndef TRMMKERNEL
	andi	K,KCO,2						#	kr=2
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L28
	nop

.L26:			
	gsLQC1(R8,F5,F4,1)			
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0

	gsLQC1(R9,F13,F12,2)		
	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1

	gsLQC1(R9,F15,F14,3)
	MADD	t13,t13,a0,b2
	MADD	t23,t23,a1,b2
	daddu	A,A,4*SIZE					#  	2mr*2kr
	
	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3
	daddu	B,B,8*SIZE					#	4nr*2kr

.L27:
	gsLQC1(R8,F1,F0,0)
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4

	gsLQC1(R9,F9,F8,0)
	MADD	t12,t12,a4,b5
	MADD	t22,t22,a5,b5

	gsLQC1(R9,F11,F10,1)
	MADD	t13,t13,a4,b6
	MADD	t23,t23,a5,b6

	MADD	t14,t14,a4,b7
	MADD	t24,t24,a5,b7
	
.L28:									#	kr=1	
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L29				  
	LD	ALPHA,152($sp)					#  Get ALPHA
	
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,2*SIZE					#  2mr*kr
	daddu	B,B,4*SIZE					#  4nr*kr

	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1

	MADD	t13,t13,a0,b2
	MADD	t23,t23,a1,b2

	MADD	t14,t14,a0,b3
	MADD	t24,t24,a1,b3

.L29:									#  Write Back to C
#ifndef TRMMKERNEL
	LD	c11,0(CO1)						#	GEMM write back part
	LD	c21,1*SIZE(CO1)			

	LD	c12,0(CO2)
	LD	c22,1*SIZE(CO2)
	
	LD	c13,0(CO3)
	MADD	t11,c11,t11,ALPHA
	LD	c23,1*SIZE(CO3)
	MADD	t21,c21,t21,ALPHA

	LD	c14,0(CO4)
	MADD	t12,c12,t12,ALPHA
	LD	c24,1*SIZE(CO4)
	MADD	t22,c22,t22,ALPHA

	ST	t11,0(CO1)
	MADD	t13,c13,t13,ALPHA
	ST	t21,1*SIZE(CO1)
	MADD	t23,c23,t23,ALPHA

	ST	t12,0(CO2)
	MADD	t14,c14,t14,ALPHA
	ST	t22,1*SIZE(CO2)
	MADD	t24,c24,t24,ALPHA

	ST	t13,0(CO3)
	daddu	CO1,CO1,2*SIZE				#  COi += 2
	ST	t23,1*SIZE(CO3)
	daddu	CO2,CO2,2*SIZE

	ST	t14,0(CO4)
	daddu	CO3,CO3,2*SIZE
	ST	t24,1*SIZE(CO4)
	daddu	CO4,CO4,2*SIZE

	FETCH	$0,0(CO1)
	FETCH	$0,0(CO2)
	FETCH	$0,0(CO3)
	FETCH	$0,0(CO4)

#else
	MUL	t11, ALPHA, t11					#	TRMM write back part
	MUL	t21, ALPHA, t21
	
	ST	t11, 0 * SIZE(CO1)
	MUL	t12, ALPHA, t12
	ST	t21, 1 * SIZE(CO1)
	MUL	t22, ALPHA, t22
	
	ST	t12, 0 * SIZE(CO2)
	MUL	t13, ALPHA, t13
	ST	t22, 1 * SIZE(CO2)
	MUL	t23, ALPHA, t23
	
	ST	t13, 0 * SIZE(CO3)
	MUL	t14, ALPHA, t14
	ST	t23, 1 * SIZE(CO3)
	MUL	t24, ALPHA, t24
	
	ST	t14, 0 * SIZE(CO4)
	ST	t24, 1 * SIZE(CO4)
	
	daddiu	CO1,CO1, 2 * SIZE
	daddiu	CO2,CO2, 2 * SIZE
	daddiu	CO3,CO3, 2 * SIZE
	daddiu	CO4,CO4, 2 * SIZE

	FETCH	$0,0(CO1)
	FETCH	$0,0(CO2)
	FETCH	$0,0(CO3)
	FETCH	$0,0(CO4)

#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP,KCO,KK
#ifdef LEFT
	daddiu	TEMP,TEMP,-2
#else
	daddiu	TEMP,TEMP,-4
#endif
	dsll	K,TEMP,1 + BASE_SHIFT
	dsll	TEMP,TEMP,2 + BASE_SHIFT

	daddu	A,A,K						#	move A to next panel Ai
	daddu	B,B,TEMP					#	move B to next panel Bj
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif
#endif


	.align 3
.L14_M1:
	andi	M,MCO,1						#	mr=1	
	beqz	M,.L0_N4_Loop				#  	M = 0, finishing one panel Bj
	nop

.L30:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,BO						#	Reset B
#else
	dsll	K,KK, 0 + BASE_SHIFT
	dsll	TEMP,KK,2 + BASE_SHIFT

	daddu	A,A,K
	daddu	B,BO,TEMP
#endif
	MTC		$0,t11
	MOV	t12,t11
	LD	a0,	0 * SIZE(A)					#	a0

	MOV	t13,t11
	gsLQC1(R9,F9,F8,0)					#	b0,b1

	MOV	t14,t11							#	clear result registers
	gsLQC1(R9,F11,F10,1)				#	b2,b3

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, KCO, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 4
#endif
	dsra	K,TEMP, 2
	nop
	beqz	K,.L35
	nop
								
#else							
	move	B,BO						#	Reset B, GEMM part
	dsra	K,KCO,2						#  	K=KCO/2
	LD	a0, 0 * SIZE(A)					#	a0

	MTC		$0,t11
	MOV	t12,t11
	gsLQC1(R9,F9,F8,0)					#	b0,b1

	MOV	t13,t11
	MOV	t14,t11
	gsLQC1(R9,F11,F10,1)				#	b2,b3
	
	beqz	K,.L35
	nop
#endif

.L31:									#	nr=4,mr=1,kr=4	
	LD	a1,	1*SIZE(A)					#	load a1
	MADD	t11,t11,a0,b0
	
	gsLQC1(R9,F13,F12,2)				#	b4,b5
	MADD	t12,t12,a0,b1
	
	gsLQC1(R9,F15,F14,3)				#	b6,b7
	MADD	t13,t13,a0,b2
	MADD	t14,t14,a0,b3

	LD	a2,	2*SIZE(A)					#	a2
	MADD	t11,t11,a1,b4
	
	gsLQC1(R9,F9,F8,4)
	MADD	t12,t12,a1,b5
	
	gsLQC1(R9,F11,F10,5)
	MADD	t13,t13,a1,b6
	MADD	t14,t14,a1,b7
	daddiu	K,K,-1

	LD	a3,	3*SIZE(A)					#	a3
	MADD	t11,t11,a2,b0
	
	gsLQC1(R9,F13,F12,6)
	MADD	t12,t12,a2,b1
	daddu	A,A,4*SIZE					#	1mr*4kr
	
	gsLQC1(R9,F15,F14,7)
	MADD	t13,t13,a2,b2
	MADD	t14,t14,a2,b3
	daddu	B,B,16*SIZE					#	4nr*4kr

	LD	a0,	0*SIZE(A)					#	a0
	MADD	t11,t11,a3,b4
	
	gsLQC1(R9,F9,F8,0)
	MADD	t12,t12,a3,b5
	
	gsLQC1(R9,F11,F10,1)
	MADD	t13,t13,a3,b6
	bnez 	K,.L31
	MADD	t14,t14,a3,b7

.L35:									#  kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2			
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L38
	nop

.L36:			
	LD	a1,1*SIZE(A)					#	load a1
	MADD	t11,t11,a0,b0
	
	gsLQC1(R9,F13,F12,2)				
	MADD	t12,t12,a0,b1
	daddu	A,A,2*SIZE					#  	mr*2kr
	
	gsLQC1(R9,F15,F14,3)
	MADD	t13,t13,a0,b2
	MADD	t14,t14,a0,b3
	daddu	B,B,8*SIZE					#	4nr*2kr


.L37:
	LD	a0,0(A)
	MADD	t11,t11,a1,b4
	
	gsLQC1(R9,F9,F8,0)
	MADD	t12,t12,a1,b5
	
	gsLQC1(R9,F11,F10,1)
	MADD	t13,t13,a1,b6
	MADD	t14,t14,a1,b7
	
.L38:									#  	kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L39				
	LD	ALPHA,152($sp)					#  Get ALPHA
	
	MADD	t11,t11,a0,b0
	MADD	t12,t12,a0,b1
	daddu	A,A,1*SIZE				
	daddu	B,B,4*SIZE
	
	MADD	t13,t13,a0,b2
	MADD	t14,t14,a0,b3

.L39:									#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)			
	LD	c12,0(CO2)
	LD	c13,0(CO3)
	LD	c14,0(CO4)
	
	MADD	t11,c11,t11,ALPHA
	MADD	t12,c12,t12,ALPHA
	MADD	t13,c13,t13,ALPHA
	MADD	t14,c14,t14,ALPHA

	ST	t11,0(CO1)
	ST	t12,0(CO2)
	ST	t13,0(CO3)
	ST	t14,0(CO4)
#else
	MUL	t11, ALPHA, t11
	MUL	t12, ALPHA, t12
	MUL	t13, ALPHA, t13
	MUL	t14, ALPHA, t14

	ST	t11,  0 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)
	ST	t13,  0 * SIZE(CO3)
	ST	t14,  0 * SIZE(CO4)

#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, KCO, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -4
#endif

	dsll	K,TEMP, 0 + BASE_SHIFT
	dsll	TEMP,TEMP, 2 + BASE_SHIFT

	daddu	A,A,K
	daddu	B,B,TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 1
#endif
#endif


	.align	3
.L0_N4_Loop:								#	mc finished
	daddiu	N,N,-1							#  N--
#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu	KK, KK,4 
#endif
	bnez	N,.L0_N4_Lb			
	move	BO,B							#  Set BO point to next panel Bj

	.align	5					
.L0_N2:
	andi	N,NCO,2							#  	nr = 2
	beqz	N,.L0_N1		
	nop

.L0_N2_Lb:
	move	CO1,C					
	daddu	CO2,C,LDC

	dsra	M,MCO,2				
	move	A,AO							#  Reset A

	daddu	PREA,AO,SPANA
	daddu	C,CO2,LDC

#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif
	beqz	M,.L12_M2
	nop

.L40:						
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,BO							#	Reset B
#else
	dsll	K,KK, 2 + BASE_SHIFT
	dsll	TEMP, KK,1 + BASE_SHIFT	

	daddu	A,A,K
	daddu	B,BO,TEMP
#endif
	MTC		$0,t11
	MOV	t21,t11
	gsLQC1(R8,F1,F0,0)						#	a0,a1

	MOV	t31,t11
	MOV	t41,t11
	gsLQC1(R9,F9,F8,0)						#	b0,b1

	MOV	t12,t11
	MOV	t22,t11
	gsLQC1(R8,F3,F2,1)						#	a2,a3
	
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP,KCO,KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 4
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	K,TEMP,2				
	MOV	t32,t11
	beqz	K,.L45
	MOV	t42,t11

#else
	move	B,BO							#	Reset B
	MTC		$0,t11							#  	gemm part
	gsLQC1(R8,F1,F0,0)						#	a0,a1

	MOV	t21,t11
	MOV	t31,t11
	gsLQC1(R9,F9,F8,0)						#	b0,b1

	MOV	t41,t11
	dsra	K,KCO,2							#	K=KCO/2
	gsLQC1(R8,F3,F2,1)						#	a2,a3
	
	MOV	t12,t11
	MOV	t22,t11
	
	MOV	t32,t11
	beqz	K,.L45
	MOV	t42,t11
#endif

.L41:										#  	nr=2,mr=kr=4
	gsLQC1(R8,F5,F4,2)			
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0

	gsLQC1(R9,F13,F12,1)	
	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1

	gsLQC1(R8,F7,F6,3)
	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0
	
	FETCH		$0,(PREA)
	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1

.L42:
	gsLQC1(R8,F1,F0,4)
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4

	gsLQC1(R9,F11,F10,2)
	MADD	t12,t12,a4,b5
	MADD	t22,t22,a5,b5

	gsLQC1(R8,F3,F2,5)
	MADD	t31,t31,a6,b4
	MADD	t41,t41,a7,b4

	FETCH		$0,4*SIZE(PREA)
	MADD	t32,t32,a6,b5
	MADD	t42,t42,a7,b5

.L43:
	gsLQC1(R8,F5,F4,6)	
	MADD	t11,t11,a0,b2
	MADD	t21,t21,a1,b2

	gsLQC1(R9,F15,F14,3)
	MADD	t12,t12,a0,b3
	MADD	t22,t22,a1,b3

	gsLQC1(R8,F7,F6,7)
	MADD	t31,t31,a2,b2
	MADD	t41,t41,a3,b2
	daddu	B,B,8*SIZE						#	2nr*4kr	

	FETCH		$0,8*SIZE(PREA)
	MADD	t32,t32,a2,b3
	MADD	t42,t42,a3,b3
	daddu	A,A,16*SIZE						#	4mr*4kr

.L44:
	gsLQC1(R8,F1,F0,0)
	MADD	t11,t11,a4,b6
	MADD	t21,t21,a5,b6
	daddiu	K,K,-1

	gsLQC1(R9,F9,F8,0)
	MADD	t12,t12,a4,b7
	MADD	t22,t22,a5,b7
	daddu	PREA,PREA,16*SIZE

	gsLQC1(R8,F3,F2,1)
	MADD	t31,t31,a6,b6
	MADD	t41,t41,a7,b6

	FETCH		$0,-4*SIZE(PREA)
	MADD	t32,t32,a6,b7
	bnez 	K,.L41
	MADD	t42,t42,a7,b7


.L45:										#  	kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2				
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L48
	nop

.L46:			
	gsLQC1(R8,F5,F4,2)			
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0

	gsLQC1(R9,F13,F12,1)		
	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1

	gsLQC1(R8,F7,F6,3)
	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0
	daddu	B,B,4*SIZE						#  B+=2(nr)*2(kr)*8Byte=32

	FETCH		$0,0(PREA)
	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1
	daddu	A,A,8*SIZE						#  A+=4(mr)*2(kr)*8Byte=8*SIZE

.L47:
	gsLQC1(R8,F1,F0,0)
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4

	gsLQC1(R9,F9,F8,0)
	MADD	t12,t12,a4,b5
	MADD	t22,t22,a5,b5

	gsLQC1(R8,F3,F2,1)
	MADD	t31,t31,a6,b4
	MADD	t41,t41,a7,b4

	FETCH		$0,4*SIZE(PREA)
	MADD	t32,t32,a6,b5
	MADD	t42,t42,a7,b5
	daddu	PREA,PREA,8*SIZE

	
.L48:										#	 kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L49				 
	LD	ALPHA,152($sp)						#  Get ALPHA
	
	FETCH		$0,0(PREA)
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,4*SIZE						#  A+=4(mr)*1(kr)*8Byte=32

	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1
	daddu	B,B,2*SIZE
	daddu	PREA,PREA,4*SIZE

	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0

	MADD	t32,t32,a2,b1
	MADD	t42,t42,a3,b1

.L49:										#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)							#  gemm write back part Fetch 16 C
	LD	c21,1*SIZE(CO1)			
	LD	c31,2*SIZE(CO1)
	LD	c41,3*SIZE(CO1)

	LD	c12,0(CO2)
	MADD	t11,c11,t11,ALPHA
	LD	c22,1*SIZE(CO2)
	MADD	t21,c21,t21,ALPHA
	LD	c32,2*SIZE(CO2)
	MADD	t31,c31,t31,ALPHA
	LD	c42,3*SIZE(CO2)
	MADD	t41,c41,t41,ALPHA

	ST	t11,0(CO1)
	MADD	t12,c12,t12,ALPHA
	ST	t21,1*SIZE(CO1)
	MADD	t22,c22,t22,ALPHA
	ST	t31,2*SIZE(CO1)
	MADD	t32,c32,t32,ALPHA
	ST	t41,3*SIZE(CO1)
	MADD	t42,c42,t42,ALPHA
	daddiu	M,M,-1				

	ST	t12,0(CO2)
	ST	t22,1*SIZE(CO2)
	ST	t32,2*SIZE(CO2)
	ST	t42,3*SIZE(CO2)

	FETCH	$0,4*SIZE(CO1)
	FETCH	$0,4*SIZE(CO2)
	FETCH	$0,8*SIZE(CO1)
	FETCH	$0,8*SIZE(CO2)

	daddu	CO1,CO1,4*SIZE			
	bnez	M,.L40				
	daddu	CO2,CO2,4*SIZE

#else
	MUL	t11, ALPHA, t11
	MUL	t21, ALPHA, t21
	MUL	t31, ALPHA, t31
	MUL	t41, ALPHA, t41
	
	MUL	t12, ALPHA, t12
	ST	t11, 0 * SIZE(CO1)
	MUL	t22, ALPHA, t22
	ST	t21, 1 * SIZE(CO1)
	MUL	t32, ALPHA, t32
	ST	t31, 2 * SIZE(CO1)
	MUL	t42, ALPHA, t42
	ST	t41, 3 * SIZE(CO1)
	
	ST	t12, 0 * SIZE(CO2)
	daddiu	M,M,-1
	ST	t22, 1 * SIZE(CO2)
	ST	t32, 2 * SIZE(CO2)
	ST	t42, 3 * SIZE(CO2)
	
	daddiu	CO1,CO1, 4*SIZE
	daddiu	CO2,CO2, 4*SIZE

	FETCH	$0,0(CO1)
	FETCH	$0,0(CO2)
	FETCH	$0,4(CO1)
	FETCH	$0,4(CO2)

#if ( defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, KCO, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -4
#else
	daddiu	TEMP, TEMP, -2
#endif
	dsll	K,TEMP, 2 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT

	daddu	A,A,K
	daddu	B,B,TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 4
#endif
	bnez	M,.L40
	nop
#endif


	.align 3
.L12_M2:
	andi	M,MCO,2						#  	mr = 2
	beqz	M,.L12_M1			
	nop

.L50:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,BO
#else
	dsll	K,    KK, 1 + BASE_SHIFT	#mr=2
	dsll	TEMP, KK, 1 + BASE_SHIFT	#nr=2

	daddu	A, A, K
	daddu	B, BO,  TEMP
#endif
	MTC		$0,t11
	gsLQC1(R8,F1,F0,0)					#a0,a1

	MOV	t21,t11
	gsLQC1(R9,F9,F8,0)					#b0,b1
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, KCO, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	K,TEMP,2			
	MOV	t12,t11
	beqz	K,.L55
	MOV	t22,t11

#else
	move	B,BO
	dsra	K,KCO,2						#  K=KCO/2
	gsLQC1(R8,F1,F0,0)					#a0,a1

	MTC		$0,t11
	MOV	t21,t11
	gsLQC1(R9,F9,F8,0)					#b0,b1
	
	MOV	t12,t11
	beqz	K,.L55
	MOV	t22,t11
#endif

.L51:									#  nr=2 mr=2,kr=4
	gsLQC1(R8,F5,F4,1)			
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0

	gsLQC1(R9,F13,F12,1)	
	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1

	gsLQC1(R8,F3,F2,2)
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4

	gsLQC1(R9,F11,F10,2)
	MADD	t12,t12,a4,b5
	MADD	t22,t22,a5,b5
	daddiu	K,K,-1

	gsLQC1(R8,F7,F6,3)	
	MADD	t11,t11,a2,b2
	MADD	t21,t21,a3,b2
	daddu	A,A,8*SIZE					#  A+=2(mr)*4(kr)*8Byte=8*SIZE

	gsLQC1(R9,F15,F14,3)
	MADD	t12,t12,a2,b3
	MADD	t22,t22,a3,b3
	daddu	B,B,8*SIZE					#  B+=2(nr)*4(kr)*8Byte=16*SIZE

	gsLQC1(R8,F1,F0,0)
	MADD	t11,t11,a6,b6
	MADD	t21,t21,a7,b6

	gsLQC1(R9,F9,F8,0)
	MADD	t12,t12,a6,b7
	bnez 	K,.L51
	MADD	t22,t22,a7,b7

.L55:									#  	kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2				
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L58
	nop

.L56:			
	gsLQC1(R8,F5,F4,1)			
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,4*SIZE					#  A+=2(mr)*2(kr)*8Byte=32

	gsLQC1(R9,F13,F12,1)		
	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1
	daddu	B,B,4*SIZE					#	2nr*2kr

.L57:
	gsLQC1(R8,F1,F0,0)
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4

	gsLQC1(R9,F9,F8,0)
	MADD	t12,t12,a4,b5
	MADD	t22,t22,a5,b5

	
.L58:									#  kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP, 1
#endif
	beqz	K,.L59				
	LD	ALPHA,152($sp)					#  Get ALPHA
	
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,2*SIZE					#  	A+=2(mr)*1(kr)*8Byte=16
	daddu	B,B,2*SIZE					#	2nr*kr

	MADD	t12,t12,a0,b1
	MADD	t22,t22,a1,b1


.L59:									#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)						#  write gemm part back Fetch 16 C
	LD	c21,1*SIZE(CO1)			
	LD	c12,0(CO2)
	LD	c22,1*SIZE(CO2)
	
	MADD	t11,c11,t11,ALPHA
	MADD	t21,c21,t21,ALPHA
	MADD	t12,c12,t12,ALPHA
	MADD	t22,c22,t22,ALPHA

	ST	t11,0(CO1)
	ST	t21,1*SIZE(CO1)
	ST	t12,0(CO2)
	ST	t22,1*SIZE(CO2)

	daddu	CO1,CO1,2*SIZE			
	daddu	CO2,CO2,2*SIZE

	FETCH	$0,0(CO1)
	FETCH	$0,0(CO2)
#else
	daddiu	M, M, -1
	daddiu	CO1,CO1, 2 * SIZE
	daddiu	CO2,CO2, 2 * SIZE
	MUL	t11, ALPHA, t11
	MUL	t21, ALPHA, t21
	MUL	t12, ALPHA, t12
	MUL	t22, ALPHA, t22

	ST	t11, -2 * SIZE(CO1)
	ST	t21, -1 * SIZE(CO1)
	ST	t12, -2 * SIZE(CO2)
	ST	t22, -1 * SIZE(CO2)

	FETCH	$0,0(CO1)
	FETCH	$0,0(CO2)

#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, KCO, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -2
#endif

	dsll	K,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, B, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif
#endif


	.align 3
.L12_M1:
	andi	M,MCO,1					#  	mr = 1
	beqz	M,.L0_N2_Loop		
	nop

.L60:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,BO					#	Reset B
#else
	dsll	K,    KK, 0 + BASE_SHIFT
	dsll	TEMP, KK, 1 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, BO,  TEMP
#endif
	MTC		$0,t11
	LD	a0, 0*SIZE(A)				#	a0
	
	MOV	t21,t11
	gsLQC1(R9,F9,F8,0)				#	b0,b1
	
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, KCO, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	K,TEMP,2				
	MOV	t12,t11
	beqz	K,.L65
	MOV	t22,t11

#else
	dsra	K,KCO,2				
	move	B,BO					#  Reset B
	LD	a0,0*SIZE(A)
	
	MTC		$0,t11
	MOV	t21,t11
	gsLQC1(R9,F9,F8,0)		

	MOV	t12,t11
	beqz	K,.L65
	MOV	t22,t11
#endif

.L61:								#	nr=2,mr=1,kr=4	
	LD	a4,	1*SIZE(A)				#	a2
	MADD	t11,t11,a0,b0
	
	gsLQC1(R9,F13,F12,1)		
	MADD	t12,t12,a0,b1

	LD	a2,	2*SIZE(A)				#	a3
	MADD	t11,t11,a4,b4
	
	gsLQC1(R9,F11,F10,2)
	MADD	t12,t12,a4,b5

	LD	a6,	3*SIZE(A)				#	a4
	MADD	t11,t11,a2,b2
	daddiu	K,K,-1
	
	gsLQC1(R9,F15,F14,3)
	MADD	t12,t12,a2,b3
	daddu	A,A,4*SIZE				#  A+=1(mr)*4(kr)*8Byte=32

	LD	a0,	0*SIZE(A)
	MADD	t11,t11,a6,b6
	daddu	B,B,8*SIZE				#  B+=2(nr)*4(kr)*8Byte=8*SIZE
	
	gsLQC1(R9,F9,F8,0)				#	a0
	bnez 	K,.L61
	MADD	t12,t12,a6,b7

.L65:								#  kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2				
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L68
	nop

.L66:			
	LD	a4,	1*SIZE(A)				#	a1
	MADD	t11,t11,a0,b0
	daddu	A,A,2*SIZE				#  A+=1(mr)*2(kr)*8Byte=16
	
	gsLQC1(R9,F13,F12,1)	
	MADD	t12,t12,a0,b1
	daddu	B,B,4*SIZE

.L67:
	LD	a0,0(A)						#	a0
	MADD	t11,t11,a4,b4
	
	gsLQC1(R9,F9,F8,0)
	MADD	t12,t12,a4,b5

	
.L68:								#   kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L69				  
	LD	ALPHA,152($sp)				#  Get ALPHA
	
	MADD	t11,t11,a0,b0
	MADD	t12,t12,a0,b1
	daddu	A,A,1*SIZE				#  A+=1(mr)*1(kr)*8Byte=16
	daddu	B,B,2*SIZE


.L69:								#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)					#  Fetch 16 C
	LD	c12,0(CO2)
	
	MADD	t11,c11,t11,ALPHA
	MADD	t12,c12,t12,ALPHA

	ST	t11,0(CO1)
	ST	t12,0(CO2)

	daddu	CO1,CO1,1*SIZE		
	daddu	CO2,CO2,1*SIZE

#else
	MUL	t11, ALPHA, t11
	MUL	t12, ALPHA, t12

	ST	t11,  0 * SIZE(CO1)
	ST	t12,  0 * SIZE(CO2)

	daddu	CO1,CO1,1*SIZE			
	daddu	CO2,CO2,1*SIZE

#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, KCO, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -2
#endif

	dsll	K,    TEMP, 0 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, B, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 1
#endif
#endif

.L0_N2_Loop:
#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu	KK, KK, 2
#endif
	move	BO, B


	.align	5					
.L0_N1:
	andi	N,NCO,1					#  nr = 1
	beqz	N,.L999					
	nop

	move	CO1,C				
	dsra	M,MCO,2				
	
	move	A,AO					#  Reset A
	daddu	PREA,AO,SPANA
#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif

	beqz	M,.L11_M2
	daddu	C,CO1,LDC

.L70:						
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B, BO					#	Reset B
#else
	dsll	K,    KK, 2 + BASE_SHIFT
	dsll	TEMP, KK, 0 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, BO,  TEMP
#endif

	MTC		$0,t11
	LD	b0,	0*SIZE(B)
	
	MOV	t21,t11
	gsLQC1(R8,F1,F0,0)				#a0,a1

	MOV	t31,t11
	gsLQC1(R8,F3,F2,1)				#a2,a3
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, KCO, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 4
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	K,TEMP,2		
	MOV	t41,t11
	beqz	K,.L75
	nop
#else
	move	B, BO					#	Reset B
	dsra	K,KCO,2			
	LD	b0,	0*SIZE(B)
	
	MTC		$0,t11
	MOV	t21,t11
	gsLQC1(R8,F1,F0,0)				#a0,a1
	
	MOV	t31,t11
	MOV	t41,t11
	gsLQC1(R8,F3,F2,1)				#a2,a3
	
	beqz	K,.L75
	nop
#endif

.L71:								#  nr=1,mr=kr=4
	LD	b4,	1*SIZE(B)				#	b1
	MADD	t11,t11,a0,b0
	
	gsLQC1(R8,F5,F4,2)			
	MADD	t21,t21,a1,b0

	gsLQC1(R8,F7,F6,3)
	FETCH		$0,(PREA)
	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0

.L72:
	LD	b2,	2*SIZE(B)				#	b2
	MADD	t11,t11,a4,b4
	gsLQC1(R8,F1,F0,4)
	MADD	t21,t21,a5,b4

	gsLQC1(R8,F3,F2,5)
	FETCH		$0,4*SIZE(PREA)
	MADD	t31,t31,a6,b4
	MADD	t41,t41,a7,b4

.L73:
	LD	b6,	3*SIZE(B)
	MADD	t11,t11,a0,b2
	daddu	B,B,4*SIZE				#  B+=1(nr)*4(kr)*8Byte=32
	
	gsLQC1(R8,F5,F4,6)	
	MADD	t21,t21,a1,b2
	FETCH		$0,8*SIZE(PREA)

	gsLQC1(R8,F7,F6,7)
	MADD	t31,t31,a2,b2
	MADD	t41,t41,a3,b2
	daddu	A,A,16*SIZE				#  A+=4(mr)*4(kr)*8Byte=16*SIZE

.L74:
	LD	b0,	0*SIZE(B)
	MADD	t11,t11,a4,b6
	daddu	PREA,PREA,16*SIZE
	
	gsLQC1(R8,F1,F0,0)
	MADD	t21,t21,a5,b6
	daddiu	K,K,-1
	FETCH		$0,-32(PREA)

	gsLQC1(R8,F3,F2,1)
	MADD	t31,t31,a6,b6
	bnez 	K,.L71
	MADD	t41,t41,a7,b6


.L75:								#  kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2				
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L78
	nop

.L76:			
	LD	b4,	1*SIZE(B)
	MADD	t11,t11,a0,b0
	daddu	B,B,2*SIZE				#  B+=1(nr)*2(kr)*8Byte=32
	
	gsLQC1(R8,F5,F4,2)			
	MADD	t21,t21,a1,b0
	FETCH		$0,0(PREA)

	gsLQC1(R8,F7,F6,3)
	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0
	daddu	A,A,8*SIZE				#  A+=4(mr)*2(kr)*8Byte=8*SIZE

.L77:
	LD	b0,0(B)
	MADD	t11,t11,a4,b4
	
	gsLQC1(R8,F1,F0,0)
	MADD	t21,t21,a5,b4
	FETCH		$0,4*SIZE(PREA)

	gsLQC1(R8,F3,F2,1)
	MADD	t31,t31,a6,b4
	MADD	t41,t41,a7,b4
	daddu	PREA,PREA,8*SIZE

	
.L78:								#   kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L79				 
	LD	ALPHA,152($sp)				#  Get ALPHA
	
	FETCH		$0,0(PREA)
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,4*SIZE				#  A+=4(mr)*1(kr)*8Byte=32

	MADD	t31,t31,a2,b0
	MADD	t41,t41,a3,b0
	daddu	B,B,1*SIZE
	daddu	PREA,PREA,4*SIZE


.L79:								#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)					#  Fetch 16 C
	LD	c21,1*SIZE(CO1)			
	LD	c31,2*SIZE(CO1)
	LD	c41,3*SIZE(CO1)

	MADD	t11,c11,t11,ALPHA
	MADD	t21,c21,t21,ALPHA
	MADD	t31,c31,t31,ALPHA
	MADD	t41,c41,t41,ALPHA

	ST	t11,0(CO1)
	ST	t21,1*SIZE(CO1)
	ST	t31,2*SIZE(CO1)
	ST	t41,3*SIZE(CO1)
	daddiu	M,M,-1					#  M--

	FETCH	$0,4*SIZE(CO1)
	FETCH	$0,8*SIZE(CO1)

	bnez	M,.L70					#  M!=0
	daddu	CO1,CO1,4*SIZE			#  COx += 4*8Byte
#else
	daddiu	M,M,-1					#  M--
	MUL	t11, ALPHA, t11
	MUL	t21, ALPHA, t21
	MUL	t31, ALPHA, t31
	MUL	t41, ALPHA, t41

	ST	t11,0(CO1)
	ST	t21,1*SIZE(CO1)
	ST	t31,2*SIZE(CO1)
	ST	t41,3*SIZE(CO1)

	FETCH	$0,4*SIZE(CO1)
	FETCH	$0,8*SIZE(CO1)

	daddu	CO1,CO1,4*SIZE			
#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, KCO, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -4
#else
	daddiu	TEMP, TEMP, -1
#endif

	dsll	K,    TEMP, 2 + BASE_SHIFT
	dsll	TEMP, TEMP, 0 + BASE_SHIFT

	daddu	A, A,K
	daddu	B, B, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 4
#endif
	bnez	M,.L70				
	nop
#endif


	.align 3
.L11_M2:
	andi	M,MCO,2					#  mr = 2
	beqz	M,.L11_M1			
	nop

.L80:						
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B, BO
#else
	dsll	K,    KK, 1 + BASE_SHIFT
	dsll	TEMP, KK, 0 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, BO,  TEMP
#endif

	LD	b0,	0*SIZE(B)
	MTC		$0,t11
	
	gsLQC1(R8,F1,F0,0)				#a0,a1
	MOV		t21,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, KCO, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	K,TEMP,2				#  K=KCO/2
	beqz	K,.L85
	nop
#else
	move	B, BO
	dsra	K,KCO,2				
	LD	b0,	0*SIZE(B)

	MTC		$0,t11
	MOV		t21,t11
	gsLQC1(R8,F1,F0,0)				#a0,a1
	
	beqz	K,.L85
	nop
#endif

.L81:								#  nr=1,mr=2,kr=4
	LD	b4,	1*SIZE(B)
	gsLQC1(R8,F5,F4,1)			
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0

	LD	b2,	2*SIZE(B)
	gsLQC1(R8,F3,F2,2)
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4
	
	LD	b6,	3*SIZE(B)
	gsLQC1(R8,F7,F6,3)
	MADD	t11,t11,a2,b2
	MADD	t21,t21,a3,b2

	daddu	A,A,8*SIZE				#  A+=2(mr)*4(kr)*8Byte=8*SIZE
	daddu	B,B,4*SIZE				#  B+=1(nr)*4(kr)*8Byte=32

	LD	b0,	0*SIZE(B)
	gsLQC1(R8,F1,F0,0)
	MADD	t11,t11,a6,b6
	MADD	t21,t21,a7,b6
	
	daddiu	K,K,-1
	bnez 	K,.L81
	nop

.L85:								#  kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2				
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L88
	nop

.L86:			
	gsLQC1(R8,F5,F4,1)		
	LD	b4,	1*SIZE(B)
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	
	daddu	A,A,4*SIZE				#  A+=2(mr)*2(kr)*8Byte=32
	daddu	B,B,2*SIZE				#  B+=1(nr)*2(kr)*8Byte=16
	
	gsLQC1(R8,F1,F0,0)
	LD	b0,0(B)
	MADD	t11,t11,a4,b4
	MADD	t21,t21,a5,b4

	
.L88:								#  kr=1
#ifndef TRMMKERNEL
	andi	K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L89				
	LD	ALPHA,152($sp)				#  Get ALPHA
	
	MADD	t11,t11,a0,b0
	MADD	t21,t21,a1,b0
	daddu	A,A,2*SIZE				#  A+=2(mr)*1(kr)*8Byte=16
	daddu	B,B,1*SIZE


.L89:								#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)					#  Fetch 16 C
	LD	c21,1*SIZE(CO1)			

	MADD	t11,c11,t11,ALPHA
	MADD	t21,c21,t21,ALPHA

	ST	t11,0(CO1)
	ST	t21,1*SIZE(CO1)

	FETCH	$0,2*SIZE(CO1)
	
	daddu	CO1,CO1,2*SIZE			#  COx += 2*8Byte

#else
	daddu	CO1,CO1,2*SIZE			#  COx += 2*8Byte
	MUL	t11, ALPHA, t11
	MUL	t21, ALPHA, t21

	FETCH	$0,0(CO1)
	ST	t11, -2 * SIZE(CO1)
	ST	t21, -1 * SIZE(CO1)
#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, KCO, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -1
#endif

	dsll	K,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 0 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, B, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif
#endif


	.align 3
.L11_M1:
	andi		M,MCO,1				#   mr = 1
	beqz	M,.L999			
	nop

.L90:			
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	B,  BO
#else
	dsll	K,    KK, 0 + BASE_SHIFT
	dsll	TEMP, KK, 0 + BASE_SHIFT

	daddu	A, A, K
	daddu	B, BO,  TEMP
#endif
	LD	a0,	0*SIZE(A)
	LD	b0,	0*SIZE(B)
	MTC		$0,t11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, KCO, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	K,  TEMP, 2
	beqz	K,.L95
	nop

#else
	move	B,  BO
	LD	a0,	0*SIZE(A)
	LD	b0,	0*SIZE(B)
	dsra	K,KCO,2				
	beqz	K,.L95
	MTC		$0,t11
#endif

.L91:								#  nr=mr=1,kr=4
	LD	a4,	1*SIZE(A)
	LD	b4,	1*SIZE(B)
	MADD	t11,t11,a0,b0
	
	LD	a2,	2*SIZE(A)
	LD	b2,	2*SIZE(B)
	MADD	t11,t11,a4,b4

	LD	a6,	3*SIZE(A)
	LD	b6,	3*SIZE(B)
	MADD	t11,t11,a2,b2
	
	daddu	A,A,4*SIZE				#  A+=1(mr)*4(kr)*8Byte=32
	daddu	B,B,4*SIZE				#  B+=1(nr)*4(kr)*8Byte=32

	LD	a0,	0*SIZE(A)
	LD	b0,	0*SIZE(B)
	MADD	t11,t11,a6,b6
	
	daddiu	K,K,-1
	bnez 	K,.L91
	nop

.L95:								#  kr=2
#ifndef TRMMKERNEL
	andi	K,KCO,2			
#else
	andi	K,TEMP,2
#endif
	beqz	K,.L98
	nop

.L96:			
	LD	a4,	1*SIZE(A)
	LD	b4,	1*SIZE(B)
	MADD	t11,t11,a0,b0
	daddu	B,B,2*SIZE				#  B+=1(nr)*2(kr)*8Byte=16
	daddu	A,A,2*SIZE				#  A+=1(mr)*2(kr)*8Byte=32

	LD	b0,0(B)
	LD	a0,0(A)
	MADD	t11,t11,a4,b4
	
.L98:								#  kr=1
#ifndef TRMMKERNEL
	andi		K,KCO,1
#else
	andi	K,TEMP,1
#endif
	beqz	K,.L99				
	LD	ALPHA,152($sp)				#  Get ALPHA

	MADD	t11,t11,a0,b0


.L99:								#  Write Back
#ifndef TRMMKERNEL
	LD	c11,0(CO1)					#  Fetch 16 C
	MADD	t11,c11,t11,ALPHA
	ST	t11,0(CO1)

#else
	MUL	t11, ALPHA, t11

	ST	t11,  0 * SIZE(CO1)
#endif


.L999:							#  End
	ld	$16,   0($sp)
	ld	$17,   8($sp)
	ld	$18,  16($sp)
	ld	$19,  24($sp)
	ld	$20,  32($sp)
	ld	$21,  40($sp)
	ld	$22,  48($sp)
	LD	$f24, 56($sp)
	LD	$f25, 64($sp)
	LD	$f26, 72($sp)
	LD	$f27, 80($sp)
	LD	$f28, 88($sp)
	ld	$23,  96($sp)
	ld	$24, 104($sp)
	ld	$25, 112($sp)
	LD	$f20,120($sp)
	LD	$f21,128($sp)
	LD	$f22,136($sp)
	LD	$f23,144($sp)

	j	$31
	daddiu	$sp, $sp, 160

	EPILOGUE