Blob Blame Raw
#define REALNAME ASMNAME
#define ASSEMBLER
#include "common.h"

#define FETCH	ld
#define	STACKSIZE	160
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)


#####	Parameter registers	####

#define M	$4
#define	N	$5
#define	K	$6
#define A	$8
#define B	$9
#define C	$10
#define LDC	$11

####	Pointer A, B, C	####
#define AO	$12
#define BO	$13

#define CO1	$14
#define CO2	$15
#define CO3	$16
#define CO4	$17

#define PREA	$18
#define PREB	$19

####	Used registers	####
#define A1	$f0
#define A2	$f1
#define A3	$f2
#define A4	$f3
#define A5	$f4
#define	A6	$f5
#define	A7	$f6
#define	A8	$f7

#define B1	$f8
#define B2	$f9
#define B3	$f10
#define B4	$f11
#define B5	$f12
#define	B6	$f13
#define	B7	$f14
#define	B8	$f15

#define C11	$f16
#define C12	$f17
#define C21	$f18
#define C22	$f19
#define C31	$f20
#define C32	$f21
#define C41	$f22
#define C42	$f23
#define C13	$f24
#define C14	$f25
#define C23	$f26
#define C24	$f27
#define C33	$f28
#define C34	$f29
#define C43	$f30
#define C44	$f31

#define I	$2
#define J	$3
#define L	$7

####	Alpha register	####
#define ALPHA	$f15

#define F31 31
#define F30 30
#define F29 29
#define F28 28
#define F27 27
#define F26 26
#define F25 25
#define F24 24 
#define F23 23
#define F22 22
#define F21 21
#define F20 20
#define F19 19
#define F18 18
#define F17 17
#define F16 16 
#define F15 15
#define F14 14
#define F13 13
#define F12 12
#define F11 11
#define F10 10
#define F9 9
#define F8 8
#define F7 7
#define F6 6
#define F5 5
#define F4 4 
#define F3 3 
#define F2 2 
#define F1 1 
#define F0 0

#define	R12	12
#define	R13	13

#define R14	14
#define R15	15
#define	R16	16
#define	R17	17

#if defined(TRMMKERNEL)
#define	OFFSET	$23
#define	KK		$24
#define TEMP	$25
#endif

#	.text
#	.align	2
##	.globl	gemm
#	.set	nomips16
#	.ent	gemm
#	.type	gemm, @function
#gemm:
#	.frame	$sp,STACKSIZE,$31		# vars= 48, regs= 1/0, args= 0, gp= 0
#	.mask	0x40000000,-8
#	.fmask	0x00000000,0
#	.set	noreorder
#	.set	nomacro
	

	PROLOGUE

	daddiu	$sp,$sp,-STACKSIZE

	sd	$16,   0($sp)
	sd	$17,   8($sp)
	sd	$18,  16($sp)
	sd	$19,  24($sp)
	sd	$20,  32($sp)
	sd	$21,  40($sp)
	sd	$22,  48($sp)

	ST	$f24, 56($sp)
	ST	$f25, 64($sp)
	ST	$f26, 72($sp)
	ST	$f27, 80($sp)
	ST	$f28, 88($sp)

#if defined(TRMMKERNEL)
	sd	$23,  96($sp)
	sd	$24, 104($sp)
	sd	$25, 112($sp)

	LDARG	OFFSET, 160($sp)
#endif

#ifndef __64BIT__
	ST	$f20,120($sp)
	ST	$f21,128($sp)
	ST	$f22,136($sp)
	ST	$f23,144($sp)
#endif

	.align	4
.L4:
	dsra	J, N, 2				#	NR=4
	dsll	LDC, LDC, BASE_SHIFT#	LDC*SIZE

#if	defined(TRMMKERNEL) && !defined(LEFT)
	neg		KK, OFFSET
#endif

	blez	J, .L2
	ST		ALPHA, 152($sp)

.L48:
	dsra	I, M, 3				#	MR=8
	dsll	PREA, K, BASE_SHIFT

	move	AO, A				#	Reset A
	move	CO1, C

	daddu	CO2, C,   LDC
	daddu	CO3, CO2, LDC

	daddu	CO4, CO3, LDC
	daddu	PREA, A, PREA

#if defined(TRMMKERNEL) && defined(LEFT)
	move	KK, OFFSET
#endif

	blez	I, .L44
	daddu	C,   CO4, LDC

	.align	4
.L481:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) ||\
	(!defined(LEFT) && !defined(TRANSA))
	move	BO, B
#else
	dsll	L, KK, 3 + BASE_SHIFT	# kk*8mr*datasize
	dsll	TEMP, KK, 2 + BASE_SHIFT

	daddu	AO, AO, L			#	AO point to the data addr
	daddu	BO, B,  TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		

	dsll	PREB, K, BASE_SHIFT
	MOV		C21, C11
	MOV		C22, C11
	
	MOV		C31, C11
	MOV		C32, C11
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MOV		C41, C11
	MOV		C42, C11
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MOV		C13, C11
	MOV		C14, C11
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MOV		C23, C11
	FETCH	$0, 0 * SIZE(CO1)
	MOV		C24, C11
	FETCH	$0, 4 * SIZE(CO1)
	
	MOV		C33, C11
	FETCH	$0, 0 * SIZE(CO2)
	MOV		C34, C11
	FETCH	$0, 4 * SIZE(CO2)
	
	daddu	PREB, B, PREB 
	MOV		C43, C11
	FETCH	$0, 0 * SIZE(CO3)

	MOV		C44, C11
	FETCH	$0, 4 * SIZE(CO3)

	PLU		B3,	B1, B1
	FETCH	$0, 0 * SIZE(CO4)

	PLU		B4, B2, B2
	FETCH	$0, 4 * SIZE(CO4)

#if (defined(LEFT) && !defined(TRANSA)) ||\
		 (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK			#	TEMP is the length of the data part
#elif defined(LEFT)
	daddiu	TEMP, KK, 8
#else
	daddiu	TEMP, KK, 4
#endif
	dsra	L, TEMP, 6
	blez	L, .L482
	NOP
#else
								#	GEMM PART
	move	BO, B				#	Reset	B
	dsra	L, K, 6				#	UnRoll	K=64

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		

	dsll	PREB, K, BASE_SHIFT
	MOV		C21, C11
	MOV		C22, C11
	
	MOV		C31, C11
	MOV		C32, C11
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MOV		C41, C11
	MOV		C42, C11
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MOV		C13, C11
	MOV		C14, C11
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MOV		C23, C11
	FETCH	$0, 0 * SIZE(CO1)
	MOV		C24, C11
	FETCH	$0, 4 * SIZE(CO1)
	
	MOV		C33, C11
	FETCH	$0, 0 * SIZE(CO2)
	MOV		C34, C11
	FETCH	$0, 4 * SIZE(CO2)
	
	daddu	PREB, B, PREB 
	MOV		C43, C11
	FETCH	$0, 0 * SIZE(CO3)

	MOV		C44, C11
	FETCH	$0, 4 * SIZE(CO3)

	PLU		B3,	B1, B1
	FETCH	$0, 0 * SIZE(CO4)

	PLU		B4, B2, B2
	blez	L, .L482
	FETCH	$0, 4 * SIZE(CO4)
#endif

.L4810:
	daddiu	L, L, -1
	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	bgtz	L, .L4810
	MADPS	C44, C44, A8, B8

	.align	4
.L482:
#ifndef TRMMKERNEL
	andi	L, K, 32
#else
	andi	L, TEMP, 32
#endif
	blez	L, .L483
	NOP

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8


	.align	4
.L483:
#ifndef TRMMKERNEL
	andi	L, K, 16
#else
	andi	L, TEMP, 16
#endif
	blez	L, .L484
	NOP

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8


	.align	4
.L484:				
#ifndef TRMMKERNEL
	andi	L, K, 8
#else
	andi	L, TEMP, 8
#endif
	blez	L, .L485
	NOP

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8


	.align	4
.L485:
#ifndef TRMMKERNEL
	andi	L, K, 4
#else
	andi	L, TEMP, 4
#endif
	blez	L, .L486
	NOP

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 4)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 5)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 6)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 7)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 8 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 32 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 16 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 20 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 24 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 28 * SIZE(PREA)
	daddiu	PREA, PREA, 32 * SIZE

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8


	.align	4
.L486:
#ifndef TRMMKERNEL
	andi	L, K, 2
#else
	andi	L, TEMP, 2
#endif
	blez	L, .L487
	NOP

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 8 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 16 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	PLU		B7,	B5, B5
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C24, C24, A2, B4
	PLU		B8, B6, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4

	MADPS	C11, C11, A5, B5
	MADPS	C21, C21, A6, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C12, C12, A5, B6
	MADPS	C22, C22, A6, B6
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C31, C31, A7, B5
	MADPS	C41, C41, A8, B5
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C32, C32, A7, B6
	MADPS	C42, C42, A8, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C13, C13, A5, B7
	MADPS	C23, C23, A6, B7
	daddiu	PREB, PREB, 8 * SIZE	

	MADPS	C33, C33, A7, B7
	MADPS	C43, C43, A8, B7

	MADPS	C14, C14, A5, B8
	PLU		B3,	B1, B1
	FETCH	$0, 8 * SIZE(PREA)

	MADPS	C24, C24, A6, B8
	PLU		B4, B2, B2
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C34, C34, A7, B8
	MADPS	C44, C44, A8, B8
	daddiu	PREA, PREA, 16 * SIZE


	.align	4
.L487:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L480
	LD		ALPHA, 152($sp)

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1

	MADPS	C12, C12, A1, B2
	MADPS	C22, C22, A2, B2

	MADPS	C31, C31, A3, B1
	MADPS	C41, C41, A4, B1

	MADPS	C32, C32, A3, B2
	MADPS	C42, C42, A4, B2

	MADPS	C13, C13, A1, B3
	daddiu	BO, BO, 4 * SIZE	#	4KR*4NR
	MADPS	C23, C23, A2, B3
	daddiu	AO, AO, 8 * SIZE 	#	4KR*8MR

	MADPS	C33, C33, A3, B3
	MADPS	C43, C43, A4, B3

	MADPS	C14, C14, A1, B4
	MADPS	C24, C24, A2, B4

	MADPS	C34, C34, A3, B4
	MADPS	C44, C44, A4, B4



	.align	4
.L480:							#	Write Back
#ifndef TRMMKERNEL
	daddiu	I, I, -1
	CVTU	A1, C13				#	A1=C13.upper=c12
	CVTU	A2, C11				#	A2=C11.upper=c22

	CVTU	A3, C23				#	A3=C23.upper=c14
	LD		B1, 1 * SIZE(CO1)

	CVTU	A4, C21				#	A4=C21.upper=c24
	LD		B2, 1 * SIZE(CO2)

	CVTU	A5, C33				#	A5=C33.upper=c16
	LD		B3, 3 * SIZE(CO1)

	CVTU	A6, C31				#	A6=C31.upper=c26
	LD		B4, 3 * SIZE(CO2)

	CVTU	A7, C43				#	A7=C43.upper=c18
	LD		B5, 5 * SIZE(CO1)

	CVTU	A8, C41				#	A8=C41.upper=c28
	LD		B6, 5 * SIZE(CO2)

	MADD	A1, B1, A1, ALPHA 		#	c12		
	LD		B7, 7 * SIZE(CO1)

	MADD	A2, B2, A2, ALPHA 		#	c22
	LD		B1, 7 * SIZE(CO2)

	MADD	A3, B3, A3, ALPHA 		#	c14
	LD		B2, 0 * SIZE(CO1)

	MADD	A4, B4, A4, ALPHA 		#	c24
	LD		B3, 0 * SIZE(CO2)
	
	MADD	A5, B5, A5, ALPHA 		#	c16
	LD		B4, 2 * SIZE(CO1)

	MADD	A6, B6, A6, ALPHA 		#	c26
	LD		B5, 2 * SIZE(CO2)

	MADD	A7, B7, A7, ALPHA 		#	c18
	LD		B6, 4 * SIZE(CO1)

	MADD	A8, B1, A8, ALPHA 		#	c28
	ST		A1, 1 * SIZE(CO1)

	MADD	C11, B2, C11, ALPHA 		#	c12
	LD		B7, 4 * SIZE(CO2)

	MADD	C13, B3, C13, ALPHA 		#	c22
	ST		A2, 1 * SIZE(CO2)

	MADD	C21, B4, C21, ALPHA 		#	c14
	LD		A1, 6 * SIZE(CO1)

	MADD	C23, B5, C23, ALPHA 		#	c24
	ST		A3, 3 * SIZE(CO1)

	MADD	C31, B6, C31, ALPHA 		#	c16
	LD		A2, 6 * SIZE(CO2)
	
	MADD	C33, B7, C33, ALPHA 		#	c26
	ST		A4, 3 * SIZE(CO2)

	ST		A5, 5 * SIZE(CO1)
	ST		A6, 5 * SIZE(CO2)
	ST		A7, 7 * SIZE(CO1)
	ST		A8, 7 * SIZE(CO2)

	MADD	C41, A1, C41, ALPHA 		#	c18
	ST		C11, 0 * SIZE(CO1)

	MADD	C43, A2, C43, ALPHA 		#	c28
	ST		C13, 0 * SIZE(CO2)
	
	ST		C21, 2 * SIZE(CO1)
	ST		C23, 2 * SIZE(CO2)
	ST		C31, 4 * SIZE(CO1)
	ST		C33, 4 * SIZE(CO2)
	ST		C41, 6 * SIZE(CO1)

	CVTU	A1, C14				#	B1=C12.upper=c42
	ST		C43, 6 * SIZE(CO2)

	CVTU	A2, C12				#	B2=C14.upper=c32
	LD		B1, 1 * SIZE(CO3)

	CVTU	A3, C24				#	B3=C22.upper=c44
	LD		B2, 1 * SIZE(CO4)

	CVTU	A4, C22				#	B4=C24.upper=c34
	LD		B3, 3 * SIZE(CO3)

	CVTU	A5, C34				#	B5=C32.upper=c46
	LD		B4, 3 * SIZE(CO4)

	CVTU	A6, C32				#	B6=C24.upper=c36
	LD		B5, 5 * SIZE(CO3)

	CVTU	A7, C44				#	B7=C42.upper=c48
	LD		B6, 5 * SIZE(CO4)

	CVTU	A8, C42				#	A1=C44.upper=c38	
	LD		B7, 7 * SIZE(CO3)

	MADD	A1, B1, A1, ALPHA 		#	c31
	LD		C11, 7 * SIZE(CO4)

	MADD	A2, B2, A2, ALPHA 
	LD		C13, 0 * SIZE(CO3)
	
	MADD	A3, B3, A3, ALPHA 
	LD		C21, 0 * SIZE(CO4)
	
	MADD	A4, B4, A4, ALPHA 
	LD		C23, 2 * SIZE(CO3)

	MADD	A5, B5, A5, ALPHA 
	LD		C31, 2 * SIZE(CO4)
	
	MADD	A6, B6, A6, ALPHA 
	LD		C33, 4 * SIZE(CO3)
	
	MADD	A7, B7, A7, ALPHA 
	LD		C41, 4 * SIZE(CO4)
	
	MADD	A8, C11, A8, ALPHA 
	ST		A1, 1 * SIZE(CO3)

	MADD	C12, C13, C12, ALPHA 
	LD		C43, 6 * SIZE(CO3)
	
	MADD	C14, C21, C14, ALPHA 
	ST		A2, 1 * SIZE(CO4)

	MADD	C22, C23, C22, ALPHA 
	LD		B1, 6 * SIZE(CO4)
	
	MADD	C24, C31, C24, ALPHA 
	ST		A3, 3 * SIZE(CO3)

	MADD	C32, C33, C32, ALPHA 
	ST		A4, 3 * SIZE(CO4)

	MADD	C34, C41, C34, ALPHA 
	ST		A5, 5 * SIZE(CO3)

	MADD	C42, C43, C42, ALPHA 
	ST		A6, 5 * SIZE(CO4)

	ST		A7, 7 * SIZE(CO3)
	NOP

	MADD	C44, B1, C44, ALPHA 
	ST		A8, 7 * SIZE(CO4)

	ST		C12, 0 * SIZE(CO3)
	ST		C14, 0 * SIZE(CO4)
	ST		C22, 2 * SIZE(CO3)
	ST		C24, 2 * SIZE(CO4)
	ST		C32, 4 * SIZE(CO3)
	ST		C34, 4 * SIZE(CO4)
	ST		C42, 6 * SIZE(CO3)
	ST		C44, 6 * SIZE(CO4)

	daddiu	CO1, CO1, 8 * SIZE
	daddiu	CO2, CO2, 8 * SIZE
	daddiu	CO3, CO3, 8 * SIZE
	bgtz	I, .L481
	daddiu	CO4, CO4, 8 * SIZE
#else
	daddiu	I, I, -1
	CVTU	A1, C13				#	A1=C13.upper=c12
	CVTU	A2, C11				#	A2=C11.upper=c22
	CVTU	A3, C23				#	A3=C23.upper=c14
	CVTU	A4, C21				#	A4=C21.upper=c24
	CVTU	A5, C33				#	A5=C33.upper=c16
	CVTU	A6, C31				#	A6=C31.upper=c26
	CVTU	A7, C43				#	A7=C43.upper=c18
	CVTU	A8, C41				#	A8=C41.upper=c28

	MUL		A1, A1, ALPHA 		#	c12		
	MUL		A2, A2, ALPHA 		#	c22
	MUL		A3, A3, ALPHA 		#	c14
	MUL		A4, A4, ALPHA 		#	c24
	MUL		A5, A5, ALPHA 		#	c16
	MUL		A6, A6, ALPHA 		#	c26
	MUL		A7, A7, ALPHA 		#	c18
	MUL		A8, A8, ALPHA 		#	c28

	MUL	C11, C11, ALPHA 		#	c12
	ST	A1, 1 * SIZE(CO1)

	MUL	C13, C13, ALPHA 		#	c22
	ST	A2, 1 * SIZE(CO2)

	MUL	C21, C21, ALPHA 		#	c14
	ST	A3, 3 * SIZE(CO1)

	MUL	C23, C23, ALPHA 		#	c24
	ST	A4, 3 * SIZE(CO2)

	MUL	C31, C31, ALPHA 		#	c16
	ST	A5, 5 * SIZE(CO1)
	
	MUL	C33, C33, ALPHA 		#	c26
	ST	A6, 5 * SIZE(CO2)

	MUL	C41, C41, ALPHA 		#	c18
	ST	A7, 7 * SIZE(CO1)

	MUL	C43, C43, ALPHA 		#	c28
	ST	A8, 7 * SIZE(CO2)

	CVTU	A1, C14				#	B1=C12.upper=c42
	ST		C11, 0 * SIZE(CO1)

	CVTU	A2, C12				#	B2=C14.upper=c32
	ST		C13, 0 * SIZE(CO2)

	CVTU	A3, C24				#	B3=C22.upper=c44
	ST		C21, 2 * SIZE(CO1)
	
	CVTU	A4, C22				#	B4=C24.upper=c34
	ST		C23, 2 * SIZE(CO2)

	CVTU	A5, C34				#	B5=C32.upper=c46
	ST		C31, 4 * SIZE(CO1)

	CVTU	A6, C32				#	B6=C24.upper=c36
	ST		C33, 4 * SIZE(CO2)

	CVTU	A7, C44				#	B7=C42.upper=c48
	ST		C41, 6 * SIZE(CO1)

	CVTU	A8, C42				#	A1=C44.upper=c38	
	ST		C43, 6 * SIZE(CO2)

	MUL		A1, A1, ALPHA 		#	c31
	MUL		A2, A2, ALPHA 
	MUL		A3, A3, ALPHA 
	MUL		A4, A4, ALPHA 
	MUL		A5, A5, ALPHA 
	MUL		A6, A6, ALPHA 
	MUL		A7, A7, ALPHA 
	MUL		A8, A8, ALPHA 

	MUL	C12, C12, ALPHA 
	ST	A1, 1 * SIZE(CO3)

	MUL	C14, C14, ALPHA 
	ST	A2, 1 * SIZE(CO4)

	MUL	C22, C22, ALPHA 
	ST	A3, 3 * SIZE(CO3)

	MUL	C24, C24, ALPHA 
	ST	A4, 3 * SIZE(CO4)

	MUL	C32, C32, ALPHA 
	ST	A5, 5 * SIZE(CO3)

	MUL	C34, C34, ALPHA 
	ST	A6, 5 * SIZE(CO4)

	MUL	C42, C42, ALPHA 
	ST	A7, 7 * SIZE(CO3)

	MUL	C44, C44, ALPHA 
	ST	A8, 7 * SIZE(CO4)

	ST		C12, 0 * SIZE(CO3)
	ST		C14, 0 * SIZE(CO4)
	ST		C22, 2 * SIZE(CO3)
	ST		C24, 2 * SIZE(CO4)
	ST		C32, 4 * SIZE(CO3)
	ST		C34, 4 * SIZE(CO4)
	ST		C42, 6 * SIZE(CO3)
	ST		C44, 6 * SIZE(CO4)

	daddiu	CO1, CO1, 8 * SIZE
	daddiu	CO2, CO2, 8 * SIZE
	daddiu	CO3, CO3, 8 * SIZE
	daddiu	CO4, CO4, 8 * SIZE

#if ( defined(LEFT) && defined(TRANSA)) ||\
		(!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef	LEFT
	daddiu	TEMP, TEMP, -8
#else
	daddiu	TEMP, TEMP, -4
#endif
	dsll	L, TEMP, 3 + BASE_SHIFT
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef	LEFT
	daddiu	KK, KK, 8
#endif

	bgtz	I, .L481
	NOP
#endif

	.align	4
.L44:
	andi	I, M, 4				#	MR=4
	blez	I, .L42
	NOP

	.align	4
.L441:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) ||\
	(!defined(LEFT) && !defined(TRANSA))
	move	BO, B				#	Reset B
#else
	dsll	L, KK, 2 + BASE_SHIFT
	dsll	TEMP, KK, 2 + BASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		

	dsll	PREB, K, BASE_SHIFT
	MOV		C21, C11
	MOV		C22, C11
	
	MOV		C31, C11
	MOV		C32, C11
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MOV		C41, C11
	MOV		C42, C11
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MOV		C13, C11
	MOV		C14, C11

	MOV		C23, C11
	FETCH	$0, 0 * SIZE(CO1)
	MOV		C24, C11
	
	MOV		C33, C11
	FETCH	$0, 0 * SIZE(CO2)
	MOV		C34, C11
	
	daddu	PREB, B, PREB 
	MOV		C43, C11
	FETCH	$0, 0 * SIZE(CO3)

	MOV		C44, C11
	PLU		B3,	B1, B1

	FETCH	$0, 0 * SIZE(CO4)
	PLU		B4, B2, B2

#if (defined(LEFT) && !defined(TRANSA)) ||\
		(!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddu	TEMP, KK, 4
#else
	daddu	TEMP, KK, 4
#endif
	dsra	L, TEMP, 2
	blez	L, .L442
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 2				#	UnRoll	K=4

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		

	dsll	PREB, K, BASE_SHIFT
	MOV		C21, C11
	MOV		C22, C11
	
	MOV		C31, C11
	MOV		C32, C11
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MOV		C41, C11
	MOV		C42, C11
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MOV		C13, C11
	MOV		C14, C11

	MOV		C23, C11
	FETCH	$0, 0 * SIZE(CO1)
	MOV		C24, C11
	
	MOV		C33, C11
	FETCH	$0, 0 * SIZE(CO2)
	MOV		C34, C11
	
	daddu	PREB, B, PREB 
	MOV		C43, C11
	FETCH	$0, 0 * SIZE(CO3)

	MOV		C44, C11
	PLU		B3,	B1, B1

	FETCH	$0, 0 * SIZE(CO4)
	blez	L, .L442
	PLU		B4, B2, B2
#endif

.L4410:							#	
	daddiu	L, L, -1
	MADPS	C11, C11, A1, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C21, C21, A2, B1
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C12, C12, A1, B2
	FETCH	$0, 0 * SIZE(PREB)

	MADPS	C22, C22, A2, B2
	FETCH	$0, 0 * SIZE(PREA)

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C14, C14, A1, B4
	MADPS	C24, C24, A2, B4

	PLU		B7,	B5, B5
	PLU		B8, B6, B6

	MADPS	C11, C11, A3, B5
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C21, C21, A4, B5
	gsLQC1(R12, F5, F4, 2)		#	A5 A6

	MADPS	C12, C12, A3, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C22, C22, A4, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C13, C13, A3, B7
	MADPS	C23, C23, A4, B7

	MADPS	C14, C14, A3, B8
	MADPS	C24, C24, A4, B8

	PLU		B3,	B1, B1
	PLU		B4, B2, B2

	MADPS	C11, C11, A5, B1
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	MADPS	C21, C21, A6, B1
	gsLQC1(R12, F7, F6, 3)		#	A7 A8

	MADPS	C12, C12, A5, B2
	FETCH	$0, 8 * SIZE(PREB)
	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR

	MADPS	C22, C22, A6, B2
	FETCH	$0, 8 * SIZE(PREA)
	daddiu	AO, AO, 16 * SIZE 	#	4KR*4MR

	MADPS	C13, C13, A5, B3
	MADPS	C23, C23, A6, B3

	MADPS	C14, C14, A5, B4
	MADPS	C24, C24, A6, B4

	PLU		B7,	B5, B5
	PLU		B8, B6, B6

	MADPS	C11, C11, A7, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C21, C21, A8, B5
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MADPS	C12, C12, A7, B6
	FETCH	$0, 12 * SIZE(PREB)

	MADPS	C22, C22, A8, B6
	FETCH	$0, 12 * SIZE(PREA)

	MADPS	C13, C13, A7, B7
	daddiu	PREA, PREA, 16 * SIZE
	MADPS	C23, C23, A8, B7
	daddiu	PREB, PREB, 16 * SIZE	

	MADPS	C14, C14, A7, B8
	MADPS	C24, C24, A8, B8

	PLU		B3,	B1, B1
	bgtz	L, .L4410
	PLU		B4, B2, B2

	.align	4
.L442:
#ifndef TRMMKERNEL
	andi	L, K, 2
#else
	andi	L, TEMP, 2
#endif
	blez	L, .L443
	NOP

	MADPS	C11, C11, A1, B1
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C21, C21, A2, B1
	gsLQC1(R12, F3, F2, 1)		#	A3 A4

	MADPS	C12, C12, A1, B2
	FETCH	$0, 0 * SIZE(PREB)
	daddiu	BO, BO, 8 * SIZE	#	2KR*4NR

	MADPS	C22, C22, A2, B2
	FETCH	$0, 0 * SIZE(PREA)
	daddiu	AO, AO, 8 * SIZE	#	2KR*4MR

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C14, C14, A1, B4
	MADPS	C24, C24, A2, B4

	PLU		B7,	B5, B5
	PLU		B8, B6, B6

	MADPS	C11, C11, A3, B5
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C21, C21, A4, B5
	gsLQC1(R12, F1, F0, 0)		#	A5 A6

	MADPS	C12, C12, A3, B6
	FETCH	$0, 4 * SIZE(PREB)

	MADPS	C22, C22, A4, B6
	FETCH	$0, 4 * SIZE(PREA)

	MADPS	C13, C13, A3, B7
	daddiu	PREB, PREB, 8
	MADPS	C23, C23, A4, B7
	daddiu	PREA, PREA, 8

	MADPS	C14, C14, A3, B8
	MADPS	C24, C24, A4, B8

	PLU		B3,	B1, B1
	PLU		B4, B2, B2


	.align	4
.L443:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L440
	LD		ALPHA, 152($sp)

	MADPS	C11, C11, A1, B1
	MADPS	C21, C21, A2, B1

	MADPS	C12, C12, A1, B2
	daddiu	BO, BO, 4 * SIZE	#	1KR*4NR
	MADPS	C22, C22, A2, B2
	daddiu	AO, AO, 4 * SIZE	#	1KR*4MR

	MADPS	C13, C13, A1, B3
	MADPS	C23, C23, A2, B3

	MADPS	C14, C14, A1, B4
	MADPS	C24, C24, A2, B4


	.align	4
.L440:
#ifndef TRMMKERNEL
	CVTU	A1, C13				#	A1=C13.upper=c12
	LD		B1, 1 * SIZE(CO1)

	CVTU	A2, C11				#	A2=C11.upper=c22
	LD		B2, 1 * SIZE(CO2)

	CVTU	A3, C23				#	A3=C23.upper=c14
	LD		B3, 3 * SIZE(CO1)

	CVTU	A4, C21				#	A4=C21.upper=c24
	LD		B4, 3 * SIZE(CO2)


	MADD	A1, B1, A1, ALPHA 		#	c12		
	LD		B5, 0 * SIZE(CO1)

	MADD	A2, B2, A2, ALPHA 		#	c22
	LD		B6, 0 * SIZE(CO2)

	MADD	A3, B3, A3, ALPHA 		#	c14
	LD		B7, 2 * SIZE(CO1)

	MADD	A4, B4, A4, ALPHA 		#	c24
	LD		B1, 2 * SIZE(CO2)
	
	MADD	C11, B5, C11, ALPHA 		#	c12
	ST		A1, 1 * SIZE(CO1)

	MADD	C13, B6, C13, ALPHA 		#	c22
	ST		A2, 1 * SIZE(CO2)

	MADD	C21, B7, C21, ALPHA 		#	c14
	ST		A3, 3 * SIZE(CO1)

	MADD	C23, B1, C23, ALPHA 		#	c24
	ST		A4, 3 * SIZE(CO2)

	ST		C11, 0 * SIZE(CO1)
	ST		C13, 0 * SIZE(CO2)
	ST		C21, 2 * SIZE(CO1)
	ST		C23, 2 * SIZE(CO2)

	CVTU	A1, C14				#	B1=C12.upper=c42
	LD		B1, 1 * SIZE(CO3)

	CVTU	A2, C12				#	B2=C14.upper=c32
	LD		B2, 1 * SIZE(CO4)

	CVTU	A3, C24				#	B3=C22.upper=c44
	LD		B3, 3 * SIZE(CO3)

	CVTU	A4, C22				#	B4=C24.upper=c34
	LD		B4, 3 * SIZE(CO4)

	MADD	A1, B1, A1, ALPHA 		#	c31
	LD		A5, 0 * SIZE(CO3)

	MADD	A2, B2, A2, ALPHA 
	LD		A6, 0 * SIZE(CO4)
	
	MADD	A3, B3, A3, ALPHA 
	LD		A7, 2 * SIZE(CO3)
	
	MADD	A4, B4, A4, ALPHA 
	LD		A8, 2 * SIZE(CO4)

	MADD	C12, A5, C12, ALPHA 
	ST		A1, 1 * SIZE(CO3)

	MADD	C14, A6, C14, ALPHA 
	ST		A2, 1 * SIZE(CO4)

	MADD	C22, A7, C22, ALPHA 
	ST		A3, 3 * SIZE(CO3)
	
	MADD	C24, A8, C24, ALPHA 
	ST		A4, 3 * SIZE(CO4)

	ST		C12, 0 * SIZE(CO3)
	ST		C14, 0 * SIZE(CO4)
	ST		C22, 2 * SIZE(CO3)
	ST		C24, 2 * SIZE(CO4)

	daddiu	CO1, CO1, 4 * SIZE
	daddiu	CO2, CO2, 4 * SIZE
	daddiu	CO3, CO3, 4 * SIZE
	daddiu	CO4, CO4, 4 * SIZE

#else
	CVTU	A1, C13				#	A1=C13.upper=c12
	CVTU	A2, C11				#	A2=C11.upper=c22
	CVTU	A3, C23				#	A3=C23.upper=c14
	CVTU	A4, C21				#	A4=C21.upper=c24

	MUL		A1, A1, ALPHA 		#	c12		
	MUL		A2, A2, ALPHA 		#	c22
	MUL		A3, A3, ALPHA 		#	c14
	MUL		A4, A4, ALPHA 		#	c24
	
	MUL	C11, C11, ALPHA 		#	c12
	ST	A1, 1 * SIZE(CO1)

	MUL	C13, C13, ALPHA 		#	c22
	ST	A2, 1 * SIZE(CO2)

	MUL	C21, C21, ALPHA 		#	c14
	ST	A3, 3 * SIZE(CO1)

	MUL	C23, C23, ALPHA 		#	c24
	ST	A4, 3 * SIZE(CO2)

	CVTU	A5, C14				#	B1=C12.upper=c42
	ST		C11, 0 * SIZE(CO1)

	CVTU	A6, C12				#	B2=C14.upper=c32
	ST		C13, 0 * SIZE(CO2)

	CVTU	A7, C24				#	B3=C22.upper=c44
	ST		C21, 2 * SIZE(CO1)
	
	CVTU	A8, C22				#	B4=C24.upper=c34
	ST		C23, 2 * SIZE(CO2)

	MUL		A5, A5, ALPHA 		#	c31
	MUL		A6, A6, ALPHA 
	MUL		A7, A7, ALPHA 
	MUL		A8, A8, ALPHA 

	MUL	C12, C12, ALPHA 
	ST	A5, 1 * SIZE(CO3)

	MUL	C14, C14, ALPHA 
	ST	A6, 1 * SIZE(CO4)

	MUL	C22, C22, ALPHA 
	ST	A7, 3 * SIZE(CO3)
	
	MUL	C24, C24, ALPHA 
	ST	A8, 3 * SIZE(CO4)

	ST		C12, 0 * SIZE(CO3)
	ST		C14, 0 * SIZE(CO4)
	ST		C22, 2 * SIZE(CO3)
	ST		C24, 2 * SIZE(CO4)

	daddiu	CO1, CO1, 4 * SIZE
	daddiu	CO2, CO2, 4 * SIZE
	daddiu	CO3, CO3, 4 * SIZE
	daddiu	CO4, CO4, 4 * SIZE

#if ( defined(LEFT) && defined(TRANSA))||\
		(!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -4
#else
	daddiu	TEMP, TEMP, -4
#endif
	dsll	L, TEMP, 2 + BASE_SHIFT
	dsll	TEMP, TEMP, 2 + BASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 4
#endif
#endif

	.align	4
.L42:
	andi	I, M, 2
	blez	I, .L41
	NOP

	.align	4
.L421:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) ||\
	(!defined(LEFT) && !defined(TRANSA))
	move	BO, B
#else
	dsll	L, KK, 1 + BASE_SHIFT
	dsll	TEMP, KK, 2 + BASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, B, TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		

	MOV		C21, C11
	MOV		C22, C11
	
	MOV		C31, C11
	MOV		C32, C11
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MOV		C41, C11
	MOV		C42, C11
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MOV		C13, C11
	MOV		C14, C11

	MOV		C23, C11
	FETCH	$0, 0 * SIZE(CO1)
	MOV		C24, C11
	
	MOV		C33, C11
	FETCH	$0, 0 * SIZE(CO2)
	MOV		C34, C11
	
	MOV		C43, C11
	FETCH	$0, 0 * SIZE(CO3)

	MOV		C44, C11
	PLU		B3,	B1, B1

	FETCH	$0, 0 * SIZE(CO4)
	PLU		B4, B2, B2
#if (defined(LEFT) && !defined(TRANSA)) ||\
		(!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 4
#endif
	dsra	L, TEMP, 2
	blez	L, .L422
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 2				#	UnRoll	K=4

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		

	MOV		C21, C11
	MOV		C22, C11
	
	MOV		C31, C11
	MOV		C32, C11
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MOV		C41, C11
	MOV		C42, C11
	gsLQC1(R12, F1, F0, 0)		#	A1 A2

	MOV		C13, C11
	MOV		C14, C11

	MOV		C23, C11
	FETCH	$0, 0 * SIZE(CO1)
	MOV		C24, C11
	
	MOV		C33, C11
	FETCH	$0, 0 * SIZE(CO2)
	MOV		C34, C11
	
	MOV		C43, C11
	FETCH	$0, 0 * SIZE(CO3)

	MOV		C44, C11
	PLU		B3,	B1, B1

	FETCH	$0, 0 * SIZE(CO4)
	blez	L, .L422
	PLU		B4, B2, B2
#endif

.L4210:
	daddiu	L, L, -1
	MADPS	C11, C11, A1, B1
	MADPS	C12, C12, A1, B2
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C13, C13, A1, B3
	MADPS	C14, C14, A1, B4
	gsLQC1(R12, F3, F2, 1)		#	B1 B2

	PLU		B7,	B5, B5
	PLU		B8, B6, B6

	MADPS	C11, C11, A2, B5
	MADPS	C12, C12, A2, B6
	daddiu	AO, AO, 8 * SIZE	#	4KR*2MR
	gsLQC1(R13, F9, F8, 2)		#	B1 B2

	MADPS	C13, C13, A2, B7
	MADPS	C14, C14, A2, B8

	PLU		B3,	B1, B1
	PLU		B4, B2, B2

	MADPS	C11, C11, A3, B1
	gsLQC1(R12, F1, F0, 0)		#	B3 B4

	MADPS	C12, C12, A3, B2
	gsLQC1(R13, F13, F12, 3)	#	B3 B4

	daddiu	BO, BO, 16 * SIZE	#	4KR*4NR
	MADPS	C13, C13, A3, B3
	MADPS	C14, C14, A3, B4

	PLU		B7,	B5, B5
	PLU		B8, B6, B6

	MADPS	C11, C11, A4, B5
	MADPS	C12, C12, A4, B6
	gsLQC1(R13, F9, F8, 0)		#	B3 B4

	MADPS	C13, C13, A4, B7
	MADPS	C14, C14, A4, B8

	PLU		B3,	B1, B1
	bgtz	L, .L4210
	PLU		B4, B2, B2

	.align	4
.L422:
#ifndef TRMMKERNEL
	andi	L, K, 2
#else
	andi	L, TEMP, 2
#endif
	blez	L, .L423
	NOP

	daddiu	AO, AO, 4 * SIZE	#	2KR*2MR
	MADPS	C11, C11, A1, B1
	MADPS	C12, C12, A1, B2
	gsLQC1(R13, F13, F12, 1)	#	B3 B4

	MADPS	C13, C13, A1, B3
	MADPS	C14, C14, A1, B4
	daddiu	BO, BO, 8 * SIZE	#	2KR*2MR

	PLU		B7,	B5, B5
	PLU		B8, B6, B6

	MADPS	C11, C11, A2, B5
	MADPS	C12, C12, A2, B6
	gsLQC1(R13, F9, F8, 0)		#	B1 B2

	MADPS	C13, C13, A2, B7
	MADPS	C14, C14, A2, B8
	gsLQC1(R12, F1, F0, 0)

	PLU		B3,	B1, B1
	PLU		B4, B2, B2

.L423:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L420
	LD		ALPHA, 152($sp)

	MADPS	C11, C11, A1, B1
	MADPS	C12, C12, A1, B2
	daddiu	BO, BO, 4 * SIZE	#	2KR*4NR
	daddiu	AO, AO, 2 * SIZE	#	2KR*4MR

	MADPS	C13, C13, A1, B3
	MADPS	C14, C14, A1, B4

	.align	4
.L420:
#ifndef TRMMKERNEL
	CVTU	A1, C13				#	A1=C13.upper=c12
	LD		B1, 1 * SIZE(CO1)

	CVTU	A2, C11				#	A2=C11.upper=c22
	LD		B2, 1 * SIZE(CO2)

	MADD	A1, B1, A1, ALPHA 		#	c12		
	LD		B5, 0 * SIZE(CO1)

	MADD	A2, B2, A2, ALPHA 		#	c22
	LD		B6, 0 * SIZE(CO2)

	MADD	C11, B5, C11, ALPHA 		#	c12
	ST		A1, 1 * SIZE(CO1)

	MADD	C13, B6, C13, ALPHA 		#	c22
	ST		A2, 1 * SIZE(CO2)

	ST		C11, 0 * SIZE(CO1)
	ST		C13, 0 * SIZE(CO2)

	CVTU	A1, C14				#	B1=C12.upper=c42
	LD		B1, 1 * SIZE(CO3)

	CVTU	A2, C12				#	B2=C14.upper=c32
	LD		B2, 1 * SIZE(CO4)

	MADD	A1, B1, A1, ALPHA 		#	c31
	LD		A5, 0 * SIZE(CO3)

	MADD	A2, B2, A2, ALPHA 
	LD		A6, 0 * SIZE(CO4)
	
	MADD	C12, A5, C12, ALPHA 
	ST		A1, 1 * SIZE(CO3)

	MADD	C14, A6, C14, ALPHA 
	ST		A2, 1 * SIZE(CO4)

	ST		C12, 0 * SIZE(CO3)
	ST		C14, 0 * SIZE(CO4)

	daddiu	CO1, CO1, 2 * SIZE
	daddiu	CO2, CO2, 2 * SIZE
	daddiu	CO3, CO3, 2 * SIZE
	daddiu	CO4, CO4, 2 * SIZE
#else
	CVTU	A1, C13				#	A1=C13.upper=c12
	CVTU	A2, C11				#	A2=C11.upper=c22

	MUL	A1, A1, ALPHA 		#	c12		
	MUL	A2, A2, ALPHA 		#	c22

	MUL	C11, C11, ALPHA 		#	c12
	MUL	C13, C13, ALPHA 		#	c22

	CVTU	A3, C14				#	B1=C12.upper=c42
	CVTU	A4, C12				#	B2=C14.upper=c32

	MUL	A3, A3, ALPHA 		#	c31
	ST	A1, 1 * SIZE(CO1)

	MUL	A4, A4, ALPHA 
	ST	A2, 1 * SIZE(CO2)

	MUL	C12, C12, ALPHA 
	ST	C11, 0 * SIZE(CO1)
	
	MUL	C14, C14, ALPHA 
	ST	C13, 0 * SIZE(CO2)

	ST	A3, 1 * SIZE(CO3)
	ST	A4, 1 * SIZE(CO4)

	ST	C12, 0 * SIZE(CO3)
	ST	C14, 0 * SIZE(CO4)

	daddiu	CO1, CO1, 2 * SIZE
	daddiu	CO2, CO2, 2 * SIZE
	daddiu	CO3, CO3, 2 * SIZE
	daddiu	CO4, CO4, 2 * SIZE
#if ( defined(LEFT) &&  defined(TRANSA))||\
		(!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -4
#endif
	dsll	L, TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 2 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif
#endif


	.align	4
.L41:
	andi	I, M, 1
	blez	I, .L40
	NOP

	.align	4
.L411:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) ||\
	(!defined(LEFT) && !defined(TRANSA))
	move	BO, B
#else
	dsll	L, KK, BASE_SHIFT
	dsll	TEMP, KK, 2 + BASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, B, TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		B1, 0 * SIZE(BO)

	MOV		C21, C11
	MOV		C22, C11
	LD		A1, 0 * SIZE(AO)

	MOV		C31, C11
	MOV		C32, C11
	LD		B2, 1 * SIZE(BO)

	MOV		C41, C11
	MOV		C42, C11
	LD		B3, 2 * SIZE(BO)

	MOV		C13, C11
	MOV		C14, C11
	LD		B4, 3 * SIZE(BO)

	MOV		C23, C11
	MOV		C24, C11
	
	MOV		C33, C11
	MOV		C34, C11
	
	MOV		C43, C11
	MOV		C44, C11
#if (defined(LEFT) && !defined(TRANSA))||\
		(!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 4
#endif
	dsra	L, TEMP, 2
	blez	L, .L412

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 2				#	UnRoll	K=4

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		B1, 0 * SIZE(BO)

	MOV		C21, C11
	MOV		C22, C11
	LD		A1, 0 * SIZE(AO)

	MOV		C31, C11
	MOV		C32, C11
	LD		B2, 1 * SIZE(BO)

	MOV		C41, C11
	MOV		C42, C11
	LD		B3, 2 * SIZE(BO)

	MOV		C13, C11
	MOV		C14, C11
	LD		B4, 3 * SIZE(BO)

	MOV		C23, C11
	MOV		C24, C11
	
	MOV		C33, C11
	MOV		C34, C11
	
	MOV		C43, C11
	blez	L, .L412
	MOV		C44, C11
#endif

.L4110:
	daddiu	L, L, -1
	LD		A2, 1 * SIZE(AO)
	
	MADD	C11, C11, A1, B1
	LD		B5,	4 * SIZE(BO)

	MADD	C12, C12, A1, B2
	LD		B6, 5 * SIZE(BO)

	MADD	C13, C13, A1, B3
	LD		B7, 6 * SIZE(BO)

	MADD	C14, C14, A1, B4
	LD		B8, 7 * SIZE(BO)

	LD		A3, 2 * SIZE(AO)
	NOP
	
	MADD	C11, C11, A2, B5
	LD		B1,	 8 * SIZE(BO)

	MADD	C12, C12, A2, B6
	LD		B2,  9 * SIZE(BO)

	MADD	C13, C13, A2, B7
	LD		B3, 10 * SIZE(BO)

	MADD	C14, C14, A2, B8
	LD		B4, 11 * SIZE(BO)

	LD		A4, 3 * SIZE(AO)
	daddiu	AO, AO, 4 * SIZE
	
	MADD	C11, C11, A3, B1
	LD		B5,	12 * SIZE(BO)

	MADD	C12, C12, A3, B2
	LD		B6, 13 * SIZE(BO)

	MADD	C13, C13, A3, B3
	LD		B7, 14 * SIZE(BO)

	MADD	C14, C14, A3, B4
	LD		B8, 15 * SIZE(BO)

	LD		A1, 0 * SIZE(AO)
	daddiu 	BO, BO, 16 * SIZE

	MADD	C11, C11, A4, B5
	LD		B1,	0 * SIZE(BO)

	MADD	C12, C12, A4, B6
	LD		B2, 1 * SIZE(BO)

	MADD	C13, C13, A4, B7
	LD		B3, 2 * SIZE(BO)

	MADD	C14, C14, A4, B8
	bgtz	L, .L4110
	LD		B4, 3 * SIZE(BO)

.L412:
#ifndef TRMMKERNEL
	andi	L, K, 2
#else
	andi	L, TEMP, 2
#endif
	blez	L, .L413
	NOP

	LD		A2, 1 * SIZE(AO)
	daddiu	AO, AO, 2 * SIZE
	
	MADD	C11, C11, A1, B1
	LD		B5,	4 * SIZE(BO)

	MADD	C12, C12, A1, B2
	LD		B6, 5 * SIZE(BO)

	MADD	C13, C13, A1, B3
	LD		B7, 6 * SIZE(BO)

	MADD	C14, C14, A1, B4
	LD		B8, 7 * SIZE(BO)

	LD		A1, 0 * SIZE(AO)
	daddiu	BO, BO, 8 * SIZE
	
	MADD	C11, C11, A2, B5
	LD		B1,	0 * SIZE(BO)

	MADD	C12, C12, A2, B6
	LD		B2, 1 * SIZE(BO)

	MADD	C13, C13, A2, B7
	LD		B3, 2 * SIZE(BO)

	MADD	C14, C14, A2, B8
	LD		B4, 3 * SIZE(BO)

.L413:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L410
	LD		ALPHA, 152($sp)

	MADD	C11, C11, A1, B1
	MADD	C12, C12, A1, B2
	daddiu	AO, AO, 1 * SIZE
	MADD	C13, C13, A1, B3
	MADD	C14, C14, A1, B4
	daddiu	BO, BO, 4 * SIZE

	.align	4
.L410:
#ifndef TRMMKERNEL
	LD		A5, 0 * SIZE(CO1)
	LD		A6, 0 * SIZE(CO2)
	LD		A7, 0 * SIZE(CO3)
	LD		A8, 0 * SIZE(CO4)

	MADD		A5, A5, C11, ALPHA
	MADD		A6, A6, C12, ALPHA
	MADD		A7, A7, C13, ALPHA
	MADD		A8, A8, C14, ALPHA

	ST		A5, 0 * SIZE(CO1)
	ST		A6, 0 * SIZE(CO2)
	ST		A7, 0 * SIZE(CO3)
	ST		A8, 0 * SIZE(CO4)

	daddiu	CO1, CO1, 1 * SIZE
	daddiu	CO2, CO2, 1 * SIZE
	daddiu	CO3, CO3, 1 * SIZE
	daddiu	CO4, CO4, 1 * SIZE
#else
	MUL		A5, C11, ALPHA
	MUL		A6, C12, ALPHA
	MUL		A7, C13, ALPHA
	MUL		A8, C14, ALPHA

	ST		A5, 0 * SIZE(CO1)
	ST		A6, 0 * SIZE(CO2)
	ST		A7, 0 * SIZE(CO3)
	ST		A8, 0 * SIZE(CO4)

	daddiu	CO1, CO1, 1 * SIZE
	daddiu	CO2, CO2, 1 * SIZE
	daddiu	CO3, CO3, 1 * SIZE
	daddiu	CO4, CO4, 1 * SIZE

#if ( defined(LEFT) &&  defined(TRANSA))||\
		(!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef	LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -4
#endif

	dsll	L, TEMP, BASE_SHIFT
	dsll	TEMP, TEMP, 2 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif
#ifdef	LEFT
	daddiu	KK, KK, 1
#endif
#endif

	.align	4
.L40:
#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu	KK, KK, 4
#endif
	daddiu	J, J, -1
	move	B, BO
	bgtz	J, .L48
	NOP



	.align	4
.L2:							#	Nr=2
	andi	J, N, 2			
	blez	J, .L1
	NOP

.L28:
	dsra	I, M, 3				#	MR=8

	move	AO, A				#	Reset A
	move	CO1, C

#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif
	daddu	CO2, C,   LDC
	blez	I, .L24
	daddu	C,   CO2, LDC

	.align	4
.L281:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO, B
#else
	dsll	L, KK, 3 + BASE_SHIFT
	dsll	TEMP, KK, 1 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B, TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	LD		A1, 0 * SIZE(AO)

	MOV		C12, C11		
	LD		A2, 1 * SIZE(AO)

	MOV		C21, C11
	LD		A3, 2 * SIZE(AO)

	MOV		C22, C11
	LD		A4, 3 * SIZE(AO)

	MOV		C31, C11
	LD		A5, 4 * SIZE(AO)

	MOV		C32, C11
	LD		A6, 5 * SIZE(AO)

	MOV		C41, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C42, C11
	LD		B2, 1 * SIZE(BO)

	MOV		C13, C11
	LD		A7, 6 * SIZE(AO)

	MOV		C14, C11
	LD		A8, 7 * SIZE(AO)

	MOV		C23, C11
	MOV		C24, C11
	
	MOV		C33, C11
	MOV		C34, C11
	
	MOV		C43, C11
	MOV		C44, C11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 8
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	L, TEMP, 1
	blez	L, .L282
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 1				#	UnRoll	K=4

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	LD		A1, 0 * SIZE(AO)

	MOV		C12, C11		
	LD		A2, 1 * SIZE(AO)

	MOV		C21, C11
	LD		A3, 2 * SIZE(AO)

	MOV		C22, C11
	LD		A4, 3 * SIZE(AO)

	MOV		C31, C11
	LD		A5, 4 * SIZE(AO)

	MOV		C32, C11
	LD		A6, 5 * SIZE(AO)

	MOV		C41, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C42, C11
	LD		B2, 1 * SIZE(BO)

	MOV		C13, C11
	LD		A7, 6 * SIZE(AO)

	MOV		C14, C11
	LD		A8, 7 * SIZE(AO)

	MOV		C23, C11
	MOV		C24, C11
	
	MOV		C33, C11
	MOV		C34, C11
	
	MOV		C43, C11
	blez	L, .L282
	MOV		C44, C11
#endif

	.align	4
.L2810:
	daddiu	L, L, -1
	MADD	C11, C11, A1, B1
	LD		B5, 8 * SIZE(AO)

	MADD	C21, C21, A2, B1
	LD		B6, 9 * SIZE(AO)

	MADD	C31, C31, A3, B1
	LD		B7, 10 * SIZE(AO)

	MADD	C41, C41, A4, B1
	LD		B8, 11 * SIZE(AO)

	MADD	C12, C12, A1, B2
	MADD	C22, C22, A2, B2
	LD		B3, 2 * SIZE(BO)

	MADD	C32, C32, A3, B2
	MADD	C42, C42, A4, B2
	LD		B4, 3 * SIZE(BO)
	daddiu	BO, BO, 4 * SIZE

	MADD	C13, C13, A5, B1
	MADD	C23, C23, A6, B1
	LD		A1, 12 * SIZE(AO)

	MADD	C33, C33, A7, B1
	MADD	C43, C43, A8, B1
	LD		A2, 13 * SIZE(AO)

	MADD	C14, C14, A5, B2
	MADD	C24, C24, A6, B2
	LD		A3, 14 * SIZE(AO)

	MADD	C34, C34, A7, B2
	MADD	C44, C44, A8, B2
	LD		A4, 15 * SIZE(AO)
	daddiu	AO, AO, 16 * SIZE

	MADD	C11, C11, B5, B3
	LD		A5, 4 * SIZE(AO)

	MADD	C21, C21, B6, B3
	LD		A6, 5 * SIZE(AO)

	MADD	C13, C13, A1, B3
	MADD	C23, C23, A2, B3
	LD		A7, 6 * SIZE(AO)

	MADD	C33, C33, A3, B3
	MADD	C43, C43, A4, B3
	LD		A8, 7 * SIZE(AO)

	MADD	C14, C14, A1, B4
	MADD	C24, C24, A2, B4
	LD		B1, 0 * SIZE(BO)

	MADD	C34, C34, A3, B4
	MADD	C44, C44, A4, B4
	LD		B2, 1 * SIZE(BO)

	MADD	C31, C31, B7, B3
	MADD	C41, C41, B8, B3
	LD		A1, 0 * SIZE(AO)

	MADD	C12, C12, B5, B4
	LD		A2, 1 * SIZE(AO)

	MADD	C22, C22, B6, B4
	LD		A3, 2 * SIZE(AO)

	LD		A4, 3 * SIZE(AO)
	MADD	C32, C32, B7, B4
	bgtz	L, .L2810
	MADD	C42, C42, B8, B4

	.align	4
.L282:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L280
	LD		ALPHA, 152($sp)

	MADD	C13, C13, A5, B1
	MADD	C23, C23, A6, B1
	MADD	C33, C33, A7, B1
	MADD	C43, C43, A8, B1
	MADD	C14, C14, A5, B2
	MADD	C24, C24, A6, B2
	MADD	C34, C34, A7, B2
	MADD	C44, C44, A8, B2
	daddiu	AO, AO, 8 * SIZE

	MADD	C11, C11, A1, B1
	MADD	C21, C21, A2, B1
	MADD	C31, C31, A3, B1
	MADD	C41, C41, A4, B1
	MADD	C12, C12, A1, B2
	MADD	C22, C22, A2, B2
	MADD	C32, C32, A3, B2
	MADD	C42, C42, A4, B2
	daddiu	BO, BO, 2 * SIZE


	.align	4
.L280:							#	Write Back
#ifndef TRMMKERNEL
	daddiu	I, I, -1

	LD		A1, 0 * SIZE(CO1)
	LD		A2, 1 * SIZE(CO1)
	LD		A3, 2 * SIZE(CO1)
	LD		A4, 3 * SIZE(CO1)
	LD		A5, 4 * SIZE(CO1)
	LD		A6, 5 * SIZE(CO1)
	LD		A7, 6 * SIZE(CO1)
	LD		A8, 7 * SIZE(CO1)

	MADD		A1, A1, C11, ALPHA
	LD		B1, 0 * SIZE(CO2) 
	
	MADD		A2, A2, C21, ALPHA
	LD		B2, 1 * SIZE(CO2)

	MADD		A3, A3, C31, ALPHA
	LD		B3, 2 * SIZE(CO2)

	MADD		A4, A4, C41, ALPHA
	LD		B4, 3 * SIZE(CO2)

	MADD		A5, A5, C13, ALPHA
	LD		B5, 4 * SIZE(CO2)

	MADD		A6, A6, C23, ALPHA
	LD		B6, 5 * SIZE(CO2)

	MADD		A7, A7, C33, ALPHA
	LD		B7, 6 * SIZE(CO2)

	MADD		A8, A8, C43, ALPHA
	LD		C11, 7 * SIZE(CO2)

	MADD		B1, B1, C12, ALPHA
	ST		A1, 0 * SIZE(CO1)

	MADD		B2, B2, C22, ALPHA
	ST		A2, 1 * SIZE(CO1)

	MADD		B3, B3, C32, ALPHA
	ST		A3, 2 * SIZE(CO1)

	MADD		B4, B4, C42, ALPHA
	ST		A4, 3 * SIZE(CO1)

	MADD		B5, B5, C14, ALPHA
	ST		A5, 4 * SIZE(CO1)

	MADD		B6, B6, C24, ALPHA
	ST		A6, 5 * SIZE(CO1)

	MADD		B7, B7, C34, ALPHA
	ST		A7, 6 * SIZE(CO1)

	MADD		C11, C11, C44, ALPHA
	ST		A8, 7 * SIZE(CO1)

	ST		B1, 0 * SIZE(CO2)
	ST		B2, 1 * SIZE(CO2)
	ST		B3, 2 * SIZE(CO2)
	ST		B4, 3 * SIZE(CO2)
	ST		B5, 4 * SIZE(CO2)
	ST		B6, 5 * SIZE(CO2)
	ST		B7, 6 * SIZE(CO2)
	ST		C11, 7 * SIZE(CO2)

	daddiu	CO1, CO1, 8 * SIZE
	bgtz	I, .L281
	daddiu	CO2, CO2, 8 * SIZE
#else
	daddiu	I, I, -1

	MUL		A1, C11, ALPHA
	MUL		A2, C21, ALPHA
	MUL		A3, C31, ALPHA
	MUL		A4, C41, ALPHA
	MUL		A5, C13, ALPHA
	MUL		A6, C23, ALPHA
	MUL		A7, C33, ALPHA
	MUL		A8, C43, ALPHA

	MUL		B1, C12, ALPHA
	ST		A1, 0 * SIZE(CO1)

	MUL		B2, C22, ALPHA
	ST		A2, 1 * SIZE(CO1)

	MUL		B3, C32, ALPHA
	ST		A3, 2 * SIZE(CO1)

	MUL		B4, C42, ALPHA
	ST		A4, 3 * SIZE(CO1)

	MUL		B5, C14, ALPHA
	ST		A5, 4 * SIZE(CO1)

	MUL		B6, C24, ALPHA
	ST		A6, 5 * SIZE(CO1)

	MUL		B7, C34, ALPHA
	ST		A7, 6 * SIZE(CO1)

	MUL		C11, C44, ALPHA
	ST		A8, 7 * SIZE(CO1)

	ST		B1, 0 * SIZE(CO2)
	ST		B2, 1 * SIZE(CO2)
	ST		B3, 2 * SIZE(CO2)
	ST		B4, 3 * SIZE(CO2)
	ST		B5, 4 * SIZE(CO2)
	ST		B6, 5 * SIZE(CO2)
	ST		B7, 6 * SIZE(CO2)
	ST		C11, 7 * SIZE(CO2)

#if ( defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -8
#else
	daddiu	TEMP, TEMP, -2
#endif
	dsll	L, TEMP, 3 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 8
#endif
	daddiu	CO1, CO1, 8 * SIZE
	bgtz	I, .L281
	daddiu	CO2, CO2, 8 * SIZE
#endif


	.align	4
.L24:
	andi	I, M, 4				#	MR=4
	blez	I, .L22
	NOP

	.align	4
.L241:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO, B
#else
	dsll	L, KK, 2 + BASE_SHIFT
	dsll	TEMP, KK, 1 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B, TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		A1, 0 * SIZE(AO)

	MOV		C21, C11
	MOV		C22, C11
	LD		A2, 1 * SIZE(AO)

	MOV		C31, C11
	MOV		C32, C11
	LD		A3, 2 * SIZE(AO)

	MOV		C41, C11
	MOV		C42, C11
	LD		A4, 3 * SIZE(AO)

	MOV		C13, C11
	MOV		C14, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C23, C11
	MOV		C24, C11
	LD		B2, 1 * SIZE(BO)

	MOV		C33, C11
	MOV		C34, C11
	
	MOV		C43, C11
	MOV		C44, C11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 4
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	L, TEMP, 1
	blez	L, .L242
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 1				#	UnRoll	K=4

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		A1, 0 * SIZE(AO)

	MOV		C21, C11
	MOV		C22, C11
	LD		A2, 1 * SIZE(AO)

	MOV		C31, C11
	MOV		C32, C11
	LD		A3, 2 * SIZE(AO)

	MOV		C41, C11
	MOV		C42, C11
	LD		A4, 3 * SIZE(AO)

	MOV		C13, C11
	MOV		C14, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C23, C11
	MOV		C24, C11
	LD		B2, 1 * SIZE(BO)

	MOV		C33, C11
	MOV		C34, C11
	
	MOV		C43, C11
	blez	L, .L242
	MOV		C44, C11
#endif

	.align	4
.L2410:
	daddiu	L, L, -1
	MADD	C11, C11, A1, B1
	LD		A5, 4 * SIZE(AO)

	MADD	C21, C21, A2, B1
	LD		B3, 2 * SIZE(BO)

	MADD	C31, C31, A3, B1
	LD		B4, 3 * SIZE(BO)

	MADD	C41, C41, A4, B1
	LD		A6, 5 * SIZE(AO)
	daddiu	BO, BO, 4 * SIZE

	MADD	C12, C12, A1, B2
	LD		A7, 6 * SIZE(AO)

	MADD	C22, C22, A2, B2
	LD		A8, 7 * SIZE(AO)
	daddiu	AO, AO, 8 * SIZE

	MADD	C32, C32, A3, B2
	MADD	C42, C42, A4, B2

	MADD	C11, C11, A5, B3
	LD		A1, 0 * SIZE(AO)

	MADD	C21, C21, A6, B3
	LD		B1, 0 * SIZE(BO)

	MADD	C31, C31, A7, B3
	LD		B2, 1 * SIZE(BO)

	MADD	C41, C41, A8, B3
	LD		A2, 1 * SIZE(AO)

	MADD	C12, C12, A5, B4
	LD		A3, 2 * SIZE(AO)

	MADD	C22, C22, A6, B4
	LD		A4, 3 * SIZE(AO)

	MADD	C32, C32, A7, B4
	bgtz	L, .L2410
	MADD	C42, C42, A8, B4

	.align	4
.L242:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L240
	LD		ALPHA, 152($sp)

	MADD	C11, C11, A1, B1
	MADD	C21, C21, A2, B1
	MADD	C31, C31, A3, B1
	MADD	C41, C41, A4, B1
	MADD	C12, C12, A1, B2
	MADD	C22, C22, A2, B2
	MADD	C32, C32, A3, B2
	MADD	C42, C42, A4, B2
	daddiu	AO, AO, 4 * SIZE
	daddiu	BO, BO, 2 * SIZE


	.align	4
.L240:							#	Write Back
#ifndef TRMMKERNEL
	LD		A1, 0 * SIZE(CO1)
	LD		A2, 1 * SIZE(CO1)
	LD		A3, 2 * SIZE(CO1)
	LD		A4, 3 * SIZE(CO1)

	MADD		A1, A1, C11, ALPHA
	LD		B1, 0 * SIZE(CO2) 
	
	MADD		A2, A2, C21, ALPHA
	LD		B2, 1 * SIZE(CO2)

	MADD		A3, A3, C31, ALPHA
	LD		B3, 2 * SIZE(CO2)

	MADD		A4, A4, C41, ALPHA
	LD		B4, 3 * SIZE(CO2)

	MADD		B1, B1, C12, ALPHA
	ST		A1, 0 * SIZE(CO1)

	MADD		B2, B2, C22, ALPHA
	ST		A2, 1 * SIZE(CO1)

	MADD		B3, B3, C32, ALPHA
	ST		A3, 2 * SIZE(CO1)

	MADD		B4, B4, C42, ALPHA
	ST		A4, 3 * SIZE(CO1)

	ST		B1, 0 * SIZE(CO2)
	ST		B2, 1 * SIZE(CO2)
	ST		B3, 2 * SIZE(CO2)
	ST		B4, 3 * SIZE(CO2)

	daddiu	CO1, CO1, 4 * SIZE
	daddiu	CO2, CO2, 4 * SIZE
#else

	MUL		A1, C11, ALPHA
	MUL		A2, C21, ALPHA
	MUL		A3, C31, ALPHA
	MUL		A4, C41, ALPHA

	MUL		B1, C12, ALPHA
	ST		A1, 0 * SIZE(CO1)

	MUL		B2, C22, ALPHA
	ST		A2, 1 * SIZE(CO1)

	MUL		B3, C32, ALPHA
	ST		A3, 2 * SIZE(CO1)

	MUL		B4, C42, ALPHA
	ST		A4, 3 * SIZE(CO1)

	ST		B1, 0 * SIZE(CO2)
	ST		B2, 1 * SIZE(CO2)
	ST		B3, 2 * SIZE(CO2)
	ST		B4, 3 * SIZE(CO2)

	daddiu	CO1, CO1, 4 * SIZE
	daddiu	CO2, CO2, 4 * SIZE
#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -4
#else
	daddiu	TEMP, TEMP, -2
#endif
	dsll	L, TEMP, 2 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 4
#endif
#endif

	.align	4
.L22:
	andi	I, M, 2
	blez	I, .L21
	NOP

	.align	4
.L221:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO, B
#else
	dsll	L, KK, 1 + BASE_SHIFT
	dsll	TEMP, KK, 1 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B, TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		A1, 0 * SIZE(AO)

	MOV		C21, C11
	MOV		C22, C11
	LD		A2, 1 * SIZE(AO)

	MOV		C31, C11
	MOV		C32, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C41, C11
	MOV		C42, C11
	LD		B2, 1 * SIZE(BO)

	MOV		C43, C11
	MOV		C44, C11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	L, TEMP, 1
	blez	L, .L222
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 1				#	UnRoll	K=4

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		A1, 0 * SIZE(AO)

	MOV		C21, C11
	MOV		C22, C11
	LD		A2, 1 * SIZE(AO)

	MOV		C31, C11
	MOV		C32, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C41, C11
	MOV		C42, C11
	LD		B2, 1 * SIZE(BO)

	MOV		C43, C11
	blez	L, .L222
	MOV		C44, C11
#endif


	.align	4
.L2210:
	daddiu	L, L, -1
	MADD	C11, C11, A1, B1
	LD		A3, 2 * SIZE(AO)

	MADD	C21, C21, A2, B1
	LD		B3, 2 * SIZE(BO)

	MADD	C12, C12, A1, B2
	LD		A4, 3 * SIZE(AO)
	daddiu	AO, AO, 4 * SIZE

	MADD	C22, C22, A2, B2
	LD		B4, 3 * SIZE(BO)
	daddiu	BO, BO, 4 * SIZE

	MADD	C11, C11, A3, B3
	LD		A1, 0 * SIZE(AO)

	MADD	C21, C21, A4, B3
	LD		B1, 0 * SIZE(BO)

	MADD	C12, C12, A3, B4
	LD		B2, 1 * SIZE(BO)

	MADD	C22, C22, A4, B4
	bgtz	L, .L2210
	LD		A2, 1 * SIZE(AO)


	.align	4
.L222:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L220
	LD		ALPHA, 152($sp)

	MADD	C11, C11, A1, B1
	MADD	C21, C21, A2, B1
	MADD	C12, C12, A1, B2
	MADD	C22, C22, A2, B2
	daddiu	AO, AO, 2 * SIZE
	daddiu	BO, BO, 2 * SIZE


	.align	4
.L220:							#	Write Back
#ifndef TRMMKERNEL
	LD		A1, 0 * SIZE(CO1)
	LD		A2, 1 * SIZE(CO1)

	MADD		A1, A1, C11, ALPHA
	LD		B1, 0 * SIZE(CO2) 
	
	MADD		A2, A2, C21, ALPHA
	LD		B2, 1 * SIZE(CO2)

	MADD		B1, B1, C12, ALPHA
	ST		A1, 0 * SIZE(CO1)

	MADD		B2, B2, C22, ALPHA
	ST		A2, 1 * SIZE(CO1)

	ST		B1, 0 * SIZE(CO2)
	ST		B2, 1 * SIZE(CO2)

	daddiu	CO1, CO1, 2 * SIZE
	daddiu	CO2, CO2, 2 * SIZE
#else

	MUL		A1, C11, ALPHA
	MUL		A2, C21, ALPHA
	MUL		B1, C12, ALPHA
	MUL		B2, C22, ALPHA

	ST		A1, 0 * SIZE(CO1)
	ST		A2, 1 * SIZE(CO1)
	ST		B1, 0 * SIZE(CO2)
	ST		B2, 1 * SIZE(CO2)

	daddiu	CO1, CO1, 2 * SIZE
	daddiu	CO2, CO2, 2 * SIZE

#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu 	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -2
#endif
	dsll	L, TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef	LEFT
	daddu	KK, KK, 2
#endif
#endif

	.align	4
.L21:
	andi	I, M, 1
	blez	I, .L20
	NOP

	.align	4
.L211:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO, B				#	Reset	B
#else
	dsll	L, KK, BASE_SHIFT
	dsll	TEMP, KK, 1 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B, TEMP
#endif

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		A1, 0 * SIZE(AO)

	MOV		C21, C11
	MOV		C22, C11

	MOV		C31, C11
	MOV		C32, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C41, C11
	MOV		C42, C11
	LD		B2, 1 * SIZE(BO)

	MOV		C43, C11
	MOV		C44, C11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	L, TEMP, 1
	blez	L, .L212
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 1				#	UnRoll	K=4

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		A1, 0 * SIZE(AO)

	MOV		C21, C11
	MOV		C22, C11

	MOV		C31, C11
	MOV		C32, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C41, C11
	MOV		C42, C11
	LD		B2, 1 * SIZE(BO)

	MOV		C43, C11
	blez	L, .L212
	MOV		C44, C11
#endif

	.align	4
.L2110:
	daddiu	L, L, -1
	MADD	C11, C11, A1, B1
	LD		A2, 1 * SIZE(AO)

	MADD	C12, C12, A1, B2
	LD		B3, 2 * SIZE(BO)

	LD		B4, 3 * SIZE(BO)
	daddiu	AO, AO, 2 * SIZE
	daddiu	BO, BO, 4 * SIZE

	MADD	C11, C11, A2, B3
	LD		A1, 0 * SIZE(AO)

	MADD	C12, C12, A2, B4
	LD		B1, 0 * SIZE(BO)

	bgtz	L, .L2110
	LD		B2, 1 * SIZE(BO)


	.align	4
.L212:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L210
	LD		ALPHA, 152($sp)

	MADD	C11, C11, A1, B1
	MADD	C12, C12, A1, B2
	daddiu	AO, AO, 1 * SIZE
	daddiu	BO, BO, 2 * SIZE


	.align	4
.L210:							#	Write Back
#ifndef TRMMKERNEL
	LD		A1, 0 * SIZE(CO1)

	MADD		A1, A1, C11, ALPHA
	LD		B1, 0 * SIZE(CO2) 
	
	MADD		B1, B1, C12, ALPHA
	ST		A1, 0 * SIZE(CO1)

	ST		B1, 0 * SIZE(CO2)

	daddiu	CO1, CO1, 1 * SIZE
	daddiu	CO2, CO2, 1 * SIZE
#else

	MUL		A1, C11, ALPHA
	MUL		B1, C12, ALPHA

	ST		A1, 0 * SIZE(CO1)
	ST		B1, 0 * SIZE(CO2)

	daddiu	CO1, CO1, 1 * SIZE
	daddiu	CO2, CO2, 1 * SIZE
#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef	LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -2
#endif
	dsll	L, TEMP, BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef	LEFT
	daddiu	KK, KK, 1
#endif
#endif


	.align	4
.L20:
#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu  KK, KK, 2
#endif
	move	B, BO



	.align	4
.L1:
	andi	J, N, 1
	blez	J, .L999
	NOP

.L18:
	dsra	I, M, 3				#	MR=8
	move	AO, A				#	Reset A

#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif
	blez	I, .L14
	NOP


	.align	4
.L181:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO, B				#	Reset	B
#else
	dsll	L, KK, 3 + BASE_SHIFT
	dsll	TEMP, KK, BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B, TEMP
#endif

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	LD		A1, 0 * SIZE(AO)

	MOV		C12, C11		
	LD		A2, 1 * SIZE(AO)

	MOV		C21, C11
	LD		A3, 2 * SIZE(AO)

	MOV		C22, C11
	LD		A4, 3 * SIZE(AO)

	MOV		C31, C11
	LD		A5, 4 * SIZE(AO)

	MOV		C32, C11
	LD		A6, 5 * SIZE(AO)

	MOV		C41, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C42, C11
	LD		A7, 6 * SIZE(AO)

	MOV		C13, C11
	LD		A8, 7 * SIZE(AO)

	MOV		C14, C11

	MOV		C23, C11
	MOV		C24, C11
	
	MOV		C33, C11
	MOV		C34, C11
	
	MOV		C43, C11
	MOV		C44, C11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 8
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	L, TEMP, 1
	blez	L, .L182
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 1				#	UnRoll	K=4

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	LD		A1, 0 * SIZE(AO)

	MOV		C12, C11		
	LD		A2, 1 * SIZE(AO)

	MOV		C21, C11
	LD		A3, 2 * SIZE(AO)

	MOV		C22, C11
	LD		A4, 3 * SIZE(AO)

	MOV		C31, C11
	LD		A5, 4 * SIZE(AO)

	MOV		C32, C11
	LD		A6, 5 * SIZE(AO)

	MOV		C41, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C42, C11
	LD		A7, 6 * SIZE(AO)

	MOV		C13, C11
	LD		A8, 7 * SIZE(AO)

	MOV		C14, C11

	MOV		C23, C11
	MOV		C24, C11
	
	MOV		C33, C11
	MOV		C34, C11
	
	MOV		C43, C11
	blez	L, .L182
	MOV		C44, C11
#endif


	.align	4
.L1810:
	daddiu	L, L, -1
	MADD	C11, C11, A1, B1
	LD		B5, 8 * SIZE(AO)

	MADD	C21, C21, A2, B1
	LD		B6, 9 * SIZE(AO)

	MADD	C31, C31, A3, B1
	LD		B7, 10 * SIZE(AO)

	MADD	C41, C41, A4, B1
	LD		B8, 11 * SIZE(AO)

	MADD	C13, C13, A5, B1
	LD		B2, 1 * SIZE(BO)
	daddiu	BO, BO, 2 * SIZE

	MADD	C23, C23, A6, B1
	LD		A1, 12 * SIZE(AO)

	MADD	C33, C33, A7, B1
	LD		A2, 13 * SIZE(AO)

	MADD	C43, C43, A8, B1
	LD		A3, 14 * SIZE(AO)

	LD		A4, 15 * SIZE(AO)
	daddiu	AO, AO, 16 * SIZE

	MADD	C11, C11, B5, B2
	LD		A5, 4 * SIZE(AO)

	MADD	C21, C21, B6, B2
	LD		A6, 5 * SIZE(AO)

	MADD	C13, C13, A1, B2
	LD		A7, 6 * SIZE(AO)

	MADD	C23, C23, A2, B2
	LD		A8, 7 * SIZE(AO)

	MADD	C33, C33, A3, B2
	LD		B1, 0 * SIZE(BO)

	MADD	C43, C43, A4, B2
	LD		A1, 0 * SIZE(AO)

	MADD	C31, C31, B7, B2
	LD		A2, 1 * SIZE(AO)

	MADD	C41, C41, B8, B2
	LD		A3, 2 * SIZE(AO)

	bgtz	L, .L1810
	LD		A4, 3 * SIZE(AO)

	.align	4
.L182:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L180
	LD		ALPHA, 152($sp)

	MADD	C13, C13, A5, B1
	MADD	C23, C23, A6, B1
	MADD	C33, C33, A7, B1
	MADD	C43, C43, A8, B1
	daddiu	AO, AO, 8 * SIZE

	MADD	C11, C11, A1, B1
	MADD	C21, C21, A2, B1
	MADD	C31, C31, A3, B1
	MADD	C41, C41, A4, B1
	daddiu	BO, BO, 1 * SIZE


	.align	4
.L180:							#	Write Back
#ifndef TRMMKERNEL
	daddiu	I, I, -1

	LD		A1, 0 * SIZE(C)
	LD		A2, 1 * SIZE(C)
	LD		A3, 2 * SIZE(C)
	LD		A4, 3 * SIZE(C)
	LD		A5, 4 * SIZE(C)
	LD		A6, 5 * SIZE(C)
	LD		A7, 6 * SIZE(C)
	LD		A8, 7 * SIZE(C)

	MADD		A1, A1, C11, ALPHA
	MADD		A2, A2, C21, ALPHA
	MADD		A3, A3, C31, ALPHA
	MADD		A4, A4, C41, ALPHA
	MADD		A5, A5, C13, ALPHA
	MADD		A6, A6, C23, ALPHA
	MADD		A7, A7, C33, ALPHA
	MADD		A8, A8, C43, ALPHA

	ST		A1, 0 * SIZE(C)
	ST		A2, 1 * SIZE(C)
	ST		A3, 2 * SIZE(C)
	ST		A4, 3 * SIZE(C)
	ST		A5, 4 * SIZE(C)
	ST		A6, 5 * SIZE(C)
	ST		A7, 6 * SIZE(C)
	ST		A8, 7 * SIZE(C)

	daddiu	C, C, 8 * SIZE
	bgtz	I, .L181
	NOP
#else
	daddiu	I, I, -1

	MUL		A1, C11, ALPHA
	MUL		A2, C21, ALPHA
	MUL		A3, C31, ALPHA
	MUL		A4, C41, ALPHA
	MUL		A5, C13, ALPHA
	MUL		A6, C23, ALPHA
	MUL		A7, C33, ALPHA
	MUL		A8, C43, ALPHA

	ST		A1, 0 * SIZE(C)
	ST		A2, 1 * SIZE(C)
	ST		A3, 2 * SIZE(C)
	ST		A4, 3 * SIZE(C)
	ST		A5, 4 * SIZE(C)
	ST		A6, 5 * SIZE(C)
	ST		A7, 6 * SIZE(C)
	ST		A8, 7 * SIZE(C)

#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK

#ifdef LEFT
	daddiu	TEMP, TEMP, -8
#else
	daddiu	TEMP, TEMP, -1
#endif

	dsll	L, TEMP, 3 + BASE_SHIFT
	dsll	TEMP, TEMP, BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 8
#endif

	daddiu	C, C, 8 * SIZE
	bgtz	I, .L181
	NOP
#endif

	.align	4
.L14:
	andi	I, M, 4				#	MR=4
	blez	I, .L12
	NOP

	.align	4
.L141:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move 	BO, B
#else
	dsll	L, KK, 2 + BASE_SHIFT
	dsll	TEMP, KK, BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B, TEMP
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		A1, 0 * SIZE(AO)

	MOV		C21, C11
	MOV		C22, C11
	LD		A2, 1 * SIZE(AO)

	MOV		C31, C11
	MOV		C32, C11
	LD		A3, 2 * SIZE(AO)

	MOV		C41, C11
	MOV		C42, C11
	LD		A4, 3 * SIZE(AO)

	MOV		C13, C11
	MOV		C14, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C23, C11
	MOV		C24, C11

	MOV		C33, C11
	MOV		C34, C11
	
	MOV		C43, C11
	MOV		C44, C11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif	defined(LEFT)
	daddiu	TEMP, KK, 4
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	L, TEMP, 1
	blez	L, .L142
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 1				#	UnRoll	K=4

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		A1, 0 * SIZE(AO)

	MOV		C21, C11
	MOV		C22, C11
	LD		A2, 1 * SIZE(AO)

	MOV		C31, C11
	MOV		C32, C11
	LD		A3, 2 * SIZE(AO)

	MOV		C41, C11
	MOV		C42, C11
	LD		A4, 3 * SIZE(AO)

	MOV		C13, C11
	MOV		C14, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C23, C11
	MOV		C24, C11

	MOV		C33, C11
	MOV		C34, C11
	
	MOV		C43, C11
	blez	L, .L142
	MOV		C44, C11
#endif

	.align	4
.L1410:
	daddiu	L, L, -1
	MADD	C11, C11, A1, B1
	LD		A5, 4 * SIZE(AO)

	MADD	C21, C21, A2, B1
	LD		B3, 1 * SIZE(BO)

	MADD	C31, C31, A3, B1
	LD		A6, 5 * SIZE(AO)
	daddiu	BO, BO, 2 * SIZE

	MADD	C41, C41, A4, B1
	LD		A7, 6 * SIZE(AO)

	LD		A8, 7 * SIZE(AO)
	daddiu	AO, AO, 8 * SIZE


	MADD	C11, C11, A5, B3
	LD		A1, 0 * SIZE(AO)

	MADD	C21, C21, A6, B3
	LD		B1, 0 * SIZE(BO)

	MADD	C31, C31, A7, B3
	LD		A2, 1 * SIZE(AO)

	MADD	C41, C41, A8, B3
	LD		A3, 2 * SIZE(AO)

	bgtz	L, .L1410
	LD		A4, 3 * SIZE(AO)

	.align	4
.L142:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L140
	LD		ALPHA, 152($sp)

	MADD	C11, C11, A1, B1
	MADD	C21, C21, A2, B1
	MADD	C31, C31, A3, B1
	MADD	C41, C41, A4, B1
	daddiu	AO, AO, 4 * SIZE
	daddiu	BO, BO, 1 * SIZE


	.align	4
.L140:							#	Write Back
#ifndef TRMMKERNEL
	LD		A1, 0 * SIZE(C)
	LD		A2, 1 * SIZE(C)
	LD		A3, 2 * SIZE(C)
	LD		A4, 3 * SIZE(C)

	MADD		A1, A1, C11, ALPHA
	MADD		A2, A2, C21, ALPHA
	MADD		A3, A3, C31, ALPHA
	MADD		A4, A4, C41, ALPHA

	ST		A1, 0 * SIZE(C)
	ST		A2, 1 * SIZE(C)
	ST		A3, 2 * SIZE(C)
	ST		A4, 3 * SIZE(C)
	daddiu	C, C, 4 * SIZE
#else
	MUL		A1, C11, ALPHA
	MUL		A2, C21, ALPHA
	MUL		A3, C31, ALPHA
	MUL		A4, C41, ALPHA

	ST		A1, 0 * SIZE(C)
	ST		A2, 1 * SIZE(C)
	ST		A3, 2 * SIZE(C)
	ST		A4, 3 * SIZE(C)
	daddiu	C, C, 4 * SIZE

#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef	LEFT
	daddiu	TEMP, TEMP, -4
#else
	daddiu	TEMP, TEMP, -1
#endif

	dsll	L, TEMP, 2 + BASE_SHIFT
	dsll	TEMP, TEMP, BASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef	LEFT
	daddiu	KK, KK, 4
#endif
#endif

	.align	4
.L12:
	andi	I, M, 2
	blez	I, .L11
	NOP

	.align	4
.L121:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) ||\
	(!defined(LEFT) && !defined(TRANSA))
	move	BO, B				#	Reset	B
#else
	dsll	L, KK, 1 + BASE_SHIFT
	dsll	TEMP, KK, BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B, TEMP
#endif

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		A1, 0 * SIZE(AO)

	MOV		C21, C11
	MOV		C22, C11
	LD		A2, 1 * SIZE(AO)

	MOV		C31, C11
	MOV		C32, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C41, C11
	MOV		C42, C11

	MOV		C43, C11
	MOV		C44, C11
#if (defined(LEFT) && !defined(TRANSA)) ||\
		(!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	L, TEMP, 1
	blez	L, .L122
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 1				#	UnRoll	K=4

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		A1, 0 * SIZE(AO)

	MOV		C21, C11
	MOV		C22, C11
	LD		A2, 1 * SIZE(AO)

	MOV		C31, C11
	MOV		C32, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C41, C11
	MOV		C42, C11

	MOV		C43, C11
	blez	L, .L122
	MOV		C44, C11
#endif

	.align	4
.L1210:
	daddiu	L, L, -1
	MADD	C11, C11, A1, B1
	LD		B3, 1 * SIZE(BO)

	MADD	C21, C21, A2, B1
	daddiu	BO, BO, 2 * SIZE

	LD		A3, 2 * SIZE(AO)
	LD		A4, 3 * SIZE(AO)
	daddiu	AO, AO, 4 * SIZE

	MADD	C11, C11, A3, B3
	LD		B1, 0 * SIZE(BO)

	MADD	C21, C21, A4, B3
	LD		A1, 0 * SIZE(AO)
	bgtz	L, .L1210
	LD		A2, 1 * SIZE(AO)


	.align	4
.L122:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L120
	LD		ALPHA, 152($sp)

	MADD	C11, C11, A1, B1
	MADD	C21, C21, A2, B1
	daddiu	AO, AO, 2 * SIZE
	daddiu	BO, BO, 1 * SIZE


	.align	4
.L120:							#	Write Back
#ifndef TRMMKERNEL
	LD		A1, 0 * SIZE(C)
	LD		A2, 1 * SIZE(C)

	MADD		A1, A1, C11, ALPHA
	MADD		A2, A2, C21, ALPHA

	ST		A1, 0 * SIZE(C)
	ST		A2, 1 * SIZE(C)

	daddiu	C, C, 2 * SIZE
#else
	MUL		A1, C11, ALPHA
	MUL		A2, C21, ALPHA

	ST		A1, 0 * SIZE(C)
	ST		A2, 1 * SIZE(C)

	daddiu	C, C, 2 * SIZE
#if ( defined(LEFT) &&  defined(TRANSA))||\
		(!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -1
#endif
	dsll	L, TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef	LEFT
	daddiu	KK, KK, 2
#endif
#endif

	.align	4
.L11:
	andi	I, M, 1
	blez	I, .L10
	NOP

	.align	4
.L111:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA))||\
	(!defined(LEFT) && !defined(TRANSA))
	move	BO, B
#else
	dsll	L, KK, BASE_SHIFT
	daddu	AO, AO, L
	daddu	BO, B, L
#endif
	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		A1, 0 * SIZE(AO)

	MOV		C21, C11
	MOV		C22, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C31, C11
	MOV		C32, C11
#if (defined(LEFT) && !defined(TRANSA))||\
		(!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	L, TEMP, 1
	blez	L, .L112
	NOP

#else
	move	BO, B				#	Reset	B
	dsra	L, K, 1				#	UnRoll	K=4

	MTC		$0, C11				#	CLEAR REAULTS REGISTERS
	MOV		C12, C11		
	LD		A1, 0 * SIZE(AO)

	MOV		C21, C11
	MOV		C22, C11
	LD		B1, 0 * SIZE(BO)

	MOV		C31, C11
	blez	L, .L112
	MOV		C32, C11
#endif


	.align	4
.L1110:
	daddiu	L, L, -1
	MADD	C11, C11, A1, B1

	LD		A2, 1 * SIZE(AO)
	LD		B2, 1 * SIZE(BO)

	daddiu	AO, AO, 2 * SIZE
	daddiu	BO, BO, 2 * SIZE

	MADD	C11, C11, A2, B2
	LD		A1, 0 * SIZE(AO)
	LD		B1, 0 * SIZE(BO)

	bgtz	L, .L1110
	NOP


	.align	4
.L112:
#ifndef TRMMKERNEL
	andi	L, K, 1
#else
	andi	L, TEMP, 1
#endif
	blez	L, .L110
	LD		ALPHA, 152($sp)

	MADD	C11, C11, A1, B1
	daddiu	AO, AO, 1 * SIZE
	daddiu	BO, BO, 1 * SIZE


	.align	4
.L110:							#	Write Back
#ifndef TRMMKERNEL
	LD		A1, 0 * SIZE(C)

	MADD		A1, A1, C11, ALPHA
	
	ST		A1, 0 * SIZE(C)

	daddiu	C, C, 1 * SIZE
#else
	MUL		A1, C11, ALPHA
	
	ST		A1, 0 * SIZE(C)

	daddiu	C, C, 1 * SIZE

#endif

	.align	4
.L10:
	move	B, BO
	NOP

.L999:
	ld	$16,   0($sp)
	ld	$17,   8($sp)
	ld	$18,  16($sp)
	ld	$19,  24($sp)
	ld	$20,  32($sp)
	ld	$21,  40($sp)
	ld	$22,  48($sp)

	LD	$f24, 56($sp)
	LD	$f25, 64($sp)
	LD	$f26, 72($sp)
	LD	$f27, 80($sp)
	LD	$f28, 88($sp)

#if defined(TRMMKERNEL)
	ld	$23,  96($sp)
	ld	$24, 104($sp)
	ld	$25, 112($sp)
#endif

#ifndef __64BIT__
	LD	$f20,120($sp)
	LD	$f21,128($sp)
	LD	$f22,136($sp)
	LD	$f23,144($sp)
#endif

	daddiu	$sp,$sp,STACKSIZE
	j	$31
	nop

	EPILOGUE
#	.set	macro
#	.set	reorder
#	.end	gemm
#	.size	gemm, .-gemm
#	.ident	"GCC: (Debian 4.4.6-6) 4.4.6"