Blob Blame Raw
#define ASSEMBLER
#include "common.h"

#define FETCH	ld
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)

#define STACKSIZE	160
#define M	$4
#define	N	$5
#define	K	$6
#define A	$9
#define B	$10
#define C	$11
#define LDC	$8

#define AO	$12
#define BO	$13

#define	R12	12
#define	R13	13

#define I	$2
#define J	$3
#define L	$7

#define CO1	$14
#define CO2	$15
#define PREA	$16
#define PREB	$17

#if defined(TRMMKERNEL)
#define OFFSET	$18
#define KK	$19
#define TEMP	$20
#endif

#define a1	$f0
#define a2	$f1
#define a3	$f2
#define a4	$f3

#define b1	$f4
#define b2	$f5
#define b3	$f6
#define b4	$f7

#define a5	$f8
#define a6	$f9
#define a7	$f10
#define a8	$f11

#define b5	$f12
#define b6	$f13
#define b7	$f15
#define b8	$f16

#define c11	$f14
#define c12	$f17
#define c13	$f18
#define c14	$f19
#define c21	$f20
#define c22	$f21
#define c23	$f22
#define c24	$f23
#define c31	$f24
#define c32	$f25
#define c33	$f26
#define c34	$f27
#define c41	$f28
#define c42	$f29
#define c43	$f30
#define c44	$f31

#define F0	0
#define F1	1
#define F2	2
#define F3	3
#define F4	4
#define F5	5
#define	F6	6
#define	F7	7
#define	F8	8
#define	F9	9
#define	F10	10
#define	F11	11
#define	F12	12
#define	F13	13
#define	F14	14
#define	F15	15
#define	F16	16
#define	F17	17
#define	F18	18
#define	F19	19
#define	F20	20
#define	F21	21
#define	F22	22
#define	F23	23
#define	F24	24
#define	F25	25
#define	F26	26
#define	F27	27
#define	F28	28
#define	F29	29
#define	F30	30
#define	F31	31

#define ALPHA_R	$f15
#define ALPHA_I	$f16

#################################
##	MADD1	a*c
##	MADD2	b*c
##	MADD3	a*d
##	MADD4	d*b
##################################
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define MADD1	  MADD
#define MADD2	  MADD
#define MADD3	  MADD
#define MADD4	  NMSUB
#endif

#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define MADD1	  MADD
#define MADD2	  MADD
#define MADD3	  NMSUB
#define MADD4	  MADD
#endif

#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define MADD1	  MADD
#define MADD2	  NMSUB
#define MADD3	  MADD
#define MADD4	  MADD
#endif

#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
#define MADD1	  MADD
#define MADD2	  NMSUB
#define MADD3	  NMSUB
#define MADD4	  NMSUB
#endif

	PROLOGUE
	
	LDARG	LDC,   0($sp)
	daddiu	$sp, $sp, -STACKSIZE

	SDARG	$16,   0($sp)
	SDARG	$17,   8($sp)
	sdc1	$f24, 16($sp)
	sdc1	$f25, 24($sp)
	sdc1	$f26, 32($sp)
	sdc1	$f27, 40($sp)
	sdc1	$f28, 48($sp)
	sdc1	$f29, 56($sp)

#if defined(TRMMKERNEL)
	SDARG	$18,  64($sp)
	SDARG	$19,  72($sp)
	SDARG	$20,  80($sp)

	LDARG	OFFSET, STACKSIZE + 8($sp)
#endif

#ifndef __64BIT__
	sdc1	$f20, 88($sp)
	sdc1	$f21, 96($sp)
	sdc1	$f22,104($sp)
	sdc1	$f23,112($sp)
#endif

	dsra	J,  N, 1							#	J=N/2
	ST	ALPHA_R, 128($sp)						#	store alpha_r & alpha_i
#if defined(TRMMKERNEL) && !defined(LEFT)
	neg	KK, OFFSET
#endif

	dsll	LDC, LDC, ZBASE_SHIFT				#	LDC*SIZE*COMPSIZE
	blez	J, .L20
	ST	ALPHA_I, 136($sp)


	.align	5
.L10:
#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif

	daddiu	J, J, -1							
	dsra	I,  M, 1							#	I=M/2

	dsll	PREB, K, 1 + ZBASE_SHIFT				#	PREA=K*2*2^4
	dsll	PREA, K, 1 + ZBASE_SHIFT				#	PREA=K*2*2^4

	move	CO1, C								#	Fix pointer Cx
	daddu	CO2, C,   LDC

	move	AO, A								#	Reset AO
	blez	I, .L30
	daddu	PREA, PREA, A						#	PREA=A+panel size

.L11:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, 1 + ZBASE_SHIFT			#	MR=NR=2
	dsll	TEMP, KK, 1 + ZBASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif
	MTC	$0,  c11								#	Clear results regs
	MOV	c12, c11
	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2

	MOV	c13, c11
	MOV	c14, c11
	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2

	MOV	c21, c11
	MOV	c22, c11
	gsLQC1(R12, F3, F2, 1)						#	R:a3	I:a4
	
	MOV	c23, c11
	MOV	c24, c11
	gsLQC1(R13, F7, F6, 1)						#	R:b2	I:b3

	FETCH	$0, 0 * SIZE(CO2)
	MOV	c31, c11
	MOV	c32, c11

	FETCH	$0, 0 * SIZE(CO1)
	MOV	c33, c11
	MOV	c34, c11
	
	FETCH	$0, 4 * SIZE(CO2)
	MOV	c41, c11
	MOV	c42, c11

	FETCH	$0, 4 * SIZE(CO1)
	MOV	c43, c11
	MOV	c44, c11

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	L,  TEMP, 2
	daddu	PREB, PREB, B						#	PREA=A+panel size
	blez	L, .L15
	NOP

#else

	dsra	L,  K, 2							#	Unroll K 4 times	
	move	BO,  B

	MTC	$0,  c11								#	Clear results regs
	MOV	c12, c11
	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2

	MOV	c13, c11
	MOV	c14, c11
	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2

	MOV	c21, c11
	MOV	c22, c11
	gsLQC1(R12, F3, F2, 1)						#	R:a3	I:a4
	
	MOV	c23, c11
	MOV	c24, c11
	gsLQC1(R13, F7, F6, 1)						#	R:b2	I:b3

	FETCH	$0, 0 * SIZE(CO2)
	MOV	c31, c11
	MOV	c32, c11

	FETCH	$0, 0 * SIZE(CO1)
	MOV	c33, c11
	MOV	c34, c11
	
	FETCH	$0, 4 * SIZE(CO2)
	MOV	c41, c11
	MOV	c42, c11

	FETCH	$0, 4 * SIZE(CO1)
	MOV	c43, c11

	daddu	PREB, PREB, B						#	PREA=A+panel size
	blez	L, .L15
	MOV	c44, c11
#endif

	.align	5

.L12:
	gsLQC1(R12, F9, F8, 2)						#	Unroll K=1	
	gsLQC1(R13, F13, F12, 2)				
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
	MADD3	c13, c13, a1, b2					#	axd

	gsLQC1(R12, F11, F10, 3)					
	gsLQC1(R13, F16, F15, 3)			
	MADD2	c12, c12, a2, b1					#	bxc
	MADD4	c14, c14, a2, b2					#	bxd

	MADD1	c21, c21, a3, b1 					#	A2xB1
	MADD3	c23, c23, a3, b2

	MADD2	c22, c22, a4, b1
	MADD4	c24, c24, a4, b2

	FETCH	$0, 4 * SIZE(PREA)
	FETCH	$0, 4 * SIZE(PREB)
	MADD1	c31, c31, a1, b3 					#	A1xB2
	MADD3	c33, c33, a1, b4

	MADD2	c32, c32, a2, b3
	MADD4	c34, c34, a2, b4

	MADD1	c41, c41, a3, b3 					#	A2xB2
	MADD3	c43, c43, a3, b4
	MADD2	c42, c42, a4, b3
	MADD4	c44, c44, a4, b4

	gsLQC1(R12, F1, F0, 4)						#	unroll	k=2	
	gsLQC1(R13, F5, F4, 4)					
	MADD1	c11, c11, a5, b5 					#	axc		A1xB1	
	MADD3	c13, c13, a5, b6					#	axd

	MADD2	c12, c12, a6, b5					#	bxc
	MADD4	c14, c14, a6, b6					#	bxd

	gsLQC1(R12, F3, F2, 5)						
	gsLQC1(R13, F7, F6, 5)				
	MADD1	c21, c21, a7, b5 					#	A2xB1
	MADD3	c23, c23, a7, b6

	MADD2	c22, c22, a8, b5
	MADD4	c24, c24, a8, b6

	FETCH	$0, 8 * SIZE(PREA)
	FETCH	$0, 8 * SIZE(PREB)
	MADD1	c31, c31, a5, b7 					#	A1xB2
	MADD3	c33, c33, a5, b8

	MADD2	c32, c32, a6, b7
	MADD4	c34, c34, a6, b8

	MADD1	c41, c41, a7, b7 					#	A2xB2
	MADD3	c43, c43, a7, b8
	MADD2	c42, c42, a8, b7
	MADD4	c44, c44, a8, b8

	gsLQC1(R12, F9, F8, 6)						#	Unroll	K=3
	gsLQC1(R13, F13, F12, 6)				
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
	MADD3	c13, c13, a1, b2					#	axd

	gsLQC1(R13, F16, F15, 7)			
	gsLQC1(R12, F11, F10, 7)					
	MADD2	c12, c12, a2, b1					#	bxc
	MADD4	c14, c14, a2, b2					#	bxd

	MADD1	c21, c21, a3, b1 					#	A2xB1
	MADD3	c23, c23, a3, b2
	daddiu	AO, AO, 16 * SIZE 					#	2mr*4kr*cmpx
	daddiu	BO, BO, 16 * SIZE  					#	2nr*4kr*cmpx

	MADD2	c22, c22, a4, b1
	MADD4	c24, c24, a4, b2

	FETCH	$0, 12 * SIZE(PREA)
	MADD1	c31, c31, a1, b3 					#	A1xB2
	MADD3	c33, c33, a1, b4
	daddiu	L, L, -1

	FETCH	$0, 12 * SIZE(PREB)
	MADD2	c32, c32, a2, b3
	MADD4	c34, c34, a2, b4

	MADD1	c41, c41, a3, b3 					#	A2xB2
	MADD3	c43, c43, a3, b4
	daddu	PREA, PREA, 16 * SIZE
	daddu	PREB, PREB, 16 * SIZE

	MADD2	c42, c42, a4, b3
	MADD4	c44, c44, a4, b4

	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2	Unroll K=4
	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
	MADD1	c11, c11, a5, b5 					#	axc		A1xB1	
	MADD3	c13, c13, a5, b6					#	axd

	gsLQC1(R12, F3, F2, 1)						#	R:a3	I:a4
	gsLQC1(R13, F7, F6, 1)						#	R:b2	I:b3
	MADD2	c12, c12, a6, b5					#	bxc
	MADD4	c14, c14, a6, b6					#	bxd

	MADD1	c21, c21, a7, b5 					#	A2xB1
	MADD3	c23, c23, a7, b6

	MADD2	c22, c22, a8, b5
	MADD4	c24, c24, a8, b6

	FETCH	$0, 0 * SIZE(PREA)
	FETCH	$0, 0 * SIZE(PREB)
	MADD1	c31, c31, a5, b7 					#	A1xB2
	MADD3	c33, c33, a5, b8

	MADD2	c32, c32, a6, b7
	MADD4	c34, c34, a6, b8

	MADD1	c41, c41, a7, b7 					#	A2xB2
	MADD3	c43, c43, a7, b8

	MADD2	c42, c42, a8, b7
	bgtz	L, .L12
	MADD4	c44, c44, a8, b8

	.align	5

.L15:
#ifndef	TRMMKERNEL
	andi	L,  K, 3
	LD	ALPHA_R, 128($sp)
#else
	andi	L,  TEMP, 3
	LD	ALPHA_R, 128($sp)
#endif
	blez	L, .L18
	LD	ALPHA_I, 136($sp)

	.align	5

.L16:
	daddiu	BO, BO, 4 * SIZE					#	2nr*1kr*cmpx
	daddiu	AO, AO, 4 * SIZE					#	2mr*1kr*cmpx
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
	MADD3	c13, c13, a1, b2					#	axd

	daddiu	PREA, PREA, 4 * SIZE			
	daddiu	PREB, PREB, 4 * SIZE			
	MADD2	c12, c12, a2, b1					#	bxc
	MADD4	c14, c14, a2, b2					#	bxd

	MADD1	c21, c21, a3, b1 					#	A2xB1
	MADD3	c23, c23, a3, b2
	
	MADD2	c22, c22, a4, b1
	MADD4	c24, c24, a4, b2

	FETCH	$0, 0 * SIZE(PREA)
	MADD1	c31, c31, a1, b3 					#	A1xB2
	MADD3	c33, c33, a1, b4
	daddiu	L, L, -1

	MADD2	c32, c32, a2, b3
	MADD4	c34, c34, a2, b4

	FETCH	$0, 0 * SIZE(PREB)
	MADD1	c41, c41, a3, b3 					#	A2xB2
	MADD3	c43, c43, a3, b4

	MADD2	c42, c42, a4, b3
	MADD4	c44, c44, a4, b4

	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2	Unroll K=4
	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
	gsLQC1(R12, F3, F2, 1)						#	R:a3	I:a4
	gsLQC1(R13, F7, F6, 1)						#	R:b2	I:b3
	bgtz	L, .L16
	NOP

.L18:
#ifndef TRMMKERNEL
 	ADD	c11, c14, c11
	LD	a1,  0 * SIZE(CO1)
	ADD	c12, c13, c12
	LD	a2,  1 * SIZE(CO1)
	ADD	c21, c24, c21
	LD	b1,  2 * SIZE(CO1)
	ADD	c22, c23, c22
	LD	b2,  3 * SIZE(CO1)

	ADD	c31, c34, c31
	LD	a3,  0 * SIZE(CO2)
	ADD	c32, c33, c32
	LD	a4,  1 * SIZE(CO2)
	ADD	c41, c44, c41
	LD	b3,  2 * SIZE(CO2)
	ADD	c42, c43, c42
	LD	b4,  3 * SIZE(CO2)

	daddiu	I, I, -1
	MADD	a1, a1, ALPHA_R, c11
	MADD	a2, a2, ALPHA_R, c12
	MADD	b1, b1, ALPHA_R, c21
	MADD	b2, b2, ALPHA_R, c22

	NMSUB	a1, a1, ALPHA_I, c12
	MADD	a2, a2, ALPHA_I, c11
	NMSUB	b1, b1, ALPHA_I, c22
	MADD	b2, b2, ALPHA_I, c21

	MADD	a3, a3, ALPHA_R, c31
	MADD	a4, a4, ALPHA_R, c32
	ST	a1, 0 * SIZE(CO1)
	MADD	b3, b3, ALPHA_R, c41
	MADD	b4, b4, ALPHA_R, c42
	ST	a2, 1 * SIZE(CO1)

	NMSUB	a3, a3, ALPHA_I, c32
	MADD	a4, a4, ALPHA_I, c31
	ST	b1, 2 * SIZE(CO1)

	NMSUB	b3, b3, ALPHA_I, c42
	MADD	b4, b4, ALPHA_I, c41
	ST	b2, 3 * SIZE(CO1)

	ST	a3, 0 * SIZE(CO2)
	ST	a4, 1 * SIZE(CO2)
	ST	b3, 2 * SIZE(CO2)
	ST	b4, 3 * SIZE(CO2)

#else
 	ADD	c11, c14, c11
	ADD	c12, c13, c12
	ADD	c21, c24, c21
	ADD	c22, c23, c22

	ADD	c31, c34, c31
	ADD	c32, c33, c32
	ADD	c41, c44, c41
	ADD	c42, c43, c42

	daddiu	I, I, -1
	MUL a1, ALPHA_R, c11
	MUL a2, ALPHA_R, c12
	MUL	b1, ALPHA_R, c21
	MUL	b2, ALPHA_R, c22

	NMSUB	a1, a1, ALPHA_I, c12
	MADD	a2, a2, ALPHA_I, c11
	NMSUB	b1, b1, ALPHA_I, c22
	MADD	b2, b2, ALPHA_I, c21

	MUL	a3, ALPHA_R, c31
	MUL	a4, ALPHA_R, c32
	MUL	b3, ALPHA_R, c41
	MUL	b4, ALPHA_R, c42

	NMSUB	a3, a3, ALPHA_I, c32
	MADD	a4, a4, ALPHA_I, c31
	NMSUB	b3, b3, ALPHA_I, c42
	MADD	b4, b4, ALPHA_I, c41

	ST	a1, 0 * SIZE(CO1)
	ST	a2, 1 * SIZE(CO1)
	ST	b1, 2 * SIZE(CO1)
	ST	b2, 3 * SIZE(CO1)

	ST	a3, 0 * SIZE(CO2)
	ST	a4, 1 * SIZE(CO2)
	ST	b3, 2 * SIZE(CO2)
	ST	b4, 3 * SIZE(CO2)

#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -2
#endif

	dsll	L,    TEMP, 1 + ZBASE_SHIFT
	dsll	TEMP, TEMP, 1 + ZBASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif
#endif

	dsll	PREB, K, 1 + ZBASE_SHIFT				#	PREA=K*2*2^4
	daddiu	CO1,CO1, 4 * SIZE 
	bgtz	I, .L11
	daddiu	CO2,CO2, 4 * SIZE 

	.align	5
.L30:
	andi	I, M, 1
	daddu	C,   C, LDC						#	Change C to next panel

	daddu	PREB, PREB, B						#	PREA=A+panel size
	blez	I, .L19
	daddu	C,   C, LDC						#	Change C to next panel

#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, ZBASE_SHIFT			#	MR=1
	dsll	TEMP, KK, 1 + ZBASE_SHIFT		#	NR=2

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif

	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2
	MTC	$0,  c11								#	Clear results regs
	MOV	c12, c11
	
	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
	MOV	c13, c11
	MOV	c14, c11

	gsLQC1(R13, F7, F6, 1)						#	R:b2	I:b3
	MOV	c31, c11
	MOV	c32, c11

	FETCH	$0, 0 * SIZE(PREB)
	MOV	c33, c11
	MOV	c34, c11

	FETCH	$0, 0 * SIZE(CO1)
	FETCH	$0, 0 * SIZE(CO2)
	FETCH	$0, 4 * SIZE(CO1)
	FETCH	$0, 4 * SIZE(CO2)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1							#	MR=1
#else	
	daddiu	TEMP, KK, 2							#	NR=2
#endif
	dsra	L,  TEMP, 2
	blez	L, .L35
	NOP

#else

	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2
	dsra	L,  K, 2							#	Unroll K 4 times	
	move	BO,  B

	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
	MTC	$0,  c11								#	Clear results regs
	MOV	c12, c11
	
	gsLQC1(R13, F7, F6, 1)						#	R:b2	I:b3
	MOV	c13, c11
	MOV	c14, c11

	FETCH	$0, 0 * SIZE(PREB)
	MOV	c31, c11
	MOV	c32, c11

	FETCH	$0, 0 * SIZE(CO1)
	FETCH	$0, 0 * SIZE(CO2)
	FETCH	$0, 4 * SIZE(CO1)
	FETCH	$0, 4 * SIZE(CO2)

	MOV	c33, c11
	blez	L, .L35
	MOV	c34, c11
#endif

	.align	5

.L32:
	gsLQC1(R12, F3, F2, 1)						#	R:a3	I:a4
	gsLQC1(R13, F13, F12, 2)				
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
	MADD3	c13, c13, a1, b2					#	axd

	gsLQC1(R13, F16, F15, 3)			
	MADD2	c12, c12, a2, b1					#	bxc
	MADD4	c14, c14, a2, b2					#	bxd
	NOP

	MADD1	c31, c31, a1, b3 					#	A1xB2
	MADD3	c33, c33, a1, b4
	
	FETCH	$0, 4 * SIZE(PREB)
	MADD2	c32, c32, a2, b3
	MADD4	c34, c34, a2, b4
	NOP

	gsLQC1(R12, F9, F8, 2)						#	Unroll K=1	
	gsLQC1(R13, F5, F4, 4)					
	MADD1	c11, c11, a3, b5 					#	axc		A1xB1	
	MADD3	c13, c13, a3, b6					#	axd

	gsLQC1(R13, F7, F6, 5)				
	MADD2	c12, c12, a4, b5					#	bxc
	MADD4	c14, c14, a4, b6					#	bxd
	NOP

	MADD1	c31, c31, a3, b7 					#	A1xB2
	MADD3	c33, c33, a3, b8

	FETCH	$0, 8 * SIZE(PREB)
	MADD2	c32, c32, a4, b7
	MADD4	c34, c34, a4, b8
	daddiu	L, L, -1

	gsLQC1(R12, F11, F10, 3)					
	gsLQC1(R13, F13, F12, 6)				
	MADD1	c11, c11, a5, b1 					#	axc		A1xB1	
	MADD3	c13, c13, a5, b2					#	axd

	gsLQC1(R13, F16, F15, 7)			
	MADD2	c12, c12, a6, b1					#	bxc
	MADD4	c14, c14, a6, b2					#	bxd
	daddiu	AO, AO, 8 * SIZE 					#	2mr*4kr*cmpx

	MADD1	c31, c31, a5, b3 					#	A1xB2
	MADD3	c33, c33, a5, b4

	FETCH	$0, 12 * SIZE(PREB)
	MADD2	c32, c32, a6, b3
	MADD4	c34, c34, a6, b4
	daddiu	BO, BO, 16 * SIZE  					#	2nr*4kr*cmpx

	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2	Unroll K=4
	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
	MADD1	c11, c11, a7, b5 					#	axc		A1xB1	
	MADD3	c13, c13, a7, b6					#	axd

	gsLQC1(R13, F7, F6, 1)						#	R:b2	I:b3
	MADD2	c12, c12, a8, b5					#	bxc
	MADD4	c14, c14, a8, b6					#	bxd
	daddiu	PREB, PREB, 16 * SIZE

	MADD1	c31, c31, a7, b7 					#	A1xB2
	MADD3	c33, c33, a7, b8

	FETCH	$0, 0 * SIZE(PREB)
	MADD2	c32, c32, a8, b7
	bgtz	L, .L32
	MADD4	c34, c34, a8, b8


.L35:
#ifndef TRMMKERNEL
	andi	L,  K, 3
	LD	ALPHA_R, 128($sp)
#else
	andi	L,  TEMP, 3
	LD	ALPHA_R, 128($sp)
#endif
	blez	L, .L38
	LD	ALPHA_I, 136($sp)
	.align	5

.L36:
	daddiu	L, L, -1
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
	MADD3	c13, c13, a1, b2					#	axd

	daddiu	BO, BO, 4 * SIZE 					#	2nr*1kr*cmpx
	MADD2	c12, c12, a2, b1					#	bxc
	MADD4	c14, c14, a2, b2					#	bxd

	daddiu	AO, AO, 2 * SIZE					#	2mr*1kr*cmpx
	MADD1	c31, c31, a1, b3 					#	A1xB2
	MADD3	c33, c33, a1, b4
	
	daddiu	PREB, PREB, 4 * SIZE 
	MADD2	c32, c32, a2, b3
	MADD4	c34, c34, a2, b4
	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2	Unroll K=4

	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
	NOP

	bgtz	L, .L36
	gsLQC1(R13, F7, F6, 1)						#	R:b2	I:b3

.L38:
#ifndef	TRMMKERNEL
 	ADD	c11, c14, c11
	LD	a1,  0 * SIZE(CO1)
	ADD	c12, c13, c12
	LD	a2,  1 * SIZE(CO1)

	ADD	c31, c34, c31
	LD	a3,  0 * SIZE(CO2)
	ADD	c32, c33, c32
	LD	a4,  1 * SIZE(CO2)

	MADD	a1, a1, ALPHA_R, c11
	MADD	a2, a2, ALPHA_R, c12

	MADD	a3, a3, ALPHA_R, c31
	MADD	a4, a4, ALPHA_R, c32

	NMSUB	a1, a1, ALPHA_I, c12
	MADD	a2, a2, ALPHA_I, c11

	NMSUB	a3, a3, ALPHA_I, c32
	MADD	a4, a4, ALPHA_I, c31

	ST	a1, 0 * SIZE(CO1)
	ST	a2, 1 * SIZE(CO1)

	ST	a3, 0 * SIZE(CO2)
	ST	a4, 1 * SIZE(CO2)

	daddiu	CO1,CO1, 2 * SIZE 
	daddiu	CO2,CO2, 2 * SIZE 
#else
 	ADD	c11, c14, c11
	ADD	c12, c13, c12

	ADD	c31, c34, c31
	ADD	c32, c33, c32

	MUL	a1, ALPHA_R, c11
	MUL	a2, ALPHA_R, c12
	MUL	a3, ALPHA_R, c31
	MUL	a4, ALPHA_R, c32

	NMSUB	a1, a1, ALPHA_I, c12
	MADD	a2, a2, ALPHA_I, c11

	NMSUB	a3, a3, ALPHA_I, c32
	MADD	a4, a4, ALPHA_I, c31

	ST	a1, 0 * SIZE(CO1)
	ST	a2, 1 * SIZE(CO1)

	ST	a3, 0 * SIZE(CO2)
	ST	a4, 1 * SIZE(CO2)

	daddiu	CO1,CO1, 2 * SIZE 
	daddiu	CO2,CO2, 2 * SIZE 

#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -2
#endif
	dsll	L,    TEMP, ZBASE_SHIFT
	dsll	TEMP, TEMP, 1 + ZBASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 1
#endif
#endif

	.align 5

.L19:
#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu	KK, KK, 2
#endif

	bgtz	J, .L10
	move	B, BO

	.align 5
	
.L20:
	andi	J,  N, 1
	blez	J, .L999
	dsll	PREA, K, 1+ZBASE_SHIFT				#	PREA=K*2*2^4

	dsra	I,  M, 1							#	I=M/2
	move	CO1, C

#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif

	move	AO, A								#	Reset AO
	blez	I, .L29
	daddu	PREA, PREA, A

.L21:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, 1 + ZBASE_SHIFT
	dsll	TEMP, KK, ZBASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif
	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2
	MTC	$0,  c11								#	Clear results regs
	MOV	c12, c11

	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
	MOV	c13, c11
	MOV	c14, c11

	gsLQC1(R12, F3, F2, 1)						#	R:a3	I:a4
	MOV	c21, c11
	MOV	c22, c11

	FETCH	$0, 0 * SIZE(PREA)
	MOV	c23, c11
	MOV	c24, c11

	FETCH	$0, 0 * SIZE(CO1)
	FETCH	$0, 4 * SIZE(CO1)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2						#	define Mr=2
#else
	daddiu	TEMP, KK, 1						#	define	NR=1
#endif
	dsra	L,  TEMP, 2
	blez	L, .L25
	NOP

#else
	dsra	L,  K, 2							#	Unroll K 4 times	
	move	BO,  B

	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2
	MTC	$0,  c11								#	Clear results regs
	MOV	c12, c11

	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
	MOV	c13, c11
	MOV	c14, c11

	gsLQC1(R12, F3, F2, 1)						#	R:a3	I:a4
	MOV	c21, c11
	MOV	c22, c11

	FETCH	$0, 0 * SIZE(PREA)
	MOV	c23, c11
	MOV	c24, c11

	FETCH	$0, 0 * SIZE(CO1)
	FETCH	$0, 4 * SIZE(CO1)

	blez	L, .L25
	NOP
#endif

	.align	5

.L22:
	gsLQC1(R12, F9, F8, 2)						#	Unroll K=1	
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
	MADD3	c13, c13, a1, b2					#	axd

	gsLQC1(R13, F7, F6, 1)						#	R:b2	I:b3
	MADD2	c12, c12, a2, b1					#	bxc
	MADD4	c14, c14, a2, b2					#	bxd

	gsLQC1(R12, F11, F10, 3)					
	MADD1	c21, c21, a3, b1 					#	A2xB1
	MADD3	c23, c23, a3, b2
	
	FETCH	$0, 4 * SIZE(PREA)
	MADD2	c22, c22, a4, b1
	MADD4	c24, c24, a4, b2

	gsLQC1(R12, F1, F0, 4)						#	Unroll	K=2	
	MADD1	c11, c11, a5, b3 					#	axc		A1xB1	
	MADD3	c13, c13, a5, b4					#	axd

	gsLQC1(R13, F13, F12, 2)				
	MADD2	c12, c12, a6, b3					#	bxc
	MADD4	c14, c14, a6, b4					#	bxd

	gsLQC1(R12, F3, F2, 5)						
	MADD1	c21, c21, a7, b3 					#	A2xB1
	MADD3	c23, c23, a7, b4

	FETCH	$0, 8 * SIZE(PREA)
	MADD2	c22, c22, a8, b3
	MADD4	c24, c24, a8, b4
	daddiu	L, L, -1

	gsLQC1(R12, F9, F8, 6)						#	Unroll	K=3
	MADD1	c11, c11, a1, b5 					#	axc		A1xB1	
	MADD3	c13, c13, a1, b6					#	axd

	gsLQC1(R13, F16, F15, 3)			
	MADD2	c12, c12, a2, b5					#	bxc
	MADD4	c14, c14, a2, b6					#	bxd

	gsLQC1(R12, F11, F10, 7)					
	MADD1	c21, c21, a3, b5 					#	A2xB1
	MADD3	c23, c23, a3, b6
	daddiu	BO, BO,  8 * SIZE  					#	1nr*4kr*cmpx

	FETCH	$0, 12 * SIZE(PREA)
	MADD2	c22, c22, a4, b5
	MADD4	c24, c24, a4, b6
	daddiu	AO, AO, 16 * SIZE 					#	2mr*4kr*cmpx

	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2	Unroll K=4
	MADD1	c11, c11, a5, b7 					#	axc		A1xB1	
	MADD3	c13, c13, a5, b8					#	axd
	daddiu	PREA, PREA, 16 * SIZE 		

	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
	MADD2	c12, c12, a6, b7					#	bxc
	MADD4	c14, c14, a6, b8					#	bxd

	gsLQC1(R12, F3, F2, 1)						#	R:a3	I:a4
	MADD1	c21, c21, a7, b7 					#	A2xB1
	MADD3	c23, c23, a7, b8

	FETCH	$0, 0 * SIZE(PREA)
	MADD2	c22, c22, a8, b7
	bgtz	L, .L22
	MADD4	c24, c24, a8, b8


.L25:
#ifndef TRMMKERNEL
	andi	L,  K, 3
	LD	ALPHA_R, 128($sp)
#else
	andi	L, TEMP, 3
	LD	ALPHA_R, 128($sp)
#endif
	blez	L, .L28
	LD	ALPHA_I, 136($sp)
	.align	3

.L26:
	daddiu	L, L, -1
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
	MADD3	c13, c13, a1, b2					#	axd

	daddiu	BO, BO, 2 * SIZE 					#	2nr*1kr*cmpx
	MADD2	c12, c12, a2, b1					#	bxc
	MADD4	c14, c14, a2, b2					#	bxd

	daddiu	AO, AO, 4 * SIZE 					#	2mr*1kr*cmpx
	MADD1	c21, c21, a3, b1 					#	A2xB1
	MADD3	c23, c23, a3, b2

	daddiu	PREA, PREA, 4 * SIZE 				#	2mr*1kr*cmpx
	MADD2	c22, c22, a4, b1
	MADD4	c24, c24, a4, b2

	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2	Unroll K=4
	gsLQC1(R12, F3, F2, 1)						#	R:a3	I:a4
	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2

	bgtz	L, .L26
	FETCH	$0, 0 * SIZE(PREA)

.L28:
#ifndef	TRMMKERNEL
 	ADD	c11, c14, c11
	LD	a1,  0 * SIZE(CO1)
	ADD	c12, c13, c12
	LD	a2,  1 * SIZE(CO1)
	ADD	c21, c24, c21
	LD	b1,  2 * SIZE(CO1)
	ADD	c22, c23, c22
	LD	b2,  3 * SIZE(CO1)

	daddiu	I, I, -1
	MADD	a1, a1, ALPHA_R, c11
	MADD	a2, a2, ALPHA_R, c12
	MADD	b1, b1, ALPHA_R, c21
	MADD	b2, b2, ALPHA_R, c22

	NMSUB	a1, a1, ALPHA_I, c12
	MADD	a2, a2, ALPHA_I, c11
	NMSUB	b1, b1, ALPHA_I, c22
	MADD	b2, b2, ALPHA_I, c21

	ST	a1, 0 * SIZE(CO1)
	ST	a2, 1 * SIZE(CO1)
	ST	b1, 2 * SIZE(CO1)
	ST	b2, 3 * SIZE(CO1)

#else
 	ADD	c11, c14, c11
	ADD	c12, c13, c12
	ADD	c21, c24, c21
	ADD	c22, c23, c22

	daddiu	I, I, -1
	MUL	a1, ALPHA_R, c11
	MUL	a2, ALPHA_R, c12
	MUL	b1, ALPHA_R, c21
	MUL	b2, ALPHA_R, c22

	NMSUB	a1, a1, ALPHA_I, c12
	MADD	a2, a2, ALPHA_I, c11
	NMSUB	b1, b1, ALPHA_I, c22
	MADD	b2, b2, ALPHA_I, c21

	ST	a1, 0 * SIZE(CO1)
	ST	a2, 1 * SIZE(CO1)
	ST	b1, 2 * SIZE(CO1)
	ST	b2, 3 * SIZE(CO1)

#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -1
#endif

	dsll	L,    TEMP, 1 + ZBASE_SHIFT
	dsll	TEMP, TEMP, ZBASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif
#endif
	daddiu	CO1,CO1, 4 * SIZE 
	bgtz	I, .L21
	NOP

.L29:
	andi	I, M, 1
	blez	I, .L999
	NOP

#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	TEMP, KK,  ZBASE_SHIFT

	daddu	AO, AO, TEMP
	daddu	BO, B,  TEMP
#endif

	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2
	MTC	$0,  c11								#	Clear results regs
	MOV	c12, c11

	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
	MOV	c13, c11
	MOV	c14, c11

	FETCH	$0, 0 * SIZE(PREA)
	FETCH	$0, 4 * SIZE(PREA)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	L,  TEMP, 2
	blez	L, .L45
	NOP

#else
	dsra	L,  K, 2							#	Unroll K 4 times	
	move	BO,  B

	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2
	MTC	$0,  c11								#	Clear results regs
	MOV	c12, c11

	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
	MOV	c13, c11
	MOV	c14, c11

	FETCH	$0, 0 * SIZE(PREA)
	FETCH	$0, 4 * SIZE(PREA)
	blez	L, .L45
	NOP
#endif

	.align	3

.L42:
	gsLQC1(R12, F3, F2, 1)						#	R:a3	I:a4
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
	MADD3	c13, c13, a1, b2					#	axd

	gsLQC1(R13, F7, F6, 1)						#	R:b2	I:b3
	MADD2	c12, c12, a2, b1					#	bxc
	MADD4	c14, c14, a2, b2					#	bxd

	gsLQC1(R12, F9, F8, 2)						#	Unroll K=1	
	MADD1	c11, c11, a3, b3 					#	axc		A1xB1	
	MADD3	c13, c13, a3, b4					#	axd

	gsLQC1(R13, F13, F12, 2)				
	MADD2	c12, c12, a4, b3					#	bxc
	MADD4	c14, c14, a4, b4					#	bxd
	daddiu	L, L, -1

	gsLQC1(R12, F11, F10, 3)					
	MADD1	c11, c11, a5, b5 					#	axc		A1xB1	
	MADD3	c13, c13, a5, b6					#	axd
	daddiu	AO, AO, 8 * SIZE 					#	2mr*4kr*cmpx

	gsLQC1(R13, F16, F15, 3)			
	MADD2	c12, c12, a6, b5					#	bxc
	MADD4	c14, c14, a6, b6					#	bxd
	daddiu	BO, BO, 8 * SIZE  					#	2nr*4kr*cmpx

	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2	Unroll K=4
	MADD1	c11, c11, a7, b7 					#	axc		A1xB1	
	MADD3	c13, c13, a7, b8					#	axd

	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
	MADD2	c12, c12, a8, b7					#	bxc
	bgtz	L, .L42
	MADD4	c14, c14, a8, b8					#	bxd


	.align 5

.L45:
#ifndef	TRMMKERNEL
	andi	L,  K, 3
	LD	ALPHA_R, 128($sp)
#else
	andi	L,  TEMP, 3
	LD	ALPHA_R, 128($sp)
#endif
	blez	L, .L48
	LD	ALPHA_I, 136($sp)

.L46:
	daddiu	L, L, -1
	daddiu	BO, BO, 1 * SIZE * COMPSIZE			#	2nr*1kr*cmpx
	daddiu	AO, AO, 1 * SIZE * COMPSIZE			#	2mr*1kr*cmpx

	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
	MADD3	c13, c13, a1, b2					#	axd
	MADD2	c12, c12, a2, b1					#	bxc
	MADD4	c14, c14, a2, b2					#	bxd

	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2	Unroll K=4
	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2

	bgtz	L, .L46
	NOP

.L48:
#ifndef	TRMMKERNEL
 	ADD	c11, c14, c11
	ADD	c12, c13, c12

	LD	a1,  0 * SIZE(CO1)
	LD	a2,  1 * SIZE(CO1)

	MADD	a1, a1, ALPHA_R, c11
	MADD	a2, a2, ALPHA_R, c12

	NMSUB	a1, a1, ALPHA_I, c12
	MADD	a2, a2, ALPHA_I, c11

	ST	a1, 0 * SIZE(CO1)
	ST	a2, 1 * SIZE(CO1)

#else
	ADD	c11, c14, c11
	ADD	c12, c13, c12

	MUL	a1, ALPHA_R, c11
	MUL	a2, ALPHA_R, c12

	NMSUB	a1, a1, ALPHA_I, c12
	MADD	a2, a2, ALPHA_I, c11

	ST	a1, 0 * SIZE(CO1)
	ST	a2, 1 * SIZE(CO1)

#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -1
#endif

	dsll	TEMP, TEMP, ZBASE_SHIFT

	daddu	AO, AO, TEMP
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 1
#endif

	daddiu	CO1,CO1, 2 * SIZE 
#endif



	.align 5

.L999:
	LDARG	$16,   0($sp)
	LDARG	$17,   8($sp)
	ldc1	$f24, 16($sp)
	ldc1	$f25, 24($sp)
	ldc1	$f26, 32($sp)
	ldc1	$f27, 40($sp)
	ldc1	$f28, 48($sp)
	ldc1	$f29, 56($sp)

#if defined(TRMMKERNEL)
	LDARG	$18,  64($sp)
	LDARG	$19,  72($sp)
	LDARG	$20,  80($sp)
#endif

#ifndef __64BIT__
	ldc1	$f20, 88($sp)
	ldc1	$f21, 96($sp)
	ldc1	$f22,104($sp)
	ldc1	$f23,112($sp)
#endif

	j	$31
	daddiu	$sp, $sp, STACKSIZE

	EPILOGUE