Blame thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/mips64/cgemm_kernel_loongson3a_2x2.S

kusano 2b45e8
#define ASSEMBLER
kusano 2b45e8
#include "common.h"
kusano 2b45e8
kusano 2b45e8
#define FETCH	ld
kusano 2b45e8
#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
kusano 2b45e8
#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#define STACKSIZE	160
kusano 2b45e8
#define M	$4
kusano 2b45e8
#define	N	$5
kusano 2b45e8
#define	K	$6
kusano 2b45e8
#define A	$9
kusano 2b45e8
#define B	$10
kusano 2b45e8
#define C	$11
kusano 2b45e8
#define LDC	$8
kusano 2b45e8
kusano 2b45e8
#define AO	$12
kusano 2b45e8
#define BO	$13
kusano 2b45e8
kusano 2b45e8
#define	R12	12
kusano 2b45e8
#define	R13	13
kusano 2b45e8
kusano 2b45e8
#define I	$2
kusano 2b45e8
#define J	$3
kusano 2b45e8
#define L	$7
kusano 2b45e8
kusano 2b45e8
#define CO1	$14
kusano 2b45e8
#define CO2	$15
kusano 2b45e8
#define PREA	$16
kusano 2b45e8
#define PREB	$17
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#define OFFSET	$18
kusano 2b45e8
#define KK	$19
kusano 2b45e8
#define TEMP	$20
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#define a1	$f0
kusano 2b45e8
#define a2	$f1
kusano 2b45e8
#define a3	$f2
kusano 2b45e8
#define a4	$f3
kusano 2b45e8
kusano 2b45e8
#define b1	$f4
kusano 2b45e8
#define b2	$f5
kusano 2b45e8
#define b3	$f6
kusano 2b45e8
#define b4	$f7
kusano 2b45e8
kusano 2b45e8
#define a5	$f8
kusano 2b45e8
#define a6	$f9
kusano 2b45e8
#define a7	$f10
kusano 2b45e8
#define a8	$f11
kusano 2b45e8
kusano 2b45e8
#define b5	$f12
kusano 2b45e8
#define b6	$f13
kusano 2b45e8
#define b7	$f15
kusano 2b45e8
#define b8	$f16
kusano 2b45e8
kusano 2b45e8
#define c11	$f14
kusano 2b45e8
#define c12	$f17
kusano 2b45e8
#define c13	$f18
kusano 2b45e8
#define c14	$f19
kusano 2b45e8
#define c21	$f20
kusano 2b45e8
#define c22	$f21
kusano 2b45e8
#define c23	$f22
kusano 2b45e8
#define c24	$f23
kusano 2b45e8
#define c31	$f24
kusano 2b45e8
#define c32	$f25
kusano 2b45e8
#define c33	$f26
kusano 2b45e8
#define c34	$f27
kusano 2b45e8
#define c41	$f28
kusano 2b45e8
#define c42	$f29
kusano 2b45e8
#define c43	$f30
kusano 2b45e8
#define c44	$f31
kusano 2b45e8
kusano 2b45e8
#define F0	0
kusano 2b45e8
#define F1	1
kusano 2b45e8
#define F2	2
kusano 2b45e8
#define F3	3
kusano 2b45e8
#define F4	4
kusano 2b45e8
#define F5	5
kusano 2b45e8
#define	F6	6
kusano 2b45e8
#define	F7	7
kusano 2b45e8
#define	F8	8
kusano 2b45e8
#define	F9	9
kusano 2b45e8
#define	F10	10
kusano 2b45e8
#define	F11	11
kusano 2b45e8
#define	F12	12
kusano 2b45e8
#define	F13	13
kusano 2b45e8
#define	F14	14
kusano 2b45e8
#define	F15	15
kusano 2b45e8
#define	F16	16
kusano 2b45e8
#define	F17	17
kusano 2b45e8
#define	F18	18
kusano 2b45e8
#define	F19	19
kusano 2b45e8
#define	F20	20
kusano 2b45e8
#define	F21	21
kusano 2b45e8
#define	F22	22
kusano 2b45e8
#define	F23	23
kusano 2b45e8
#define	F24	24
kusano 2b45e8
#define	F25	25
kusano 2b45e8
#define	F26	26
kusano 2b45e8
#define	F27	27
kusano 2b45e8
#define	F28	28
kusano 2b45e8
#define	F29	29
kusano 2b45e8
#define	F30	30
kusano 2b45e8
#define	F31	31
kusano 2b45e8
kusano 2b45e8
#define ALPHA_R	$f15
kusano 2b45e8
#define ALPHA_I	$f16
kusano 2b45e8
kusano 2b45e8
#################################
kusano 2b45e8
##	MADD1	a*c
kusano 2b45e8
##	MADD2	b*c
kusano 2b45e8
##	MADD3	a*d
kusano 2b45e8
##	MADD4	d*b
kusano 2b45e8
##################################
kusano 2b45e8
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
kusano 2b45e8
#define MADD1	  MADD
kusano 2b45e8
#define MADD2	  MADD
kusano 2b45e8
#define MADD3	  MADD
kusano 2b45e8
#define MADD4	  NMSUB
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
kusano 2b45e8
#define MADD1	  MADD
kusano 2b45e8
#define MADD2	  MADD
kusano 2b45e8
#define MADD3	  NMSUB
kusano 2b45e8
#define MADD4	  MADD
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
kusano 2b45e8
#define MADD1	  MADD
kusano 2b45e8
#define MADD2	  NMSUB
kusano 2b45e8
#define MADD3	  MADD
kusano 2b45e8
#define MADD4	  MADD
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
kusano 2b45e8
#define MADD1	  MADD
kusano 2b45e8
#define MADD2	  NMSUB
kusano 2b45e8
#define MADD3	  NMSUB
kusano 2b45e8
#define MADD4	  NMSUB
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	PROLOGUE
kusano 2b45e8
	
kusano 2b45e8
	LDARG	LDC,   0($sp)
kusano 2b45e8
	daddiu	$sp, $sp, -STACKSIZE
kusano 2b45e8
kusano 2b45e8
	SDARG	$16,   0($sp)
kusano 2b45e8
	SDARG	$17,   8($sp)
kusano 2b45e8
	sdc1	$f24, 16($sp)
kusano 2b45e8
	sdc1	$f25, 24($sp)
kusano 2b45e8
	sdc1	$f26, 32($sp)
kusano 2b45e8
	sdc1	$f27, 40($sp)
kusano 2b45e8
	sdc1	$f28, 48($sp)
kusano 2b45e8
	sdc1	$f29, 56($sp)
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
	SDARG	$18,  64($sp)
kusano 2b45e8
	SDARG	$19,  72($sp)
kusano 2b45e8
	SDARG	$20,  80($sp)
kusano 2b45e8
kusano 2b45e8
	LDARG	OFFSET, STACKSIZE + 8($sp)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifndef __64BIT__
kusano 2b45e8
	sdc1	$f20, 88($sp)
kusano 2b45e8
	sdc1	$f21, 96($sp)
kusano 2b45e8
	sdc1	$f22,104($sp)
kusano 2b45e8
	sdc1	$f23,112($sp)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	dsra	J,  N, 1							#	J=N/2
kusano 2b45e8
	ST	ALPHA_R, 128($sp)						#	store alpha_r & alpha_i
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && !defined(LEFT)
kusano 2b45e8
	neg	KK, OFFSET
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	dsll	LDC, LDC, ZBASE_SHIFT				#	LDC*SIZE*COMPSIZE
kusano 2b45e8
	blez	J, .L20
kusano 2b45e8
	ST	ALPHA_I, 136($sp)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align	5
kusano 2b45e8
.L10:
kusano 2b45e8
#if defined(TRMMKERNEL) &&  defined(LEFT)
kusano 2b45e8
	move	KK, OFFSET
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	daddiu	J, J, -1							
kusano 2b45e8
	dsra	I,  M, 1							#	I=M/2
kusano 2b45e8
kusano 2b45e8
	dsll	PREB, K, 1 + ZBASE_SHIFT				#	PREA=K*2*2^4
kusano 2b45e8
	dsll	PREA, K, 1 + ZBASE_SHIFT				#	PREA=K*2*2^4
kusano 2b45e8
kusano 2b45e8
	move	CO1, C								#	Fix pointer Cx
kusano 2b45e8
	daddu	CO2, C,   LDC
kusano 2b45e8
kusano 2b45e8
	move	AO, A								#	Reset AO
kusano 2b45e8
	blez	I, .L30
kusano 2b45e8
	daddu	PREA, PREA, A						#	PREA=A+panel size
kusano 2b45e8
kusano 2b45e8
.L11:
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	BO,  B
kusano 2b45e8
#else
kusano 2b45e8
	dsll	TEMP, KK, 1 + ZBASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AO, TEMP
kusano 2b45e8
	daddu	BO, B,  TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  c11								#	Clear results regs
kusano 2b45e8
	LD	a1,	0 * SIZE(AO)
kusano 2b45e8
	MOV	c12, c11
kusano 2b45e8
	LD	a2,	1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	MOV	c13, c11
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	MOV	c14, c11
kusano 2b45e8
	LD	b2, 1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MOV	c21, c11
kusano 2b45e8
	LD	a3,	2 * SIZE(AO)
kusano 2b45e8
	MOV	c22, c11
kusano 2b45e8
	LD	a4, 3 * SIZE(AO)
kusano 2b45e8
	
kusano 2b45e8
	MOV	c23, c11
kusano 2b45e8
	LD	b3, 2 * SIZE(BO)
kusano 2b45e8
	MOV	c24, c11
kusano 2b45e8
	LD	b4,	3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(CO2)
kusano 2b45e8
	MOV	c31, c11
kusano 2b45e8
	MOV	c32, c11
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(CO1)
kusano 2b45e8
	MOV	c33, c11
kusano 2b45e8
	MOV	c34, c11
kusano 2b45e8
	
kusano 2b45e8
	FETCH	$0, 4 * SIZE(CO2)
kusano 2b45e8
	MOV	c41, c11
kusano 2b45e8
	MOV	c42, c11
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 4 * SIZE(CO1)
kusano 2b45e8
	MOV	c43, c11
kusano 2b45e8
	MOV	c44, c11
kusano 2b45e8
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP, KK, 2
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, KK, 2
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	L,  TEMP, 2
kusano 2b45e8
	daddu	PREB, PREB, B						#	PREA=A+panel size
kusano 2b45e8
	blez	L, .L15
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
kusano 2b45e8
	dsra	L,  K, 2							#	Unroll K 4 times	
kusano 2b45e8
	move	BO,  B
kusano 2b45e8
kusano 2b45e8
	MTC	$0,  c11								#	Clear results regs
kusano 2b45e8
	LD	a1,	0 * SIZE(AO)
kusano 2b45e8
	MOV	c12, c11
kusano 2b45e8
	LD	a2,	1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	MOV	c13, c11
kusano 2b45e8
	LD	b1,	0 * SIZE(BO)
kusano 2b45e8
	MOV	c14, c11
kusano 2b45e8
	LD	b2,	1 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MOV	c21, c11
kusano 2b45e8
	LD	a3,	2 * SIZE(AO)
kusano 2b45e8
	MOV	c22, c11
kusano 2b45e8
	LD	a4, 3 * SIZE(AO)
kusano 2b45e8
	
kusano 2b45e8
	MOV	c23, c11
kusano 2b45e8
	LD	b3, 2 * SIZE(BO)
kusano 2b45e8
	MOV	c24, c11
kusano 2b45e8
	LD	b4,	3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	MOV	c31, c11
kusano 2b45e8
	MOV	c32, c11
kusano 2b45e8
	FETCH	$0, 0 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	MOV	c33, c11
kusano 2b45e8
	MOV	c34, c11
kusano 2b45e8
	FETCH	$0, 0 * SIZE(CO1)
kusano 2b45e8
	
kusano 2b45e8
	MOV	c41, c11
kusano 2b45e8
	MOV	c42, c11
kusano 2b45e8
	FETCH	$0, 4 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	MOV	c43, c11
kusano 2b45e8
	NOP
kusano 2b45e8
	FETCH	$0, 4 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	daddu	PREB, PREB, B						#	PREA=A+panel size
kusano 2b45e8
	blez	L, .L15
kusano 2b45e8
	MOV	c44, c11
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	.align	5
kusano 2b45e8
kusano 2b45e8
.L12:
kusano 2b45e8
	LD	a5,	4 * SIZE(AO)
kusano 2b45e8
	LD	a6,	5 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a1, b2					#	axd
kusano 2b45e8
kusano 2b45e8
	LD	b5,	4 * SIZE(BO)
kusano 2b45e8
	LD	b6,	5 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a2, b1					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a2, b2					#	bxd
kusano 2b45e8
kusano 2b45e8
	LD	a7,	6 * SIZE(AO)
kusano 2b45e8
	LD	a8,	7 * SIZE(AO)
kusano 2b45e8
	MADD1	c21, c21, a3, b1 					#	A2xB1
kusano 2b45e8
	MADD3	c23, c23, a3, b2
kusano 2b45e8
kusano 2b45e8
	LD	b7,	6 * SIZE(BO)
kusano 2b45e8
	LD	b8, 7 * SIZE(BO)
kusano 2b45e8
	MADD2	c22, c22, a4, b1
kusano 2b45e8
	MADD4	c24, c24, a4, b2
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 4 * SIZE(PREA)
kusano 2b45e8
	FETCH	$0, 4 * SIZE(PREB)
kusano 2b45e8
	MADD1	c31, c31, a1, b3 					#	A1xB2
kusano 2b45e8
	MADD3	c33, c33, a1, b4
kusano 2b45e8
kusano 2b45e8
	MADD2	c32, c32, a2, b3
kusano 2b45e8
	MADD4	c34, c34, a2, b4
kusano 2b45e8
kusano 2b45e8
	MADD1	c41, c41, a3, b3 					#	A2xB2
kusano 2b45e8
	MADD3	c43, c43, a3, b4
kusano 2b45e8
	MADD2	c42, c42, a4, b3
kusano 2b45e8
	MADD4	c44, c44, a4, b4
kusano 2b45e8
kusano 2b45e8
	LD	a1,	8 * SIZE(AO)
kusano 2b45e8
	LD	a2, 9 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a5, b5 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a5, b6					#	axd
kusano 2b45e8
kusano 2b45e8
	LD	b1,	8 * SIZE(BO)
kusano 2b45e8
	LD	b2, 9 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a6, b5					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a6, b6					#	bxd
kusano 2b45e8
kusano 2b45e8
	LD	a3,	10 * SIZE(AO)
kusano 2b45e8
	LD	a4,	11 * SIZE(AO)	
kusano 2b45e8
	MADD1	c21, c21, a7, b5 					#	A2xB1
kusano 2b45e8
	MADD3	c23, c23, a7, b6
kusano 2b45e8
kusano 2b45e8
	LD	b3,	10 * SIZE(BO)
kusano 2b45e8
	LD	b4,	11 * SIZE(BO)
kusano 2b45e8
	MADD2	c22, c22, a8, b5
kusano 2b45e8
	MADD4	c24, c24, a8, b6
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 8 * SIZE(PREA)
kusano 2b45e8
	FETCH	$0, 8 * SIZE(PREB)
kusano 2b45e8
	MADD1	c31, c31, a5, b7 					#	A1xB2
kusano 2b45e8
	MADD3	c33, c33, a5, b8
kusano 2b45e8
kusano 2b45e8
	MADD2	c32, c32, a6, b7
kusano 2b45e8
	MADD4	c34, c34, a6, b8
kusano 2b45e8
kusano 2b45e8
	MADD1	c41, c41, a7, b7 					#	A2xB2
kusano 2b45e8
	MADD3	c43, c43, a7, b8
kusano 2b45e8
	MADD2	c42, c42, a8, b7
kusano 2b45e8
	MADD4	c44, c44, a8, b8
kusano 2b45e8
kusano 2b45e8
	LD	a5,	12 * SIZE(AO)
kusano 2b45e8
	LD	a6,	13 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a1, b2					#	axd
kusano 2b45e8
kusano 2b45e8
	LD	b5,	12 * SIZE(BO)
kusano 2b45e8
	LD	b6,	13 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a2, b1					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a2, b2					#	bxd
kusano 2b45e8
kusano 2b45e8
	LD	a7,	14 * SIZE(AO)
kusano 2b45e8
	LD	a8,	15 * SIZE(AO)
kusano 2b45e8
	MADD1	c21, c21, a3, b1 					#	A2xB1
kusano 2b45e8
	MADD3	c23, c23, a3, b2
kusano 2b45e8
kusano 2b45e8
	LD	b7, 14 * SIZE(BO)
kusano 2b45e8
	LD	b8,	15 * SIZE(BO)
kusano 2b45e8
	MADD2	c22, c22, a4, b1
kusano 2b45e8
	MADD4	c24, c24, a4, b2
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 12 * SIZE(PREA)
kusano 2b45e8
	MADD1	c31, c31, a1, b3 					#	A1xB2
kusano 2b45e8
	MADD3	c33, c33, a1, b4
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 12 * SIZE(PREB)
kusano 2b45e8
	MADD2	c32, c32, a2, b3
kusano 2b45e8
	MADD4	c34, c34, a2, b4
kusano 2b45e8
	daddiu	AO, AO, 16 * SIZE
kusano 2b45e8
kusano 2b45e8
	daddiu	BO, BO, 16 * SIZE  					#	2nr*4kr*cmpx
kusano 2b45e8
	MADD1	c41, c41, a3, b3 					#	A2xB2
kusano 2b45e8
	MADD3	c43, c43, a3, b4
kusano 2b45e8
	daddu	PREA, PREA, 16 * SIZE
kusano 2b45e8
kusano 2b45e8
	MADD2	c42, c42, a4, b3
kusano 2b45e8
	MADD4	c44, c44, a4, b4
kusano 2b45e8
	daddu	PREB, PREB, 16 * SIZE
kusano 2b45e8
kusano 2b45e8
	LD	a1,	0 * SIZE(AO)
kusano 2b45e8
	LD	a2,	1 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a5, b5 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a5, b6					#	axd
kusano 2b45e8
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD	b2,	1 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a6, b5					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a6, b6					#	bxd
kusano 2b45e8
kusano 2b45e8
	LD	a3,	2 * SIZE(AO)
kusano 2b45e8
	LD	a4,	3 * SIZE(AO)
kusano 2b45e8
	MADD1	c21, c21, a7, b5 					#	A2xB1
kusano 2b45e8
	MADD3	c23, c23, a7, b6
kusano 2b45e8
kusano 2b45e8
	LD	b3, 2 * SIZE(BO)
kusano 2b45e8
	LD	b4,	3 * SIZE(BO)
kusano 2b45e8
	MADD2	c22, c22, a8, b5
kusano 2b45e8
	MADD4	c24, c24, a8, b6
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREA)
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREB)
kusano 2b45e8
	MADD1	c31, c31, a5, b7 					#	A1xB2
kusano 2b45e8
	MADD3	c33, c33, a5, b8
kusano 2b45e8
kusano 2b45e8
	MADD2	c32, c32, a6, b7
kusano 2b45e8
	MADD4	c34, c34, a6, b8
kusano 2b45e8
kusano 2b45e8
	MADD1	c41, c41, a7, b7 					#	A2xB2
kusano 2b45e8
	MADD3	c43, c43, a7, b8
kusano 2b45e8
kusano 2b45e8
	MADD2	c42, c42, a8, b7
kusano 2b45e8
	bgtz	L, .L12
kusano 2b45e8
	MADD4	c44, c44, a8, b8
kusano 2b45e8
kusano 2b45e8
	.align	5
kusano 2b45e8
kusano 2b45e8
.L15:
kusano 2b45e8
#ifndef	TRMMKERNEL
kusano 2b45e8
	andi	L,  K, 3
kusano 2b45e8
	LD	ALPHA_R, 128($sp)
kusano 2b45e8
#else
kusano 2b45e8
	andi	L,  TEMP, 3
kusano 2b45e8
	LD	ALPHA_R, 128($sp)
kusano 2b45e8
#endif
kusano 2b45e8
	blez	L, .L18
kusano 2b45e8
	LD	ALPHA_I, 136($sp)
kusano 2b45e8
kusano 2b45e8
	.align	5
kusano 2b45e8
kusano 2b45e8
.L16:
kusano 2b45e8
	daddiu	BO, BO, 4 * SIZE					#	2nr*1kr*cmpx
kusano 2b45e8
	daddiu	AO, AO, 4 * SIZE					#	2mr*1kr*cmpx
kusano 2b45e8
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a1, b2					#	axd
kusano 2b45e8
kusano 2b45e8
	daddiu	PREA, PREA, 4 * SIZE			
kusano 2b45e8
	daddiu	PREB, PREB, 4 * SIZE			
kusano 2b45e8
	MADD2	c12, c12, a2, b1					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a2, b2					#	bxd
kusano 2b45e8
kusano 2b45e8
	MADD1	c21, c21, a3, b1 					#	A2xB1
kusano 2b45e8
	MADD3	c23, c23, a3, b2
kusano 2b45e8
	
kusano 2b45e8
	MADD2	c22, c22, a4, b1
kusano 2b45e8
	MADD4	c24, c24, a4, b2
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREA)
kusano 2b45e8
	MADD1	c31, c31, a1, b3 					#	A1xB2
kusano 2b45e8
	MADD3	c33, c33, a1, b4
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
kusano 2b45e8
	MADD2	c32, c32, a2, b3
kusano 2b45e8
	MADD4	c34, c34, a2, b4
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREB)
kusano 2b45e8
	MADD1	c41, c41, a3, b3 					#	A2xB2
kusano 2b45e8
	MADD3	c43, c43, a3, b4
kusano 2b45e8
kusano 2b45e8
	MADD2	c42, c42, a4, b3
kusano 2b45e8
	MADD4	c44, c44, a4, b4
kusano 2b45e8
kusano 2b45e8
	LD	a1,	0 * SIZE(AO)
kusano 2b45e8
	LD	a2, 1 * SIZE(AO)
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD	b2, 1 * SIZE(BO)
kusano 2b45e8
	LD	a3, 2 * SIZE(AO)
kusano 2b45e8
	LD	a4, 3 * SIZE(AO)
kusano 2b45e8
	LD	b3, 2 * SIZE(BO)
kusano 2b45e8
	LD	b4, 3 * SIZE(BO)
kusano 2b45e8
	bgtz	L, .L16
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
.L18:
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
 	ADD	c11, c14, c11
kusano 2b45e8
	LD	a1,  0 * SIZE(CO1)
kusano 2b45e8
	ADD	c12, c13, c12
kusano 2b45e8
	LD	a2,  1 * SIZE(CO1)
kusano 2b45e8
	ADD	c21, c24, c21
kusano 2b45e8
	LD	b1,  2 * SIZE(CO1)
kusano 2b45e8
	ADD	c22, c23, c22
kusano 2b45e8
	LD	b2,  3 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	ADD	c31, c34, c31
kusano 2b45e8
	LD	a3,  0 * SIZE(CO2)
kusano 2b45e8
	ADD	c32, c33, c32
kusano 2b45e8
	LD	a4,  1 * SIZE(CO2)
kusano 2b45e8
	ADD	c41, c44, c41
kusano 2b45e8
	LD	b3,  2 * SIZE(CO2)
kusano 2b45e8
	ADD	c42, c43, c42
kusano 2b45e8
	LD	b4,  3 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
	MADD	a1, a1, ALPHA_R, c11
kusano 2b45e8
	MADD	a2, a2, ALPHA_R, c12
kusano 2b45e8
	MADD	b1, b1, ALPHA_R, c21
kusano 2b45e8
	MADD	b2, b2, ALPHA_R, c22
kusano 2b45e8
kusano 2b45e8
	NMSUB	a1, a1, ALPHA_I, c12
kusano 2b45e8
	MADD	a2, a2, ALPHA_I, c11
kusano 2b45e8
	NMSUB	b1, b1, ALPHA_I, c22
kusano 2b45e8
	MADD	b2, b2, ALPHA_I, c21
kusano 2b45e8
kusano 2b45e8
	MADD	a3, a3, ALPHA_R, c31
kusano 2b45e8
	MADD	a4, a4, ALPHA_R, c32
kusano 2b45e8
	ST	a1, 0 * SIZE(CO1)
kusano 2b45e8
	MADD	b3, b3, ALPHA_R, c41
kusano 2b45e8
	MADD	b4, b4, ALPHA_R, c42
kusano 2b45e8
	ST	a2, 1 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	NMSUB	a3, a3, ALPHA_I, c32
kusano 2b45e8
	MADD	a4, a4, ALPHA_I, c31
kusano 2b45e8
	ST	b1, 2 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	NMSUB	b3, b3, ALPHA_I, c42
kusano 2b45e8
	MADD	b4, b4, ALPHA_I, c41
kusano 2b45e8
	ST	b2, 3 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	ST	a3, 0 * SIZE(CO2)
kusano 2b45e8
	ST	a4, 1 * SIZE(CO2)
kusano 2b45e8
	ST	b3, 2 * SIZE(CO2)
kusano 2b45e8
	ST	b4, 3 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
 	ADD	c11, c14, c11
kusano 2b45e8
	ADD	c12, c13, c12
kusano 2b45e8
	ADD	c21, c24, c21
kusano 2b45e8
	ADD	c22, c23, c22
kusano 2b45e8
kusano 2b45e8
	ADD	c31, c34, c31
kusano 2b45e8
	ADD	c32, c33, c32
kusano 2b45e8
	ADD	c41, c44, c41
kusano 2b45e8
	ADD	c42, c43, c42
kusano 2b45e8
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
	MUL	a1, ALPHA_R, c11
kusano 2b45e8
	MUL	a2, ALPHA_R, c12
kusano 2b45e8
	MUL	b1, ALPHA_R, c21
kusano 2b45e8
	MUL	b2, ALPHA_R, c22
kusano 2b45e8
kusano 2b45e8
	NMSUB	a1, a1, ALPHA_I, c12
kusano 2b45e8
	MADD	a2, a2, ALPHA_I, c11
kusano 2b45e8
	NMSUB	b1, b1, ALPHA_I, c22
kusano 2b45e8
	MADD	b2, b2, ALPHA_I, c21
kusano 2b45e8
kusano 2b45e8
	MUL	a3, ALPHA_R, c31
kusano 2b45e8
	MUL	a4, ALPHA_R, c32
kusano 2b45e8
	MUL	b3, ALPHA_R, c41
kusano 2b45e8
	MUL	b4, ALPHA_R, c42
kusano 2b45e8
kusano 2b45e8
	NMSUB	a3, a3, ALPHA_I, c32
kusano 2b45e8
	MADD	a4, a4, ALPHA_I, c31
kusano 2b45e8
	NMSUB	b3, b3, ALPHA_I, c42
kusano 2b45e8
	MADD	b4, b4, ALPHA_I, c41
kusano 2b45e8
kusano 2b45e8
	ST	a1, 0 * SIZE(CO1)
kusano 2b45e8
	ST	a2, 1 * SIZE(CO1)
kusano 2b45e8
	ST	b1, 2 * SIZE(CO1)
kusano 2b45e8
	ST	b2, 3 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	ST	a3, 0 * SIZE(CO2)
kusano 2b45e8
	ST	a4, 1 * SIZE(CO2)
kusano 2b45e8
	ST	b3, 2 * SIZE(CO2)
kusano 2b45e8
	ST	b4, 3 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if ( defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	TEMP, TEMP, -2
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, TEMP, -2
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + ZBASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AO, TEMP
kusano 2b45e8
	daddu	BO, BO, TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	KK, KK, 2
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	dsll	PREB, K, 1 + ZBASE_SHIFT				#	PREA=K*2*2^4
kusano 2b45e8
	daddiu	CO1,CO1, 4 * SIZE 
kusano 2b45e8
	bgtz	I, .L11
kusano 2b45e8
	daddiu	CO2,CO2, 4 * SIZE 
kusano 2b45e8
kusano 2b45e8
	.align	5
kusano 2b45e8
.L30:
kusano 2b45e8
	andi	I, M, 1
kusano 2b45e8
	daddu	C,   C, LDC						#	Change C to next panel
kusano 2b45e8
kusano 2b45e8
	daddu	PREB, PREB, B						#	PREA=A+panel size
kusano 2b45e8
	blez	I, .L19
kusano 2b45e8
	daddu	C,   C, LDC						#	Change C to next panel
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	BO,  B
kusano 2b45e8
#else
kusano 2b45e8
	dsll	L,    KK, ZBASE_SHIFT			#	MR=1
kusano 2b45e8
	dsll	TEMP, KK, 1 + ZBASE_SHIFT		#	NR=2
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AO, L
kusano 2b45e8
	daddu	BO, B,  TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	LD	a1, 0 * SIZE(AO)
kusano 2b45e8
	LD	a2,	1 * SIZE(AO)
kusano 2b45e8
	MTC	$0,  c11								#	Clear results regs
kusano 2b45e8
	MOV	c12, c11
kusano 2b45e8
	
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD	b2, 1 * SIZE(BO)
kusano 2b45e8
	MOV	c13, c11
kusano 2b45e8
	MOV	c14, c11
kusano 2b45e8
kusano 2b45e8
	LD	b3, 2 * SIZE(BO)
kusano 2b45e8
	LD	b4, 3 * SIZE(BO)
kusano 2b45e8
	MOV	c31, c11
kusano 2b45e8
	MOV	c32, c11
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREB)
kusano 2b45e8
	MOV	c33, c11
kusano 2b45e8
	MOV	c34, c11
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(CO1)
kusano 2b45e8
	FETCH	$0, 0 * SIZE(CO2)
kusano 2b45e8
	FETCH	$0, 4 * SIZE(CO1)
kusano 2b45e8
	FETCH	$0, 4 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP, KK, 1							#	MR=1
kusano 2b45e8
#else	
kusano 2b45e8
	daddiu	TEMP, KK, 2							#	NR=2
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	L,  TEMP, 2
kusano 2b45e8
	blez	L, .L35
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
kusano 2b45e8
	LD	a1, 0 * SIZE(AO)
kusano 2b45e8
	LD	a2,	1 * SIZE(AO)
kusano 2b45e8
	dsra	L,  K, 2							#	Unroll K 4 times	
kusano 2b45e8
	move	BO,  B
kusano 2b45e8
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD	b2, 1 * SIZE(BO)
kusano 2b45e8
	MTC	$0,  c11								#	Clear results regs
kusano 2b45e8
	MOV	c12, c11
kusano 2b45e8
	
kusano 2b45e8
	LD	b3, 2 * SIZE(BO)
kusano 2b45e8
	LD	b4, 3 * SIZE(BO)
kusano 2b45e8
	MOV	c13, c11
kusano 2b45e8
	MOV	c14, c11
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREB)
kusano 2b45e8
	MOV	c31, c11
kusano 2b45e8
	MOV	c32, c11
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(CO1)
kusano 2b45e8
	FETCH	$0, 0 * SIZE(CO2)
kusano 2b45e8
	FETCH	$0, 4 * SIZE(CO1)
kusano 2b45e8
	FETCH	$0, 4 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	MOV	c33, c11
kusano 2b45e8
	blez	L, .L35
kusano 2b45e8
	MOV	c34, c11
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	.align	5
kusano 2b45e8
kusano 2b45e8
.L32:
kusano 2b45e8
	LD	a3,	2 * SIZE(AO)
kusano 2b45e8
	LD	a4, 3 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a1, b2					#	axd
kusano 2b45e8
kusano 2b45e8
	LD	b5, 4 * SIZE(BO)
kusano 2b45e8
	LD	b6, 5 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a2, b1					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a2, b2					#	bxd
kusano 2b45e8
	
kusano 2b45e8
	LD	b7, 6 * SIZE(BO)
kusano 2b45e8
	LD	b8, 7 * SIZE(BO)
kusano 2b45e8
	MADD1	c31, c31, a1, b3 					#	A1xB2
kusano 2b45e8
	MADD3	c33, c33, a1, b4
kusano 2b45e8
	
kusano 2b45e8
	FETCH	$0, 4 * SIZE(PREB)
kusano 2b45e8
	MADD2	c32, c32, a2, b3
kusano 2b45e8
	MADD4	c34, c34, a2, b4
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	LD	a5,	4 * SIZE(AO)
kusano 2b45e8
	LD	a6, 5 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a3, b5 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a3, b6					#	axd
kusano 2b45e8
kusano 2b45e8
	LD	b1, 8 * SIZE(BO)
kusano 2b45e8
	LD	b2, 9 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a4, b5					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a4, b6					#	bxd
kusano 2b45e8
	
kusano 2b45e8
	LD	b3, 10 * SIZE(BO)
kusano 2b45e8
	LD	b4, 11 * SIZE(BO)
kusano 2b45e8
	MADD1	c31, c31, a3, b7 					#	A1xB2
kusano 2b45e8
	MADD3	c33, c33, a3, b8
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 8 * SIZE(PREB)
kusano 2b45e8
	MADD2	c32, c32, a4, b7
kusano 2b45e8
	MADD4	c34, c34, a4, b8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
kusano 2b45e8
	LD	a7, 6 * SIZE(AO)
kusano 2b45e8
	LD	a8, 7 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a5, b1 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a5, b2					#	axd
kusano 2b45e8
kusano 2b45e8
	LD	b5, 12 * SIZE(BO)
kusano 2b45e8
	LD	b6, 13 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a6, b1					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a6, b2					#	bxd
kusano 2b45e8
kusano 2b45e8
	LD	b7, 14 * SIZE(BO)
kusano 2b45e8
	LD	b8, 15 * SIZE(BO)
kusano 2b45e8
	MADD1	c31, c31, a5, b3 					#	A1xB2
kusano 2b45e8
	MADD3	c33, c33, a5, b4
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 8 * SIZE 					#	2mr*4kr*cmpx
kusano 2b45e8
	daddiu	BO, BO, 16 * SIZE  					#	2nr*4kr*cmpx
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 12 * SIZE(PREB)
kusano 2b45e8
	MADD2	c32, c32, a6, b3
kusano 2b45e8
	MADD4	c34, c34, a6, b4
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	LD	a1, 0 * SIZE(AO)
kusano 2b45e8
	LD	a2, 1 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a7, b5 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a7, b6					#	axd
kusano 2b45e8
kusano 2b45e8
	LD	b1,	0 * SIZE(BO)
kusano 2b45e8
	LD	b2, 1 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a8, b5					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a8, b6					#	bxd
kusano 2b45e8
kusano 2b45e8
	LD	b3, 2 * SIZE(BO)
kusano 2b45e8
	LD	b4, 3 * SIZE(BO)
kusano 2b45e8
	MADD1	c31, c31, a7, b7 					#	A1xB2
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
	MADD3	c33, c33, a7, b8
kusano 2b45e8
	daddiu	PREB, PREB, 16 * SIZE
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREB)
kusano 2b45e8
	MADD2	c32, c32, a8, b7
kusano 2b45e8
	bgtz	L, .L32
kusano 2b45e8
	MADD4	c34, c34, a8, b8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L35:
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	L,  K, 3
kusano 2b45e8
	LD	ALPHA_R, 128($sp)
kusano 2b45e8
#else
kusano 2b45e8
	andi	L,  TEMP, 3
kusano 2b45e8
	LD	ALPHA_R, 128($sp)
kusano 2b45e8
#endif
kusano 2b45e8
	blez	L, .L38
kusano 2b45e8
	LD	ALPHA_I, 136($sp)
kusano 2b45e8
	.align	5
kusano 2b45e8
kusano 2b45e8
.L36:
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a1, b2					#	axd
kusano 2b45e8
kusano 2b45e8
	daddiu	BO, BO, 4 * SIZE 					#	2nr*1kr*cmpx
kusano 2b45e8
	MADD2	c12, c12, a2, b1					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a2, b2					#	bxd
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 2 * SIZE					#	2mr*1kr*cmpx
kusano 2b45e8
	MADD1	c31, c31, a1, b3 					#	A1xB2
kusano 2b45e8
	MADD3	c33, c33, a1, b4
kusano 2b45e8
	
kusano 2b45e8
	daddiu	PREB, PREB, 4 * SIZE 
kusano 2b45e8
	MADD2	c32, c32, a2, b3
kusano 2b45e8
	MADD4	c34, c34, a2, b4
kusano 2b45e8
kusano 2b45e8
	LD	a1, 0 * SIZE(AO)
kusano 2b45e8
	LD	a2, 1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD	b2, 1 * SIZE(BO)
kusano 2b45e8
	LD	b3, 2 * SIZE(BO)
kusano 2b45e8
	LD	b4, 3 * SIZE(BO)
kusano 2b45e8
	bgtz	L, .L36
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
.L38:
kusano 2b45e8
#ifndef	TRMMKERNEL
kusano 2b45e8
 	ADD	c11, c14, c11
kusano 2b45e8
	LD	a1,  0 * SIZE(CO1)
kusano 2b45e8
	ADD	c12, c13, c12
kusano 2b45e8
	LD	a2,  1 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	ADD	c31, c34, c31
kusano 2b45e8
	LD	a3,  0 * SIZE(CO2)
kusano 2b45e8
	ADD	c32, c33, c32
kusano 2b45e8
	LD	a4,  1 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	MADD	a1, a1, ALPHA_R, c11
kusano 2b45e8
	MADD	a2, a2, ALPHA_R, c12
kusano 2b45e8
kusano 2b45e8
	MADD	a3, a3, ALPHA_R, c31
kusano 2b45e8
	MADD	a4, a4, ALPHA_R, c32
kusano 2b45e8
kusano 2b45e8
	NMSUB	a1, a1, ALPHA_I, c12
kusano 2b45e8
	MADD	a2, a2, ALPHA_I, c11
kusano 2b45e8
kusano 2b45e8
	NMSUB	a3, a3, ALPHA_I, c32
kusano 2b45e8
	MADD	a4, a4, ALPHA_I, c31
kusano 2b45e8
kusano 2b45e8
	ST	a1, 0 * SIZE(CO1)
kusano 2b45e8
	ST	a2, 1 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	ST	a3, 0 * SIZE(CO2)
kusano 2b45e8
	ST	a4, 1 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1,CO1, 2 * SIZE 
kusano 2b45e8
	daddiu	CO2,CO2, 2 * SIZE 
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	ADD	c11, c14, c11
kusano 2b45e8
	ADD	c12, c13, c12
kusano 2b45e8
kusano 2b45e8
	ADD	c31, c34, c31
kusano 2b45e8
	ADD	c32, c33, c32
kusano 2b45e8
kusano 2b45e8
	MUL	a1, ALPHA_R, c11
kusano 2b45e8
	MUL	a2, ALPHA_R, c12
kusano 2b45e8
kusano 2b45e8
	MUL	a3, ALPHA_R, c31
kusano 2b45e8
	MUL	a4, ALPHA_R, c32
kusano 2b45e8
kusano 2b45e8
	NMSUB	a1, a1, ALPHA_I, c12
kusano 2b45e8
	MADD	a2, a2, ALPHA_I, c11
kusano 2b45e8
kusano 2b45e8
	NMSUB	a3, a3, ALPHA_I, c32
kusano 2b45e8
	MADD	a4, a4, ALPHA_I, c31
kusano 2b45e8
kusano 2b45e8
	ST	a1, 0 * SIZE(CO1)
kusano 2b45e8
	ST	a2, 1 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	ST	a3, 0 * SIZE(CO2)
kusano 2b45e8
	ST	a4, 1 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1,CO1, 2 * SIZE 
kusano 2b45e8
	daddiu	CO2,CO2, 2 * SIZE 
kusano 2b45e8
kusano 2b45e8
#if ( defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	TEMP, TEMP, -1
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, TEMP, -2
kusano 2b45e8
#endif
kusano 2b45e8
	dsll	L,    TEMP, ZBASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, 1 + ZBASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AO, L
kusano 2b45e8
	daddu	BO, BO, TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	KK, KK, 1
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	.align 5
kusano 2b45e8
kusano 2b45e8
.L19:
kusano 2b45e8
#if defined(TRMMKERNEL) && !defined(LEFT)
kusano 2b45e8
	daddiu	KK, KK, 2
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	bgtz	J, .L10
kusano 2b45e8
	move	B, BO
kusano 2b45e8
kusano 2b45e8
	.align 5
kusano 2b45e8
	
kusano 2b45e8
.L20:
kusano 2b45e8
	andi	J,  N, 1
kusano 2b45e8
	blez	J, .L999
kusano 2b45e8
	dsll	PREA, K, 1+ZBASE_SHIFT				#	PREA=K*2*2^4
kusano 2b45e8
kusano 2b45e8
	dsra	I,  M, 1							#	I=M/2
kusano 2b45e8
	move	CO1, C
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) &&  defined(LEFT)
kusano 2b45e8
	move	KK, OFFSET
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	move	AO, A								#	Reset AO
kusano 2b45e8
	blez	I, .L29
kusano 2b45e8
	daddu	PREA, PREA, A
kusano 2b45e8
kusano 2b45e8
.L21:
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	BO,  B
kusano 2b45e8
#else
kusano 2b45e8
	dsll	L,    KK, 1 + ZBASE_SHIFT
kusano 2b45e8
	dsll	TEMP, KK, ZBASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AO, L
kusano 2b45e8
	daddu	BO, B,  TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	LD	a1, 0 * SIZE(AO)
kusano 2b45e8
	LD	a2, 1 * SIZE(AO)
kusano 2b45e8
	MTC	$0,  c11								#	Clear results regs
kusano 2b45e8
	MOV	c12, c11
kusano 2b45e8
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD	b2, 1 * SIZE(BO)
kusano 2b45e8
	MOV	c13, c11
kusano 2b45e8
	MOV	c14, c11
kusano 2b45e8
kusano 2b45e8
	LD	a3, 2 * SIZE(AO)
kusano 2b45e8
	LD	a4, 3 * SIZE(AO)
kusano 2b45e8
	MOV	c21, c11
kusano 2b45e8
	MOV	c22, c11
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREA)
kusano 2b45e8
	MOV	c23, c11
kusano 2b45e8
	MOV	c24, c11
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(CO1)
kusano 2b45e8
	FETCH	$0, 4 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP, KK, 2						#	define Mr=2
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, KK, 1						#	define	NR=1
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	L,  TEMP, 2
kusano 2b45e8
	blez	L, .L25
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	dsra	L,  K, 2							#	Unroll K 4 times	
kusano 2b45e8
	move	BO,  B
kusano 2b45e8
kusano 2b45e8
	LD	a1, 0 * SIZE(AO)
kusano 2b45e8
	LD	a2, 1 * SIZE(AO)
kusano 2b45e8
	MTC	$0,  c11								#	Clear results regs
kusano 2b45e8
	MOV	c12, c11
kusano 2b45e8
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD	b2, 1 * SIZE(BO)
kusano 2b45e8
	MOV	c13, c11
kusano 2b45e8
	MOV	c14, c11
kusano 2b45e8
kusano 2b45e8
	LD	a3, 2 * SIZE(AO)
kusano 2b45e8
	LD 	a4, 3 * SIZE(AO)
kusano 2b45e8
	MOV	c21, c11
kusano 2b45e8
	MOV	c22, c11
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREA)
kusano 2b45e8
	MOV	c23, c11
kusano 2b45e8
	MOV	c24, c11
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(CO1)
kusano 2b45e8
	FETCH	$0, 4 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	blez	L, .L25
kusano 2b45e8
	NOP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	.align	5
kusano 2b45e8
kusano 2b45e8
.L22:
kusano 2b45e8
	LD	 a5, 4 * SIZE(AO)
kusano 2b45e8
	LD	 a6, 5 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a1, b2					#	axd
kusano 2b45e8
kusano 2b45e8
	LD	 b3, 2 * SIZE(BO)
kusano 2b45e8
	LD	 b4, 3 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a2, b1					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a2, b2					#	bxd
kusano 2b45e8
kusano 2b45e8
	LD 	 a7, 6 * SIZE(AO)
kusano 2b45e8
	LD 	 a8, 7 * SIZE(AO)
kusano 2b45e8
	MADD1	c21, c21, a3, b1 					#	A2xB1
kusano 2b45e8
	MADD3	c23, c23, a3, b2
kusano 2b45e8
	
kusano 2b45e8
	FETCH	$0, 4 * SIZE(PREA)
kusano 2b45e8
	MADD2	c22, c22, a4, b1
kusano 2b45e8
	MADD4	c24, c24, a4, b2
kusano 2b45e8
kusano 2b45e8
	LD 	a1, 8 * SIZE(AO)
kusano 2b45e8
	LD	a2, 9 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a5, b3 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a5, b4					#	axd
kusano 2b45e8
kusano 2b45e8
	LD	b5, 4 * SIZE(BO)
kusano 2b45e8
	LD	b6, 5 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a6, b3					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a6, b4					#	bxd
kusano 2b45e8
kusano 2b45e8
	LD	a3, 10 * SIZE(AO)
kusano 2b45e8
	LD	a4, 11 * SIZE(AO)
kusano 2b45e8
	MADD1	c21, c21, a7, b3 					#	A2xB1
kusano 2b45e8
	MADD3	c23, c23, a7, b4
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 8 * SIZE(PREA)
kusano 2b45e8
	MADD2	c22, c22, a8, b3
kusano 2b45e8
	MADD4	c24, c24, a8, b4
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
kusano 2b45e8
	LD	a5, 12 * SIZE(AO)
kusano 2b45e8
	LD	a6, 13 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a1, b5 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a1, b6					#	axd
kusano 2b45e8
kusano 2b45e8
	LD	b7, 6 * SIZE(BO)
kusano 2b45e8
	LD	b8, 7 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a2, b5					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a2, b6					#	bxd
kusano 2b45e8
kusano 2b45e8
	LD	a7, 14 * SIZE(AO)
kusano 2b45e8
	LD	a8, 15 * SIZE(AO)
kusano 2b45e8
	MADD1	c21, c21, a3, b5 					#	A2xB1
kusano 2b45e8
	MADD3	c23, c23, a3, b6
kusano 2b45e8
kusano 2b45e8
	daddiu	BO, BO,  8 * SIZE  					#	1nr*4kr*cmpx
kusano 2b45e8
	daddiu	AO, AO, 16 * SIZE 					#	2mr*4kr*cmpx
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 12 * SIZE(PREA)
kusano 2b45e8
	MADD2	c22, c22, a4, b5
kusano 2b45e8
	MADD4	c24, c24, a4, b6
kusano 2b45e8
	daddiu	PREA, PREA, 16 * SIZE 		
kusano 2b45e8
kusano 2b45e8
	LD	a1, 0 * SIZE(AO)
kusano 2b45e8
	LD	a2, 1 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a5, b7 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a5, b8					#	axd
kusano 2b45e8
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD	b2, 1 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a6, b7					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a6, b8					#	bxd
kusano 2b45e8
kusano 2b45e8
	LD	a3, 2 * SIZE(AO)
kusano 2b45e8
	LD	a4, 3 * SIZE(AO)
kusano 2b45e8
	MADD1	c21, c21, a7, b7 					#	A2xB1
kusano 2b45e8
	MADD3	c23, c23, a7, b8
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREA)
kusano 2b45e8
	MADD2	c22, c22, a8, b7
kusano 2b45e8
	bgtz	L, .L22
kusano 2b45e8
	MADD4	c24, c24, a8, b8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
.L25:
kusano 2b45e8
#ifndef TRMMKERNEL
kusano 2b45e8
	andi	L,  K, 3
kusano 2b45e8
	LD	ALPHA_R, 128($sp)
kusano 2b45e8
#else
kusano 2b45e8
	andi	L, TEMP, 3
kusano 2b45e8
	LD	ALPHA_R, 128($sp)
kusano 2b45e8
#endif
kusano 2b45e8
	blez	L, .L28
kusano 2b45e8
	LD	ALPHA_I, 136($sp)
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L26:
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a1, b2					#	axd
kusano 2b45e8
kusano 2b45e8
	daddiu	BO, BO, 2 * SIZE 					#	2nr*1kr*cmpx
kusano 2b45e8
	MADD2	c12, c12, a2, b1					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a2, b2					#	bxd
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 4 * SIZE 					#	2mr*1kr*cmpx
kusano 2b45e8
	MADD1	c21, c21, a3, b1 					#	A2xB1
kusano 2b45e8
	MADD3	c23, c23, a3, b2
kusano 2b45e8
kusano 2b45e8
	daddiu	PREA, PREA, 4 * SIZE 				#	2mr*1kr*cmpx
kusano 2b45e8
	MADD2	c22, c22, a4, b1
kusano 2b45e8
	MADD4	c24, c24, a4, b2
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2	Unroll K=4
kusano 2b45e8
#	gsLQC1(R12, F3, F2, 1)						#	R:a3	I:a4
kusano 2b45e8
#	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
kusano 2b45e8
	LD	a1, 0 * SIZE(AO)
kusano 2b45e8
	LD	a2, 1 * SIZE(AO)
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD	b2, 1 * SIZE(BO)
kusano 2b45e8
	LD	a3, 2 * SIZE(AO)
kusano 2b45e8
	LD	a4, 3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	bgtz	L, .L26
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREA)
kusano 2b45e8
kusano 2b45e8
.L28:
kusano 2b45e8
#ifndef	TRMMKERNEL
kusano 2b45e8
 	ADD	c11, c14, c11
kusano 2b45e8
	LD	a1,  0 * SIZE(CO1)
kusano 2b45e8
	ADD	c12, c13, c12
kusano 2b45e8
	LD	a2,  1 * SIZE(CO1)
kusano 2b45e8
	ADD	c21, c24, c21
kusano 2b45e8
	LD	b1,  2 * SIZE(CO1)
kusano 2b45e8
	ADD	c22, c23, c22
kusano 2b45e8
	LD	b2,  3 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
	MADD	a1, a1, ALPHA_R, c11
kusano 2b45e8
	MADD	a2, a2, ALPHA_R, c12
kusano 2b45e8
	MADD	b1, b1, ALPHA_R, c21
kusano 2b45e8
	MADD	b2, b2, ALPHA_R, c22
kusano 2b45e8
kusano 2b45e8
	NMSUB	a1, a1, ALPHA_I, c12
kusano 2b45e8
	MADD	a2, a2, ALPHA_I, c11
kusano 2b45e8
	NMSUB	b1, b1, ALPHA_I, c22
kusano 2b45e8
	MADD	b2, b2, ALPHA_I, c21
kusano 2b45e8
kusano 2b45e8
	ST	a1, 0 * SIZE(CO1)
kusano 2b45e8
	ST	a2, 1 * SIZE(CO1)
kusano 2b45e8
	ST	b1, 2 * SIZE(CO1)
kusano 2b45e8
	ST	b2, 3 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
 	ADD	c11, c14, c11
kusano 2b45e8
	ADD	c12, c13, c12
kusano 2b45e8
	ADD	c21, c24, c21
kusano 2b45e8
	ADD	c22, c23, c22
kusano 2b45e8
kusano 2b45e8
	daddiu	I, I, -1
kusano 2b45e8
	MUL	a1, ALPHA_R, c11
kusano 2b45e8
	MUL	a2, ALPHA_R, c12
kusano 2b45e8
	MUL	b1, ALPHA_R, c21
kusano 2b45e8
	MUL	b2, ALPHA_R, c22
kusano 2b45e8
kusano 2b45e8
	NMSUB	a1, a1, ALPHA_I, c12
kusano 2b45e8
	MADD	a2, a2, ALPHA_I, c11
kusano 2b45e8
	NMSUB	b1, b1, ALPHA_I, c22
kusano 2b45e8
	MADD	b2, b2, ALPHA_I, c21
kusano 2b45e8
kusano 2b45e8
	ST	a1, 0 * SIZE(CO1)
kusano 2b45e8
	ST	a2, 1 * SIZE(CO1)
kusano 2b45e8
	ST	b1, 2 * SIZE(CO1)
kusano 2b45e8
	ST	b2, 3 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
#if ( defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	TEMP, TEMP, -2
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, TEMP, -1
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	dsll	L,    TEMP, 1 + ZBASE_SHIFT
kusano 2b45e8
	dsll	TEMP, TEMP, ZBASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AO, L
kusano 2b45e8
	daddu	BO, BO, TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	KK, KK, 2
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
	daddiu	CO1,CO1, 4 * SIZE 
kusano 2b45e8
	bgtz	I, .L21
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
.L29:
kusano 2b45e8
	andi	I, M, 1
kusano 2b45e8
	blez	I, .L999
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	move	BO,  B
kusano 2b45e8
#else
kusano 2b45e8
	dsll	TEMP, KK,  ZBASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AO, TEMP
kusano 2b45e8
	daddu	BO, B,  TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2
kusano 2b45e8
	LD	a1, 0 * SIZE(AO)
kusano 2b45e8
	LD	a2, 1 * SIZE(AO)
kusano 2b45e8
	MTC	$0,  c11								#	Clear results regs
kusano 2b45e8
	MOV	c12, c11
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD 	b2, 1 * SIZE(BO)
kusano 2b45e8
	MOV	c13, c11
kusano 2b45e8
	MOV	c14, c11
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREA)
kusano 2b45e8
	FETCH	$0, 4 * SIZE(PREA)
kusano 2b45e8
kusano 2b45e8
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
#elif defined(LEFT)
kusano 2b45e8
	daddiu	TEMP, KK, 1
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, KK, 1
kusano 2b45e8
#endif
kusano 2b45e8
	dsra	L,  TEMP, 2
kusano 2b45e8
	blez	L, .L45
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	dsra	L,  K, 2							#	Unroll K 4 times	
kusano 2b45e8
	move	BO,  B
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2
kusano 2b45e8
	LD	a1, 0 * SIZE(AO)
kusano 2b45e8
	LD	a2, 1 * SIZE(AO)
kusano 2b45e8
	MTC	$0,  c11								#	Clear results regs
kusano 2b45e8
	MOV	c12, c11
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD 	b2, 1 * SIZE(BO)
kusano 2b45e8
	MOV	c13, c11
kusano 2b45e8
	MOV	c14, c11
kusano 2b45e8
kusano 2b45e8
	FETCH	$0, 0 * SIZE(PREA)
kusano 2b45e8
	FETCH	$0, 4 * SIZE(PREA)
kusano 2b45e8
	blez	L, .L45
kusano 2b45e8
	NOP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	.align	3
kusano 2b45e8
kusano 2b45e8
.L42:
kusano 2b45e8
#	gsLQC1(R12, F3, F2, 1)						#	R:a3	I:a4
kusano 2b45e8
	LD 	a3, 2 * SIZE(AO)
kusano 2b45e8
	LD	a4, 3 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a1, b2					#	axd
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R13, F7, F6, 1)						#	R:b2	I:b3
kusano 2b45e8
	LD	b3, 2 * SIZE(BO)
kusano 2b45e8
	LD	b4, 3 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a2, b1					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a2, b2					#	bxd
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R12, F9, F8, 2)						#	Unroll K=1	
kusano 2b45e8
	LD	a5, 4 * SIZE(AO)
kusano 2b45e8
	LD	a6, 5 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a3, b3 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a3, b4					#	axd
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R13, F13, F12, 2)				
kusano 2b45e8
	LD	b5, 4 * SIZE(BO)
kusano 2b45e8
	LD 	b6, 5 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a4, b3					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a4, b4					#	bxd
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R12, F11, F10, 3)					
kusano 2b45e8
	LD 	a7, 6 * SIZE(AO)
kusano 2b45e8
	LD	a8, 7 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a5, b5 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a5, b6					#	axd
kusano 2b45e8
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R13, F16, F15, 3)			
kusano 2b45e8
	LD	b7, 6 * SIZE(BO)
kusano 2b45e8
	LD	b8, 7 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a6, b5					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a6, b6					#	bxd
kusano 2b45e8
kusano 2b45e8
	daddiu	AO, AO, 8 * SIZE 					#	2mr*4kr*cmpx
kusano 2b45e8
	daddiu	BO, BO, 8 * SIZE  					#	2nr*4kr*cmpx
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2	Unroll K=4
kusano 2b45e8
	LD	a1, 0 * SIZE(AO)
kusano 2b45e8
	LD	a2, 1 * SIZE(AO)
kusano 2b45e8
	MADD1	c11, c11, a7, b7 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a7, b8					#	axd
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD	b2, 1 * SIZE(BO)
kusano 2b45e8
	MADD2	c12, c12, a8, b7					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a8, b8					#	bxd
kusano 2b45e8
kusano 2b45e8
	bgtz	L, .L42
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 5
kusano 2b45e8
kusano 2b45e8
.L45:
kusano 2b45e8
#ifndef	TRMMKERNEL
kusano 2b45e8
	andi	L,  K, 3
kusano 2b45e8
	LD	ALPHA_R, 128($sp)
kusano 2b45e8
#else
kusano 2b45e8
	andi	L,  TEMP, 3
kusano 2b45e8
	LD	ALPHA_R, 128($sp)
kusano 2b45e8
#endif
kusano 2b45e8
	blez	L, .L48
kusano 2b45e8
	LD	ALPHA_I, 136($sp)
kusano 2b45e8
kusano 2b45e8
.L46:
kusano 2b45e8
	daddiu	L, L, -1
kusano 2b45e8
	daddiu	BO, BO, 1 * SIZE * COMPSIZE			#	2nr*1kr*cmpx
kusano 2b45e8
	daddiu	AO, AO, 1 * SIZE * COMPSIZE			#	2mr*1kr*cmpx
kusano 2b45e8
kusano 2b45e8
	MADD1	c11, c11, a1, b1 					#	axc		A1xB1	
kusano 2b45e8
	MADD3	c13, c13, a1, b2					#	axd
kusano 2b45e8
	MADD2	c12, c12, a2, b1					#	bxc
kusano 2b45e8
	MADD4	c14, c14, a2, b2					#	bxd
kusano 2b45e8
kusano 2b45e8
#	gsLQC1(R12, F1, F0, 0)						#	R:a1	I:a2	Unroll K=4
kusano 2b45e8
#	gsLQC1(R13, F5, F4, 0)						#	R:b1	I:b2
kusano 2b45e8
	LD	a1, 0 * SIZE(AO)
kusano 2b45e8
	LD	a2, 1 * SIZE(AO)
kusano 2b45e8
	LD	b1, 0 * SIZE(BO)
kusano 2b45e8
	LD	b2, 1 * SIZE(BO)
kusano 2b45e8
	bgtz	L, .L46
kusano 2b45e8
	NOP
kusano 2b45e8
kusano 2b45e8
.L48:
kusano 2b45e8
#ifndef	TRMMKERNEL
kusano 2b45e8
 	ADD	c11, c14, c11
kusano 2b45e8
	ADD	c12, c13, c12
kusano 2b45e8
kusano 2b45e8
	LD	a1,  0 * SIZE(CO1)
kusano 2b45e8
	LD	a2,  1 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	MADD	a1, a1, ALPHA_R, c11
kusano 2b45e8
	MADD	a2, a2, ALPHA_R, c12
kusano 2b45e8
kusano 2b45e8
	NMSUB	a1, a1, ALPHA_I, c12
kusano 2b45e8
	MADD	a2, a2, ALPHA_I, c11
kusano 2b45e8
kusano 2b45e8
	ST	a1, 0 * SIZE(CO1)
kusano 2b45e8
	ST	a2, 1 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
#else
kusano 2b45e8
	ADD	c11, c14, c11
kusano 2b45e8
	ADD	c12, c13, c12
kusano 2b45e8
kusano 2b45e8
	MUL	a1, ALPHA_R, c11
kusano 2b45e8
	MUL	a2, ALPHA_R, c12
kusano 2b45e8
kusano 2b45e8
	NMSUB	a1, a1, ALPHA_I, c12
kusano 2b45e8
	MADD	a2, a2, ALPHA_I, c11
kusano 2b45e8
kusano 2b45e8
	ST	a1, 0 * SIZE(CO1)
kusano 2b45e8
	ST	a2, 1 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
#if ( defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (!defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	dsubu	TEMP, K, KK
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	TEMP, TEMP, -1
kusano 2b45e8
#else
kusano 2b45e8
	daddiu	TEMP, TEMP, -1
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	dsll	TEMP, TEMP, ZBASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	daddu	AO, AO, TEMP
kusano 2b45e8
	daddu	BO, BO, TEMP
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	daddiu	KK, KK, 1
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	daddiu	CO1,CO1, 2 * SIZE 
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	.align 5
kusano 2b45e8
kusano 2b45e8
.L999:
kusano 2b45e8
	LDARG	$16,   0($sp)
kusano 2b45e8
	LDARG	$17,   8($sp)
kusano 2b45e8
	ldc1	$f24, 16($sp)
kusano 2b45e8
	ldc1	$f25, 24($sp)
kusano 2b45e8
	ldc1	$f26, 32($sp)
kusano 2b45e8
	ldc1	$f27, 40($sp)
kusano 2b45e8
	ldc1	$f28, 48($sp)
kusano 2b45e8
	ldc1	$f29, 56($sp)
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL)
kusano 2b45e8
	LDARG	$18,  64($sp)
kusano 2b45e8
	LDARG	$19,  72($sp)
kusano 2b45e8
	LDARG	$20,  80($sp)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifndef __64BIT__
kusano 2b45e8
	ldc1	$f20, 88($sp)
kusano 2b45e8
	ldc1	$f21, 96($sp)
kusano 2b45e8
	ldc1	$f22,104($sp)
kusano 2b45e8
	ldc1	$f23,112($sp)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	j	$31
kusano 2b45e8
	daddiu	$sp, $sp, STACKSIZE
kusano 2b45e8
kusano 2b45e8
	EPILOGUE