Blob Blame Raw
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define M	$4
#define	N	$5
#define	K	$6
#define A	$8
#define B	$9
#define C	$10
#define LDC	$11

#define AO	$12
#define BO	$13

#define I	$2
#define J	$3
#define L	$7

#define PREFETCHSIZE (4 * 10)
	
#define CO1	$14
#define CO2	$15
#define CO3	$16
#define CO4	$17
#define CO5	$18
#define CO6	$19
#define CO7	$20
#define CO8	$21

#define BB	$22

#if defined(TRMMKERNEL)
#define OFFSET	$23
#define KK	$24
#define TEMP	$25
#endif

#define a1	$f0
#define a2	$f1
#define a3	$f27
#define a4	$f28

#define b1	$f2
#define b2	$f3
#define b3	$f4
#define b4	$f5
#define b5	$f6
#define b6	$f7
#define b7	$f8
#define b8	$f9

#define a5	b8

#define c11	$f10
#define c12	$f11
#define c21	$f12
#define c22	$f13
#define c31	$f14
#define c32	$f16
#define c41	$f17
#define c42	$f18
#define c51	$f19
#define c52	$f20
#define c61	$f21
#define c62	$f22
#define c71	$f23
#define c72	$f24
#define c81	$f25
#define c82	$f26

#define ALPHA	$f15

	PROLOGUE
	
	daddiu	$sp, $sp, -160

	SDARG	$16,   0($sp)
	SDARG	$17,   8($sp)
	SDARG	$18,  16($sp)
	SDARG	$19,  24($sp)
	SDARG	$20,  32($sp)
	SDARG	$21,  40($sp)
	SDARG	$22,  48($sp)

	sdc1	$f24, 56($sp)
	sdc1	$f25, 64($sp)
	sdc1	$f26, 72($sp)
	sdc1	$f27, 80($sp)
	sdc1	$f28, 88($sp)

#if defined(TRMMKERNEL)
	SDARG	$23,  96($sp)
	SDARG	$24, 104($sp)
	SDARG	$25, 112($sp)

	LDARG	OFFSET, 160($sp)
#endif

#ifndef __64BIT__
	sdc1	$f20,120($sp)
	sdc1	$f21,128($sp)
	sdc1	$f22,136($sp)
	sdc1	$f23,144($sp)
#endif

	dsll	LDC, LDC, BASE_SHIFT

#if defined(TRMMKERNEL) && !defined(LEFT)
	neg	KK, OFFSET
#endif

	dsra	J,  N, 3
	blez	J, .L30
	nop

.L10:
	move	CO1, C
	MTC	$0,  c11
	daddu	CO2, C,   LDC
	move	AO, A
	daddu	CO3, CO2, LDC
	daddiu	J, J, -1
	daddu	CO4, CO3, LDC
	MOV	c21, c11
	daddu	CO5, CO4, LDC
	MOV	c31, c11
	daddu	CO6, CO5, LDC
	MOV	c41, c11
	daddu	CO7, CO6, LDC
	MOV	c51, c11
	daddu	CO8, CO7, LDC
	dsra	I,  M, 1
	daddu	C,   CO8, LDC

	dsll	BB, K, 2 + BASE_SHIFT
	daddu	BB, B, BB

#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif

	blez	I, .L20
	MOV	c61, c11

.L11:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, 1 + BASE_SHIFT
	dsll	TEMP, KK, 3 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif

	LD	a1,  0 * SIZE(AO)
	MOV	c71, c11
	LD	b1,  0 * SIZE(BO)
	MOV	c81, c11

	LD	a3,  4 * SIZE(AO)
	MOV	c12, c11
	LD	b2,  1 * SIZE(BO)
	MOV	c22, c11

	MOV	c32, c11
	LD	b3,  2 * SIZE(BO)
	MOV	c42, c11

	LD	b4,  3 * SIZE(BO)
	MOV	c52, c11
	LD	b5,  4 * SIZE(BO)
	MOV	c62, c11

	LD	b6,  8 * SIZE(BO)
	MOV	c72, c11
	LD	b7, 12 * SIZE(BO)
	MOV	c82, c11

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 8
#endif
	dsra	L,  TEMP, 2

	blez	L, .L15
	NOP
#else
	LD	a1,  0 * SIZE(AO)
	MOV	c71, c11
	LD	b1,  0 * SIZE(B)
	MOV	c81, c11

	pref	1, 3 * SIZE(CO1)
	pref	1, 3 * SIZE(CO2)

	LD	a3,  4 * SIZE(AO)
	MOV	c12, c11
	LD	b2,  1 * SIZE(B)
	MOV	c22, c11

	dsra	L,  K, 2
	MOV	c32, c11
	LD	b3,  2 * SIZE(B)
	MOV	c42, c11

	LD	b4,  3 * SIZE(B)
	MOV	c52, c11
	LD	b5,  4 * SIZE(B)
	MOV	c62, c11

	LD	b6,  8 * SIZE(B)
	MOV	c72, c11
	LD	b7, 12 * SIZE(B)
	MOV	c82, c11

	blez	L, .L15
	move	BO,  B
#endif

	MADD	c11, c11, a1, b1
	LD	a2,  1 * SIZE(AO)
	MADD	c21, c21, a1, b2
	daddiu	L, L, -1
	MADD	c31, c31, a1, b3
	blez	L, .L13
	MADD	c41, c41, a1, b4
	pref	1, 2 * SIZE(CO3)
	.align	3

.L12:
	MADD	c12, c12, a2, b1
	LD	b1, 16 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2,  5 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3,  6 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4,  7 * SIZE(BO)

	MADD	c51, c51, a1, b5
	LD	a4,  2 * SIZE(AO)
	MADD	c61, c61, a1, b2
	NOP
	MADD	c71, c71, a1, b3
	NOP
	MADD	c81, c81, a1, b4
	LD	a1,  8 * SIZE(AO)

	MADD	c52, c52, a2, b5
	LD	b5, 20 * SIZE(BO)
	MADD	c62, c62, a2, b2
	LD	b2,  9 * SIZE(BO)
	MADD	c72, c72, a2, b3
	LD	b3, 10 * SIZE(BO)
	MADD	c82, c82, a2, b4
	LD	b4, 11 * SIZE(BO)

	MADD	c11, c11, a4, b6
	LD	a2,  3 * SIZE(AO)
	MADD	c21, c21, a4, b2
	NOP
	MADD	c31, c31, a4, b3
	NOP
	MADD	c41, c41, a4, b4
	NOP

	MADD	c12, c12, a2, b6
	LD	b6, 24 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2, 13 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3, 14 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4, 15 * SIZE(BO)

	MADD	c51, c51, a4, b7
	NOP
	MADD	c61, c61, a4, b2
	NOP
	MADD	c71, c71, a4, b3
	NOP
	MADD	c81, c81, a4, b4
	NOP

	MADD	c52, c52, a2, b7
	LD	b7, 28 * SIZE(BO)
	MADD	c62, c62, a2, b2
	LD	b2, 17 * SIZE(BO)
	MADD	c72, c72, a2, b3
	LD	b3, 18 * SIZE(BO)
	MADD	c82, c82, a2, b4
	LD	b4, 19 * SIZE(BO)

	MADD	c11, c11, a3, b1
	LD	a2,  5 * SIZE(AO)
	MADD	c21, c21, a3, b2
	NOP
	MADD	c31, c31, a3, b3
	NOP
	MADD	c41, c41, a3, b4
	NOP

	MADD	c12, c12, a2, b1
	LD	b1, 32 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2, 21 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3, 22 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4, 23 * SIZE(BO)

	MADD	c51, c51, a3, b5
	LD	a4,  6 * SIZE(AO)
	MADD	c61, c61, a3, b2
	NOP
	MADD	c71, c71, a3, b3
	NOP
	MADD	c81, c81, a3, b4
	LD	a3, 12 * SIZE(AO)

	MADD	c52, c52, a2, b5
	LD	b5, 36 * SIZE(BO)
	MADD	c62, c62, a2, b2
	LD	b2, 25 * SIZE(BO)
	MADD	c72, c72, a2, b3
	LD	b3, 26 * SIZE(BO)
	MADD	c82, c82, a2, b4
	LD	b4, 27 * SIZE(BO)

	MADD	c11, c11, a4, b6
	LD	a2,  7 * SIZE(AO)
	MADD	c21, c21, a4, b2
	NOP
	MADD	c31, c31, a4, b3
	NOP
	MADD	c41, c41, a4, b4
	daddiu	L, L, -1

	MADD	c12, c12, a2, b6
	LD	b6, 40 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2, 29 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3, 30 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4, 31 * SIZE(BO)

	MADD	c51, c51, a4, b7
	daddiu	BO, BO, 32 * SIZE
	MADD	c61, c61, a4, b2
	daddiu	AO, AO,  8 * SIZE
	MADD	c71, c71, a4, b3
	NOP
	MADD	c81, c81, a4, b4
	NOP

	MADD	c52, c52, a2, b7
	LD	b7, 12 * SIZE(BO)
	MADD	c62, c62, a2, b2
	LD	b2,  1 * SIZE(BO)
	MADD	c72, c72, a2, b3
	LD	b3,  2 * SIZE(BO)
	MADD	c82, c82, a2, b4
	LD	b4,  3 * SIZE(BO)

	MADD	c11, c11, a1, b1
	LD	a2,  1 * SIZE(AO)
	MADD	c21, c21, a1, b2
	NOP
	MADD	c31, c31, a1, b3
	bgtz	L, .L12
	MADD	c41, c41, a1, b4
	NOP
	.align 3

.L13:
	MADD	c12, c12, a2, b1
	LD	b1, 16 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2,  5 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3,  6 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4,  7 * SIZE(BO)

	MADD	c51, c51, a1, b5
	NOP
	MADD	c61, c61, a1, b2
	LD	a4,  2 * SIZE(AO)
	MADD	c71, c71, a1, b3
	NOP
	MADD	c81, c81, a1, b4
	LD	a1,  8 * SIZE(AO)

	MADD	c52, c52, a2, b5
	LD	b5, 20 * SIZE(BO)
	MADD	c62, c62, a2, b2
	LD	b2,  9 * SIZE(BO)
	MADD	c72, c72, a2, b3
	LD	b3, 10 * SIZE(BO)
	MADD	c82, c82, a2, b4
	LD	b4, 11 * SIZE(BO)

	MADD	c11, c11, a4, b6
	LD	a2,  3 * SIZE(AO)
	MADD	c21, c21, a4, b2
	NOP
	MADD	c31, c31, a4, b3
	pref	1, 3 * SIZE(CO4)
	MADD	c41, c41, a4, b4
	NOP

	MADD	c12, c12, a2, b6
	LD	b6, 24 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2, 13 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3, 14 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4, 15 * SIZE(BO)

	MADD	c51, c51, a4, b7
	pref	1, 3 * SIZE(CO5)
	MADD	c61, c61, a4, b2
	NOP
	MADD	c71, c71, a4, b3
	pref	1, 3 * SIZE(CO6)
	MADD	c81, c81, a4, b4
	NOP

	MADD	c52, c52, a2, b7
	LD	b7, 28 * SIZE(BO)
	MADD	c62, c62, a2, b2
	LD	b2, 17 * SIZE(BO)
	MADD	c72, c72, a2, b3
	LD	b3, 18 * SIZE(BO)
	MADD	c82, c82, a2, b4
	LD	b4, 19 * SIZE(BO)

	MADD	c11, c11, a3, b1
	LD	a2,  5 * SIZE(AO)
	MADD	c21, c21, a3, b2
	NOP
	MADD	c31, c31, a3, b3
	pref	1, 3 * SIZE(CO7)
	MADD	c41, c41, a3, b4
	NOP

	MADD	c12, c12, a2, b1
	LD	b1, 32 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2, 21 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3, 22 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4, 23 * SIZE(BO)

	MADD	c51, c51, a3, b5
	NOP
	MADD	c61, c61, a3, b2
	LD	a4,  6 * SIZE(AO)
	MADD	c71, c71, a3, b3
	NOP
	MADD	c81, c81, a3, b4
	NOP

	MADD	c52, c52, a2, b5
	LD	b5, 36 * SIZE(BO)
	MADD	c62, c62, a2, b2
	LD	b2, 25 * SIZE(BO)
	MADD	c72, c72, a2, b3
	LD	b3, 26 * SIZE(BO)
	MADD	c82, c82, a2, b4
	LD	b4, 27 * SIZE(BO)

	MADD	c11, c11, a4, b6
	LD	a2,  7 * SIZE(AO)
	MADD	c21, c21, a4, b2
	NOP
	MADD	c31, c31, a4, b3
	NOP
	MADD	c41, c41, a4, b4
	NOP

	MADD	c12, c12, a2, b6
	LD	b6, 40 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2, 29 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3, 30 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4, 31 * SIZE(BO)

	MADD	c51, c51, a4, b7
	daddiu	BO, BO, 32 * SIZE
	MADD	c61, c61, a4, b2
	daddiu	AO, AO,  8 * SIZE
	MADD	c71, c71, a4, b3
	NOP
	MADD	c81, c81, a4, b4
	NOP

	MADD	c52, c52, a2, b7
	LD	b7, 12 * SIZE(BO)
	MADD	c62, c62, a2, b2
	LD	b2,  1 * SIZE(BO)
	MADD	c72, c72, a2, b3
	LD	b3,  2 * SIZE(BO)
	MADD	c82, c82, a2, b4
	LD	b4,  3 * SIZE(BO)
	.align 3

.L15:
#ifndef TRMMKERNEL
	andi	L,  K, 3
#else
	andi	L,  TEMP, 3
#endif
	NOP
	blez	L, .L18
	pref	1, 3 * SIZE(CO8)
	.align	3

.L16:
	MADD	c11, c11, a1, b1
	LD	a2,  1 * SIZE(AO)
	MADD	c21, c21, a1, b2
	NOP
	MADD	c31, c31, a1, b3
	NOP
	MADD	c41, c41, a1, b4
	NOP

	MADD	c12, c12, a2, b1
	LD	b1,  8 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2,  5 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3,  6 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4,  7 * SIZE(BO)

	MADD	c51, c51, a1, b5
	daddiu	L, L, -1
	MADD	c61, c61, a1, b2
	daddiu	AO, AO,  2 * SIZE
	MADD	c71, c71, a1, b3
	daddiu	BO, BO,  8 * SIZE
	MADD	c81, c81, a1, b4
	LD	a1,  0 * SIZE(AO)

	MADD	c52, c52, a2, b5
	LD	b5,  4 * SIZE(BO)
	MADD	c62, c62, a2, b2
	LD	b2,  1 * SIZE(BO)
	MADD	c72, c72, a2, b3
	LD	b3,  2 * SIZE(BO)
	MADD	c82, c82, a2, b4
	bgtz	L, .L16
	LD	b4,  3 * SIZE(BO)

.L18:
#ifndef TRMMKERNEL
	LD	$f0, 0 * SIZE(CO1)
	daddiu	CO3,CO3, 2 * SIZE
	LD	$f1, 1 * SIZE(CO1)
	daddiu	CO1,CO1, 2 * SIZE
	LD	$f2, 0 * SIZE(CO2)
	daddiu	CO4,CO4, 2 * SIZE
	LD	$f3, 1 * SIZE(CO2)
	daddiu	CO2,CO2, 2 * SIZE

	LD	$f4, -2 * SIZE(CO3)
	daddiu	CO5,CO5, 2 * SIZE
	LD	$f5, -1 * SIZE(CO3)
	daddiu	CO6,CO6, 2 * SIZE
	LD	$f6, -2 * SIZE(CO4)
	daddiu	CO7,CO7, 2 * SIZE
	LD	$f7, -1 * SIZE(CO4)
	daddiu	I, I, -1

	MADD	c11, $f0, ALPHA, c11
	LD	$f0,-2 * SIZE(CO5)
	MADD	c12, $f1, ALPHA, c12
	LD	$f1,-1 * SIZE(CO5)
	MADD	c21, $f2, ALPHA, c21
	LD	$f2,-2 * SIZE(CO6)
	MADD	c22, $f3, ALPHA, c22
	LD	$f3,-1 * SIZE(CO6)

	MADD	c31, $f4, ALPHA, c31
	LD	$f4,-2 * SIZE(CO7)
	MADD	c32, $f5, ALPHA, c32
	LD	$f5,-1 * SIZE(CO7)
	MADD	c41, $f6, ALPHA, c41
	LD	$f6, 0 * SIZE(CO8)
	MADD	c42, $f7, ALPHA, c42
	LD	$f7, 1 * SIZE(CO8)

	pref	0, 0 * SIZE(BB)
	pref	0, 8 * SIZE(BB)

	ST	c11, -2 * SIZE(CO1)
	MTC	$0,  c11
	ST	c12, -1 * SIZE(CO1)
	daddiu	CO8,CO8, 2 * SIZE
	ST	c21, -2 * SIZE(CO2)
	MOV	c21, c11
	ST	c22, -1 * SIZE(CO2)
	daddiu	BB, BB, 16 * SIZE

	MADD	c51, $f0, ALPHA, c51
	ST	c31, -2 * SIZE(CO3)
	MADD	c52, $f1, ALPHA, c52
	ST	c32, -1 * SIZE(CO3)
	MADD	c61, $f2, ALPHA, c61
	ST	c41, -2 * SIZE(CO4)
	MADD	c62, $f3, ALPHA, c62
	ST	c42, -1 * SIZE(CO4)

	MADD	c71, $f4, ALPHA, c71
	ST	c51, -2 * SIZE(CO5)
	MADD	c72, $f5, ALPHA, c72
	ST	c52, -1 * SIZE(CO5)
	MADD	c81, $f6, ALPHA, c81
	ST	c61, -2 * SIZE(CO6)
	MADD	c82, $f7, ALPHA, c82
	ST	c62, -1 * SIZE(CO6)

	ST	c71, -2 * SIZE(CO7)
	MOV	c31, c11
	ST	c72, -1 * SIZE(CO7)
	MOV	c41, c11

	ST	c81, -2 * SIZE(CO8)
	MOV	c51, c11
	ST	c82, -1 * SIZE(CO8)
	bgtz	I, .L11
	MOV	c61, c11
#else
	daddiu	CO4,CO4, 2 * SIZE
	daddiu	CO5,CO5, 2 * SIZE
	daddiu	CO6,CO6, 2 * SIZE
	daddiu	CO7,CO7, 2 * SIZE

	pref	0, 0 * SIZE(BB)
	pref	0, 8 * SIZE(BB)

	MUL	c11, ALPHA, c11
	daddiu	CO1,CO1, 2 * SIZE
	MUL	c12, ALPHA, c12
	MTC	$0,  a1
	MUL	c21, ALPHA, c21
	daddiu	CO2,CO2, 2 * SIZE
	MUL	c22, ALPHA, c22
	daddiu	CO3,CO3, 2 * SIZE

	ST	c11, -2 * SIZE(CO1)
	MUL	c31, ALPHA, c31
	ST	c12, -1 * SIZE(CO1)
	MUL	c32, ALPHA, c32
	ST	c21, -2 * SIZE(CO2)
	MUL	c41, ALPHA, c41
	ST	c22, -1 * SIZE(CO2)
	MUL	c42, ALPHA, c42

	ST	c31, -2 * SIZE(CO3)
	MUL	c51, ALPHA, c51
	ST	c32, -1 * SIZE(CO3)
	MUL	c52, ALPHA, c52
	ST	c41, -2 * SIZE(CO4)
	MUL	c61, ALPHA, c61
	ST	c42, -1 * SIZE(CO4)
	MUL	c62, ALPHA, c62

	ST	c51, -2 * SIZE(CO5)
	MUL	c71, ALPHA, c71
	ST	c52, -1 * SIZE(CO5)
	MUL	c72, ALPHA, c72
	ST	c61, -2 * SIZE(CO6)
	MUL	c81, ALPHA, c81
	ST	c62, -1 * SIZE(CO6)
	MUL	c82, ALPHA, c82

	ST	c71, -2 * SIZE(CO7)
	MOV	c11, a1
	ST	c72, -1 * SIZE(CO7)
	MOV	c21, a1

	daddiu	CO8,CO8, 2 * SIZE
	daddiu	BB, BB, 16 * SIZE

	ST	c81, -2 * SIZE(CO8)
	MOV	c31, a1
	ST	c82, -1 * SIZE(CO8)
	MOV	c41, a1

	daddiu	I, I, -1
	MOV	c51, a1

#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -8
#endif

	dsll	L,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 3 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif

	bgtz	I, .L11
	MOV	c61, a1
#endif
	.align 3

.L20:
	andi	I,  M, 1
	MOV	c61, c11
	blez	I, .L29
	MOV	c71, c11

#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, 0 + BASE_SHIFT
	dsll	TEMP, KK, 3 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif

	LD	a1,  0 * SIZE(AO)
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  8 * SIZE(BO)
	LD	b7, 12 * SIZE(BO)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 8
#endif
	dsra	L,  TEMP, 2

	blez	L, .L25
	MOV	c81, c11
#else
	LD	a1,  0 * SIZE(AO)
	LD	a2,  1 * SIZE(AO)
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)

	LD	b1,  0 * SIZE(B)
	LD	b2,  1 * SIZE(B)
	LD	b3,  2 * SIZE(B)
	LD	b4,  3 * SIZE(B)
	LD	b5,  4 * SIZE(B)
	LD	b6,  8 * SIZE(B)
	LD	b7, 12 * SIZE(B)

	dsra	L,  K, 2
	MOV	c81, c11

	blez	L, .L25
	move	BO,  B
#endif
	.align	3

.L22:
	MADD	c11, c11, a1, b1
	LD	b1, 16 * SIZE(BO)
	MADD	c21, c21, a1, b2
	LD	b2,  5 * SIZE(BO)
	MADD	c31, c31, a1, b3
	LD	b3,  6 * SIZE(BO)
	MADD	c41, c41, a1, b4
	LD	b4,  7 * SIZE(BO)

	MADD	c51, c51, a1, b5
	LD	b5, 20 * SIZE(BO)
	MADD	c61, c61, a1, b2
	LD	b2,  9 * SIZE(BO)
	MADD	c71, c71, a1, b3
	LD	b3, 10 * SIZE(BO)
	MADD	c81, c81, a1, b4
	LD	b4, 11 * SIZE(BO)

	LD	a1,  4 * SIZE(AO)
	daddiu	L, L, -1

	MADD	c11, c11, a2, b6
	LD	b6, 24 * SIZE(BO)
	MADD	c21, c21, a2, b2
	LD	b2, 13 * SIZE(BO)
	MADD	c31, c31, a2, b3
	LD	b3, 14 * SIZE(BO)
	MADD	c41, c41, a2, b4
	LD	b4, 15 * SIZE(BO)

	MADD	c51, c51, a2, b7
	LD	b7, 28 * SIZE(BO)
	MADD	c61, c61, a2, b2
	LD	b2, 17 * SIZE(BO)
	MADD	c71, c71, a2, b3
	LD	b3, 18 * SIZE(BO)
	MADD	c81, c81, a2, b4
	LD	b4, 19 * SIZE(BO)

	LD	a2,  5 * SIZE(AO)
	daddiu	AO, AO,  4 * SIZE

	MADD	c11, c11, a3, b1
	LD	b1, 32 * SIZE(BO)
	MADD	c21, c21, a3, b2
	LD	b2, 21 * SIZE(BO)
	MADD	c31, c31, a3, b3
	LD	b3, 22 * SIZE(BO)
	MADD	c41, c41, a3, b4
	LD	b4, 23 * SIZE(BO)

	MADD	c51, c51, a3, b5
	LD	b5, 36 * SIZE(BO)
	MADD	c61, c61, a3, b2
	LD	b2, 25 * SIZE(BO)
	MADD	c71, c71, a3, b3
	LD	b3, 26 * SIZE(BO)
	MADD	c81, c81, a3, b4
	LD	b4, 27 * SIZE(BO)

	LD	a3,  2 * SIZE(AO)
	daddiu	BO, BO, 32 * SIZE

	MADD	c11, c11, a4, b6
	LD	b6,  8 * SIZE(BO)
	MADD	c21, c21, a4, b2
	LD	b2, -3 * SIZE(BO)
	MADD	c31, c31, a4, b3
	LD	b3, -2 * SIZE(BO)
	MADD	c41, c41, a4, b4
	LD	b4, -1 * SIZE(BO)

	MADD	c51, c51, a4, b7
	LD	b7, 12 * SIZE(BO)
	MADD	c61, c61, a4, b2
	LD	b2,  1 * SIZE(BO)
	MADD	c71, c71, a4, b3
	LD	b3,  2 * SIZE(BO)
	MADD	c81, c81, a4, b4
	LD	b4,  3 * SIZE(BO)
	bgtz	L, .L22
	LD	a4,  3 * SIZE(AO)
	.align 3

.L25:
#ifndef TRMMKERNEL
	andi	L,  K, 3
#else
	andi	L,  TEMP, 3
#endif
	NOP
	blez	L, .L28
	NOP
	.align	3

.L26:
	MADD	c11, c11, a1, b1
	LD	b1,  8 * SIZE(BO)
	MADD	c21, c21, a1, b2
	LD	b2,  5 * SIZE(BO)
	MADD	c31, c31, a1, b3
	LD	b3,  6 * SIZE(BO)
	MADD	c41, c41, a1, b4
	LD	b4,  7 * SIZE(BO)

	daddiu	L, L, -1
	MOV	a2, a2
	daddiu	AO, AO,  1 * SIZE
	daddiu	BO, BO,  8 * SIZE

	MADD	c51, c51, a1, b5
	LD	b5,  4 * SIZE(BO)
	MADD	c61, c61, a1, b2
	LD	b2,  1 * SIZE(BO)
	MADD	c71, c71, a1, b3
	LD	b3,  2 * SIZE(BO)
	MADD	c81, c81, a1, b4
	LD	a1,  0 * SIZE(AO)

	bgtz	L, .L26
	LD	b4,  3 * SIZE(BO)

.L28:
#ifndef TRMMKERNEL
	LD	$f0, 0 * SIZE(CO1)
	LD	$f1, 0 * SIZE(CO2)
	LD	$f2, 0 * SIZE(CO3)
	LD	$f3, 0 * SIZE(CO4)
	MADD	c11, $f0, ALPHA, c11
	LD	$f4, 0 * SIZE(CO5)
	MADD	c21, $f1, ALPHA, c21
	LD	$f5, 0 * SIZE(CO6)
	MADD	c31, $f2, ALPHA, c31
	LD	$f6, 0 * SIZE(CO7)
	MADD	c41, $f3, ALPHA, c41
	LD	$f7, 0 * SIZE(CO8)
	MADD	c51, $f4, ALPHA, c51
	ST	c11,  0 * SIZE(CO1)
	MADD	c61, $f5, ALPHA, c61
	ST	c21,  0 * SIZE(CO2)
	MADD	c71, $f6, ALPHA, c71
	ST	c31,  0 * SIZE(CO3)
	MADD	c81, $f7, ALPHA, c81
	ST	c41,  0 * SIZE(CO4)
	ST	c51,  0 * SIZE(CO5)
	ST	c61,  0 * SIZE(CO6)
	ST	c71,  0 * SIZE(CO7)
	ST	c81,  0 * SIZE(CO8)
#else
	MUL	c11, ALPHA, c11
	MUL	c21, ALPHA, c21
	MUL	c31, ALPHA, c31
	MUL	c41, ALPHA, c41

	ST	c11,  0 * SIZE(CO1)
	MUL	c51, ALPHA, c51
	ST	c21,  0 * SIZE(CO2)
	MUL	c61, ALPHA, c61
	ST	c31,  0 * SIZE(CO3)
	MUL	c71, ALPHA, c71
	ST	c41,  0 * SIZE(CO4)
	MUL	c81, ALPHA, c81

	ST	c51,  0 * SIZE(CO5)
	ST	c61,  0 * SIZE(CO6)
	ST	c71,  0 * SIZE(CO7)
	ST	c81,  0 * SIZE(CO8)

#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -8
#endif

	dsll	L,    TEMP, 0 + BASE_SHIFT
	dsll	TEMP, TEMP, 3 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 1
#endif
#endif
	.align 3

.L29:
#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu	KK, KK, 8
#endif

	bgtz	J, .L10
	move	B, BO
	.align 3
	
.L30:
	andi	J,  N, 4
	blez	J, .L50
	move	AO, A

	move	CO1, C
	MTC	$0,  c11
	daddu	CO2, C,   LDC
	daddu	CO3, CO2, LDC
	daddu	CO4, CO3, LDC
	MOV	c21, c11
	daddu	C,   CO4, LDC
	MOV	c31, c11

#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif

	dsra	I,  M, 1
	blez	I, .L40
	MOV	c41, c11

.L31:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, 1 + BASE_SHIFT
	dsll	TEMP, KK, 2 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif

	LD	a1,  0 * SIZE(AO)
	LD	a3,  4 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)
	MOV	c12, c11
	LD	b2,  1 * SIZE(BO)
	MOV	c22, c11
	LD	b3,  2 * SIZE(BO)
	MOV	c32, c11
	LD	b4,  3 * SIZE(BO)
	MOV	c42, c11

	LD	b5,  4 * SIZE(BO)
	LD	b6,  8 * SIZE(BO)
	LD	b7, 12 * SIZE(BO)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 4
#endif
	dsra	L,  TEMP, 2
	blez	L, .L35
	NOP
#else
	LD	a1,  0 * SIZE(AO)
	LD	a3,  4 * SIZE(AO)

	LD	b1,  0 * SIZE(B)
	MOV	c12, c11
	LD	b2,  1 * SIZE(B)
	MOV	c22, c11
	LD	b3,  2 * SIZE(B)
	MOV	c32, c11
	LD	b4,  3 * SIZE(B)
	MOV	c42, c11

	LD	b5,  4 * SIZE(B)
	dsra	L,  K, 2
	LD	b6,  8 * SIZE(B)
	LD	b7, 12 * SIZE(B)

	blez	L, .L35
	move	BO,  B
#endif
	.align	3

.L32:
	MADD	c11, c11, a1, b1
	LD	a2,  1 * SIZE(AO)
	MADD	c21, c21, a1, b2
	daddiu	L, L, -1
	MADD	c31, c31, a1, b3
	NOP
	MADD	c41, c41, a1, b4
	LD	a1,  2 * SIZE(AO)

	MADD	c12, c12, a2, b1
	LD	b1, 16 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2,  5 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3,  6 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4,  7 * SIZE(BO)

	MADD	c11, c11, a1, b5
	LD	a2,  3 * SIZE(AO)
	MADD	c21, c21, a1, b2
	NOP
	MADD	c31, c31, a1, b3
	NOP
	MADD	c41, c41, a1, b4
	LD	a1,  8 * SIZE(AO)

	MADD	c12, c12, a2, b5
	LD	b5, 20 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2,  9 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3, 10 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4, 11 * SIZE(BO)

	MADD	c11, c11, a3, b6
	LD	a2,  5 * SIZE(AO)
	MADD	c21, c21, a3, b2
	NOP
	MADD	c31, c31, a3, b3
	NOP
	MADD	c41, c41, a3, b4
	LD	a3,  6 * SIZE(AO)

	MADD	c12, c12, a2, b6
	LD	b6, 24 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2, 13 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3, 14 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4, 15 * SIZE(BO)

	MADD	c11, c11, a3, b7
	LD	a2,  7 * SIZE(AO)
	MADD	c21, c21, a3, b2
	daddiu	AO, AO,  8 * SIZE
	MADD	c31, c31, a3, b3
	daddiu	BO, BO, 16 * SIZE
	MADD	c41, c41, a3, b4
	LD	a3,  4 * SIZE(AO)

	MADD	c12, c12, a2, b7
	LD	b7, 12 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2,  1 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3,  2 * SIZE(BO)
	MADD	c42, c42, a2, b4
	NOP

	bgtz	L, .L32
	LD	b4,  3 * SIZE(BO)
	.align 3

.L35:
#ifndef TRMMKERNEL
	andi	L,  K, 3
#else
	andi	L,  TEMP, 3
#endif
	NOP
	blez	L, .L38
	NOP
	.align	3

.L36:
	MADD	c11, c11, a1, b1
	LD	a2,  1 * SIZE(AO)
	MADD	c21, c21, a1, b2
	daddiu	L, L, -1
	MADD	c31, c31, a1, b3
	daddiu	AO, AO,  2 * SIZE
	MADD	c41, c41, a1, b4
	LD	a1,  0 * SIZE(AO)

	MADD	c12, c12, a2, b1
	LD	b1,  4 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2,  5 * SIZE(BO)
	MADD	c32, c32, a2, b3
	LD	b3,  6 * SIZE(BO)
	MADD	c42, c42, a2, b4
	LD	b4,  7 * SIZE(BO)

	bgtz	L, .L36
	daddiu	BO, BO,  4 * SIZE

.L38:
#ifndef TRMMKERNEL
	LD	$f0, 0 * SIZE(CO1)
	daddiu	CO3,CO3, 2 * SIZE
	LD	$f1, 1 * SIZE(CO1)
	daddiu	CO1,CO1, 2 * SIZE
	LD	$f2, 0 * SIZE(CO2)
	daddiu	CO4,CO4, 2 * SIZE
	LD	$f3, 1 * SIZE(CO2)
	daddiu	CO2,CO2, 2 * SIZE

	LD	$f4, -2 * SIZE(CO3)
	MADD	c11, $f0, ALPHA, c11
	LD	$f5, -1 * SIZE(CO3)
	MADD	c12, $f1, ALPHA, c12
	LD	$f6, -2 * SIZE(CO4)
	MADD	c21, $f2, ALPHA, c21
	LD	$f7, -1 * SIZE(CO4)
	MADD	c22, $f3, ALPHA, c22

	MADD	c31, $f4, ALPHA, c31
	ST	c11, -2 * SIZE(CO1)
	MADD	c32, $f5, ALPHA, c32
	ST	c12, -1 * SIZE(CO1)
	MADD	c41, $f6, ALPHA, c41
	ST	c21, -2 * SIZE(CO2)
	MADD	c42, $f7, ALPHA, c42
	ST	c22, -1 * SIZE(CO2)

	ST	c31, -2 * SIZE(CO3)
	MTC	$0,  c11
	ST	c32, -1 * SIZE(CO3)
	daddiu	I, I, -1
	ST	c41, -2 * SIZE(CO4)
	MOV	c21, c11
	ST	c42, -1 * SIZE(CO4)
	MOV	c31, c11
#else
	MUL	c11, ALPHA, c11
	daddiu	CO3,CO3, 2 * SIZE
	MUL	c12, ALPHA, c12
	daddiu	CO1,CO1, 2 * SIZE
	MUL	c21, ALPHA, c21
	daddiu	CO4,CO4, 2 * SIZE
	MUL	c22, ALPHA, c22
	daddiu	CO2,CO2, 2 * SIZE

	ST	c11, -2 * SIZE(CO1)
	MUL	c31, ALPHA, c31
	ST	c12, -1 * SIZE(CO1)
	MUL	c32, ALPHA, c32
	ST	c21, -2 * SIZE(CO2)
	MUL	c41, ALPHA, c41
	ST	c22, -1 * SIZE(CO2)
	MUL	c42, ALPHA, c42

	ST	c31, -2 * SIZE(CO3)
	MTC	$0,  c11
	ST	c32, -1 * SIZE(CO3)
	daddiu	I, I, -1
	ST	c41, -2 * SIZE(CO4)
	MOV	c21, c11
	ST	c42, -1 * SIZE(CO4)
	MOV	c31, c11

#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -4
#endif

	dsll	L,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 2 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif
#endif

	bgtz	I, .L31
	MOV	c41, c11
	.align 3

.L40:
	andi	I,  M, 1
	blez	I, .L49
	MOV	c61, c11

#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, 0 + BASE_SHIFT
	dsll	TEMP, KK, 2 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif

	LD	a1,  0 * SIZE(AO)
	MOV	c71, c11
	LD	a2,  1 * SIZE(AO)
	MOV	c81, c11

	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  8 * SIZE(BO)
	LD	b7, 12 * SIZE(BO)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 4
#endif
	dsra	L,  TEMP, 2

	blez	L, .L45
	NOP
#else
	LD	a1,  0 * SIZE(AO)
	MOV	c71, c11
	LD	a2,  1 * SIZE(AO)
	MOV	c81, c11

	LD	b1,  0 * SIZE(B)
	LD	b2,  1 * SIZE(B)
	LD	b3,  2 * SIZE(B)
	LD	b4,  3 * SIZE(B)
	LD	b5,  4 * SIZE(B)
	LD	b6,  8 * SIZE(B)
	LD	b7, 12 * SIZE(B)

	dsra	L,  K, 2

	blez	L, .L45
	move	BO,  B
#endif
	.align	3

.L42:
	MADD	c11, c11, a1, b1
	LD	b1, 16 * SIZE(BO)
	MADD	c21, c21, a1, b2
	LD	b2,  5 * SIZE(BO)
	MADD	c31, c31, a1, b3
	LD	b3,  6 * SIZE(BO)
	MADD	c41, c41, a1, b4
	LD	b4,  7 * SIZE(BO)

	LD	a1,  4 * SIZE(AO)
	daddiu	L, L, -1

	MADD	c11, c11, a2, b5
	LD	b5, 20 * SIZE(BO)
	MADD	c21, c21, a2, b2
	LD	b2,  9 * SIZE(BO)
	MADD	c31, c31, a2, b3
	LD	b3, 10 * SIZE(BO)
	MADD	c41, c41, a2, b4
	LD	b4, 11 * SIZE(BO)

	LD	a2,  2 * SIZE(AO)
	daddiu	AO, AO,  4 * SIZE

	MADD	c11, c11, a2, b6
	LD	b6, 24 * SIZE(BO)
	MADD	c21, c21, a2, b2
	LD	b2, 13 * SIZE(BO)
	MADD	c31, c31, a2, b3
	LD	b3, 14 * SIZE(BO)
	MADD	c41, c41, a2, b4
	LD	b4, 15 * SIZE(BO)

	LD	a2, -1 * SIZE(AO)
	daddiu	BO, BO, 16 * SIZE

	MADD	c11, c11, a2, b7
	LD	b7, 12 * SIZE(BO)
	MADD	c21, c21, a2, b2
	LD	b2,  1 * SIZE(BO)
	MADD	c31, c31, a2, b3
	LD	b3,  2 * SIZE(BO)
	MADD	c41, c41, a2, b4
	LD	b4,  3 * SIZE(BO)

	bgtz	L, .L42
	LD	a2,  1 * SIZE(AO)
	.align 3

.L45:
#ifndef TRMMKERNEL
	andi	L,  K, 3
#else
	andi	L,  TEMP, 3
#endif
	NOP
	blez	L, .L48
	NOP
	.align	3

.L46:
	MADD	c11, c11, a1, b1
	LD	b1,  4 * SIZE(BO)
	MADD	c21, c21, a1, b2
	LD	b2,  5 * SIZE(BO)
	MADD	c31, c31, a1, b3
	LD	b3,  6 * SIZE(BO)
	MADD	c41, c41, a1, b4
	LD	a1,  1 * SIZE(AO)

	LD	b4,  7 * SIZE(BO)
	daddiu	L, L, -1

	daddiu	AO, AO,  1 * SIZE
	MOV	a2, a2
	bgtz	L, .L46
	daddiu	BO, BO,  4 * SIZE


.L48:
#ifndef TRMMKERNEL
	LD	$f0, 0 * SIZE(CO1)
	LD	$f1, 0 * SIZE(CO2)
	LD	$f2, 0 * SIZE(CO3)
	LD	$f3, 0 * SIZE(CO4)

	MADD	c11, $f0, ALPHA, c11
	MADD	c21, $f1, ALPHA, c21
	MADD	c31, $f2, ALPHA, c31
	MADD	c41, $f3, ALPHA, c41

	ST	c11,  0 * SIZE(CO1)
	ST	c21,  0 * SIZE(CO2)
	ST	c31,  0 * SIZE(CO3)
	ST	c41,  0 * SIZE(CO4)
#else
	MUL	c11, ALPHA, c11
	MUL	c21, ALPHA, c21
	MUL	c31, ALPHA, c31
	MUL	c41, ALPHA, c41

	ST	c11,  0 * SIZE(CO1)
	ST	c21,  0 * SIZE(CO2)
	ST	c31,  0 * SIZE(CO3)
	ST	c41,  0 * SIZE(CO4)

#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -4
#endif

	dsll	L,    TEMP, 0 + BASE_SHIFT
	dsll	TEMP, TEMP, 2 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 1
#endif
#endif
	.align 3

.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu	KK, KK, 4
#endif
	move	B, BO
	.align 3

.L50:
	andi	J,  N, 2
	blez	J, .L70

	move	AO, A
	move	CO1, C
	daddu	CO2, C,   LDC

#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif

	dsra	I,  M, 1
	blez	I, .L60
	daddu	C,   CO2, LDC

.L51:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, 1 + BASE_SHIFT
	dsll	TEMP, KK, 1 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif

	LD	a1,  0 * SIZE(AO)
	MTC	$0,  c11
	LD	a2,  1 * SIZE(AO)
	MOV	c21, c11
	LD	a5,  4 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)
	MOV	c12, c11
	LD	b2,  1 * SIZE(BO)
	MOV	c22, c11
	LD	b3,  2 * SIZE(BO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  8 * SIZE(BO)
	LD	b7, 12 * SIZE(BO)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	L,  TEMP, 2
	blez	L, .L55
	NOP
#else
	LD	a1,  0 * SIZE(AO)
	MTC	$0,  c11
	LD	a2,  1 * SIZE(AO)
	MOV	c21, c11
	LD	a5,  4 * SIZE(AO)

	LD	b1,  0 * SIZE(B)
	MOV	c12, c11
	LD	b2,  1 * SIZE(B)
	MOV	c22, c11
	LD	b3,  2 * SIZE(B)
	LD	b5,  4 * SIZE(B)
	dsra	L,  K, 2
	LD	b6,  8 * SIZE(B)
	LD	b7, 12 * SIZE(B)

	blez	L, .L55
	move	BO,  B
#endif
	.align	3

.L52:
	MADD	c11, c11, a1, b1
	LD	a3,  2 * SIZE(AO)
	MADD	c21, c21, a1, b2
	LD	b4,  3 * SIZE(BO)
	MADD	c12, c12, a2, b1
	LD	a4,  3 * SIZE(AO)
	MADD	c22, c22, a2, b2
	LD	b1,  8 * SIZE(BO)

	MADD	c11, c11, a3, b3
	LD	a1,  8 * SIZE(AO)
	MADD	c21, c21, a3, b4
	LD	b2,  5 * SIZE(BO)
	MADD	c12, c12, a4, b3
	LD	a2,  5 * SIZE(AO)
	MADD	c22, c22, a4, b4
	LD	b3,  6 * SIZE(BO)

	MADD	c11, c11, a5, b5
	LD	a3,  6 * SIZE(AO)
	MADD	c21, c21, a5, b2
	LD	b4,  7 * SIZE(BO)
	MADD	c12, c12, a2, b5
	LD	a4,  7 * SIZE(AO)
	MADD	c22, c22, a2, b2
	LD	b5, 12 * SIZE(BO)

	MADD	c11, c11, a3, b3
	LD	a5, 12 * SIZE(AO)
	MADD	c21, c21, a3, b4
	LD	b2,  9 * SIZE(BO)
	MADD	c12, c12, a4, b3
	LD	a2,  9 * SIZE(AO)
	MADD	c22, c22, a4, b4
	LD	b3, 10 * SIZE(BO)

	daddiu	AO, AO,  8 * SIZE
	daddiu	L, L, -1
	bgtz	L, .L52
	daddiu	BO, BO,  8 * SIZE
	.align 3

.L55:
#ifndef TRMMKERNEL
	andi	L,  K, 3
#else
	andi	L,  TEMP, 3
#endif
	NOP
	blez	L, .L58
	NOP
	.align	3

.L56:
	MADD	c11, c11, a1, b1
	LD	a2,  1 * SIZE(AO)
	MADD	c21, c21, a1, b2
	LD	a1,  2 * SIZE(AO)

	MADD	c12, c12, a2, b1
	LD	b1,  2 * SIZE(BO)
	MADD	c22, c22, a2, b2
	LD	b2,  3 * SIZE(BO)

	daddiu	L, L, -1
	daddiu	AO, AO,  2 * SIZE
	bgtz	L, .L56
	daddiu	BO, BO,  2 * SIZE

.L58:
#ifndef TRMMKERNEL
	LD	$f0, 0 * SIZE(CO1)
	daddiu	I, I, -1
	LD	$f1, 1 * SIZE(CO1)
	daddiu	CO1,CO1, 2 * SIZE
	LD	$f2, 0 * SIZE(CO2)
	NOP
	LD	$f3, 1 * SIZE(CO2)
	daddiu	CO2,CO2, 2 * SIZE

	MADD	c11, $f0, ALPHA, c11
	MADD	c12, $f1, ALPHA, c12
	MADD	c21, $f2, ALPHA, c21
	MADD	c22, $f3, ALPHA, c22

	ST	c11, -2 * SIZE(CO1)
	ST	c12, -1 * SIZE(CO1)
	ST	c21, -2 * SIZE(CO2)
	NOP
	bgtz	I, .L51
	ST	c22, -1 * SIZE(CO2)
#else
	daddiu	I, I, -1

	daddiu	CO1,CO1, 2 * SIZE
	daddiu	CO2,CO2, 2 * SIZE

	MUL	c11, ALPHA, c11
	MUL	c12, ALPHA, c12
	MUL	c21, ALPHA, c21
	MUL	c22, ALPHA, c22

	ST	c11, -2 * SIZE(CO1)
	ST	c12, -1 * SIZE(CO1)
	ST	c21, -2 * SIZE(CO2)
	ST	c22, -1 * SIZE(CO2)

#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -2
#endif

	dsll	L,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif

	bgtz	I, .L51
	NOP
#endif
	.align 3

.L60:
	andi	I,  M, 1
	blez	I, .L69
	NOP

#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, 0 + BASE_SHIFT
	dsll	TEMP, KK, 1 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif

	LD	a1,  0 * SIZE(AO)
	MTC	$0,  c11
	LD	a2,  1 * SIZE(AO)
	MOV	c21, c11
	LD	a3,  2 * SIZE(AO)
	MOV	c31, c11
	LD	a4,  3 * SIZE(AO)
	MOV	c41, c11

	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  8 * SIZE(BO)
	LD	b7, 12 * SIZE(BO)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 2
#endif
	dsra	L,  TEMP, 2
	blez	L, .L65
	NOP
#else
	dsra	L,  K, 2
	LD	a1,  0 * SIZE(AO)
	MTC	$0,  c11
	LD	a2,  1 * SIZE(AO)
	MOV	c21, c11
	LD	a3,  2 * SIZE(AO)
	MOV	c31, c11
	LD	a4,  3 * SIZE(AO)
	MOV	c41, c11

	LD	b1,  0 * SIZE(B)
	LD	b2,  1 * SIZE(B)
	LD	b3,  2 * SIZE(B)
	LD	b4,  3 * SIZE(B)
	LD	b5,  4 * SIZE(B)
	LD	b6,  8 * SIZE(B)
	LD	b7, 12 * SIZE(B)

	blez	L, .L65
	move	BO,  B
#endif
	.align	3

.L62:
	MADD	c11, c11, a1, b1
	LD	b1,  4 * SIZE(BO)
	MADD	c21, c21, a1, b2
	LD	b2,  5 * SIZE(BO)
	MADD	c31, c31, a2, b3
	LD	b3,  6 * SIZE(BO)
	MADD	c41, c41, a2, b4
	LD	b4,  7 * SIZE(BO)

	LD	a1,  4 * SIZE(AO)
	LD	a2,  5 * SIZE(AO)

	MADD	c11, c11, a3, b1
	LD	b1,  8 * SIZE(BO)
	MADD	c21, c21, a3, b2
	LD	b2,  9 * SIZE(BO)
	MADD	c31, c31, a4, b3
	LD	b3, 10 * SIZE(BO)
	MADD	c41, c41, a4, b4
	LD	b4, 11 * SIZE(BO)

	LD	a3,  6 * SIZE(AO)
	LD	a4,  7 * SIZE(AO)

	daddiu	L, L, -1
	daddiu	AO, AO,  4 * SIZE

	bgtz	L, .L62
	daddiu	BO, BO,  8 * SIZE
	.align 3

.L65:
#ifndef TRMMKERNEL
	andi	L,  K, 3
#else
	andi	L,  TEMP, 3
#endif
	NOP
	blez	L, .L68
	NOP
	.align	3

.L66:
	MADD	c11, c11, a1, b1
	LD	b1,  2 * SIZE(BO)
	MADD	c21, c21, a1, b2
	LD	b2,  3 * SIZE(BO)

	LD	a1,  1 * SIZE(AO)
	daddiu	L, L, -1

	daddiu	AO, AO,  1 * SIZE
	bgtz	L, .L66
	daddiu	BO, BO,  2 * SIZE


.L68:
#ifndef TRMMKERNEL
	LD	$f0, 0 * SIZE(CO1)
	LD	$f1, 0 * SIZE(CO2)

	ADD	c11, c11, c31
	ADD	c21, c21, c41

	MADD	c11, $f0, ALPHA, c11
	MADD	c21, $f1, ALPHA, c21

	ST	c11,  0 * SIZE(CO1)
	ST	c21,  0 * SIZE(CO2)
#else
	ADD	c11, c11, c31
	ADD	c21, c21, c41

	MUL	c11, ALPHA, c11
	MUL	c21, ALPHA, c21

	ST	c11,  0 * SIZE(CO1)
	ST	c21,  0 * SIZE(CO2)

#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -1
#else
	daddiu	TEMP, TEMP, -2
#endif

	dsll	L,    TEMP, 0 + BASE_SHIFT
	dsll	TEMP, TEMP, 1 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 1
#endif
#endif
	.align 3

.L69:
#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu	KK, KK, 2
#endif
	move	B, BO
	.align 3

.L70:
	andi	J,  N, 1
	blez	J, .L999

	move	AO, A
	move	CO1, C

#if defined(TRMMKERNEL) &&  defined(LEFT)
	move	KK, OFFSET
#endif

	dsra	I,  M, 1
	blez	I, .L80
	daddu	C,   CO1, LDC

.L71:
#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, 1 + BASE_SHIFT
	dsll	TEMP, KK, 0 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif

	LD	a1,  0 * SIZE(AO)
	MTC	$0,  c11
	LD	a2,  1 * SIZE(AO)
	MOV	c21, c11
	LD	a5,  4 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)
	MOV	c12, c11
	LD	b2,  1 * SIZE(BO)
	MOV	c22, c11
	LD	b3,  2 * SIZE(BO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  8 * SIZE(BO)
	LD	b7, 12 * SIZE(BO)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 2
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	L,  TEMP, 2
	blez	L, .L75
	NOP
#else
	LD	a1,  0 * SIZE(AO)
	MTC	$0,  c11
	LD	a2,  1 * SIZE(AO)
	MOV	c21, c11
	LD	a5,  4 * SIZE(AO)

	LD	b1,  0 * SIZE(B)
	MOV	c12, c11
	LD	b2,  1 * SIZE(B)
	MOV	c22, c11
	LD	b3,  2 * SIZE(B)
	LD	b5,  4 * SIZE(B)
	dsra	L,  K, 2
	LD	b6,  8 * SIZE(B)
	LD	b7, 12 * SIZE(B)

	blez	L, .L75
	move	BO,  B
#endif
	.align	3

.L72:
	LD	a1,  0 * SIZE(AO)
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)

	MADD	c11, c11, a1, b1
	MADD	c12, c12, a2, b1

	LD	a1,  2 * SIZE(AO)
	LD	a2,  3 * SIZE(AO)
	LD	b1,  1 * SIZE(BO)

	MADD	c11, c11, a1, b1
	MADD	c12, c12, a2, b1

	LD	a1,  4 * SIZE(AO)
	LD	a2,  5 * SIZE(AO)
	LD	b1,  2 * SIZE(BO)

	MADD	c11, c11, a1, b1
	MADD	c12, c12, a2, b1

	LD	a1,  6 * SIZE(AO)
	LD	a2,  7 * SIZE(AO)
	LD	b1,  3 * SIZE(BO)

	MADD	c11, c11, a1, b1
	MADD	c12, c12, a2, b1

	daddiu	L, L, -1
	daddiu	AO, AO,  8 * SIZE
	bgtz	L, .L72
	daddiu	BO, BO,  4 * SIZE
	.align 3

.L75:
#ifndef TRMMKERNEL
	andi	L,  K, 3
#else
	andi	L,  TEMP, 3
#endif
	NOP
	blez	L, .L78
	NOP
	.align	3

.L76:
	LD	a1,  0 * SIZE(AO)
	LD	a2,  1 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)

	MADD	c11, c11, a1, b1
	MADD	c12, c12, a2, b1

	daddiu	L, L, -1
	daddiu	AO, AO,  2 * SIZE
	bgtz	L, .L76
	daddiu	BO, BO,  1 * SIZE

.L78:
#ifndef TRMMKERNEL
	LD	$f0, 0 * SIZE(CO1)
	daddiu	I, I, -1
	LD	$f1, 1 * SIZE(CO1)
	daddiu	CO1,CO1, 2 * SIZE

	ADD	c11, c11, c21
	ADD	c12, c12, c22

	MADD	c11, $f0, ALPHA, c11
	MADD	c12, $f1, ALPHA, c12

	ST	c11, -2 * SIZE(CO1)
	bgtz	I, .L71
	ST	c12, -1 * SIZE(CO1)
#else
	ADD	c11, c11, c21
	daddiu	I, I, -1
	ADD	c12, c12, c22
	daddiu	CO1,CO1, 2 * SIZE

	MUL	c11, ALPHA, c11
	MUL	c12, ALPHA, c12

	ST	c11, -2 * SIZE(CO1)
	ST	c12, -1 * SIZE(CO1)

#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	dsubu	TEMP, K, KK
#ifdef LEFT
	daddiu	TEMP, TEMP, -2
#else
	daddiu	TEMP, TEMP, -1
#endif

	dsll	L,    TEMP, 1 + BASE_SHIFT
	dsll	TEMP, TEMP, 0 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, BO, TEMP
#endif

#ifdef LEFT
	daddiu	KK, KK, 2
#endif

	bgtz	I, .L71
	NOP
#endif
	.align 3

.L80:
	andi	I,  M, 1
	blez	I, .L89
	NOP

#if defined(TRMMKERNEL)
#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
	move	BO,  B
#else
	dsll	L,    KK, 0 + BASE_SHIFT
	dsll	TEMP, KK, 0 + BASE_SHIFT

	daddu	AO, AO, L
	daddu	BO, B,  TEMP
#endif

	LD	a1,  0 * SIZE(AO)
	MTC	$0,  c11
	LD	a2,  1 * SIZE(AO)
	MOV	c21, c11
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)

	LD	b1,  0 * SIZE(BO)
	LD	b2,  1 * SIZE(BO)
	LD	b3,  2 * SIZE(BO)
	LD	b4,  3 * SIZE(BO)
	LD	b5,  4 * SIZE(BO)
	LD	b6,  8 * SIZE(BO)
	LD	b7, 12 * SIZE(BO)

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	dsubu	TEMP, K, KK
#elif defined(LEFT)
	daddiu	TEMP, KK, 1
#else
	daddiu	TEMP, KK, 1
#endif
	dsra	L,  TEMP, 2
	blez	L, .L85
	NOP
#else
	LD	a1,  0 * SIZE(AO)
	MTC	$0,  c11
	LD	a2,  1 * SIZE(AO)
	MOV	c21, c11
	LD	a3,  2 * SIZE(AO)
	LD	a4,  3 * SIZE(AO)

	LD	b1,  0 * SIZE(B)
	LD	b2,  1 * SIZE(B)
	LD	b3,  2 * SIZE(B)
	LD	b4,  3 * SIZE(B)
	LD	b5,  4 * SIZE(B)
	LD	b6,  8 * SIZE(B)
	LD	b7, 12 * SIZE(B)

	dsra	L,  K, 2
	blez	L, .L85
	move	BO,  B
#endif
	.align	3

.L82:
	LD	a1,  0 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)

	MADD	c11, c11, a1, b1

	LD	a1,  1 * SIZE(AO)
	LD	b1,  1 * SIZE(BO)

	MADD	c21, c21, a1, b1

	LD	a1,  2 * SIZE(AO)
	LD	b1,  2 * SIZE(BO)

	MADD	c11, c11, a1, b1

	LD	a1,  3 * SIZE(AO)
	LD	b1,  3 * SIZE(BO)

	MADD	c21, c21, a1, b1

	daddiu	L, L, -1
	daddiu	AO, AO,  4 * SIZE
	bgtz	L, .L82
	daddiu	BO, BO,  4 * SIZE
	.align 3

.L85:
#ifndef TRMMKERNEL
	andi	L,  K, 3
#else
	andi	L,  TEMP, 3
#endif
	NOP
	blez	L, .L88
	NOP
	.align	3

.L86:
	LD	a1,  0 * SIZE(AO)
	LD	b1,  0 * SIZE(BO)

	MADD	c11, c11, a1, b1

	daddiu	L, L, -1
	daddiu	AO, AO,  1 * SIZE
	bgtz	L, .L86
	daddiu	BO, BO,  1 * SIZE


.L88:
#ifndef TRMMKERNEL
	LD	$f0, 0 * SIZE(CO1)

	ADD	c11, c11, c21
	MADD	c11, $f0, ALPHA, c11

	ST	c11,  0 * SIZE(CO1)
#else
	ADD	c11, c11, c21
	MUL	c11, ALPHA, c11

	ST	c11,  0 * SIZE(CO1)
#endif
	.align 3

.L89:
#if defined(TRMMKERNEL) && !defined(LEFT)
	daddiu	KK, KK, 1
#endif
	move	B, BO
	.align 3


.L999:
	LDARG	$16,   0($sp)
	LDARG	$17,   8($sp)
	LDARG	$18,  16($sp)
	LDARG	$19,  24($sp)
	LDARG	$20,  32($sp)
	LDARG	$21,  40($sp)
	LDARG	$22,  48($sp)

	ldc1	$f24, 56($sp)
	ldc1	$f25, 64($sp)
	ldc1	$f26, 72($sp)
	ldc1	$f27, 80($sp)
	ldc1	$f28, 88($sp)

#if defined(TRMMKERNEL)
	LDARG	$23,  96($sp)
	LDARG	$24, 104($sp)
	LDARG	$25, 112($sp)
#endif

#ifndef __64BIT__
	ldc1	$f20,120($sp)
	ldc1	$f21,128($sp)
	ldc1	$f22,136($sp)
	ldc1	$f23,144($sp)
#endif

	j	$31
	daddiu	$sp, $sp, 160

	EPILOGUE