Blob Blame Raw
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"
#include "version.h"

#if !defined(EV4) && !defined(EV5) && !defined(EV6)
#error "Architecture is not specified."
#endif

#ifdef EV6
#define PREFETCHSIZE 56
#define UNOP unop
#endif

#ifdef EV5
#define PREFETCHSIZE 48
#define UNOP
#endif

#ifdef EV4
#define UNOP
#endif

	.set	noat
	.set	noreorder
	.arch ev6

.text
	.align	5
	.globl	CNAME
	.ent	CNAME

#define STACKSIZE 80

#define M	$16
#define N	$17
#define K	$18
#define A	$21
#define	B	$22
#define C	$20
#define	LDC	$23

#define C1	$19
#define C2	$24

#define AO	$at
#define	BO	$5
#define I	$6
#define J	$7
#define L	$8

#define a1	$f16
#define a2	$f17
#define a3	$f18
#define a4	$f19

#define b1	$f20
#define b2	$f21
#define b3	$f22
#define b4	$f23

#define t1	$f24
#define t2	$f25
#define t3	$f26
#define t4	$f27

#define a5	$f28
#define a6	$f30
#define b5	$f29

#define alpha_i	$f29
#define alpha_r	$f30

#define c01	$f0
#define c02	$f1
#define c03	$f2
#define c04	$f3

#define c05	$f4
#define c06	$f5
#define c07	$f6
#define c08	$f7

#define c09	$f8
#define c10	$f9
#define c11	$f10
#define c12	$f11

#define c13	$f12
#define c14	$f13
#define c15	$f14
#define c16	$f15

#define TMP1	$0
#define TMP2	$1
#define KK	$2
#define	BB	$3
#define OFFSET  $4

#define ALPHA_R	64($sp)
#define ALPHA_I	72($sp)

#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define ADD1	  ADD
#define ADD2	  SUB
#define ADD3	  ADD
#define ADD4	  ADD
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define ADD1	  ADD
#define ADD2	  ADD
#define ADD3	  SUB
#define ADD4	  ADD
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define ADD1	  ADD
#define ADD2	  ADD
#define ADD3	  ADD
#define ADD4	  SUB
#else
#define ADD1	  ADD
#define ADD2	  SUB
#define ADD3	  SUB
#define ADD4	  SUB
#endif

CNAME:
	.frame	$sp, STACKSIZE, $26, 0

#ifdef PROFILE
	ldgp	$gp, 0($27)
	lda	$at, _mcount
	jsr	$at, ($at), _mcount
#endif

#ifndef PROFILE
	.prologue 0
#else
	.prologue 1
#endif

	lda	$sp, -STACKSIZE($sp)

	ldq	B,        0 + STACKSIZE($sp)
	ldq	C,        8 + STACKSIZE($sp)
	ldq	LDC,     16 + STACKSIZE($sp)
#ifdef TRMMKERNEL
	ldq	OFFSET,  24 + STACKSIZE($sp)
#endif

	sll	LDC, ZBASE_SHIFT, LDC

	stt	$f2,   0($sp)
	stt	$f3,   8($sp)
	stt	$f4,  16($sp)
	stt	$f5,  24($sp)
	stt	$f6,  32($sp)
	stt	$f7,  40($sp)
	stt	$f8,  48($sp)
	stt	$f9,  56($sp)
	stt	$f19, ALPHA_R
	stt	$f20, ALPHA_I

	cmple	M, 0, $0
	cmple	N, 0, $1
	cmple	K, 0, $2

	or	$0, $1, $0
	or	$0, $2, $0
	bne	$0, $L999

#if defined(TRMMKERNEL) && !defined(LEFT)
	subq	$31, OFFSET, KK
#endif

	sra	N, 1, J
	ble	J, $L30
	.align 4
	
$L01:
	mov	C,  C1
	addq	C,  LDC, C2
	mov	A, AO
	s4addq	K, 0, BB


#if defined(TRMMKERNEL) &&  defined(LEFT)
	mov	OFFSET, KK
#endif

	SXADDQ	BB, B, BB
	addq	C2, LDC, C
	unop

	sra	M,  1, I
	fclr	t1
	fclr	t2
	fclr	t3
	fclr	t4

	fclr	c01
	fclr	c05

	ble	I, $L20
	.align 4

$L11:
#ifndef EV4
	ldl	$31,   0 * SIZE(BB)
	ldl	$31,   8 * SIZE(BB)
	unop
	lda	BB,   16 * SIZE(BB)
#endif

#if !defined(TRMMKERNEL) || \
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))

#ifdef TRMMKERNEL
#ifdef LEFT
	addq	KK, 2, TMP1
#else
	addq	KK, 2, TMP1
#endif
#endif

	LD	a1,  0 * SIZE(AO)
 	fclr	c09
	LD	a2,  1 * SIZE(AO)
	fclr	c13

	LD	a3,  2 * SIZE(AO)
	fclr	c02
	LD	a4,  3 * SIZE(AO)
	fclr	c06

	LD	b1,  0 * SIZE(B)
	fclr	c10
	LD	b2,  1 * SIZE(B)
	fclr	c14

	LD	b3,  2 * SIZE(B)
	fclr	c03
	LD	b4,  3 * SIZE(B)
	fclr	c07

	lda	BO,  4 * SIZE(B)
	fclr	c11
	lda	AO,  4 * SIZE(AO)
	fclr	c15

 	lds	$f31,  4 * SIZE(C1)
	fclr	c04
#ifndef TRMMKERNEL
	lda	L,        -2(K)
#else
	lda	L,        -2(TMP1)
#endif
	fclr	c08

	lds	$f31,  4 * SIZE(C2)
	fclr	c12
	fclr	c16
	ble	L, $L15
#else
	sll	KK, ZBASE_SHIFT + 1, TMP1
	addq	AO, TMP1, AO
	addq	B,  TMP1, BO
	subq	K, KK, TMP1

	LD	a1,  0 * SIZE(AO)
 	fclr	c09
	LD	a2,  1 * SIZE(AO)
	fclr	c13

	LD	a3,  2 * SIZE(AO)
	fclr	c02
	LD	a4,  3 * SIZE(AO)
	fclr	c06

	LD	b1,  0 * SIZE(BO)
	fclr	c10
	LD	b2,  1 * SIZE(BO)
	fclr	c14

	LD	b3,  2 * SIZE(BO)
	fclr	c03
	LD	b4,  3 * SIZE(BO)
	fclr	c07

	lda	BO,  4 * SIZE(BO)
	fclr	c11
	lda	AO,  4 * SIZE(AO)
	fclr	c15

 	lds	$f31,  4 * SIZE(C1)
	fclr	c04
	lda	L,        -2(TMP1)
	fclr	c08

	lds	$f31,  4 * SIZE(C2)
	fclr	c12
	fclr	c16
	ble	L, $L15
#endif
	.align	5

$L12:
/*  1 */
	ADD1	c11,  t1, c11
#ifndef EV4
	ldq	$31,  PREFETCHSIZE * SIZE(AO)
#else
	unop
#endif
	MUL	b1, a1, t1
#ifndef EV4
	ldl	$31,  PREFETCHSIZE * SIZE(BO)
#else
	unop
#endif

	ADD3	c12,  t2, c12
	unop
	MUL	b1, a2, t2
	unop

	ADD2	c16,  t3, c16
	unop
	MUL	b2, a2, t3
	LD	a5,   0 * SIZE(AO)

	ADD4	c15, t4, c15
	unop
	MUL	b2, a1, t4
	LD	b5,   0 * SIZE(BO)

/*  2 */
	ADD1	c01, t1, c01
	UNOP
	MUL	b1, a3, t1
	UNOP

	ADD3	c02, t2, c02
	UNOP
	MUL	b1, a4, t2
	UNOP

	ADD2	c06,  t3, c06
	unop
	MUL	b2, a4, t3
	unop

	ADD4	c05, t4, c05
	unop
	MUL	b4, a1, t4
	unop

/*  3 */
	ADD1	c03, t1, c03
	unop
	MUL	b3, a1, t1
	unop

	ADD3	c04, t2, c04
	unop
	MUL	b3, a2, t2
	unop

	ADD2	c08,  t3, c08
	unop
	MUL	b4, a2, t3
	LD	a2,  1 * SIZE(AO)

	ADD4	c13, t4, c13
	unop
	MUL	b2, a3, t4
	LD	b2,  1 * SIZE(BO)

/*  4 */
	ADD1	c09,  t1, c09
	unop
	MUL	b3, a3, t1
	LD	a6,  2 * SIZE(AO)

	ADD3	c10,  t2, c10
	unop
	MUL	b3, a4, t2
	LD	b3,  2 * SIZE(BO)

	ADD2	c14, t3, c14
	unop
	MUL	b4, a4, t3
	LD	a4,  3 * SIZE(AO)

	ADD4	c07,  t4, c07
	unop
	MUL	b4, a3, t4
	LD	b4,  3 * SIZE(BO)

/*  5 */
	ADD1	c11,  t1, c11
	unop
	MUL	b5,  a5,  t1
	LD	a1,  4 * SIZE(AO)

	ADD3	c12,  t2, c12
	lda	L,        -2(L)
	MUL	b5,  a2, t2
	LD	b1,  4 * SIZE(BO)

	ADD2	c16,  t3, c16
	unop
	MUL	b2, a2, t3
	unop

	ADD4	c15, t4, c15
	unop
	MUL	b2, a5,  t4
	unop

/*  6 */
	ADD1	c01, t1, c01
	unop
	MUL	b5,  a6, t1
	unop

	ADD3	c02, t2, c02
	unop
	MUL	b5,  a4, t2
	unop

	ADD2	c06,  t3, c06
	unop
	MUL	b2, a4, t3
	unop

	ADD4	c05, t4, c05
	unop
	MUL	b4, a5,  t4
	unop

/*  7 */
	ADD1	c03, t1, c03
	lda	AO,    8 * SIZE(AO)
	MUL	b3, a5,  t1
	unop

	ADD3	c04, t2, c04
	lda	BO,    8 * SIZE(BO)
	MUL	b3, a2, t2
	unop

	ADD2	c08,  t3, c08
	unop
	MUL	b4, a2, t3
	LD	a2, -3 * SIZE(AO)

	ADD4	c13, t4, c13
	unop
	MUL	b2, a6, t4
	LD	b2, -3 * SIZE(BO)

/*  8 */
	ADD1	c09,  t1, c09
	unop
	MUL	b3, a6, t1
	LD	a3, -2 * SIZE(AO)

	ADD3	c10,  t2, c10
	unop
	MUL	b3, a4, t2
	LD	b3, -2 * SIZE(BO)

	ADD2	c14, t3, c14
	unop
	MUL	b4, a4, t3
	LD	a4, -1 * SIZE(AO)

	ADD4	c07,  t4, c07
	MUL	b4, a6, t4
	LD	b4, -1 * SIZE(BO)
	bgt	L,  $L12
	.align 4

$L15:
	ADD1	c11,  t1, c11
	ldt	alpha_r, ALPHA_R
	MUL	b1, a1, t1
#ifndef TRMMKERNEL
	blbs	K, $L18
#else
	blbs	TMP1, $L18
#endif
	.align 4

	ADD3	c12,  t2, c12
	MUL	b1, a2, t2
	ADD2	c16,  t3, c16
	MUL	b2, a2, t3

	ADD4	c15, t4, c15
	MUL	b2, a1, t4
	ADD1	c01, t1, c01
	MUL	b1, a3, t1

	ADD3	c02, t2, c02
	unop
	MUL	b1, a4, t2
	LD	b1,  0 * SIZE(BO)

	ADD2	c06,  t3, c06
	MUL	b2, a4, t3
	ADD4	c05, t4, c05
	MUL	b4, a1, t4

	ADD1	c03, t1, c03
	unop
	MUL	b3, a1, t1
	LD	a1,  0 * SIZE(AO)

	ADD3	c04, t2, c04
	unop
	MUL	b3, a2, t2
	unop

	ADD2	c08,  t3, c08
	unop
	MUL	b4, a2, t3
	LD	a2,  1 * SIZE(AO)

	ADD4	c13, t4, c13
	unop
	MUL	b2, a3, t4
	LD	b2,  1 * SIZE(BO)

	ADD1	c09,  t1, c09
	unop
	MUL	b3, a3, t1
	lda	AO,  4 * SIZE(AO)

	ADD3	c10,  t2, c10
	unop
	MUL	b3, a4, t2
 	LD	b3,  2 * SIZE(BO)

	ADD2	c14, t3, c14
	unop
	MUL	b4, a4, t3
	LD	a4, -1 * SIZE(AO)

	ADD4	c07,  t4, c07
	unop
	MUL	b4, a3, t4
	LD	a3, -2 * SIZE(AO)

	ADD1	c11,  t1, c11
	LD	b4,  3 * SIZE(BO)
	MUL	b1, a1, t1
	lda	BO,  4 * SIZE(BO)
	.align 4

$L18:
	ADD3	c12,  t2, c12
	unop
	MUL	b1, a2, t2
	ldt	alpha_i, ALPHA_I

	ADD2	c16,  t3, c16
	unop
	MUL	b2, a2, t3
#ifndef TRMMKERNEL
	LD	a5, 0 * SIZE(C1)
#else
	unop
#endif

	ADD4	c15, t4, c15
	MUL	b2, a1, t4
	ADD1	c01, t1, c01
	MUL	b1, a3, t1

	ADD3	c02, t2, c02
	unop
	MUL	b1, a4, t2
#ifndef TRMMKERNEL
	LD	b1, 1 * SIZE(C1)
#else
	unop
#endif

	ADD2	c06,  t3, c06
	MUL	b2, a4, t3
	ADD4	c05, t4, c05
	MUL	b4, a1, t4

	ADD1	c03, t1, c03
	unop
	MUL	b3, a1, t1
#ifndef TRMMKERNEL
	LD	a1, 2 * SIZE(C1)
#else
	unop
#endif

	ADD3	c04, t2, c04
	unop
	MUL	b3, a2, t2
	unop

	ADD2	c08,  t3, c08
	unop
	MUL	b4, a2, t3
#ifndef TRMMKERNEL
	LD	a2, 3 * SIZE(C1)
#else
	unop
#endif

	ADD4	c13, t4, c13
	unop
	MUL	b2, a3, t4
#ifndef TRMMKERNEL
	LD	b2, 0 * SIZE(C2)
#else
	unop
#endif

	ADD1	c09,  t1, c09
	lda	I,        -1(I)
	MUL	b3, a3, t1
	unop

	ADD3	c10,  t2, c10
	unop
	MUL	b3, a4, t2
#ifndef TRMMKERNEL
	LD	b3, 1 * SIZE(C2)
#else
	unop
#endif

	ADD2	c14, t3, c14
	unop
	MUL	b4, a4, t3
#ifndef TRMMKERNEL
  	LD	a4, 2 * SIZE(C2)
#else
	unop
#endif

	ADD4	c07,  t4, c07
	unop
	MUL	b4, a3, t4
#ifndef TRMMKERNEL
	LD	a3, 3 * SIZE(C2)
#else
	unop
#endif

	ADD1	c11,  t1, c11
	ADD3	c12,  t2, c12
	ADD2	c16,  t3, c16
	ADD4	c15,  t4, c15

	ADD	c01, c06, c01
	ADD	c02, c05, c02
	ADD	c03, c08, c03
	ADD	c04, c07, c04

	ADD	c09, c14, c09
	MUL	  alpha_r, c01, t1
	ADD	c10, c13, c10
	MUL	  alpha_r, c02, t2

	ADD	c11, c16, c11
	MUL	  alpha_r, c03, t3
	ADD	c12, c15, c12
	MUL	  alpha_r, c04, t4

#ifndef TRMMKERNEL
	ADD	  a5,  t1,  a5
	MUL	  alpha_i, c02, t1
	ADD	  b1,  t2,  b1
	MUL	  alpha_i, c01, t2

	ADD	  a1,  t3,  a1
	MUL	  alpha_i, c04, t3
	ADD	  a2,  t4,  a2
	MUL	  alpha_i, c03, t4
#else
	ADD	  $f31,  t1,  a5
	MUL	  alpha_i, c02, t1
	ADD	  $f31,  t2,  b1
	MUL	  alpha_i, c01, t2

	ADD	  $f31,  t3,  a1
	MUL	  alpha_i, c04, t3
	ADD	  $f31,  t4,  a2
	MUL	  alpha_i, c03, t4
#endif

	SUB	  a5,  t1,  a5
	MUL	  alpha_r, c09, t1
	ADD	  b1,  t2,  b1
	MUL	  alpha_r, c10, t2

	SUB	  a1,  t3,  a1
	MUL	  alpha_r, c11, t3
	ADD	  a2,  t4,  a2
	MUL	  alpha_r, c12, t4

#ifndef TRMMKERNEL
	ADD	  b2,  t1,  b2
	MUL	  alpha_i, c10, t1
	ADD	  b3,  t2,  b3
	MUL	  alpha_i, c09, t2

	ADD	  a4,  t3,  a4
	MUL	  alpha_i, c12, t3
	ADD	  a3,  t4,  a3
	MUL	  alpha_i, c11, t4
#else
	ADD	  $f31,  t1,  b2
	MUL	  alpha_i, c10, t1
	ADD	  $f31,  t2,  b3
	MUL	  alpha_i, c09, t2

	ADD	  $f31,  t3,  a4
	MUL	  alpha_i, c12, t3
	ADD	  $f31,  t4,  a3
	MUL	  alpha_i, c11, t4
#endif

	SUB	  b2,  t1,  b2
	ST	a5,  0 * SIZE(C1)
	fclr	t1
	unop

	ADD	  b3,  t2,  b3
	ST	b1,  1 * SIZE(C1)
	fclr	t2
	unop

	SUB	  a4,  t3,  a4
	ST	a1,  2 * SIZE(C1)
	fclr	t3
	unop

	ADD	  a3,  t4,  a3
	ST	a2,  3 * SIZE(C1)
	fclr	t4
	unop

	ST	b2,  0 * SIZE(C2)
	fclr	c01
 	ST	b3,  1 * SIZE(C2)
	fclr	c05

	ST	a4,  2 * SIZE(C2)
	lda	C1,  4 * SIZE(C1)
	ST	a3,  3 * SIZE(C2)
	lda	C2,  4 * SIZE(C2)

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
	subq	K, KK, TMP1
#ifdef LEFT
	subq	TMP1, 2, TMP1
#else
	subq	TMP1, 2, TMP1
#endif
	sll	TMP1, ZBASE_SHIFT + 1, TMP1
	addq	AO, TMP1, AO
	addq	BO, TMP1, BO
#endif

#if defined(TRMMKERNEL) && defined(LEFT)
	addq	KK, 2, KK
#endif
	bgt	I, $L11
	.align 4

$L20:
	and	M,  1, I
	ble	I, $L29

#if !defined(TRMMKERNEL) || \
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))

#ifdef TRMMKERNEL
#ifdef LEFT
	addq	KK, 1, TMP1
#else
	addq	KK, 2, TMP1
#endif
#endif

	LD	a1,  0 * SIZE(AO)
 	fclr	c09
	LD	a2,  1 * SIZE(AO)
	fclr	c13

	LD	a3,  2 * SIZE(AO)
	fclr	c02
	LD	a4,  3 * SIZE(AO)
	fclr	c06

	LD	b1,  0 * SIZE(B)
	fclr	c10
	LD	b2,  1 * SIZE(B)
	fclr	c14

	LD	b3,  2 * SIZE(B)
	lda	AO,  2 * SIZE(AO)
	LD	b4,  3 * SIZE(B)
	lda	BO,  4 * SIZE(B)

#ifndef TRMMKERNEL
	lda	L,        -2(K)
#else
	lda	L,        -2(TMP1)
#endif
	ble	L, $L25
#else
	sll	KK, ZBASE_SHIFT + 0, TMP1
	addq	AO, TMP1, AO
	sll	KK, ZBASE_SHIFT + 1, TMP1
	addq	B,  TMP1, BO
	subq	K, KK, TMP1

	LD	a1,  0 * SIZE(AO)
 	fclr	c09
	LD	a2,  1 * SIZE(AO)
	fclr	c13

	LD	a3,  2 * SIZE(AO)
	fclr	c02
	LD	a4,  3 * SIZE(AO)
	fclr	c06

	LD	b1,  0 * SIZE(BO)
	fclr	c10
	LD	b2,  1 * SIZE(BO)
	fclr	c14

	LD	b3,  2 * SIZE(BO)
	lda	AO,  2 * SIZE(AO)
	LD	b4,  3 * SIZE(BO)
	lda	BO,  4 * SIZE(BO)

	lda	L,        -2(TMP1)
	ble	L, $L25
#endif
	.align	5

$L22:
	ADD1	c09, t1, c09
	unop
	MUL	a1, b1, t1
	unop

	ADD3	c10, t2, c10
	unop
	MUL	a2, b1, t2
	LD	b1,  0 * SIZE(BO)

	ADD4	c13, t3, c13
	unop
	MUL	a1, b2, t3
	lda	BO,    8 * SIZE(BO)

	ADD2	c14, t4, c14
	unop
	MUL	a2, b2, t4
	LD	b2, -7 * SIZE(BO)

	ADD1	c01, t1, c01
	unop
	MUL	a1, b3, t1
	unop

	ADD3	c02, t2, c02
	unop
	MUL	a2, b3, t2
	LD	b3, -6 * SIZE(BO)

	ADD4	c05, t3, c05
	unop
	MUL	a1, b4, t3
	LD	a1,  2 * SIZE(AO)

	ADD2	c06, t4, c06
	MUL	a2, b4, t4
	LD	b5, -5 * SIZE(BO)

	ADD1	c09, t1, c09
	unop
	MUL	a3, b1, t1
	LD	a2,  3 * SIZE(AO)

	ADD3	c10, t2, c10
	unop
	MUL	a4, b1, t2
	LD	b1, -4 * SIZE(BO)

	ADD4	c13, t3, c13
	unop
	MUL	a3, b2, t3
	lda	AO,    4 * SIZE(AO)

	ADD2	c14, t4, c14
	MUL	a4, b2, t4
	LD	b2, -3 * SIZE(BO)

	ADD1	c01, t1, c01
	lda	L,        -2(L)
	MUL	a3, b3, t1
	LD	b4, -1 * SIZE(BO)

	ADD3	c02, t2, c02
	unop
	MUL	a4, b3, t2
	LD	b3, -2 * SIZE(BO)

	ADD4	c05, t3, c05
	unop
	MUL	a3, b5, t3
	LD	a3,  0 * SIZE(AO)

	ADD2	c06, t4, c06
	MUL	a4, b5, t4
	LD	a4,  1 * SIZE(AO)
	bgt	L,  $L22
	.align 4

$L25:
	ADD1	c09, t1, c09
	ldt	alpha_r, ALPHA_R
	MUL	a1, b1, t1
#ifndef TRMMKERNEL
	blbs	K, $L28
#else
	blbs	TMP1, $L28
#endif
	.align 4

	ADD3	c10, t2, c10
	unop
	MUL	a2, b1, t2
	LD	b1,  0 * SIZE(BO)

	ADD4	c13, t3, c13
	unop
	MUL	a1, b2, t3
	unop

	ADD2	c14, t4, c14
	unop
	MUL	a2, b2, t4
	LD	b2,  1 * SIZE(BO)

	ADD1	c01, t1, c01
	unop
	MUL	a1, b3, t1
	lda	AO,  2 * SIZE(AO)

	ADD3	c02, t2, c02
	unop
	MUL	a2, b3, t2
	LD	b3,  2 * SIZE(BO)

	ADD4	c05, t3, c05
	unop
	MUL	a1, b4, t3
	LD	a1, -2 * SIZE(AO)

	ADD2	c06, t4, c06
	unop
	MUL	a2, b4, t4
	LD	a2, -1 * SIZE(AO)

	ADD1	c09, t1, c09
	LD	b4,  3 * SIZE(BO)
	MUL	a1, b1, t1
	lda	BO,  4 * SIZE(BO)
	.align 4

$L28:
	ADD3	c10, t2, c10
	unop
	MUL	a2, b1, t2
	ldt	alpha_i, ALPHA_I

	ADD4	c13, t3, c13
	unop
	MUL	a1, b2, t3
#ifndef TRMMKERNEL
	LD	c03, 0 * SIZE(C1)
#else
	unop
#endif

	ADD2	c14, t4, c14
	unop
	MUL	a2, b2, t4
#ifndef TRMMKERNEL
	LD	c04, 1 * SIZE(C1)
#else
	unop
#endif

	ADD1	c01, t1, c01
	unop
	MUL	a1, b3, t1
#ifndef TRMMKERNEL
	LD	c11, 0 * SIZE(C2)
#else
	unop
#endif

	ADD3	c02, t2, c02
	unop
	MUL	a2, b3, t2
#ifndef TRMMKERNEL
	LD	c12, 1 * SIZE(C2)
#else
	unop
#endif

	ADD4	c05, t3, c05
	MUL	a1, b4, t3
	ADD2	c06, t4, c06
	MUL	a2, b4, t4

	ADD1	c09, t1, c09
	ADD3	c10, t2, c10
	ADD4	c13, t3, c13
	ADD2	c14, t4, c14

	ADD	c01, c06, c01
	ADD	c02, c05, c02
	ADD	c09, c14, c09
	ADD	c10, c13, c10

	MUL	  alpha_r, c01, t1
	MUL	  alpha_r, c02, t2
	MUL	  alpha_r, c09, t3
	MUL	  alpha_r, c10, t4

#ifndef TRMMKERNEL
	ADD	  c03,  t1,  c03
	MUL	  alpha_i, c02, t1
	ADD	  c04,  t2,  c04
	MUL	  alpha_i, c01, t2

	ADD	  c11,  t3,  c11
	MUL	  alpha_i, c10, t3
	ADD	  c12,  t4,  c12
	MUL	  alpha_i, c09, t4
#else
	ADD	  $f31,  t1,  c03
	MUL	  alpha_i, c02, t1
	ADD	  $f31,  t2,  c04
	MUL	  alpha_i, c01, t2

	ADD	  $f31,  t3,  c11
	MUL	  alpha_i, c10, t3
	ADD	  $f31,  t4,  c12
	MUL	  alpha_i, c09, t4
#endif

	SUB	  c03,  t1,  c03
	ADD	  c04,  t2,  c04
	SUB	  c11,  t3,  c11
	ADD	  c12,  t4,  c12

	ST	c03,  0 * SIZE(C1)
	ST	c04,  1 * SIZE(C1)
	ST	c11,  0 * SIZE(C2)
 	ST	c12,  1 * SIZE(C2)

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
	subq	K, KK, TMP1
#ifdef LEFT
	subq	TMP1, 1, TMP1
#else
	subq	TMP1, 2, TMP1
#endif
	sll	TMP1, ZBASE_SHIFT + 0, TMP2
	addq	AO, TMP2, AO
	sll	TMP1, ZBASE_SHIFT + 1, TMP2
	addq	BO, TMP2, BO
#endif

#if defined(TRMMKERNEL) && defined(LEFT)
	addq	KK, 1, KK
#endif
	.align 4

$L29:
 	mov	BO, B
	lda	J,        -1(J)
#if defined(TRMMKERNEL) && !defined(LEFT)
	addq	KK, 2, KK
#else
	unop
#endif
	bgt	J, $L01
	.align 4

$L30:
	and	N, 1, J
	ble	J, $L999

	mov	C,  C1
	mov	A, AO

#if defined(TRMMKERNEL) &&  defined(LEFT)
	mov	OFFSET, KK
#endif

	sra	M,  1, I
	ble	I, $L50
	.align 4

$L41:
#if !defined(TRMMKERNEL) || \
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))

#ifdef TRMMKERNEL
#ifdef LEFT
	addq	KK, 2, TMP1
#else
	addq	KK, 1, TMP1
#endif
#endif

	LD	a1,  0 * SIZE(AO)
	fclr	t1
	LD	a2,  1 * SIZE(AO)
	fclr	t2
	LD	a3,  2 * SIZE(AO)
	fclr	t3
	LD	a4,  3 * SIZE(AO)
	fclr	t4

	LD	b1,  0 * SIZE(B)
	fclr	c01
	LD	b2,  1 * SIZE(B)
	fclr	c05
	LD	b3,  2 * SIZE(B)
 	fclr	c02
	LD	b4,  3 * SIZE(B)
	fclr	c06

	lda	BO,  2 * SIZE(B)
	fclr	c03
	lda	AO,  4 * SIZE(AO)
	fclr	c07

#ifndef TRMMKERNEL
	lda	L,        -2(K)
#else
	lda	L,        -2(TMP1)
#endif
	fclr	c04
	fclr	c08
	ble	L, $L45
#else
	sll	KK, ZBASE_SHIFT + 1, TMP1
	addq	AO, TMP1, AO
	sll	KK, ZBASE_SHIFT + 0, TMP1
	addq	B,  TMP1, BO
	subq	K, KK, TMP1

	LD	a1,  0 * SIZE(AO)
	fclr	t1
	LD	a2,  1 * SIZE(AO)
	fclr	t2
	LD	a3,  2 * SIZE(AO)
	fclr	t3
	LD	a4,  3 * SIZE(AO)
	fclr	t4

	LD	b1,  0 * SIZE(BO)
	fclr	c01
	LD	b2,  1 * SIZE(BO)
	fclr	c05
	LD	b3,  2 * SIZE(BO)
 	fclr	c02
	LD	b4,  3 * SIZE(BO)
	fclr	c06

	lda	BO,  2 * SIZE(BO)
	fclr	c03
	lda	AO,  4 * SIZE(AO)
	fclr	c07

	lda	L,        -2(TMP1)
	fclr	c04
	fclr	c08
	ble	L, $L45
#endif
	.align	5

$L42:
	ADD4	c05, t1, c05
	unop
	MUL	a1, b1, t1
	unop

	ADD2	c06, t2, c06
	lda	L,   -2(L)
	MUL	a2, b1, t2
	unop

	ADD4	c07, t3, c07
	unop
	MUL	a3, b1, t3
	unop

	ADD2	c08, t4, c08
	unop
	MUL	a4, b1, t4
	LD	b1,  2 * SIZE(BO)

	ADD1	c01, t1, c01
	unop
	MUL	a1, b2, t1
	LD	a1,  0 * SIZE(AO)

	ADD3	c02, t2, c02
	lda	BO,  4 * SIZE(BO)
	MUL	a2, b2, t2
	LD	a2,  1 * SIZE(AO)

	ADD1	c03, t3, c03
	unop
	MUL	a3, b2, t3
	LD	a3,  2 * SIZE(AO)

	ADD3	c04, t4, c04
	unop
	MUL	a4, b2, t4
	LD	a5,  3 * SIZE(AO)

	ADD4	c05, t1, c05
	unop
	MUL	a1, b3, t1
	LD	b2, -1 * SIZE(BO)

	ADD2	c06, t2, c06
	unop
	MUL	a2, b3, t2
	unop

	ADD4	c07, t3, c07
	unop
	MUL	a3, b3, t3
	lda	AO,  8 * SIZE(AO)

	ADD2	c08, t4, c08
	unop
	MUL	a5, b3, t4
	LD	b3,  0 * SIZE(BO)

	ADD1	c01, t1, c01
	unop
	MUL	a1, b4, t1
	LD	a1, -4 * SIZE(AO)

	ADD3	c02, t2, c02
	unop
	MUL	a2, b4, t2
	LD	a2, -3 * SIZE(AO)

	ADD1	c03, t3, c03
	LD	a4, -1 * SIZE(AO)
	MUL	a3, b4, t3
	LD	a3, -2 * SIZE(AO)

	ADD3	c04, t4, c04
	MUL	a5, b4, t4
	LD	b4,  1 * SIZE(BO)
	bgt	L,  $L42
	.align 4

$L45:
	ADD4	c05, t1, c05
	ldt	alpha_r, ALPHA_R
	MUL	b1, a1, t1
#ifndef TRMMKERNEL
	blbs	K, $L48
#else
	blbs	TMP1, $L48
#endif
	.align 4

	ADD2	c06, t2, c06
	MUL	a2, b1, t2
	ADD4	c07, t3, c07
	MUL	a3, b1, t3

	ADD2	c08, t4, c08
	unop
	MUL	a4, b1, t4
	LD	b1,  0 * SIZE(BO)

	ADD1	c01, t1, c01
	unop
	MUL	a1, b2, t1
	LD	a1,  0 * SIZE(AO)

	ADD3	c02, t2, c02
	unop
	MUL	a2, b2, t2
	LD	a2,  1 * SIZE(AO)

	ADD1	c03, t3, c03
	unop
	MUL	a3, b2, t3
	LD	a3,  2 * SIZE(AO)

	ADD3	c04, t4, c04
	MUL	a4, b2, t4
	LD	a4,  3 * SIZE(AO)
	lda	AO,  4 * SIZE(AO)

	ADD4	c05, t1, c05
	LD	b2,  1 * SIZE(BO)
	MUL	a1, b1, t1
	lda	BO,  2 * SIZE(BO)
	.align 4

$L48:
	ADD2	c06, t2, c06
	unop
	MUL	a2, b1, t2
	ldt	alpha_i, ALPHA_I

	ADD4	c07, t3, c07
	lda	I,        -1(I)
	MUL	a3, b1, t3
#ifndef TRMMKERNEL
	LD	c09, 0 * SIZE(C1)
#else
	unop
#endif

	ADD2	c08, t4, c08
	unop
	MUL	a4, b1, t4
#ifndef TRMMKERNEL
	LD	c10, 1 * SIZE(C1)
#else
	unop
#endif

	ADD1	c01, t1, c01
	unop
	MUL	a1, b2, t1
#ifndef TRMMKERNEL
	LD	c11, 2 * SIZE(C1)
#else
	unop
#endif

	ADD3	c02, t2, c02
	unop
	MUL	a2, b2, t2
#ifndef TRMMKERNEL
	LD	c12, 3 * SIZE(C1)
#else
	unop
#endif

	ADD1	c03, t3, c03
	MUL	a3, b2, t3
	ADD3	c04, t4, c04
	MUL	a4, b2, t4

	ADD4	c05, t1, c05
	ADD2	c06, t2, c06
	ADD4	c07, t3, c07
	ADD2	c08, t4, c08

	ADD	c01, c06, c01
	ADD	c02, c05, c02
	ADD	c03, c08, c03
	ADD	c04, c07, c04

	MUL	  alpha_r, c01, t1
	MUL	  alpha_r, c02, t2
	MUL	  alpha_r, c03, t3
	MUL	  alpha_r, c04, t4

#ifndef TRMMKERNEL
	ADD	  c09,  t1,  c09
	MUL	  alpha_i, c02, t1
	ADD	  c10,  t2,  c10
	MUL	  alpha_i, c01, t2

	ADD	  c11,  t3,  c11
	MUL	  alpha_i, c04, t3
	ADD	  c12,  t4,  c12
	MUL	  alpha_i, c03, t4
#else
	ADD	  $f31,  t1,  c09
	MUL	  alpha_i, c02, t1
	ADD	  $f31,  t2,  c10
	MUL	  alpha_i, c01, t2

	ADD	  $f31,  t3,  c11
	MUL	  alpha_i, c04, t3
	ADD	  $f31,  t4,  c12
	MUL	  alpha_i, c03, t4
#endif

	SUB	  c09,  t1,  c09
	ADD	  c10,  t2,  c10
	SUB	  c11,  t3,  c11
	ADD	  c12,  t4,  c12

	ST	c09,  0 * SIZE(C1)
	ST	c10,  1 * SIZE(C1)
	ST	c11,  2 * SIZE(C1)
	ST	c12,  3 * SIZE(C1)

	lda	C1,   4 * SIZE(C1)

#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
	subq	K, KK, TMP1
#ifdef LEFT
	subq	TMP1, 2, TMP1
#else
	subq	TMP1, 1, TMP1
#endif
	sll	TMP1, ZBASE_SHIFT + 1, TMP2
	addq	AO, TMP2, AO
	sll	TMP1, ZBASE_SHIFT + 0, TMP2
	addq	BO, TMP2, BO
#endif

#if defined(TRMMKERNEL) && defined(LEFT)
	addq	KK, 2, KK
#endif

	bgt	I, $L41
	.align 4

$L50:
	and	M,  1, I
	ble	I, $L999

#if !defined(TRMMKERNEL) || \
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))

#ifdef TRMMKERNEL
#ifdef LEFT
	addq	KK, 1, TMP1
#else
	addq	KK, 1, TMP1
#endif
#endif

	LD	a1,  0 * SIZE(AO)
	fclr	t1
	LD	a2,  1 * SIZE(AO)
	fclr	t2
	LD	a3,  2 * SIZE(AO)
	fclr	t3
	LD	a4,  3 * SIZE(AO)
	fclr	t4

	LD	b1,  0 * SIZE(B)
	fclr	c01
	LD	b2,  1 * SIZE(B)
	fclr	c05

	LD	b3,  2 * SIZE(B)
 	fclr	c02
	LD	b4,  3 * SIZE(B)
	fclr	c06

	lda	AO,  2 * SIZE(AO)
	lda	BO,  2 * SIZE(B)

#ifndef TRMMKERNEL
	lda	L,        -2(K)
#else
	lda	L,        -2(TMP1)
#endif
	ble	L, $L55
#else
	sll	KK, ZBASE_SHIFT + 0, TMP1
	addq	AO, TMP1, AO
	addq	B,  TMP1, BO
	subq	K, KK, TMP1

	LD	a1,  0 * SIZE(AO)
	fclr	t1
	LD	a2,  1 * SIZE(AO)
	fclr	t2
	LD	a3,  2 * SIZE(AO)
	fclr	t3
	LD	a4,  3 * SIZE(AO)
	fclr	t4

	LD	b1,  0 * SIZE(BO)
	fclr	c01
	LD	b2,  1 * SIZE(BO)
	fclr	c05

	LD	b3,  2 * SIZE(BO)
 	fclr	c02
	LD	b4,  3 * SIZE(BO)
	fclr	c06

	lda	AO,  2 * SIZE(AO)
	lda	BO,  2 * SIZE(BO)

	lda	L,        -2(TMP1)
	ble	L, $L55
#endif
	.align	5

$L52:
	ADD1	c01, t1, c01
	unop
	MUL	a1, b1, t1
	unop

	ADD3	c02, t2, c02
	lda	AO,  4 * SIZE(AO)
	MUL	a2, b1, t2
	LD	b1,  2 * SIZE(BO)

	ADD4	c05, t3, c05
	lda	L,        -2(L)
	MUL	a1, b2, t3
	LD	a1, -2 * SIZE(AO)

	ADD2	c06, t4, c06
	unop
	MUL	a2, b2, t4
	LD	a2, -1 * SIZE(AO)

	ADD1	c01, t1, c01
	LD	b2,  3 * SIZE(BO)
	MUL	a3, b3, t1
	lda	BO,    4 * SIZE(BO)

	ADD3	c02, t2, c02
	unop
	MUL	a4, b3, t2
	LD	b3,  0 * SIZE(BO)

	ADD4	c05, t3, c05
	unop
	MUL	a3, b4, t3
	LD	a3,  0 * SIZE(AO)

	ADD2	c06, t4, c06
	MUL	a4, b4, t4
	LD	b4,  1 * SIZE(BO)
	unop

	LD	a4,  1 * SIZE(AO)
	unop
	unop
	bgt	L,  $L52
	.align 4

$L55:
	ADD1	c01, t1, c01
	ldt	alpha_r, ALPHA_R
	MUL	a1, b1, t1
#ifndef TRMMKERNEL
	blbs	K, $L58
#else
	blbs	TMP1, $L58
#endif
	.align 4

	ADD3	c02, t2, c02
	unop
	MUL	a2, b1, t2
	LD	b1,  0 * SIZE(BO)

	ADD4	c05, t3, c05
	lda	BO,  2 * SIZE(BO)
	MUL	a1, b2, t3
	LD	a1,  0 * SIZE(AO)

	ADD2	c06, t4, c06
	unop
	MUL	a2, b2, t4
	LD	a2,  1 * SIZE(AO)

	ADD1	c01, t1, c01
	LD	b2, -1 * SIZE(BO)
	MUL	a1, b1, t1
	lda	AO,  2 * SIZE(AO)
	.align 4

$L58:
	ADD3	c02, t2, c02
	unop
	MUL	a2, b1, t2
	ldt	alpha_i, ALPHA_I

	ADD4	c05, t3, c05
	unop
	MUL	a1, b2, t3
#ifndef TRMMKERNEL
	LD	c03, 0 * SIZE(C1)
#else
	unop
#endif

	ADD2	c06, t4, c06
	unop
	MUL	a2, b2, t4
#ifndef TRMMKERNEL
	LD	c04, 1 * SIZE(C1)
#else
	unop
#endif

	ADD1	c01, t1, c01
	ADD3	c02, t2, c02
	ADD4	c05, t3, c05
	ADD2	c06, t4, c06

	ADD	c01, c06, c01
	ADD	c02, c05, c02

	MUL	  alpha_r, c01, t1
	MUL	  alpha_r, c02, t2
	MUL	  alpha_i, c02, t3
	MUL	  alpha_i, c01, t4

#ifndef TRMMKERNEL
	ADD	  c03,  t1,  c03
	ADD	  c04,  t2,  c04
#else
	ADD	  $f31,  t1,  c03
	ADD	  $f31,  t2,  c04
#endif

	SUB	  c03,  t3,  c03
	ADD	  c04,  t4,  c04

	ST	c03,  0 * SIZE(C1)
	ST	c04,  1 * SIZE(C1)
	.align 4

$L999:
	ldt	$f2,   0($sp)
	ldt	$f3,   8($sp)
	ldt	$f4,  16($sp)
	ldt	$f5,  24($sp)
	ldt	$f6,  32($sp)
	ldt	$f7,  40($sp)
	ldt	$f8,  48($sp)
	ldt	$f9,  56($sp)
	clr	$0
	lda	$sp, STACKSIZE($sp)
	ret
	.ident	VERSION
	.end	CNAME