/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "version.h"
#if !defined(EV4) && !defined(EV5) && !defined(EV6)
#error "Architecture is not specified."
#endif
#ifdef EV6
#define PREFETCHSIZE 56
#define UNOP unop
#endif
#ifdef EV5
#define PREFETCHSIZE 56
#define UNOP
#endif
#ifdef EV4
#define UNOP
#endif
#define STACKSIZE 80
#define M $16
#define N $17
#define K $18
#define A $20
#define B $21
#define C $22
#define LDC $23
#define C1 $19
#define C2 $24
#define C3 $25
#define C4 $27
#define AO $at
#define BO $5
#define I $6
#define J $7
#define L $8
#define a1 $f16
#define a2 $f17
#define a3 $f18
#define a4 $f19
#define b1 $f20
#define b2 $f21
#define b3 $f22
#define b4 $f23
#define t1 $f24
#define t2 $f25
#define t3 $f26
#define t4 $f27
#define a5 $f28
#define a6 $f30
#define b5 $f29
#define alpha $f30
#define c01 $f0
#define c02 $f1
#define c03 $f2
#define c04 $f3
#define c05 $f4
#define c06 $f5
#define c07 $f6
#define c08 $f7
#define c09 $f8
#define c10 $f9
#define c11 $f10
#define c12 $f11
#define c13 $f12
#define c14 $f13
#define c15 $f14
#define c16 $f15
#define TMP1 $0
#define TMP2 $1
#define KK $2
#define BB $3
#define OFFSET $4
#define ALPHA 64($sp)
PROLOGUE
PROFCODE
.frame $sp, STACKSIZE, $26, 0
lda $sp, -STACKSIZE($sp)
ldq C, 0 + STACKSIZE($sp)
ldq LDC, 8 + STACKSIZE($sp)
#ifdef TRMMKERNEL
ldq OFFSET, 16 + STACKSIZE($sp)
#endif
SXADDQ LDC, 0, LDC
stt $f2, 0($sp)
stt $f3, 8($sp)
stt $f4, 16($sp)
stt $f5, 24($sp)
stt $f6, 32($sp)
stt $f7, 40($sp)
stt $f8, 48($sp)
stt $f9, 56($sp)
stt $f19, ALPHA
cmple M, 0, $0
cmple N, 0, $1
cmple K, 0, $2
or $0, $1, $0
or $0, $2, $0
bne $0, $L999
#if defined(TRMMKERNEL) && !defined(LEFT)
subq $31, OFFSET, KK
#endif
sra N, 2, J
ble J, $L40
.align 4
$L01:
mov C, C1
addq C, LDC, C2
mov A, AO
s4addq K, 0, BB
#if defined(TRMMKERNEL) && defined(LEFT)
mov OFFSET, KK
#endif
addq C2, LDC, C3
s4addq LDC, C, C
SXADDQ BB, B, BB
fclr t1
addq C3, LDC, C4
fclr t2
sra M, 2, I
fclr t3
fclr t4
ble I, $L20
.align 4
$L11:
#if defined(EV5) || defined(EV6)
ldl $31, 0 * SIZE(BB)
ldl $31, 8 * SIZE(BB)
unop
lda BB, 16 * SIZE(BB)
#endif
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 4, TMP1
#else
addq KK, 4, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr c11
LD a2, 1 * SIZE(AO)
fclr c12
LD a3, 2 * SIZE(AO)
fclr c16
LD a4, 3 * SIZE(AO)
fclr c15
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c02
LD b3, 2 * SIZE(B)
fclr c06
LD b4, 3 * SIZE(B)
fclr c05
lds $f31, 4 * SIZE(C1)
fclr c03
#ifndef TRMMKERNEL
lda L, -2(K)
#else
lda L, -2(TMP1)
#endif
fclr c04
lds $f31, 7 * SIZE(C2)
fclr c08
lda BO, 4 * SIZE(B)
fclr c13
lds $f31, 4 * SIZE(C3)
fclr c09
lda AO, 4 * SIZE(AO)
fclr c10
#else
sll KK, BASE_SHIFT + 2, TMP1
addq AO, TMP1, AO
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c11
LD a2, 1 * SIZE(AO)
fclr c12
LD a3, 2 * SIZE(AO)
fclr c16
LD a4, 3 * SIZE(AO)
fclr c15
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c02
LD b3, 2 * SIZE(BO)
fclr c06
LD b4, 3 * SIZE(BO)
fclr c05
lds $f31, 4 * SIZE(C1)
fclr c03
lda L, -2(TMP1)
fclr c04
lds $f31, 7 * SIZE(C2)
fclr c08
lda BO, 4 * SIZE(BO)
fclr c13
lds $f31, 4 * SIZE(C3)
fclr c09
lda AO, 4 * SIZE(AO)
fclr c10
#endif
lds $f31, 7 * SIZE(C4)
fclr c14
fclr c07
ble L, $L15
.align 5
$L12:
/* 1 */
ADD c11, t1, c11
#ifndef EV4
ldq $31, PREFETCHSIZE * SIZE(AO)
#else
unop
#endif
MUL b1, a1, t1
#ifndef EV4
ldl $31, PREFETCHSIZE * SIZE(BO)
#else
unop
#endif
ADD c12, t2, c12
unop
MUL b1, a2, t2
unop
ADD c16, t3, c16
unop
MUL b2, a2, t3
LD a5, 0 * SIZE(AO)
ADD c15, t4, c15
unop
MUL b2, a1, t4
LD b5, 0 * SIZE(BO)
/* 2 */
ADD c01, t1, c01
UNOP
MUL b1, a3, t1
UNOP
ADD c02, t2, c02
UNOP
MUL b1, a4, t2
UNOP
ADD c06, t3, c06
unop
MUL b2, a4, t3
unop
ADD c05, t4, c05
unop
MUL b4, a1, t4
unop
/* 3 */
ADD c03, t1, c03
unop
MUL b3, a1, t1
unop
ADD c04, t2, c04
unop
MUL b3, a2, t2
unop
ADD c08, t3, c08
unop
MUL b4, a2, t3
LD a2, 1 * SIZE(AO)
ADD c13, t4, c13
unop
MUL b2, a3, t4
LD b2, 1 * SIZE(BO)
/* 4 */
ADD c09, t1, c09
unop
MUL b3, a3, t1
LD a6, 2 * SIZE(AO)
ADD c10, t2, c10
unop
MUL b3, a4, t2
LD b3, 2 * SIZE(BO)
ADD c14, t3, c14
unop
MUL b4, a4, t3
LD a4, 3 * SIZE(AO)
ADD c07, t4, c07
unop
MUL b4, a3, t4
LD b4, 3 * SIZE(BO)
/* 5 */
ADD c11, t1, c11
unop
MUL b5, a5, t1
LD a1, 4 * SIZE(AO)
ADD c12, t2, c12
lda L, -2(L)
MUL b5, a2, t2
LD b1, 4 * SIZE(BO)
ADD c16, t3, c16
unop
MUL b2, a2, t3
unop
ADD c15, t4, c15
unop
MUL b2, a5, t4
unop
/* 6 */
ADD c01, t1, c01
unop
MUL b5, a6, t1
unop
ADD c02, t2, c02
unop
MUL b5, a4, t2
unop
ADD c06, t3, c06
unop
MUL b2, a4, t3
unop
ADD c05, t4, c05
unop
MUL b4, a5, t4
unop
/* 7 */
ADD c03, t1, c03
lda AO, 8 * SIZE(AO)
MUL b3, a5, t1
unop
ADD c04, t2, c04
lda BO, 8 * SIZE(BO)
MUL b3, a2, t2
unop
ADD c08, t3, c08
unop
MUL b4, a2, t3
LD a2, -3 * SIZE(AO)
ADD c13, t4, c13
unop
MUL b2, a6, t4
LD b2, -3 * SIZE(BO)
/* 8 */
ADD c09, t1, c09
unop
MUL b3, a6, t1
LD a3, -2 * SIZE(AO)
ADD c10, t2, c10
unop
MUL b3, a4, t2
LD b3, -2 * SIZE(BO)
ADD c14, t3, c14
unop
MUL b4, a4, t3
LD a4, -1 * SIZE(AO)
ADD c07, t4, c07
MUL b4, a6, t4
LD b4, -1 * SIZE(BO)
bgt L, $L12
.align 4
$L15:
ADD c11, t1, c11
ldt alpha, ALPHA
MUL b1, a1, t1
#ifndef TRMMKERNEL
blbs K, $L18
#else
blbs TMP1, $L18
#endif
.align 4
ADD c12, t2, c12
MUL b1, a2, t2
ADD c16, t3, c16
MUL b2, a2, t3
ADD c15, t4, c15
MUL b2, a1, t4
ADD c01, t1, c01
MUL b1, a3, t1
ADD c02, t2, c02
unop
MUL b1, a4, t2
LD b1, 0 * SIZE(BO)
ADD c06, t3, c06
MUL b2, a4, t3
ADD c05, t4, c05
MUL b4, a1, t4
ADD c03, t1, c03
unop
MUL b3, a1, t1
LD a1, 0 * SIZE(AO)
ADD c04, t2, c04
unop
MUL b3, a2, t2
unop
ADD c08, t3, c08
unop
MUL b4, a2, t3
LD a2, 1 * SIZE(AO)
ADD c13, t4, c13
unop
MUL b2, a3, t4
LD b2, 1 * SIZE(BO)
ADD c09, t1, c09
unop
MUL b3, a3, t1
lda AO, 4 * SIZE(AO)
ADD c10, t2, c10
unop
MUL b3, a4, t2
LD b3, 2 * SIZE(BO)
ADD c14, t3, c14
unop
MUL b4, a4, t3
LD a4, -1 * SIZE(AO)
ADD c07, t4, c07
unop
MUL b4, a3, t4
LD a3, -2 * SIZE(AO)
ADD c11, t1, c11
LD b4, 3 * SIZE(BO)
MUL b1, a1, t1
lda BO, 4 * SIZE(BO)
.align 4
$L18:
ADD c12, t2, c12
unop
MUL b1, a2, t2
#ifndef TRMMKERNEL
LD a5, 0 * SIZE(C1)
#else
unop
#endif
ADD c16, t3, c16
unop
MUL b2, a2, t3
unop
ADD c15, t4, c15
unop
MUL b2, a1, t4
#ifndef TRMMKERNEL
LD b5, 1 * SIZE(C1)
#else
unop
#endif
ADD c01, t1, c01
unop
MUL b1, a3, t1
unop
ADD c02, t2, c02
unop
MUL b1, a4, t2
#ifndef TRMMKERNEL
LD b1, 0 * SIZE(C2)
#else
unop
#endif
ADD c06, t3, c06
unop
MUL b2, a4, t3
unop
ADD c05, t4, c05
unop
MUL b4, a1, t4
unop
ADD c03, t1, c03
unop
MUL b3, a1, t1
unop
ADD c04, t2, c04
unop
MUL b3, a2, t2
#ifndef TRMMKERNEL
LD a1, 0 * SIZE(C3)
#else
unop
#endif
ADD c08, t3, c08
unop
MUL b4, a2, t3
#ifndef TRMMKERNEL
LD a2, 2 * SIZE(C1)
#else
unop
#endif
ADD c13, t4, c13
unop
MUL b2, a3, t4
#ifndef TRMMKERNEL
LD b2, 3 * SIZE(C1)
#else
unop
#endif
ADD c09, t1, c09
lda I, -1(I)
MUL b3, a3, t1
unop
ADD c10, t2, c10
unop
MUL b3, a4, t2
#ifndef TRMMKERNEL
LD b3, 0 * SIZE(C4)
#else
unop
#endif
ADD c14, t3, c14
unop
MUL b4, a4, t3
#ifndef TRMMKERNEL
LD a4, 1 * SIZE(C2)
#else
unop
#endif
ADD c07, t4, c07
unop
MUL b4, a3, t4
#ifndef TRMMKERNEL
LD a3, 2 * SIZE(C2)
#else
unop
#endif
ADD c11, t1, c11
unop
MUL alpha, c01, c01
#ifndef TRMMKERNEL
LD b4, 3 * SIZE(C2)
#else
unop
#endif
ADD c12, t2, c12
unop
MUL alpha, c02, c02
#ifndef TRMMKERNEL
LD t1, 1 * SIZE(C3)
#else
unop
#endif
ADD c16, t3, c16
unop
MUL alpha, c03, c03
#ifndef TRMMKERNEL
LD t2, 2 * SIZE(C3)
#else
unop
#endif
ADD c15, t4, c15
unop
MUL alpha, c04, c04
#ifndef TRMMKERNEL
LD t3, 3 * SIZE(C3)
#else
unop
#endif
MUL alpha, c05, c05
unop
#ifndef TRMMKERNEL
ADD c01, a5, c01
LD t4, 1 * SIZE(C4)
#else
unop
unop
#endif
MUL alpha, c06, c06
#ifndef TRMMKERNEL
unop
ADD c02, b5, c02
LD a5, 2 * SIZE(C4)
#endif
MUL alpha, c07, c07
#ifndef TRMMKERNEL
unop
ADD c03, a2, c03
LD b5, 3 * SIZE(C4)
#endif
MUL alpha, c08, c08
#ifndef TRMMKERNEL
unop
ADD c04, b2, c04
unop
#endif
MUL alpha, c09, c09
ST c01, 0 * SIZE(C1)
#ifndef TRMMKERNEL
ADD c05, b1, c05
unop
#endif
MUL alpha, c10, c10
ST c02, 1 * SIZE(C1)
#ifndef TRMMKERNEL
ADD c06, a4, c06
unop
#endif
MUL alpha, c11, c11
ST c03, 2 * SIZE(C1)
#ifndef TRMMKERNEL
ADD c07, a3, c07
unop
#endif
MUL alpha, c12, c12
ST c04, 3 * SIZE(C1)
#ifndef TRMMKERNEL
ADD c08, b4, c08
#else
unop
#endif
lda C1, 4 * SIZE(C1)
MUL alpha, c13, c13
ST c05, 0 * SIZE(C2)
#ifndef TRMMKERNEL
ADD c09, a1, c09
unop
#endif
MUL alpha, c14, c14
ST c06, 1 * SIZE(C2)
#ifndef TRMMKERNEL
ADD c10, t1, c10
unop
#endif
MUL alpha, c15, c15
ST c07, 2 * SIZE(C2)
#ifndef TRMMKERNEL
ADD c11, t2, c11
unop
#endif
MUL alpha, c16, c16
ST c08, 3 * SIZE(C2)
#ifndef TRMMKERNEL
ADD c12, t3, c12
#else
unop
#endif
lda C2, 4 * SIZE(C2)
#ifndef TRMMKERNEL
ADD c13, b3, c13
#else
unop
#endif
ST c09, 0 * SIZE(C3)
fclr t1
lda C4, 4 * SIZE(C4)
#ifndef TRMMKERNEL
ADD c14, t4, c14
#else
unop
#endif
ST c10, 1 * SIZE(C3)
fclr t2
unop
#ifndef TRMMKERNEL
ADD c15, a5, c15
#else
unop
#endif
ST c11, 2 * SIZE(C3)
fclr t3
unop
#ifndef TRMMKERNEL
ADD c16, b5, c16
#else
unop
#endif
ST c12, 3 * SIZE(C3)
fclr t4
lda C3, 4 * SIZE(C3)
ST c13, -4 * SIZE(C4)
ST c14, -3 * SIZE(C4)
ST c15, -2 * SIZE(C4)
ST c16, -1 * SIZE(C4)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
subq K, KK, TMP1
#ifdef LEFT
subq TMP1, 4, TMP1
#else
subq TMP1, 4, TMP1
#endif
sll TMP1, BASE_SHIFT + 2, TMP1
addq AO, TMP1, AO
addq BO, TMP1, BO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq KK, 4, KK
#endif
bgt I, $L11
.align 4
$L20:
and M, 2, I
ble I, $L30
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 2, TMP1
#else
addq KK, 4, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr c09
LD a2, 1 * SIZE(AO)
fclr c13
LD a3, 2 * SIZE(AO)
fclr c10
LD a4, 3 * SIZE(AO)
fclr c14
LD b1, 0 * SIZE(B)
#ifndef TRMMKERNEL
lda L, -2(K)
#else
lda L, -2(TMP1)
#endif
LD b2, 1 * SIZE(B)
lda AO, 2 * SIZE(AO)
LD b3, 2 * SIZE(B)
fclr c01
LD b4, 3 * SIZE(B)
fclr c05
lda BO, 4 * SIZE(B)
fclr c02
fclr c06
ble L, $L25
#else
sll KK, BASE_SHIFT + 1, TMP1
addq AO, TMP1, AO
sll KK, BASE_SHIFT + 2, TMP2
addq B, TMP2, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c09
LD a2, 1 * SIZE(AO)
fclr c13
LD a3, 2 * SIZE(AO)
fclr c10
LD a4, 3 * SIZE(AO)
fclr c14
LD b1, 0 * SIZE(BO)
lda L, -2(TMP1)
LD b2, 1 * SIZE(BO)
lda AO, 2 * SIZE(AO)
LD b3, 2 * SIZE(BO)
fclr c01
LD b4, 3 * SIZE(BO)
fclr c05
lda BO, 4 * SIZE(BO)
fclr c02
fclr c06
ble L, $L25
#endif
.align 4
$L22:
ADD c09, t1, c09
unop
MUL a1, b1, t1
unop
ADD c10, t2, c10
unop
MUL a2, b1, t2
LD b1, 0 * SIZE(BO)
ADD c13, t3, c13
unop
MUL a1, b2, t3
lda BO, 8 * SIZE(BO)
ADD c14, t4, c14
unop
MUL a2, b2, t4
LD b2, -7 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b3, t1
unop
ADD c02, t2, c02
unop
MUL a2, b3, t2
LD b3, -6 * SIZE(BO)
ADD c05, t3, c05
unop
MUL a1, b4, t3
LD a1, 2 * SIZE(AO)
ADD c06, t4, c06
MUL a2, b4, t4
LD b5, -5 * SIZE(BO)
ADD c09, t1, c09
unop
MUL a3, b1, t1
LD a2, 3 * SIZE(AO)
ADD c10, t2, c10
unop
MUL a4, b1, t2
LD b1, -4 * SIZE(BO)
ADD c13, t3, c13
unop
MUL a3, b2, t3
lda AO, 4 * SIZE(AO)
ADD c14, t4, c14
MUL a4, b2, t4
LD b2, -3 * SIZE(BO)
ADD c01, t1, c01
lda L, -2(L)
MUL a3, b3, t1
LD b4, -1 * SIZE(BO)
ADD c02, t2, c02
unop
MUL a4, b3, t2
LD b3, -2 * SIZE(BO)
ADD c05, t3, c05
unop
MUL a3, b5, t3
LD a3, 0 * SIZE(AO)
ADD c06, t4, c06
MUL a4, b5, t4
LD a4, 1 * SIZE(AO)
bgt L, $L22
.align 4
$L25:
ADD c09, t1, c09
ldt alpha, ALPHA
MUL a1, b1, t1
#ifndef TRMMKERNEL
blbs K, $L28
#else
blbs TMP1, $L28
#endif
ADD c10, t2, c10
unop
MUL a2, b1, t2
LD b1, 0 * SIZE(BO)
ADD c13, t3, c13
unop
MUL a1, b2, t3
unop
ADD c14, t4, c14
unop
MUL a2, b2, t4
LD b2, 1 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b3, t1
lda AO, 2 * SIZE(AO)
ADD c02, t2, c02
unop
MUL a2, b3, t2
LD b3, 2 * SIZE(BO)
ADD c05, t3, c05
unop
MUL a1, b4, t3
LD a1, -2 * SIZE(AO)
ADD c06, t4, c06
unop
MUL a2, b4, t4
LD a2, -1 * SIZE(AO)
ADD c09, t1, c09
LD b4, 3 * SIZE(BO)
MUL a1, b1, t1
lda BO, 4 * SIZE(BO)
.align 4
$L28:
ADD c10, t2, c10
unop
MUL a2, b1, t2
#ifndef TRMMKERNEL
LD a3, 0 * SIZE(C1)
#else
unop
#endif
ADD c13, t3, c13
unop
MUL a1, b2, t3
#ifndef TRMMKERNEL
LD a4, 1 * SIZE(C1)
#else
unop
#endif
ADD c14, t4, c14
unop
MUL a2, b2, t4
#ifndef TRMMKERNEL
LD a5, 0 * SIZE(C2)
#else
unop
#endif
ADD c01, t1, c01
unop
MUL a1, b3, t1
#ifndef TRMMKERNEL
LD b5, 1 * SIZE(C2)
#else
unop
#endif
ADD c02, t2, c02
unop
MUL a2, b3, t2
#ifndef TRMMKERNEL
LD b1, 0 * SIZE(C3)
#else
unop
#endif
ADD c05, t3, c05
unop
MUL a1, b4, t3
#ifndef TRMMKERNEL
LD b2, 1 * SIZE(C3)
#else
unop
#endif
ADD c06, t4, c06
unop
MUL a2, b4, t4
#ifndef TRMMKERNEL
LD b3, 0 * SIZE(C4)
#else
unop
#endif
ADD c09, t1, c09
unop
MUL alpha, c01, c01
#ifndef TRMMKERNEL
LD b4, 1 * SIZE(C4)
#else
unop
#endif
ADD c10, t2, c10
unop
MUL alpha, c02, c02
unop
ADD c13, t3, c13
MUL alpha, c05, c05
ADD c14, t4, c14
MUL alpha, c06, c06
MUL alpha, c09, c09
#ifndef TRMMKERNEL
ADD c01, a3, c01
#endif
MUL alpha, c10, c10
#ifndef TRMMKERNEL
ADD c02, a4, c02
#endif
MUL alpha, c13, c13
#ifndef TRMMKERNEL
ADD c05, a5, c05
#endif
MUL alpha, c14, c14
#ifndef TRMMKERNEL
ADD c06, b5, c06
#endif
#ifndef TRMMKERNEL
ADD c09, b1, c09
unop
#endif
ST c01, 0 * SIZE(C1)
fclr t1
#ifndef TRMMKERNEL
ADD c10, b2, c10
unop
#endif
ST c02, 1 * SIZE(C1)
fclr t2
#ifndef TRMMKERNEL
ADD c13, b3, c13
unop
#endif
ST c05, 0 * SIZE(C2)
fclr t3
#ifndef TRMMKERNEL
ADD c14, b4, c14
unop
#endif
ST c06, 1 * SIZE(C2)
fclr t4
ST c09, 0 * SIZE(C3)
lda C1, 2 * SIZE(C1)
ST c10, 1 * SIZE(C3)
lda C2, 2 * SIZE(C2)
ST c13, 0 * SIZE(C4)
lda C3, 2 * SIZE(C3)
ST c14, 1 * SIZE(C4)
lda C4, 2 * SIZE(C4)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
subq K, KK, TMP1
#ifdef LEFT
subq TMP1, 2, TMP1
#else
subq TMP1, 4, TMP1
#endif
sll TMP1, BASE_SHIFT + 1, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 2, TMP2
addq BO, TMP2, BO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq KK, 2, KK
#endif
.align 4
$L30:
and M, 1, I
ble I, $L39
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 1, TMP1
#else
addq KK, 4, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr c01
LD a2, 1 * SIZE(AO)
fclr c05
LD b1, 0 * SIZE(B)
#ifndef TRMMKERNEL
lda L, -2(K)
#else
lda L, -2(TMP1)
#endif
LD b2, 1 * SIZE(B)
lda AO, 1 * SIZE(AO)
LD b3, 2 * SIZE(B)
fclr c09
LD b4, 3 * SIZE(B)
fclr c13
lda BO, 4 * SIZE(B)
ble L, $L35
#else
sll KK, BASE_SHIFT + 0, TMP1
addq AO, TMP1, AO
sll KK, BASE_SHIFT + 2, TMP2
addq B, TMP2, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c01
LD a2, 1 * SIZE(AO)
fclr c05
LD b1, 0 * SIZE(BO)
lda L, -2(TMP1)
LD b2, 1 * SIZE(BO)
lda AO, 1 * SIZE(AO)
LD b3, 2 * SIZE(BO)
fclr c09
LD b4, 3 * SIZE(BO)
fclr c13
lda BO, 4 * SIZE(BO)
ble L, $L35
#endif
.align 4
$L32:
ADD c01, t1, c01
lda L, -2(L)
MUL a1, b1, t1
LD b1, 0 * SIZE(BO)
ADD c05, t2, c05
lda AO, 2 * SIZE(AO)
MUL a1, b2, t2
LD b2, 1 * SIZE(BO)
ADD c09, t3, c09
LD b5, 3 * SIZE(BO)
MUL a1, b3, t3
LD b3, 2 * SIZE(BO)
ADD c13, t4, c13
MUL a1, b4, t4
LD a1, -1 * SIZE(AO)
ADD c01, t1, c01
MUL a2, b1, t1
LD b1, 4 * SIZE(BO)
lda BO, 8 * SIZE(BO)
ADD c05, t2, c05
MUL a2, b2, t2
LD b2, -3 * SIZE(BO)
ADD c09, t3, c09
LD b4, -1 * SIZE(BO)
MUL a2, b3, t3
LD b3, -2 * SIZE(BO)
ADD c13, t4, c13
MUL a2, b5, t4
LD a2, 0 * SIZE(AO)
bgt L, $L32
.align 4
$L35:
ADD c01, t1, c01
ldt alpha, ALPHA
MUL a1, b1, t1
#ifndef TRMMKERNEL
blbs K, $L38
#else
blbs TMP1, $L38
#endif
.align 4
ADD c05, t2, c05
LD b1, 0 * SIZE(BO)
MUL a1, b2, t2
LD b2, 1 * SIZE(BO)
ADD c09, t3, c09
MUL a1, b3, t3
LD b3, 2 * SIZE(BO)
ADD c13, t4, c13
MUL a1, b4, t4
LD a1, 0 * SIZE(AO)
lda AO, 1 * SIZE(AO)
ADD c01, t1, c01
LD b4, 3 * SIZE(BO)
MUL a1, b1, t1
lda BO, 4 * SIZE(BO)
.align 4
$L38:
ADD c05, t2, c05
unop
MUL a1, b2, t2
#ifndef TRMMKERNEL
LD a5, 0 * SIZE(C1)
#else
unop
#endif
ADD c09, t3, c09
unop
MUL a1, b3, t3
#ifndef TRMMKERNEL
LD b5, 0 * SIZE(C2)
#else
unop
#endif
ADD c13, t4, c13
unop
MUL a1, b4, t4
#ifndef TRMMKERNEL
LD a2, 0 * SIZE(C3)
#else
unop
#endif
ADD c01, t1, c01
unop
MUL alpha, c01, c01
#ifndef TRMMKERNEL
LD a3, 0 * SIZE(C4)
#else
unop
#endif
ADD c05, t2, c05
unop
MUL alpha, c05, c05
unop
ADD c09, t3, c09
MUL alpha, c09, c09
ADD c13, t4, c13
MUL alpha, c13, c13
#ifndef TRMMKERNEL
ADD c01, a5, c01
ADD c05, b5, c05
ADD c09, a2, c09
ADD c13, a3, c13
#endif
ST c01, 0 * SIZE(C1)
ST c05, 0 * SIZE(C2)
ST c09, 0 * SIZE(C3)
ST c13, 0 * SIZE(C4)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
subq K, KK, TMP1
#ifdef LEFT
subq TMP1, 1, TMP1
#else
subq TMP1, 4, TMP1
#endif
sll TMP1, BASE_SHIFT + 0, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 2, TMP2
addq BO, TMP2, BO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq KK, 1, KK
#endif
.align 4
$L39:
mov BO, B
lda J, -1(J)
#if defined(TRMMKERNEL) && !defined(LEFT)
addq KK, 4, KK
#else
unop
#endif
bgt J, $L01
.align 4
$L40:
and N, 2, J
ble J, $L80
mov C, C1
addq C, LDC, C2
mov A, AO
fclr t1
addq C2, LDC, C
fclr t2
#if defined(TRMMKERNEL) && defined(LEFT)
mov OFFSET, KK
#endif
sra M, 2, I
fclr t3
fclr t4
ble I, $L60
.align 4
$L51:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 4, TMP1
#else
addq KK, 2, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr c03
LD a2, 1 * SIZE(AO)
fclr c07
LD a3, 2 * SIZE(AO)
fclr c04
LD a4, 3 * SIZE(AO)
fclr c08
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c05
LD b3, 2 * SIZE(B)
fclr c02
LD b4, 3 * SIZE(B)
fclr c06
#ifndef TRMMKERNEL
lda L, -2(K)
#else
lda L, -2(TMP1)
#endif
lda BO, 2 * SIZE(B)
lda AO, 4 * SIZE(AO)
ble L, $L55
#else
sll KK, BASE_SHIFT + 2, TMP1
addq AO, TMP1, AO
sll KK, BASE_SHIFT + 1, TMP2
addq B, TMP2, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c03
LD a2, 1 * SIZE(AO)
fclr c07
LD a3, 2 * SIZE(AO)
fclr c04
LD a4, 3 * SIZE(AO)
fclr c08
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c05
LD b3, 2 * SIZE(BO)
fclr c02
LD b4, 3 * SIZE(BO)
fclr c06
lda L, -2(TMP1)
lda BO, 2 * SIZE(BO)
lda AO, 4 * SIZE(AO)
ble L, $L55
#endif
.align 4
$L52:
ADD c05, t1, c05
unop
MUL a1, b1, t1
unop
ADD c06, t2, c06
lda L, -2(L)
MUL a2, b1, t2
unop
ADD c07, t3, c07
unop
MUL a3, b1, t3
unop
ADD c08, t4, c08
unop
MUL a4, b1, t4
LD b1, 2 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b2, t1
LD a1, 0 * SIZE(AO)
ADD c02, t2, c02
lda BO, 4 * SIZE(BO)
MUL a2, b2, t2
LD a2, 1 * SIZE(AO)
ADD c03, t3, c03
unop
MUL a3, b2, t3
LD a3, 2 * SIZE(AO)
ADD c04, t4, c04
unop
MUL a4, b2, t4
LD a5, 3 * SIZE(AO)
ADD c05, t1, c05
unop
MUL a1, b3, t1
LD b2, -1 * SIZE(BO)
ADD c06, t2, c06
unop
MUL a2, b3, t2
unop
ADD c07, t3, c07
unop
MUL a3, b3, t3
lda AO, 8 * SIZE(AO)
ADD c08, t4, c08
unop
MUL a5, b3, t4
LD b3, 0 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b4, t1
LD a1, -4 * SIZE(AO)
ADD c02, t2, c02
unop
MUL a2, b4, t2
LD a2, -3 * SIZE(AO)
ADD c03, t3, c03
LD a4, -1 * SIZE(AO)
MUL a3, b4, t3
LD a3, -2 * SIZE(AO)
ADD c04, t4, c04
MUL a5, b4, t4
LD b4, 1 * SIZE(BO)
bgt L, $L52
.align 4
$L55:
ADD c05, t1, c05
ldt alpha, ALPHA
MUL a1, b1, t1
#ifndef TRMMKERNEL
blbs K, $L58
#else
blbs TMP1, $L58
#endif
.align 4
ADD c06, t2, c06
MUL a2, b1, t2
ADD c07, t3, c07
MUL a3, b1, t3
ADD c08, t4, c08
unop
MUL a4, b1, t4
LD b1, 0 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b2, t1
LD a1, 0 * SIZE(AO)
ADD c02, t2, c02
unop
MUL a2, b2, t2
LD a2, 1 * SIZE(AO)
ADD c03, t3, c03
unop
MUL a3, b2, t3
LD a3, 2 * SIZE(AO)
ADD c04, t4, c04
MUL a4, b2, t4
LD a4, 3 * SIZE(AO)
lda AO, 4 * SIZE(AO)
ADD c05, t1, c05
LD b2, 1 * SIZE(BO)
MUL a1, b1, t1
lda BO, 2 * SIZE(BO)
.align 4
$L58:
ADD c06, t2, c06
unop
MUL a2, b1, t2
#ifndef TRMMKERNEL
LD c09, 0 * SIZE(C1)
#else
unop
#endif
ADD c07, t3, c07
unop
MUL a3, b1, t3
#ifndef TRMMKERNEL
LD c10, 1 * SIZE(C1)
#else
unop
#endif
ADD c08, t4, c08
unop
MUL a4, b1, t4
#ifndef TRMMKERNEL
LD c11, 2 * SIZE(C1)
#else
unop
#endif
ADD c01, t1, c01
unop
MUL a1, b2, t1
#ifndef TRMMKERNEL
LD c12, 3 * SIZE(C1)
#else
unop
#endif
ADD c02, t2, c02
unop
MUL a2, b2, t2
#ifndef TRMMKERNEL
LD c13, 0 * SIZE(C2)
unop
#endif
ADD c03, t3, c03
unop
MUL a3, b2, t3
#ifndef TRMMKERNEL
LD c14, 1 * SIZE(C2)
#else
unop
#endif
ADD c04, t4, c04
unop
MUL a4, b2, t4
#ifndef TRMMKERNEL
LD c15, 2 * SIZE(C2)
#else
unop
#endif
ADD c05, t1, c05
unop
MUL alpha, c01, c01
#ifndef TRMMKERNEL
LD c16, 3 * SIZE(C2)
#else
unop
#endif
ADD c06, t2, c06
lda I, -1(I)
MUL alpha, c02, c02
unop
ADD c07, t3, c07
MUL alpha, c03, c03
ADD c08, t4, c08
MUL alpha, c04, c04
MUL alpha, c05, c05
#ifndef TRMMKERNEL
ADD c01, c09, c01
#endif
MUL alpha, c06, c06
#ifndef TRMMKERNEL
ADD c02, c10, c02
#endif
MUL alpha, c07, c07
#ifndef TRMMKERNEL
ADD c03, c11, c03
#endif
MUL alpha, c08, c08
#ifndef TRMMKERNEL
ADD c04, c12, c04
#endif
#ifndef TRMMKERNEL
ADD c05, c13, c05
#endif
ST c01, 0 * SIZE(C1)
#ifndef TRMMKERNEL
ADD c06, c14, c06
#endif
ST c02, 1 * SIZE(C1)
#ifndef TRMMKERNEL
ADD c07, c15, c07
#endif
ST c03, 2 * SIZE(C1)
#ifndef TRMMKERNEL
ADD c08, c16, c08
#endif
ST c04, 3 * SIZE(C1)
ST c05, 0 * SIZE(C2)
fclr t1
ST c06, 1 * SIZE(C2)
fclr t2
ST c07, 2 * SIZE(C2)
fclr t3
ST c08, 3 * SIZE(C2)
fclr t4
lda C1, 4 * SIZE(C1)
lda C2, 4 * SIZE(C2)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
subq K, KK, TMP1
#ifdef LEFT
subq TMP1, 4, TMP1
#else
subq TMP1, 2, TMP1
#endif
sll TMP1, BASE_SHIFT + 2, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 1, TMP2
addq BO, TMP2, BO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq KK, 4, KK
#endif
bgt I, $L51
.align 4
$L60:
and M, 2, I
ble I, $L70
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 2, TMP1
#else
addq KK, 2, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr c01
LD a2, 1 * SIZE(AO)
fclr c05
LD a3, 2 * SIZE(AO)
fclr c02
LD a4, 3 * SIZE(AO)
fclr c06
LD b1, 0 * SIZE(B)
#ifndef TRMMKERNEL
lda L, -2(K)
#else
lda L, -2(TMP1)
#endif
LD b2, 1 * SIZE(B)
lda AO, 2 * SIZE(AO)
LD b3, 2 * SIZE(B)
LD b4, 3 * SIZE(B)
lda BO, 2 * SIZE(B)
ble L, $L65
#else
sll KK, BASE_SHIFT + 1, TMP1
addq AO, TMP1, AO
sll KK, BASE_SHIFT + 1, TMP2
addq B, TMP2, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c01
LD a2, 1 * SIZE(AO)
fclr c05
LD a3, 2 * SIZE(AO)
fclr c02
LD a4, 3 * SIZE(AO)
fclr c06
LD b1, 0 * SIZE(BO)
lda L, -2(TMP1)
LD b2, 1 * SIZE(BO)
lda AO, 2 * SIZE(AO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
lda BO, 2 * SIZE(BO)
ble L, $L65
#endif
.align 4
$L62:
ADD c01, t1, c01
unop
MUL a1, b1, t1
unop
ADD c02, t2, c02
lda AO, 4 * SIZE(AO)
MUL a2, b1, t2
LD b1, 2 * SIZE(BO)
ADD c05, t3, c05
lda L, -2(L)
MUL a1, b2, t3
LD a1, -2 * SIZE(AO)
ADD c06, t4, c06
unop
MUL a2, b2, t4
LD a2, -1 * SIZE(AO)
ADD c01, t1, c01
LD b2, 3 * SIZE(BO)
MUL a3, b3, t1
lda BO, 4 * SIZE(BO)
ADD c02, t2, c02
unop
MUL a4, b3, t2
LD b3, 0 * SIZE(BO)
ADD c05, t3, c05
unop
MUL a3, b4, t3
LD a3, 0 * SIZE(AO)
ADD c06, t4, c06
MUL a4, b4, t4
LD b4, 1 * SIZE(BO)
unop
LD a4, 1 * SIZE(AO)
unop
unop
bgt L, $L62
.align 4
$L65:
ADD c01, t1, c01
ldt alpha, ALPHA
MUL a1, b1, t1
#ifndef TRMMKERNEL
blbs K, $L68
#else
blbs TMP1, $L68
#endif
.align 4
ADD c02, t2, c02
unop
MUL a2, b1, t2
LD b1, 0 * SIZE(BO)
ADD c05, t3, c05
lda BO, 2 * SIZE(BO)
MUL a1, b2, t3
LD a1, 0 * SIZE(AO)
ADD c06, t4, c06
unop
MUL a2, b2, t4
LD a2, 1 * SIZE(AO)
ADD c01, t1, c01
LD b2, -1 * SIZE(BO)
MUL a1, b1, t1
lda AO, 2 * SIZE(AO)
.align 4
$L68:
ADD c02, t2, c02
unop
MUL a2, b1, t2
#ifndef TRMMKERNEL
LD c09, 0 * SIZE(C1)
#else
unop
#endif
ADD c05, t3, c05
unop
MUL a1, b2, t3
#ifndef TRMMKERNEL
LD c10, 1 * SIZE(C1)
#else
unop
#endif
ADD c06, t4, c06
unop
MUL a2, b2, t4
#ifndef TRMMKERNEL
LD c11, 0 * SIZE(C2)
#else
unop
#endif
ADD c01, t1, c01
unop
MUL alpha, c01, c01
#ifndef TRMMKERNEL
LD c12, 1 * SIZE(C2)
#else
unop
#endif
ADD c02, t2, c02
lda C1, 2 * SIZE(C1)
MUL alpha, c02, c02
lda C2, 2 * SIZE(C2)
ADD c05, t3, c05
MUL alpha, c05, c05
ADD c06, t4, c06
MUL alpha, c06, c06
#ifndef TRMMKERNEL
ADD c01, c09, c01
ADD c02, c10, c02
ADD c05, c11, c05
ADD c06, c12, c06
#endif
ST c01, -2 * SIZE(C1)
fclr t1
ST c02, -1 * SIZE(C1)
fclr t2
ST c05, -2 * SIZE(C2)
fclr t3
ST c06, -1 * SIZE(C2)
fclr t4
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
subq K, KK, TMP1
#ifdef LEFT
subq TMP1, 2, TMP1
#else
subq TMP1, 2, TMP1
#endif
sll TMP1, BASE_SHIFT + 1, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 1, TMP2
addq BO, TMP2, BO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq KK, 2, KK
#endif
.align 4
$L70:
and M, 1, I
ble I, $L79
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 1, TMP1
#else
addq KK, 2, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr c01
LD a2, 1 * SIZE(AO)
fclr c05
LD b1, 0 * SIZE(B)
fclr c02
LD b2, 1 * SIZE(B)
fclr c06
#ifndef TRMMKERNEL
lda L, -2(K)
#else
lda L, -2(TMP1)
#endif
LD b3, 2 * SIZE(B)
lda AO, 1 * SIZE(AO)
LD b4, 3 * SIZE(B)
lda BO, 2 * SIZE(B)
ble L, $L75
#else
sll KK, BASE_SHIFT + 0, TMP1
addq AO, TMP1, AO
sll KK, BASE_SHIFT + 1, TMP2
addq B, TMP2, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c01
LD a2, 1 * SIZE(AO)
fclr c05
LD b1, 0 * SIZE(BO)
fclr c02
LD b2, 1 * SIZE(BO)
fclr c06
#ifndef TRMMKERNEL
lda L, -2(K)
#else
lda L, -2(TMP1)
#endif
LD b3, 2 * SIZE(BO)
lda AO, 1 * SIZE(AO)
LD b4, 3 * SIZE(BO)
lda BO, 2 * SIZE(BO)
ble L, $L75
#endif
.align 4
$L72:
ADD c01, t1, c01
lda L, -2(L)
MUL a1, b1, t1
LD b1, 2 * SIZE(BO)
ADD c05, t2, c05
MUL a1, b2, t2
LD a1, 1 * SIZE(AO)
LD b2, 3 * SIZE(BO)
ADD c02, t3, c02
lda AO, 2 * SIZE(AO)
MUL a2, b3, t3
LD b3, 4 * SIZE(BO)
ADD c06, t4, c06
MUL a2, b4, t4
LD a2, 0 * SIZE(AO)
LD b4, 5 * SIZE(BO)
lda BO, 4 * SIZE(BO)
unop
unop
bgt L, $L72
.align 4
$L75:
ADD c01, t1, c01
ldt alpha, ALPHA
MUL a1, b1, t1
#ifndef TRMMKERNEL
blbs K, $L78
#else
blbs TMP1, $L78
#endif
.align 4
ADD c05, t2, c05
MUL a1, b2, t2
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
ADD c01, t1, c01
LD b2, 1 * SIZE(BO)
lda AO, 1 * SIZE(AO)
MUL a1, b1, t1
lda BO, 2 * SIZE(BO)
.align 4
$L78:
ADD c05, t2, c05
MUL a1, b2, t2
#ifndef TRMMKERNEL
LD a5, 0 * SIZE(C1)
#else
unop
#endif
ADD c02, t3, c02
ADD c06, t4, c06
#ifndef TRMMKERNEL
LD b5, 0 * SIZE(C2)
#else
unop
#endif
ADD c01, c02, c01
ADD c05, c06, c05
ADD c01, t1, c01
ADD c05, t2, c05
MUL alpha, c01, c01
MUL alpha, c05, c05
#ifndef TRMMKERNEL
ADD c01, a5, c01
ADD c05, b5, c05
#endif
ST c01, 0 * SIZE(C1)
ST c05, 0 * SIZE(C2)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
subq K, KK, TMP1
#ifdef LEFT
subq TMP1, 1, TMP1
#else
subq TMP1, 2, TMP1
#endif
sll TMP1, BASE_SHIFT + 0, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 1, TMP2
addq BO, TMP2, BO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq KK, 1, KK
#endif
.align 4
$L79:
mov BO, B
#if defined(TRMMKERNEL) && !defined(LEFT)
addq KK, 2, KK
#else
unop
#endif
unop
unop
.align 4
$L80:
and N, 1, J
ble J, $L999
mov C, C1
mov A, AO
#if defined(TRMMKERNEL) && defined(LEFT)
mov OFFSET, KK
#endif
sra M, 2, I
ble I, $L100
.align 4
$L91:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 4, TMP1
#else
addq KK, 1, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c02
LD b3, 2 * SIZE(B)
fclr c03
LD b4, 3 * SIZE(B)
fclr c04
#ifndef TRMMKERNEL
sra K, 2, L
#else
sra TMP1, 2, L
#endif
mov B, BO
unop
ble L, $L95
#else
sll KK, BASE_SHIFT + 2, TMP1
addq AO, TMP1, AO
sll KK, BASE_SHIFT + 0, TMP2
addq B, TMP2, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c02
LD b3, 2 * SIZE(BO)
fclr c03
LD b4, 3 * SIZE(BO)
fclr c04
#ifndef TRMMKERNEL
sra K, 2, L
#else
sra TMP1, 2, L
#endif
unop
ble L, $L95
#endif
.align 5
$L92:
ADD c01, t1, c01
unop
MUL a1, b1, t1
LD a1, 4 * SIZE(AO)
ADD c02, t2, c02
lda L, -1(L)
MUL a2, b1, t2
LD a2, 5 * SIZE(AO)
ADD c03, t3, c03
unop
MUL a3, b1, t3
LD a3, 6 * SIZE(AO)
ADD c04, t4, c04
MUL a4, b1, t4
LD a4, 7 * SIZE(AO)
LD b1, 4 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b2, t1
LD a1, 8 * SIZE(AO)
ADD c02, t2, c02
unop
MUL a2, b2, t2
LD a2, 9 * SIZE(AO)
ADD c03, t3, c03
unop
MUL a3, b2, t3
LD a3, 10 * SIZE(AO)
ADD c04, t4, c04
MUL a4, b2, t4
LD a4, 11 * SIZE(AO)
LD b2, 5 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b3, t1
LD a1, 12 * SIZE(AO)
ADD c02, t2, c02
unop
MUL a2, b3, t2
LD a2, 13 * SIZE(AO)
ADD c03, t3, c03
unop
MUL a3, b3, t3
LD a3, 14 * SIZE(AO)
ADD c04, t4, c04
MUL a4, b3, t4
LD a5, 15 * SIZE(AO)
LD b3, 6 * SIZE(BO)
ADD c01, t1, c01
MUL a1, b4, t1
LD a1, 16 * SIZE(AO)
lda AO, 16 * SIZE(AO)
ADD c02, t2, c02
lda BO, 4 * SIZE(BO)
MUL a2, b4, t2
LD a2, 1 * SIZE(AO)
ADD c03, t3, c03
LD a4, 3 * SIZE(AO)
MUL a3, b4, t3
LD a3, 2 * SIZE(AO)
ADD c04, t4, c04
MUL a5, b4, t4
LD b4, 3 * SIZE(BO)
bgt L, $L92
.align 4
$L95:
#ifndef TRMMKERNEL
and K, 3, L
#else
and TMP1, 3, L
#endif
ldt alpha, ALPHA
unop
ble L, $L98
.align 4
$L96:
ADD c01, t1, c01
lda L, -1(L)
MUL a1, b1, t1
LD a1, 4 * SIZE(AO)
ADD c02, t2, c02
lda BO, 1 * SIZE(BO)
MUL a2, b1, t2
LD a2, 5 * SIZE(AO)
ADD c03, t3, c03
unop
MUL a3, b1, t3
LD a3, 6 * SIZE(AO)
ADD c04, t4, c04
MUL a4, b1, t4
LD a4, 7 * SIZE(AO)
LD b1, 0 * SIZE(BO)
lda AO, 4 * SIZE(AO)
bgt L, $L96
.align 4
$L98:
#ifndef TRMMKERNEL
ADD c01, t1, c01
LD c05, 0 * SIZE(C1)
ADD c02, t2, c02
LD c06, 1 * SIZE(C1)
ADD c03, t3, c03
LD c07, 2 * SIZE(C1)
ADD c04, t4, c04
LD c08, 3 * SIZE(C1)
#else
ADD c01, t1, c01
ADD c02, t2, c02
ADD c03, t3, c03
ADD c04, t4, c04
#endif
MUL alpha, c01, c01
MUL alpha, c02, c02
MUL alpha, c03, c03
MUL alpha, c04, c04
#ifndef TRMMKERNEL
ADD c01, c05, c01
ADD c02, c06, c02
ADD c03, c07, c03
ADD c04, c08, c04
#endif
ST c01, 0 * SIZE(C1)
ST c02, 1 * SIZE(C1)
ST c03, 2 * SIZE(C1)
ST c04, 3 * SIZE(C1)
lda C1, 4 * SIZE(C1)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
subq K, KK, TMP1
#ifdef LEFT
subq TMP1, 4, TMP1
#else
subq TMP1, 1, TMP1
#endif
sll TMP1, BASE_SHIFT + 2, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 0, TMP2
addq BO, TMP2, BO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq KK, 4, KK
#endif
lda I, -1(I)
bgt I, $L91
.align 4
$L100:
and M, 2, I
unop
unop
ble I, $L110
.align 4
$L101:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 2, TMP1
#else
addq KK, 1, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c02
LD b3, 2 * SIZE(B)
fclr c03
LD b4, 3 * SIZE(B)
fclr c04
#ifndef TRMMKERNEL
sra K, 2, L
#else
sra TMP1, 2, L
#endif
mov B, BO
unop
ble L, $L105
#else
sll KK, BASE_SHIFT + 1, TMP1
addq AO, TMP1, AO
sll KK, BASE_SHIFT + 0, TMP2
addq B, TMP2, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c02
LD b3, 2 * SIZE(BO)
fclr c03
LD b4, 3 * SIZE(BO)
fclr c04
#ifndef TRMMKERNEL
sra K, 2, L
#else
sra TMP1, 2, L
#endif
unop
ble L, $L105
#endif
.align 5
$L102:
ADD c01, t1, c01
lda L, -1(L)
MUL a1, b1, t1
LD a1, 4 * SIZE(AO)
ADD c02, t2, c02
MUL a2, b1, t2
LD a2, 5 * SIZE(AO)
LD b1, 4 * SIZE(BO)
ADD c03, t3, c03
lda BO, 4 * SIZE(BO)
MUL a3, b2, t3
LD a3, 6 * SIZE(AO)
ADD c04, t4, c04
MUL a4, b2, t4
LD a5, 7 * SIZE(AO)
LD b2, 1 * SIZE(BO)
ADD c01, t1, c01
MUL a1, b3, t1
LD a1, 8 * SIZE(AO)
lda AO, 8 * SIZE(AO)
ADD c02, t2, c02
MUL a2, b3, t2
LD b3, 2 * SIZE(BO)
LD a2, 1 * SIZE(AO)
ADD c03, t3, c03
LD a4, 3 * SIZE(AO)
MUL a3, b4, t3
LD a3, 2 * SIZE(AO)
ADD c04, t4, c04
MUL a5, b4, t4
LD b4, 3 * SIZE(BO)
bgt L, $L102
.align 4
$L105:
#ifndef TRMMKERNEL
and K, 3, L
#else
and TMP1, 3, L
#endif
ldt alpha, ALPHA
#ifndef TRMMKERNEL
LD a3, 0 * SIZE(C1)
LD a4, 1 * SIZE(C1)
#endif
ble L, $L108
.align 4
$L106:
ADD c01, t1, c01
lda L, -1(L)
MUL a1, b1, t1
LD a1, 2 * SIZE(AO)
ADD c02, t2, c02
MUL a2, b1, t2
LD a2, 3 * SIZE(AO)
LD b1, 1 * SIZE(BO)
lda AO, 2 * SIZE(AO)
unop
lda BO, 1 * SIZE(BO)
bgt L, $L106
.align 4
$L108:
ADD c01, t1, c01
fclr t1
ADD c02, t2, c02
fclr t2
ADD c03, t3, c03
fclr t3
ADD c04, t4, c04
fclr t4
ADD c01, c03, c01
ADD c02, c04, c02
MUL alpha, c01, c01
MUL alpha, c02, c02
#ifndef TRMMKERNEL
ADD c01, a3, c01
ADD c02, a4, c02
#endif
ST c01, 0 * SIZE(C1)
ST c02, 1 * SIZE(C1)
lda C1, 2 * SIZE(C1)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
subq K, KK, TMP1
#ifdef LEFT
subq TMP1, 2, TMP1
#else
subq TMP1, 1, TMP1
#endif
sll TMP1, BASE_SHIFT + 1, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 0, TMP2
addq BO, TMP2, BO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq KK, 2, KK
#endif
.align 4
$L110:
and M, 1, I
ble I, $L999
.align 4
$L111:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 1, TMP1
#else
addq KK, 1, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c02
LD b3, 2 * SIZE(B)
fclr c03
LD b4, 3 * SIZE(B)
fclr c04
#ifndef TRMMKERNEL
sra K, 2, L
#else
sra TMP1, 2, L
#endif
mov B, BO
unop
ble L, $L115
#else
sll KK, BASE_SHIFT + 0, TMP1
addq AO, TMP1, AO
sll KK, BASE_SHIFT + 0, TMP2
addq B, TMP2, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c02
LD b3, 2 * SIZE(BO)
fclr c03
LD b4, 3 * SIZE(BO)
fclr c04
#ifndef TRMMKERNEL
sra K, 2, L
#else
sra TMP1, 2, L
#endif
unop
ble L, $L115
#endif
.align 4
$L112:
ADD c01, t1, c01
MUL a1, b1, t1
LD a1, 4 * SIZE(AO)
LD b1, 4 * SIZE(BO)
ADD c02, t2, c02
MUL a2, b2, t2
LD a2, 5 * SIZE(AO)
LD b2, 5 * SIZE(BO)
ADD c03, t3, c03
MUL a3, b3, t3
LD a3, 6 * SIZE(AO)
LD b3, 6 * SIZE(BO)
ADD c04, t4, c04
MUL a4, b4, t4
LD a4, 7 * SIZE(AO)
LD b4, 7 * SIZE(BO)
lda L, -1(L)
lda AO, 4 * SIZE(AO)
lda BO, 4 * SIZE(BO)
bgt L, $L112
.align 4
$L115:
#ifndef TRMMKERNEL
and K, 3, L
#else
and TMP1, 3, L
#endif
ldt alpha, ALPHA
#ifndef TRMMKERNEL
LD a2, 0 * SIZE(C1)
#endif
ble L, $L118
.align 4
$L116:
ADD c01, t1, c01
MUL a1, b1, t1
LD a1, 1 * SIZE(AO)
LD b1, 1 * SIZE(BO)
lda L, -1(L)
lda AO, 1 * SIZE(AO)
lda BO, 1 * SIZE(BO)
bgt L, $L116
.align 4
$L118:
ADD c01, t1, c01
ADD c02, t2, c02
ADD c03, t3, c03
ADD c04, t4, c04
ADD c01, c02, c01
ADD c03, c04, c03
ADD c01, c03, c01
MUL alpha, c01, c01
#ifndef TRMMKERNEL
ADD c01, a2, c01
#endif
ST c01, 0 * SIZE(C1)
.align 4
$L999:
ldt $f2, 0($sp)
ldt $f3, 8($sp)
ldt $f4, 16($sp)
ldt $f5, 24($sp)
ldt $f6, 32($sp)
ldt $f7, 40($sp)
ldt $f8, 48($sp)
ldt $f9, 56($sp)
clr $0
lda $sp, STACKSIZE($sp)
ret
EPILOGUE