/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "version.h"
#if !defined(EV4) && !defined(EV5) && !defined(EV6)
#error "Architecture is not specified."
#endif
#ifdef EV6
#define PREFETCHSIZE 56
#define UNOP unop
#endif
#ifdef EV5
#define PREFETCHSIZE 48
#define UNOP
#endif
#ifdef EV4
#define UNOP
#endif
.set noat
.set noreorder
.arch ev6
.text
.align 5
.globl CNAME
.ent CNAME
#define STACKSIZE 80
#define M $16
#define N $17
#define K $18
#define A $21
#define B $22
#define C $20
#define LDC $23
#define C1 $19
#define C2 $24
#define AO $at
#define BO $5
#define I $6
#define J $7
#define L $8
#define a1 $f16
#define a2 $f17
#define a3 $f18
#define a4 $f19
#define b1 $f20
#define b2 $f21
#define b3 $f22
#define b4 $f23
#define t1 $f24
#define t2 $f25
#define t3 $f26
#define t4 $f27
#define a5 $f28
#define a6 $f30
#define b5 $f29
#define alpha_i $f29
#define alpha_r $f30
#define c01 $f0
#define c02 $f1
#define c03 $f2
#define c04 $f3
#define c05 $f4
#define c06 $f5
#define c07 $f6
#define c08 $f7
#define c09 $f8
#define c10 $f9
#define c11 $f10
#define c12 $f11
#define c13 $f12
#define c14 $f13
#define c15 $f14
#define c16 $f15
#define TMP1 $0
#define TMP2 $1
#define KK $2
#define BB $3
#define OFFSET $4
#define ALPHA_R 64($sp)
#define ALPHA_I 72($sp)
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define ADD1 ADD
#define ADD2 SUB
#define ADD3 ADD
#define ADD4 ADD
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define ADD1 ADD
#define ADD2 ADD
#define ADD3 SUB
#define ADD4 ADD
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define ADD1 ADD
#define ADD2 ADD
#define ADD3 ADD
#define ADD4 SUB
#else
#define ADD1 ADD
#define ADD2 SUB
#define ADD3 SUB
#define ADD4 SUB
#endif
CNAME:
.frame $sp, STACKSIZE, $26, 0
#ifdef PROFILE
ldgp $gp, 0($27)
lda $at, _mcount
jsr $at, ($at), _mcount
#endif
#ifndef PROFILE
.prologue 0
#else
.prologue 1
#endif
lda $sp, -STACKSIZE($sp)
ldq B, 0 + STACKSIZE($sp)
ldq C, 8 + STACKSIZE($sp)
ldq LDC, 16 + STACKSIZE($sp)
#ifdef TRMMKERNEL
ldq OFFSET, 24 + STACKSIZE($sp)
#endif
sll LDC, ZBASE_SHIFT, LDC
stt $f2, 0($sp)
stt $f3, 8($sp)
stt $f4, 16($sp)
stt $f5, 24($sp)
stt $f6, 32($sp)
stt $f7, 40($sp)
stt $f8, 48($sp)
stt $f9, 56($sp)
stt $f19, ALPHA_R
stt $f20, ALPHA_I
cmple M, 0, $0
cmple N, 0, $1
cmple K, 0, $2
or $0, $1, $0
or $0, $2, $0
bne $0, $L999
#if defined(TRMMKERNEL) && !defined(LEFT)
subq $31, OFFSET, KK
#endif
sra N, 1, J
ble J, $L30
.align 4
$L01:
mov C, C1
addq C, LDC, C2
mov A, AO
s4addq K, 0, BB
#if defined(TRMMKERNEL) && defined(LEFT)
mov OFFSET, KK
#endif
SXADDQ BB, B, BB
addq C2, LDC, C
unop
sra M, 1, I
fclr t1
fclr t2
fclr t3
fclr t4
fclr c01
fclr c05
ble I, $L20
.align 4
$L11:
#ifndef EV4
ldl $31, 0 * SIZE(BB)
ldl $31, 8 * SIZE(BB)
unop
lda BB, 16 * SIZE(BB)
#endif
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 2, TMP1
#else
addq KK, 2, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr c09
LD a2, 1 * SIZE(AO)
fclr c13
LD a3, 2 * SIZE(AO)
fclr c02
LD a4, 3 * SIZE(AO)
fclr c06
LD b1, 0 * SIZE(B)
fclr c10
LD b2, 1 * SIZE(B)
fclr c14
LD b3, 2 * SIZE(B)
fclr c03
LD b4, 3 * SIZE(B)
fclr c07
lda BO, 4 * SIZE(B)
fclr c11
lda AO, 4 * SIZE(AO)
fclr c15
lds $f31, 4 * SIZE(C1)
fclr c04
#ifndef TRMMKERNEL
lda L, -2(K)
#else
lda L, -2(TMP1)
#endif
fclr c08
lds $f31, 4 * SIZE(C2)
fclr c12
fclr c16
ble L, $L15
#else
sll KK, ZBASE_SHIFT + 1, TMP1
addq AO, TMP1, AO
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c09
LD a2, 1 * SIZE(AO)
fclr c13
LD a3, 2 * SIZE(AO)
fclr c02
LD a4, 3 * SIZE(AO)
fclr c06
LD b1, 0 * SIZE(BO)
fclr c10
LD b2, 1 * SIZE(BO)
fclr c14
LD b3, 2 * SIZE(BO)
fclr c03
LD b4, 3 * SIZE(BO)
fclr c07
lda BO, 4 * SIZE(BO)
fclr c11
lda AO, 4 * SIZE(AO)
fclr c15
lds $f31, 4 * SIZE(C1)
fclr c04
lda L, -2(TMP1)
fclr c08
lds $f31, 4 * SIZE(C2)
fclr c12
fclr c16
ble L, $L15
#endif
.align 5
$L12:
/* 1 */
ADD1 c11, t1, c11
#ifndef EV4
ldq $31, PREFETCHSIZE * SIZE(AO)
#else
unop
#endif
MUL b1, a1, t1
#ifndef EV4
ldl $31, PREFETCHSIZE * SIZE(BO)
#else
unop
#endif
ADD3 c12, t2, c12
unop
MUL b1, a2, t2
unop
ADD2 c16, t3, c16
unop
MUL b2, a2, t3
LD a5, 0 * SIZE(AO)
ADD4 c15, t4, c15
unop
MUL b2, a1, t4
LD b5, 0 * SIZE(BO)
/* 2 */
ADD1 c01, t1, c01
UNOP
MUL b1, a3, t1
UNOP
ADD3 c02, t2, c02
UNOP
MUL b1, a4, t2
UNOP
ADD2 c06, t3, c06
unop
MUL b2, a4, t3
unop
ADD4 c05, t4, c05
unop
MUL b4, a1, t4
unop
/* 3 */
ADD1 c03, t1, c03
unop
MUL b3, a1, t1
unop
ADD3 c04, t2, c04
unop
MUL b3, a2, t2
unop
ADD2 c08, t3, c08
unop
MUL b4, a2, t3
LD a2, 1 * SIZE(AO)
ADD4 c13, t4, c13
unop
MUL b2, a3, t4
LD b2, 1 * SIZE(BO)
/* 4 */
ADD1 c09, t1, c09
unop
MUL b3, a3, t1
LD a6, 2 * SIZE(AO)
ADD3 c10, t2, c10
unop
MUL b3, a4, t2
LD b3, 2 * SIZE(BO)
ADD2 c14, t3, c14
unop
MUL b4, a4, t3
LD a4, 3 * SIZE(AO)
ADD4 c07, t4, c07
unop
MUL b4, a3, t4
LD b4, 3 * SIZE(BO)
/* 5 */
ADD1 c11, t1, c11
unop
MUL b5, a5, t1
LD a1, 4 * SIZE(AO)
ADD3 c12, t2, c12
lda L, -2(L)
MUL b5, a2, t2
LD b1, 4 * SIZE(BO)
ADD2 c16, t3, c16
unop
MUL b2, a2, t3
unop
ADD4 c15, t4, c15
unop
MUL b2, a5, t4
unop
/* 6 */
ADD1 c01, t1, c01
unop
MUL b5, a6, t1
unop
ADD3 c02, t2, c02
unop
MUL b5, a4, t2
unop
ADD2 c06, t3, c06
unop
MUL b2, a4, t3
unop
ADD4 c05, t4, c05
unop
MUL b4, a5, t4
unop
/* 7 */
ADD1 c03, t1, c03
lda AO, 8 * SIZE(AO)
MUL b3, a5, t1
unop
ADD3 c04, t2, c04
lda BO, 8 * SIZE(BO)
MUL b3, a2, t2
unop
ADD2 c08, t3, c08
unop
MUL b4, a2, t3
LD a2, -3 * SIZE(AO)
ADD4 c13, t4, c13
unop
MUL b2, a6, t4
LD b2, -3 * SIZE(BO)
/* 8 */
ADD1 c09, t1, c09
unop
MUL b3, a6, t1
LD a3, -2 * SIZE(AO)
ADD3 c10, t2, c10
unop
MUL b3, a4, t2
LD b3, -2 * SIZE(BO)
ADD2 c14, t3, c14
unop
MUL b4, a4, t3
LD a4, -1 * SIZE(AO)
ADD4 c07, t4, c07
MUL b4, a6, t4
LD b4, -1 * SIZE(BO)
bgt L, $L12
.align 4
$L15:
ADD1 c11, t1, c11
ldt alpha_r, ALPHA_R
MUL b1, a1, t1
#ifndef TRMMKERNEL
blbs K, $L18
#else
blbs TMP1, $L18
#endif
.align 4
ADD3 c12, t2, c12
MUL b1, a2, t2
ADD2 c16, t3, c16
MUL b2, a2, t3
ADD4 c15, t4, c15
MUL b2, a1, t4
ADD1 c01, t1, c01
MUL b1, a3, t1
ADD3 c02, t2, c02
unop
MUL b1, a4, t2
LD b1, 0 * SIZE(BO)
ADD2 c06, t3, c06
MUL b2, a4, t3
ADD4 c05, t4, c05
MUL b4, a1, t4
ADD1 c03, t1, c03
unop
MUL b3, a1, t1
LD a1, 0 * SIZE(AO)
ADD3 c04, t2, c04
unop
MUL b3, a2, t2
unop
ADD2 c08, t3, c08
unop
MUL b4, a2, t3
LD a2, 1 * SIZE(AO)
ADD4 c13, t4, c13
unop
MUL b2, a3, t4
LD b2, 1 * SIZE(BO)
ADD1 c09, t1, c09
unop
MUL b3, a3, t1
lda AO, 4 * SIZE(AO)
ADD3 c10, t2, c10
unop
MUL b3, a4, t2
LD b3, 2 * SIZE(BO)
ADD2 c14, t3, c14
unop
MUL b4, a4, t3
LD a4, -1 * SIZE(AO)
ADD4 c07, t4, c07
unop
MUL b4, a3, t4
LD a3, -2 * SIZE(AO)
ADD1 c11, t1, c11
LD b4, 3 * SIZE(BO)
MUL b1, a1, t1
lda BO, 4 * SIZE(BO)
.align 4
$L18:
ADD3 c12, t2, c12
unop
MUL b1, a2, t2
ldt alpha_i, ALPHA_I
ADD2 c16, t3, c16
unop
MUL b2, a2, t3
#ifndef TRMMKERNEL
LD a5, 0 * SIZE(C1)
#else
unop
#endif
ADD4 c15, t4, c15
MUL b2, a1, t4
ADD1 c01, t1, c01
MUL b1, a3, t1
ADD3 c02, t2, c02
unop
MUL b1, a4, t2
#ifndef TRMMKERNEL
LD b1, 1 * SIZE(C1)
#else
unop
#endif
ADD2 c06, t3, c06
MUL b2, a4, t3
ADD4 c05, t4, c05
MUL b4, a1, t4
ADD1 c03, t1, c03
unop
MUL b3, a1, t1
#ifndef TRMMKERNEL
LD a1, 2 * SIZE(C1)
#else
unop
#endif
ADD3 c04, t2, c04
unop
MUL b3, a2, t2
unop
ADD2 c08, t3, c08
unop
MUL b4, a2, t3
#ifndef TRMMKERNEL
LD a2, 3 * SIZE(C1)
#else
unop
#endif
ADD4 c13, t4, c13
unop
MUL b2, a3, t4
#ifndef TRMMKERNEL
LD b2, 0 * SIZE(C2)
#else
unop
#endif
ADD1 c09, t1, c09
lda I, -1(I)
MUL b3, a3, t1
unop
ADD3 c10, t2, c10
unop
MUL b3, a4, t2
#ifndef TRMMKERNEL
LD b3, 1 * SIZE(C2)
#else
unop
#endif
ADD2 c14, t3, c14
unop
MUL b4, a4, t3
#ifndef TRMMKERNEL
LD a4, 2 * SIZE(C2)
#else
unop
#endif
ADD4 c07, t4, c07
unop
MUL b4, a3, t4
#ifndef TRMMKERNEL
LD a3, 3 * SIZE(C2)
#else
unop
#endif
ADD1 c11, t1, c11
ADD3 c12, t2, c12
ADD2 c16, t3, c16
ADD4 c15, t4, c15
ADD c01, c06, c01
ADD c02, c05, c02
ADD c03, c08, c03
ADD c04, c07, c04
ADD c09, c14, c09
MUL alpha_r, c01, t1
ADD c10, c13, c10
MUL alpha_r, c02, t2
ADD c11, c16, c11
MUL alpha_r, c03, t3
ADD c12, c15, c12
MUL alpha_r, c04, t4
#ifndef TRMMKERNEL
ADD a5, t1, a5
MUL alpha_i, c02, t1
ADD b1, t2, b1
MUL alpha_i, c01, t2
ADD a1, t3, a1
MUL alpha_i, c04, t3
ADD a2, t4, a2
MUL alpha_i, c03, t4
#else
ADD $f31, t1, a5
MUL alpha_i, c02, t1
ADD $f31, t2, b1
MUL alpha_i, c01, t2
ADD $f31, t3, a1
MUL alpha_i, c04, t3
ADD $f31, t4, a2
MUL alpha_i, c03, t4
#endif
SUB a5, t1, a5
MUL alpha_r, c09, t1
ADD b1, t2, b1
MUL alpha_r, c10, t2
SUB a1, t3, a1
MUL alpha_r, c11, t3
ADD a2, t4, a2
MUL alpha_r, c12, t4
#ifndef TRMMKERNEL
ADD b2, t1, b2
MUL alpha_i, c10, t1
ADD b3, t2, b3
MUL alpha_i, c09, t2
ADD a4, t3, a4
MUL alpha_i, c12, t3
ADD a3, t4, a3
MUL alpha_i, c11, t4
#else
ADD $f31, t1, b2
MUL alpha_i, c10, t1
ADD $f31, t2, b3
MUL alpha_i, c09, t2
ADD $f31, t3, a4
MUL alpha_i, c12, t3
ADD $f31, t4, a3
MUL alpha_i, c11, t4
#endif
SUB b2, t1, b2
ST a5, 0 * SIZE(C1)
fclr t1
unop
ADD b3, t2, b3
ST b1, 1 * SIZE(C1)
fclr t2
unop
SUB a4, t3, a4
ST a1, 2 * SIZE(C1)
fclr t3
unop
ADD a3, t4, a3
ST a2, 3 * SIZE(C1)
fclr t4
unop
ST b2, 0 * SIZE(C2)
fclr c01
ST b3, 1 * SIZE(C2)
fclr c05
ST a4, 2 * SIZE(C2)
lda C1, 4 * SIZE(C1)
ST a3, 3 * SIZE(C2)
lda C2, 4 * SIZE(C2)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
subq K, KK, TMP1
#ifdef LEFT
subq TMP1, 2, TMP1
#else
subq TMP1, 2, TMP1
#endif
sll TMP1, ZBASE_SHIFT + 1, TMP1
addq AO, TMP1, AO
addq BO, TMP1, BO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq KK, 2, KK
#endif
bgt I, $L11
.align 4
$L20:
and M, 1, I
ble I, $L29
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 1, TMP1
#else
addq KK, 2, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr c09
LD a2, 1 * SIZE(AO)
fclr c13
LD a3, 2 * SIZE(AO)
fclr c02
LD a4, 3 * SIZE(AO)
fclr c06
LD b1, 0 * SIZE(B)
fclr c10
LD b2, 1 * SIZE(B)
fclr c14
LD b3, 2 * SIZE(B)
lda AO, 2 * SIZE(AO)
LD b4, 3 * SIZE(B)
lda BO, 4 * SIZE(B)
#ifndef TRMMKERNEL
lda L, -2(K)
#else
lda L, -2(TMP1)
#endif
ble L, $L25
#else
sll KK, ZBASE_SHIFT + 0, TMP1
addq AO, TMP1, AO
sll KK, ZBASE_SHIFT + 1, TMP1
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c09
LD a2, 1 * SIZE(AO)
fclr c13
LD a3, 2 * SIZE(AO)
fclr c02
LD a4, 3 * SIZE(AO)
fclr c06
LD b1, 0 * SIZE(BO)
fclr c10
LD b2, 1 * SIZE(BO)
fclr c14
LD b3, 2 * SIZE(BO)
lda AO, 2 * SIZE(AO)
LD b4, 3 * SIZE(BO)
lda BO, 4 * SIZE(BO)
lda L, -2(TMP1)
ble L, $L25
#endif
.align 5
$L22:
ADD1 c09, t1, c09
unop
MUL a1, b1, t1
unop
ADD3 c10, t2, c10
unop
MUL a2, b1, t2
LD b1, 0 * SIZE(BO)
ADD4 c13, t3, c13
unop
MUL a1, b2, t3
lda BO, 8 * SIZE(BO)
ADD2 c14, t4, c14
unop
MUL a2, b2, t4
LD b2, -7 * SIZE(BO)
ADD1 c01, t1, c01
unop
MUL a1, b3, t1
unop
ADD3 c02, t2, c02
unop
MUL a2, b3, t2
LD b3, -6 * SIZE(BO)
ADD4 c05, t3, c05
unop
MUL a1, b4, t3
LD a1, 2 * SIZE(AO)
ADD2 c06, t4, c06
MUL a2, b4, t4
LD b5, -5 * SIZE(BO)
ADD1 c09, t1, c09
unop
MUL a3, b1, t1
LD a2, 3 * SIZE(AO)
ADD3 c10, t2, c10
unop
MUL a4, b1, t2
LD b1, -4 * SIZE(BO)
ADD4 c13, t3, c13
unop
MUL a3, b2, t3
lda AO, 4 * SIZE(AO)
ADD2 c14, t4, c14
MUL a4, b2, t4
LD b2, -3 * SIZE(BO)
ADD1 c01, t1, c01
lda L, -2(L)
MUL a3, b3, t1
LD b4, -1 * SIZE(BO)
ADD3 c02, t2, c02
unop
MUL a4, b3, t2
LD b3, -2 * SIZE(BO)
ADD4 c05, t3, c05
unop
MUL a3, b5, t3
LD a3, 0 * SIZE(AO)
ADD2 c06, t4, c06
MUL a4, b5, t4
LD a4, 1 * SIZE(AO)
bgt L, $L22
.align 4
$L25:
ADD1 c09, t1, c09
ldt alpha_r, ALPHA_R
MUL a1, b1, t1
#ifndef TRMMKERNEL
blbs K, $L28
#else
blbs TMP1, $L28
#endif
.align 4
ADD3 c10, t2, c10
unop
MUL a2, b1, t2
LD b1, 0 * SIZE(BO)
ADD4 c13, t3, c13
unop
MUL a1, b2, t3
unop
ADD2 c14, t4, c14
unop
MUL a2, b2, t4
LD b2, 1 * SIZE(BO)
ADD1 c01, t1, c01
unop
MUL a1, b3, t1
lda AO, 2 * SIZE(AO)
ADD3 c02, t2, c02
unop
MUL a2, b3, t2
LD b3, 2 * SIZE(BO)
ADD4 c05, t3, c05
unop
MUL a1, b4, t3
LD a1, -2 * SIZE(AO)
ADD2 c06, t4, c06
unop
MUL a2, b4, t4
LD a2, -1 * SIZE(AO)
ADD1 c09, t1, c09
LD b4, 3 * SIZE(BO)
MUL a1, b1, t1
lda BO, 4 * SIZE(BO)
.align 4
$L28:
ADD3 c10, t2, c10
unop
MUL a2, b1, t2
ldt alpha_i, ALPHA_I
ADD4 c13, t3, c13
unop
MUL a1, b2, t3
#ifndef TRMMKERNEL
LD c03, 0 * SIZE(C1)
#else
unop
#endif
ADD2 c14, t4, c14
unop
MUL a2, b2, t4
#ifndef TRMMKERNEL
LD c04, 1 * SIZE(C1)
#else
unop
#endif
ADD1 c01, t1, c01
unop
MUL a1, b3, t1
#ifndef TRMMKERNEL
LD c11, 0 * SIZE(C2)
#else
unop
#endif
ADD3 c02, t2, c02
unop
MUL a2, b3, t2
#ifndef TRMMKERNEL
LD c12, 1 * SIZE(C2)
#else
unop
#endif
ADD4 c05, t3, c05
MUL a1, b4, t3
ADD2 c06, t4, c06
MUL a2, b4, t4
ADD1 c09, t1, c09
ADD3 c10, t2, c10
ADD4 c13, t3, c13
ADD2 c14, t4, c14
ADD c01, c06, c01
ADD c02, c05, c02
ADD c09, c14, c09
ADD c10, c13, c10
MUL alpha_r, c01, t1
MUL alpha_r, c02, t2
MUL alpha_r, c09, t3
MUL alpha_r, c10, t4
#ifndef TRMMKERNEL
ADD c03, t1, c03
MUL alpha_i, c02, t1
ADD c04, t2, c04
MUL alpha_i, c01, t2
ADD c11, t3, c11
MUL alpha_i, c10, t3
ADD c12, t4, c12
MUL alpha_i, c09, t4
#else
ADD $f31, t1, c03
MUL alpha_i, c02, t1
ADD $f31, t2, c04
MUL alpha_i, c01, t2
ADD $f31, t3, c11
MUL alpha_i, c10, t3
ADD $f31, t4, c12
MUL alpha_i, c09, t4
#endif
SUB c03, t1, c03
ADD c04, t2, c04
SUB c11, t3, c11
ADD c12, t4, c12
ST c03, 0 * SIZE(C1)
ST c04, 1 * SIZE(C1)
ST c11, 0 * SIZE(C2)
ST c12, 1 * SIZE(C2)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
subq K, KK, TMP1
#ifdef LEFT
subq TMP1, 1, TMP1
#else
subq TMP1, 2, TMP1
#endif
sll TMP1, ZBASE_SHIFT + 0, TMP2
addq AO, TMP2, AO
sll TMP1, ZBASE_SHIFT + 1, TMP2
addq BO, TMP2, BO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq KK, 1, KK
#endif
.align 4
$L29:
mov BO, B
lda J, -1(J)
#if defined(TRMMKERNEL) && !defined(LEFT)
addq KK, 2, KK
#else
unop
#endif
bgt J, $L01
.align 4
$L30:
and N, 1, J
ble J, $L999
mov C, C1
mov A, AO
#if defined(TRMMKERNEL) && defined(LEFT)
mov OFFSET, KK
#endif
sra M, 1, I
ble I, $L50
.align 4
$L41:
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 2, TMP1
#else
addq KK, 1, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c05
LD b3, 2 * SIZE(B)
fclr c02
LD b4, 3 * SIZE(B)
fclr c06
lda BO, 2 * SIZE(B)
fclr c03
lda AO, 4 * SIZE(AO)
fclr c07
#ifndef TRMMKERNEL
lda L, -2(K)
#else
lda L, -2(TMP1)
#endif
fclr c04
fclr c08
ble L, $L45
#else
sll KK, ZBASE_SHIFT + 1, TMP1
addq AO, TMP1, AO
sll KK, ZBASE_SHIFT + 0, TMP1
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c05
LD b3, 2 * SIZE(BO)
fclr c02
LD b4, 3 * SIZE(BO)
fclr c06
lda BO, 2 * SIZE(BO)
fclr c03
lda AO, 4 * SIZE(AO)
fclr c07
lda L, -2(TMP1)
fclr c04
fclr c08
ble L, $L45
#endif
.align 5
$L42:
ADD4 c05, t1, c05
unop
MUL a1, b1, t1
unop
ADD2 c06, t2, c06
lda L, -2(L)
MUL a2, b1, t2
unop
ADD4 c07, t3, c07
unop
MUL a3, b1, t3
unop
ADD2 c08, t4, c08
unop
MUL a4, b1, t4
LD b1, 2 * SIZE(BO)
ADD1 c01, t1, c01
unop
MUL a1, b2, t1
LD a1, 0 * SIZE(AO)
ADD3 c02, t2, c02
lda BO, 4 * SIZE(BO)
MUL a2, b2, t2
LD a2, 1 * SIZE(AO)
ADD1 c03, t3, c03
unop
MUL a3, b2, t3
LD a3, 2 * SIZE(AO)
ADD3 c04, t4, c04
unop
MUL a4, b2, t4
LD a5, 3 * SIZE(AO)
ADD4 c05, t1, c05
unop
MUL a1, b3, t1
LD b2, -1 * SIZE(BO)
ADD2 c06, t2, c06
unop
MUL a2, b3, t2
unop
ADD4 c07, t3, c07
unop
MUL a3, b3, t3
lda AO, 8 * SIZE(AO)
ADD2 c08, t4, c08
unop
MUL a5, b3, t4
LD b3, 0 * SIZE(BO)
ADD1 c01, t1, c01
unop
MUL a1, b4, t1
LD a1, -4 * SIZE(AO)
ADD3 c02, t2, c02
unop
MUL a2, b4, t2
LD a2, -3 * SIZE(AO)
ADD1 c03, t3, c03
LD a4, -1 * SIZE(AO)
MUL a3, b4, t3
LD a3, -2 * SIZE(AO)
ADD3 c04, t4, c04
MUL a5, b4, t4
LD b4, 1 * SIZE(BO)
bgt L, $L42
.align 4
$L45:
ADD4 c05, t1, c05
ldt alpha_r, ALPHA_R
MUL b1, a1, t1
#ifndef TRMMKERNEL
blbs K, $L48
#else
blbs TMP1, $L48
#endif
.align 4
ADD2 c06, t2, c06
MUL a2, b1, t2
ADD4 c07, t3, c07
MUL a3, b1, t3
ADD2 c08, t4, c08
unop
MUL a4, b1, t4
LD b1, 0 * SIZE(BO)
ADD1 c01, t1, c01
unop
MUL a1, b2, t1
LD a1, 0 * SIZE(AO)
ADD3 c02, t2, c02
unop
MUL a2, b2, t2
LD a2, 1 * SIZE(AO)
ADD1 c03, t3, c03
unop
MUL a3, b2, t3
LD a3, 2 * SIZE(AO)
ADD3 c04, t4, c04
MUL a4, b2, t4
LD a4, 3 * SIZE(AO)
lda AO, 4 * SIZE(AO)
ADD4 c05, t1, c05
LD b2, 1 * SIZE(BO)
MUL a1, b1, t1
lda BO, 2 * SIZE(BO)
.align 4
$L48:
ADD2 c06, t2, c06
unop
MUL a2, b1, t2
ldt alpha_i, ALPHA_I
ADD4 c07, t3, c07
lda I, -1(I)
MUL a3, b1, t3
#ifndef TRMMKERNEL
LD c09, 0 * SIZE(C1)
#else
unop
#endif
ADD2 c08, t4, c08
unop
MUL a4, b1, t4
#ifndef TRMMKERNEL
LD c10, 1 * SIZE(C1)
#else
unop
#endif
ADD1 c01, t1, c01
unop
MUL a1, b2, t1
#ifndef TRMMKERNEL
LD c11, 2 * SIZE(C1)
#else
unop
#endif
ADD3 c02, t2, c02
unop
MUL a2, b2, t2
#ifndef TRMMKERNEL
LD c12, 3 * SIZE(C1)
#else
unop
#endif
ADD1 c03, t3, c03
MUL a3, b2, t3
ADD3 c04, t4, c04
MUL a4, b2, t4
ADD4 c05, t1, c05
ADD2 c06, t2, c06
ADD4 c07, t3, c07
ADD2 c08, t4, c08
ADD c01, c06, c01
ADD c02, c05, c02
ADD c03, c08, c03
ADD c04, c07, c04
MUL alpha_r, c01, t1
MUL alpha_r, c02, t2
MUL alpha_r, c03, t3
MUL alpha_r, c04, t4
#ifndef TRMMKERNEL
ADD c09, t1, c09
MUL alpha_i, c02, t1
ADD c10, t2, c10
MUL alpha_i, c01, t2
ADD c11, t3, c11
MUL alpha_i, c04, t3
ADD c12, t4, c12
MUL alpha_i, c03, t4
#else
ADD $f31, t1, c09
MUL alpha_i, c02, t1
ADD $f31, t2, c10
MUL alpha_i, c01, t2
ADD $f31, t3, c11
MUL alpha_i, c04, t3
ADD $f31, t4, c12
MUL alpha_i, c03, t4
#endif
SUB c09, t1, c09
ADD c10, t2, c10
SUB c11, t3, c11
ADD c12, t4, c12
ST c09, 0 * SIZE(C1)
ST c10, 1 * SIZE(C1)
ST c11, 2 * SIZE(C1)
ST c12, 3 * SIZE(C1)
lda C1, 4 * SIZE(C1)
#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
subq K, KK, TMP1
#ifdef LEFT
subq TMP1, 2, TMP1
#else
subq TMP1, 1, TMP1
#endif
sll TMP1, ZBASE_SHIFT + 1, TMP2
addq AO, TMP2, AO
sll TMP1, ZBASE_SHIFT + 0, TMP2
addq BO, TMP2, BO
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
addq KK, 2, KK
#endif
bgt I, $L41
.align 4
$L50:
and M, 1, I
ble I, $L999
#if !defined(TRMMKERNEL) || \
(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
#ifdef TRMMKERNEL
#ifdef LEFT
addq KK, 1, TMP1
#else
addq KK, 1, TMP1
#endif
#endif
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c05
LD b3, 2 * SIZE(B)
fclr c02
LD b4, 3 * SIZE(B)
fclr c06
lda AO, 2 * SIZE(AO)
lda BO, 2 * SIZE(B)
#ifndef TRMMKERNEL
lda L, -2(K)
#else
lda L, -2(TMP1)
#endif
ble L, $L55
#else
sll KK, ZBASE_SHIFT + 0, TMP1
addq AO, TMP1, AO
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c05
LD b3, 2 * SIZE(BO)
fclr c02
LD b4, 3 * SIZE(BO)
fclr c06
lda AO, 2 * SIZE(AO)
lda BO, 2 * SIZE(BO)
lda L, -2(TMP1)
ble L, $L55
#endif
.align 5
$L52:
ADD1 c01, t1, c01
unop
MUL a1, b1, t1
unop
ADD3 c02, t2, c02
lda AO, 4 * SIZE(AO)
MUL a2, b1, t2
LD b1, 2 * SIZE(BO)
ADD4 c05, t3, c05
lda L, -2(L)
MUL a1, b2, t3
LD a1, -2 * SIZE(AO)
ADD2 c06, t4, c06
unop
MUL a2, b2, t4
LD a2, -1 * SIZE(AO)
ADD1 c01, t1, c01
LD b2, 3 * SIZE(BO)
MUL a3, b3, t1
lda BO, 4 * SIZE(BO)
ADD3 c02, t2, c02
unop
MUL a4, b3, t2
LD b3, 0 * SIZE(BO)
ADD4 c05, t3, c05
unop
MUL a3, b4, t3
LD a3, 0 * SIZE(AO)
ADD2 c06, t4, c06
MUL a4, b4, t4
LD b4, 1 * SIZE(BO)
unop
LD a4, 1 * SIZE(AO)
unop
unop
bgt L, $L52
.align 4
$L55:
ADD1 c01, t1, c01
ldt alpha_r, ALPHA_R
MUL a1, b1, t1
#ifndef TRMMKERNEL
blbs K, $L58
#else
blbs TMP1, $L58
#endif
.align 4
ADD3 c02, t2, c02
unop
MUL a2, b1, t2
LD b1, 0 * SIZE(BO)
ADD4 c05, t3, c05
lda BO, 2 * SIZE(BO)
MUL a1, b2, t3
LD a1, 0 * SIZE(AO)
ADD2 c06, t4, c06
unop
MUL a2, b2, t4
LD a2, 1 * SIZE(AO)
ADD1 c01, t1, c01
LD b2, -1 * SIZE(BO)
MUL a1, b1, t1
lda AO, 2 * SIZE(AO)
.align 4
$L58:
ADD3 c02, t2, c02
unop
MUL a2, b1, t2
ldt alpha_i, ALPHA_I
ADD4 c05, t3, c05
unop
MUL a1, b2, t3
#ifndef TRMMKERNEL
LD c03, 0 * SIZE(C1)
#else
unop
#endif
ADD2 c06, t4, c06
unop
MUL a2, b2, t4
#ifndef TRMMKERNEL
LD c04, 1 * SIZE(C1)
#else
unop
#endif
ADD1 c01, t1, c01
ADD3 c02, t2, c02
ADD4 c05, t3, c05
ADD2 c06, t4, c06
ADD c01, c06, c01
ADD c02, c05, c02
MUL alpha_r, c01, t1
MUL alpha_r, c02, t2
MUL alpha_i, c02, t3
MUL alpha_i, c01, t4
#ifndef TRMMKERNEL
ADD c03, t1, c03
ADD c04, t2, c04
#else
ADD $f31, t1, c03
ADD $f31, t2, c04
#endif
SUB c03, t3, c03
ADD c04, t4, c04
ST c03, 0 * SIZE(C1)
ST c04, 1 * SIZE(C1)
.align 4
$L999:
ldt $f2, 0($sp)
ldt $f3, 8($sp)
ldt $f4, 16($sp)
ldt $f5, 24($sp)
ldt $f6, 32($sp)
ldt $f7, 40($sp)
ldt $f8, 48($sp)
ldt $f9, 56($sp)
clr $0
lda $sp, STACKSIZE($sp)
ret
.ident VERSION
.end CNAME