/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "version.h"
#if !defined(EV4) && !defined(EV5) && !defined(EV6)
#error "Architecture is not specified."
#endif
#ifdef EV6
#define PREFETCHSIZE 56
#define UNOP unop
#endif
#ifdef EV5
#define PREFETCHSIZE 56
#define UNOP
#endif
#ifdef EV4
#define UNOP
#endif
#define STACKSIZE 80
#define M $16
#define N $17
#define K $18
#define A $20
#define B $21
#define C $22
#define LDC $23
#define C1 $19
#define C2 $24
#define C3 $25
#define C4 $27
#define AO $at
#define BO $5
#define I $6
#define J $7
#define L $8
#define a1 $f16
#define a2 $f17
#define a3 $f18
#define a4 $f19
#define b1 $f20
#define b2 $f21
#define b3 $f22
#define b4 $f23
#define t1 $f24
#define t2 $f25
#define t3 $f26
#define t4 $f27
#define a5 $f28
#define a6 $f30
#define b5 $f29
#define alpha $f30
#define c01 $f0
#define c02 $f1
#define c03 $f2
#define c04 $f3
#define c05 $f4
#define c06 $f5
#define c07 $f6
#define c08 $f7
#define c09 $f8
#define c10 $f9
#define c11 $f10
#define c12 $f11
#define c13 $f12
#define c14 $f13
#define c15 $f14
#define c16 $f15
#define TMP1 $0
#define TMP2 $1
#define KK $2
#define AORIG $3
#define OFFSET $4
PROLOGUE
PROFCODE
.frame $sp, STACKSIZE, $26, 0
lda $sp, -STACKSIZE($sp)
ldq C, 0 + STACKSIZE($sp)
ldq LDC, 8 + STACKSIZE($sp)
ldq OFFSET, 16 + STACKSIZE($sp)
SXADDQ LDC, 0, LDC
stt $f2, 0($sp)
stt $f3, 8($sp)
stt $f4, 16($sp)
stt $f5, 24($sp)
stt $f6, 32($sp)
stt $f7, 40($sp)
stt $f8, 48($sp)
stt $f9, 56($sp)
cmple M, 0, $0
cmple N, 0, $1
cmple K, 0, $2
or $0, $1, $0
or $0, $2, $0
bne $0, $L999
#ifdef LN
mulq M, K, TMP1
SXADDQ TMP1, A, A
SXADDQ M, C, C
#endif
#ifdef RN
negq OFFSET, KK
#endif
#ifdef RT
mulq N, K, TMP1
SXADDQ TMP1, B, B
mulq N, LDC, TMP1
addq TMP1, C, C
subq N, OFFSET, KK
#endif
sra N, 2, J
ble J, $L40
.align 4
$L01:
#ifdef RT
sll K, 2 + BASE_SHIFT, TMP1
subq B, TMP1, B
s4addq LDC, 0, TMP1
subq C, TMP1, C
#endif
mov C, C1
addq C, LDC, C2
addq C2, LDC, C3
#ifndef RT
s4addq LDC, C, C
#endif
fclr t1
addq C3, LDC, C4
fclr t2
#ifdef LN
addq M, OFFSET, KK
#endif
#ifdef LT
mov OFFSET, KK
#endif
#if defined(LN) || defined(RT)
mov A, AORIG
#else
mov A, AO
#endif
fclr t3
fclr t4
and M, 1, I
ble I, $L20
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr c01
LD a2, 1 * SIZE(AO)
fclr c05
LD b1, 0 * SIZE(B)
lda L, -2(KK)
LD b2, 1 * SIZE(B)
lda AO, 1 * SIZE(AO)
LD b3, 2 * SIZE(B)
fclr c09
LD b4, 3 * SIZE(B)
fclr c13
lda BO, 4 * SIZE(B)
ble KK, $L38
ble L, $L35
#else
#ifdef LN
sll K, BASE_SHIFT + 0, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, BASE_SHIFT + 0, TMP1
addq AORIG, TMP1, AO
sll KK, BASE_SHIFT + 2, TMP2
addq B, TMP2, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c01
LD a2, 1 * SIZE(AO)
fclr c05
LD b1, 0 * SIZE(BO)
lda L, -2(TMP1)
LD b2, 1 * SIZE(BO)
lda AO, 1 * SIZE(AO)
LD b3, 2 * SIZE(BO)
fclr c09
LD b4, 3 * SIZE(BO)
fclr c13
lda BO, 4 * SIZE(BO)
ble TMP1, $L38
ble L, $L35
#endif
.align 4
$L32:
ADD c01, t1, c01
lda L, -2(L)
MUL a1, b1, t1
LD b1, 0 * SIZE(BO)
ADD c05, t2, c05
lda AO, 2 * SIZE(AO)
MUL a1, b2, t2
LD b2, 1 * SIZE(BO)
ADD c09, t3, c09
LD b5, 3 * SIZE(BO)
MUL a1, b3, t3
LD b3, 2 * SIZE(BO)
ADD c13, t4, c13
MUL a1, b4, t4
LD a1, -1 * SIZE(AO)
ADD c01, t1, c01
MUL a2, b1, t1
LD b1, 4 * SIZE(BO)
lda BO, 8 * SIZE(BO)
ADD c05, t2, c05
MUL a2, b2, t2
LD b2, -3 * SIZE(BO)
ADD c09, t3, c09
LD b4, -1 * SIZE(BO)
MUL a2, b3, t3
LD b3, -2 * SIZE(BO)
ADD c13, t4, c13
MUL a2, b5, t4
LD a2, 0 * SIZE(AO)
bgt L, $L32
.align 4
$L35:
ADD c01, t1, c01
MUL a1, b1, t1
#if defined(LT) || defined(RN)
blbs KK, $L37
#else
blbs TMP1, $L37
#endif
.align 4
ADD c05, t2, c05
LD b1, 0 * SIZE(BO)
MUL a1, b2, t2
LD b2, 1 * SIZE(BO)
ADD c09, t3, c09
MUL a1, b3, t3
LD b3, 2 * SIZE(BO)
ADD c13, t4, c13
MUL a1, b4, t4
LD a1, 0 * SIZE(AO)
lda AO, 1 * SIZE(AO)
ADD c01, t1, c01
LD b4, 3 * SIZE(BO)
MUL a1, b1, t1
lda BO, 4 * SIZE(BO)
.align 4
$L37:
ADD c05, t2, c05
MUL a1, b2, t2
ADD c09, t3, c09
MUL a1, b3, t3
ADD c13, t4, c13
lda AO, 1 * SIZE(AO)
MUL a1, b4, t4
lda BO, 4 * SIZE(BO)
ADD c01, t1, c01
ADD c05, t2, c05
ADD c09, t3, c09
ADD c13, t4, c13
$L38:
#if defined(LN) || defined(RT)
#ifdef LN
subq KK, 1, TMP1
#else
subq KK, 4, TMP1
#endif
sll TMP1, BASE_SHIFT + 0, TMP2
addq AORIG, TMP2, AO
sll TMP1, BASE_SHIFT + 2, TMP2
addq B, TMP2, BO
#else
lda AO, -1 * SIZE(AO)
lda BO, -4 * SIZE(BO)
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c09, c09
SUB a4, c13, c13
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c09, c09
SUB a4, c13, c13
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(AO)
MUL a1, c01, c01
MUL a1, c05, c05
MUL a1, c09, c09
MUL a1, c13, c13
#endif
#ifdef RN
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
MUL a1, c01, c01
MUL a2, c01, t1
SUB c05, t1, c05
MUL a3, c01, t1
SUB c09, t1, c09
MUL a4, c01, t1
SUB c13, t1, c13
LD b1, 5 * SIZE(BO)
LD b2, 6 * SIZE(BO)
LD b3, 7 * SIZE(BO)
MUL b1, c05, c05
MUL b2, c05, t1
SUB c09, t1, c09
MUL b3, c05, t1
SUB c13, t1, c13
LD a1, 10 * SIZE(BO)
LD a2, 11 * SIZE(BO)
LD a3, 15 * SIZE(BO)
MUL a1, c09, c09
MUL a2, c09, t1
SUB c13, t1, c13
MUL a3, c13, c13
#endif
#ifdef RT
LD a1, 15 * SIZE(BO)
LD a2, 14 * SIZE(BO)
LD a3, 13 * SIZE(BO)
LD a4, 12 * SIZE(BO)
MUL a1, c13, c13
MUL a2, c13, t1
SUB c09, t1, c09
MUL a3, c13, t1
SUB c05, t1, c05
MUL a4, c13, t1
SUB c01, t1, c01
LD b1, 10 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 8 * SIZE(BO)
MUL b1, c09, c09
MUL b2, c09, t1
SUB c05, t1, c05
MUL b3, c09, t1
SUB c01, t1, c01
LD a1, 5 * SIZE(BO)
LD a2, 4 * SIZE(BO)
LD a3, 0 * SIZE(BO)
MUL a1, c05, c05
MUL a2, c05, t1
SUB c01, t1, c01
MUL a3, c01, c01
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
ST c05, 1 * SIZE(BO)
ST c09, 2 * SIZE(BO)
ST c13, 3 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
ST c05, 1 * SIZE(AO)
ST c09, 2 * SIZE(AO)
ST c13, 3 * SIZE(AO)
#endif
#ifdef LN
lda C1, -1 * SIZE(C1)
lda C2, -1 * SIZE(C2)
lda C3, -1 * SIZE(C3)
lda C4, -1 * SIZE(C4)
#endif
ST c01, 0 * SIZE(C1)
ST c05, 0 * SIZE(C2)
ST c09, 0 * SIZE(C3)
ST c13, 0 * SIZE(C4)
#ifdef RT
sll K, 0 + BASE_SHIFT, TMP1
addq AORIG, TMP1, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, BASE_SHIFT + 0, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 2, TMP2
addq BO, TMP2, BO
#endif
#ifdef LT
addq KK, 1, KK
#endif
#ifdef LN
subq KK, 1, KK
#endif
.align 4
$L20:
and M, 2, I
ble I, $L30
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr c09
LD a2, 1 * SIZE(AO)
fclr c13
LD a3, 2 * SIZE(AO)
fclr c10
LD a4, 3 * SIZE(AO)
fclr c14
LD b1, 0 * SIZE(B)
lda L, -2(KK)
LD b2, 1 * SIZE(B)
lda AO, 2 * SIZE(AO)
LD b3, 2 * SIZE(B)
fclr c01
LD b4, 3 * SIZE(B)
fclr c05
lda BO, 4 * SIZE(B)
fclr c02
fclr c06
ble KK, $L28
ble L, $L25
#else
#ifdef LN
sll K, BASE_SHIFT + 1, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, BASE_SHIFT + 1, TMP1
addq AORIG, TMP1, AO
sll KK, BASE_SHIFT + 2, TMP2
addq B, TMP2, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c09
LD a2, 1 * SIZE(AO)
fclr c13
LD a3, 2 * SIZE(AO)
fclr c10
LD a4, 3 * SIZE(AO)
fclr c14
LD b1, 0 * SIZE(BO)
lda L, -2(TMP1)
LD b2, 1 * SIZE(BO)
lda AO, 2 * SIZE(AO)
LD b3, 2 * SIZE(BO)
fclr c01
LD b4, 3 * SIZE(BO)
fclr c05
lda BO, 4 * SIZE(BO)
fclr c02
fclr c06
ble TMP1, $L28
ble L, $L25
#endif
.align 4
$L22:
ADD c09, t1, c09
unop
MUL a1, b1, t1
unop
ADD c10, t2, c10
unop
MUL a2, b1, t2
LD b1, 0 * SIZE(BO)
ADD c13, t3, c13
unop
MUL a1, b2, t3
lda BO, 8 * SIZE(BO)
ADD c14, t4, c14
unop
MUL a2, b2, t4
LD b2, -7 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b3, t1
unop
ADD c02, t2, c02
unop
MUL a2, b3, t2
LD b3, -6 * SIZE(BO)
ADD c05, t3, c05
unop
MUL a1, b4, t3
LD a1, 2 * SIZE(AO)
ADD c06, t4, c06
MUL a2, b4, t4
LD b5, -5 * SIZE(BO)
ADD c09, t1, c09
unop
MUL a3, b1, t1
LD a2, 3 * SIZE(AO)
ADD c10, t2, c10
unop
MUL a4, b1, t2
LD b1, -4 * SIZE(BO)
ADD c13, t3, c13
unop
MUL a3, b2, t3
lda AO, 4 * SIZE(AO)
ADD c14, t4, c14
MUL a4, b2, t4
LD b2, -3 * SIZE(BO)
ADD c01, t1, c01
lda L, -2(L)
MUL a3, b3, t1
LD b4, -1 * SIZE(BO)
ADD c02, t2, c02
unop
MUL a4, b3, t2
LD b3, -2 * SIZE(BO)
ADD c05, t3, c05
unop
MUL a3, b5, t3
LD a3, 0 * SIZE(AO)
ADD c06, t4, c06
MUL a4, b5, t4
LD a4, 1 * SIZE(AO)
bgt L, $L22
.align 4
$L25:
ADD c09, t1, c09
MUL a1, b1, t1
#if defined(LT) || defined(RN)
blbs KK, $L27
#else
blbs TMP1, $L27
#endif
ADD c10, t2, c10
unop
MUL a2, b1, t2
LD b1, 0 * SIZE(BO)
ADD c13, t3, c13
unop
MUL a1, b2, t3
unop
ADD c14, t4, c14
unop
MUL a2, b2, t4
LD b2, 1 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b3, t1
lda AO, 2 * SIZE(AO)
ADD c02, t2, c02
unop
MUL a2, b3, t2
LD b3, 2 * SIZE(BO)
ADD c05, t3, c05
unop
MUL a1, b4, t3
LD a1, -2 * SIZE(AO)
ADD c06, t4, c06
unop
MUL a2, b4, t4
LD a2, -1 * SIZE(AO)
ADD c09, t1, c09
LD b4, 3 * SIZE(BO)
MUL a1, b1, t1
lda BO, 4 * SIZE(BO)
.align 4
$L27:
ADD c10, t2, c10
MUL a2, b1, t2
ADD c13, t3, c13
MUL a1, b2, t3
ADD c14, t4, c14
MUL a2, b2, t4
ADD c01, t1, c01
MUL a1, b3, t1
ADD c02, t2, c02
MUL a2, b3, t2
ADD c05, t3, c05
MUL a1, b4, t3
ADD c06, t4, c06
lda AO, 2 * SIZE(AO)
MUL a2, b4, t4
lda BO, 4 * SIZE(BO)
ADD c09, t1, c09
ADD c10, t2, c10
ADD c13, t3, c13
ADD c14, t4, c14
.align 4
$L28:
#if defined(LN) || defined(RT)
#ifdef LN
subq KK, 2, TMP1
#else
subq KK, 4, TMP1
#endif
sll TMP1, BASE_SHIFT + 1, TMP2
addq AORIG, TMP2, AO
sll TMP1, BASE_SHIFT + 2, TMP2
addq B, TMP2, BO
#else
lda AO, -2 * SIZE(AO)
lda BO, -4 * SIZE(BO)
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
LD b4, 7 * SIZE(BO)
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c09, c09
SUB a4, c13, c13
SUB b1, c02, c02
SUB b2, c06, c06
SUB b3, c10, c10
SUB b4, c14, c14
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
LD b4, 7 * SIZE(AO)
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c05, c05
SUB a4, c06, c06
SUB b1, c09, c09
SUB b2, c10, c10
SUB b3, c13, c13
SUB b4, c14, c14
#endif
#ifdef LN
LD a1, 3 * SIZE(AO)
LD a2, 2 * SIZE(AO)
LD a3, 0 * SIZE(AO)
MUL a1, c02, c02
MUL a1, c06, c06
MUL a1, c10, c10
MUL a1, c14, c14
MUL a2, c02, t1
MUL a2, c06, t2
MUL a2, c10, t3
MUL a2, c14, t4
SUB c01, t1, c01
SUB c05, t2, c05
SUB c09, t3, c09
SUB c13, t4, c13
MUL a3, c01, c01
MUL a3, c05, c05
MUL a3, c09, c09
MUL a3, c13, c13
#endif
#ifdef LT
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 3 * SIZE(AO)
MUL a1, c01, c01
MUL a1, c05, c05
MUL a1, c09, c09
MUL a1, c13, c13
MUL a2, c01, t1
MUL a2, c05, t2
MUL a2, c09, t3
MUL a2, c13, t4
SUB c02, t1, c02
SUB c06, t2, c06
SUB c10, t3, c10
SUB c14, t4, c14
MUL a3, c02, c02
MUL a3, c06, c06
MUL a3, c10, c10
MUL a3, c14, c14
#endif
#ifdef RN
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
MUL a1, c01, c01
MUL a1, c02, c02
MUL a2, c01, t1
MUL a2, c02, t2
SUB c05, t1, c05
SUB c06, t2, c06
MUL a3, c01, t1
MUL a3, c02, t2
SUB c09, t1, c09
SUB c10, t2, c10
MUL a4, c01, t1
MUL a4, c02, t2
SUB c13, t1, c13
SUB c14, t2, c14
LD b1, 5 * SIZE(BO)
LD b2, 6 * SIZE(BO)
LD b3, 7 * SIZE(BO)
MUL b1, c05, c05
MUL b1, c06, c06
MUL b2, c05, t1
MUL b2, c06, t2
SUB c09, t1, c09
SUB c10, t2, c10
MUL b3, c05, t1
MUL b3, c06, t2
SUB c13, t1, c13
SUB c14, t2, c14
LD a1, 10 * SIZE(BO)
LD a2, 11 * SIZE(BO)
LD a3, 15 * SIZE(BO)
MUL a1, c09, c09
MUL a1, c10, c10
MUL a2, c09, t1
MUL a2, c10, t2
SUB c13, t1, c13
SUB c14, t2, c14
MUL a3, c13, c13
MUL a3, c14, c14
#endif
#ifdef RT
LD a1, 15 * SIZE(BO)
LD a2, 14 * SIZE(BO)
LD a3, 13 * SIZE(BO)
LD a4, 12 * SIZE(BO)
MUL a1, c13, c13
MUL a1, c14, c14
MUL a2, c13, t1
MUL a2, c14, t2
SUB c09, t1, c09
SUB c10, t2, c10
MUL a3, c13, t1
MUL a3, c14, t2
SUB c05, t1, c05
SUB c06, t2, c06
MUL a4, c13, t1
MUL a4, c14, t2
SUB c01, t1, c01
SUB c02, t2, c02
LD b1, 10 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 8 * SIZE(BO)
MUL b1, c09, c09
MUL b1, c10, c10
MUL b2, c09, t1
MUL b2, c10, t2
SUB c05, t1, c05
SUB c06, t2, c06
MUL b3, c09, t1
MUL b3, c10, t2
SUB c01, t1, c01
SUB c02, t2, c02
LD a1, 5 * SIZE(BO)
LD a2, 4 * SIZE(BO)
LD a3, 0 * SIZE(BO)
MUL a1, c05, c05
MUL a1, c06, c06
MUL a2, c05, t1
MUL a2, c06, t2
SUB c01, t1, c01
SUB c02, t2, c02
MUL a3, c01, c01
MUL a3, c02, c02
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
ST c05, 1 * SIZE(BO)
ST c09, 2 * SIZE(BO)
ST c13, 3 * SIZE(BO)
ST c02, 4 * SIZE(BO)
ST c06, 5 * SIZE(BO)
ST c10, 6 * SIZE(BO)
ST c14, 7 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
ST c02, 1 * SIZE(AO)
ST c05, 2 * SIZE(AO)
ST c06, 3 * SIZE(AO)
ST c09, 4 * SIZE(AO)
ST c10, 5 * SIZE(AO)
ST c13, 6 * SIZE(AO)
ST c14, 7 * SIZE(AO)
#endif
#ifdef LN
lda C1, -2 * SIZE(C1)
lda C2, -2 * SIZE(C2)
lda C3, -2 * SIZE(C3)
lda C4, -2 * SIZE(C4)
#endif
ST c01, 0 * SIZE(C1)
ST c02, 1 * SIZE(C1)
ST c05, 0 * SIZE(C2)
ST c06, 1 * SIZE(C2)
ST c09, 0 * SIZE(C3)
ST c10, 1 * SIZE(C3)
ST c13, 0 * SIZE(C4)
ST c14, 1 * SIZE(C4)
#ifndef LN
lda C1, 2 * SIZE(C1)
lda C2, 2 * SIZE(C2)
lda C3, 2 * SIZE(C3)
lda C4, 2 * SIZE(C4)
#endif
fclr t1
fclr t2
fclr t3
fclr t4
#ifdef RT
sll K, 1 + BASE_SHIFT, TMP1
addq AORIG, TMP1, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, BASE_SHIFT + 1, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 2, TMP2
addq BO, TMP2, BO
#endif
#ifdef LT
addq KK, 2, KK
#endif
#ifdef LN
subq KK, 2, KK
#endif
.align 4
$L30:
sra M, 2, I
ble I, $L39
.align 4
$L11:
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr c11
LD a2, 1 * SIZE(AO)
fclr c12
LD a3, 2 * SIZE(AO)
fclr c16
LD a4, 3 * SIZE(AO)
fclr c15
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c02
LD b3, 2 * SIZE(B)
fclr c06
LD b4, 3 * SIZE(B)
fclr c05
lds $f31, 4 * SIZE(C1)
fclr c03
lda L, -2(KK)
fclr c04
lds $f31, 7 * SIZE(C2)
fclr c08
lda BO, 4 * SIZE(B)
fclr c13
lds $f31, 4 * SIZE(C3)
fclr c09
lda AO, 4 * SIZE(AO)
fclr c10
lds $f31, 7 * SIZE(C4)
fclr c14
fclr c07
ble KK, $L18
#else
#ifdef LN
sll K, BASE_SHIFT + 2, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, BASE_SHIFT + 2, TMP1
addq AORIG, TMP1, AO
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c11
LD a2, 1 * SIZE(AO)
fclr c12
LD a3, 2 * SIZE(AO)
fclr c16
LD a4, 3 * SIZE(AO)
fclr c15
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c02
LD b3, 2 * SIZE(BO)
fclr c06
LD b4, 3 * SIZE(BO)
fclr c05
lds $f31, 4 * SIZE(C1)
fclr c03
lda L, -2(TMP1)
fclr c04
lds $f31, 7 * SIZE(C2)
fclr c08
lda BO, 4 * SIZE(BO)
fclr c13
lds $f31, 4 * SIZE(C3)
fclr c09
lda AO, 4 * SIZE(AO)
fclr c10
lds $f31, 7 * SIZE(C4)
fclr c14
fclr c07
ble TMP1, $L18
#endif
ble L, $L15
.align 5
$L12:
/* 1 */
ADD c11, t1, c11
#ifndef EV4
ldq $31, PREFETCHSIZE * SIZE(AO)
#else
unop
#endif
MUL b1, a1, t1
#ifndef EV4
ldl $31, PREFETCHSIZE * SIZE(BO)
#else
unop
#endif
ADD c12, t2, c12
unop
MUL b1, a2, t2
unop
ADD c16, t3, c16
unop
MUL b2, a2, t3
LD a5, 0 * SIZE(AO)
ADD c15, t4, c15
unop
MUL b2, a1, t4
LD b5, 0 * SIZE(BO)
/* 2 */
ADD c01, t1, c01
UNOP
MUL b1, a3, t1
UNOP
ADD c02, t2, c02
UNOP
MUL b1, a4, t2
UNOP
ADD c06, t3, c06
unop
MUL b2, a4, t3
unop
ADD c05, t4, c05
unop
MUL b4, a1, t4
unop
/* 3 */
ADD c03, t1, c03
unop
MUL b3, a1, t1
unop
ADD c04, t2, c04
unop
MUL b3, a2, t2
unop
ADD c08, t3, c08
unop
MUL b4, a2, t3
LD a2, 1 * SIZE(AO)
ADD c13, t4, c13
unop
MUL b2, a3, t4
LD b2, 1 * SIZE(BO)
/* 4 */
ADD c09, t1, c09
unop
MUL b3, a3, t1
LD a6, 2 * SIZE(AO)
ADD c10, t2, c10
unop
MUL b3, a4, t2
LD b3, 2 * SIZE(BO)
ADD c14, t3, c14
unop
MUL b4, a4, t3
LD a4, 3 * SIZE(AO)
ADD c07, t4, c07
unop
MUL b4, a3, t4
LD b4, 3 * SIZE(BO)
/* 5 */
ADD c11, t1, c11
unop
MUL b5, a5, t1
LD a1, 4 * SIZE(AO)
ADD c12, t2, c12
lda L, -2(L)
MUL b5, a2, t2
LD b1, 4 * SIZE(BO)
ADD c16, t3, c16
unop
MUL b2, a2, t3
unop
ADD c15, t4, c15
unop
MUL b2, a5, t4
unop
/* 6 */
ADD c01, t1, c01
unop
MUL b5, a6, t1
unop
ADD c02, t2, c02
unop
MUL b5, a4, t2
unop
ADD c06, t3, c06
unop
MUL b2, a4, t3
unop
ADD c05, t4, c05
unop
MUL b4, a5, t4
unop
/* 7 */
ADD c03, t1, c03
lda AO, 8 * SIZE(AO)
MUL b3, a5, t1
unop
ADD c04, t2, c04
lda BO, 8 * SIZE(BO)
MUL b3, a2, t2
unop
ADD c08, t3, c08
unop
MUL b4, a2, t3
LD a2, -3 * SIZE(AO)
ADD c13, t4, c13
unop
MUL b2, a6, t4
LD b2, -3 * SIZE(BO)
/* 8 */
ADD c09, t1, c09
unop
MUL b3, a6, t1
LD a3, -2 * SIZE(AO)
ADD c10, t2, c10
unop
MUL b3, a4, t2
LD b3, -2 * SIZE(BO)
ADD c14, t3, c14
unop
MUL b4, a4, t3
LD a4, -1 * SIZE(AO)
ADD c07, t4, c07
MUL b4, a6, t4
LD b4, -1 * SIZE(BO)
bgt L, $L12
.align 4
$L15:
ADD c11, t1, c11
MUL b1, a1, t1
#if defined(LT) || defined(RN)
blbs KK, $L17
#else
blbs TMP1, $L17
#endif
.align 4
ADD c12, t2, c12
MUL b1, a2, t2
ADD c16, t3, c16
MUL b2, a2, t3
ADD c15, t4, c15
MUL b2, a1, t4
ADD c01, t1, c01
MUL b1, a3, t1
ADD c02, t2, c02
unop
MUL b1, a4, t2
LD b1, 0 * SIZE(BO)
ADD c06, t3, c06
MUL b2, a4, t3
ADD c05, t4, c05
MUL b4, a1, t4
ADD c03, t1, c03
unop
MUL b3, a1, t1
LD a1, 0 * SIZE(AO)
ADD c04, t2, c04
unop
MUL b3, a2, t2
unop
ADD c08, t3, c08
unop
MUL b4, a2, t3
LD a2, 1 * SIZE(AO)
ADD c13, t4, c13
unop
MUL b2, a3, t4
LD b2, 1 * SIZE(BO)
ADD c09, t1, c09
unop
MUL b3, a3, t1
lda AO, 4 * SIZE(AO)
ADD c10, t2, c10
unop
MUL b3, a4, t2
LD b3, 2 * SIZE(BO)
ADD c14, t3, c14
unop
MUL b4, a4, t3
LD a4, -1 * SIZE(AO)
ADD c07, t4, c07
unop
MUL b4, a3, t4
LD a3, -2 * SIZE(AO)
ADD c11, t1, c11
LD b4, 3 * SIZE(BO)
MUL b1, a1, t1
lda BO, 4 * SIZE(BO)
.align 4
$L17:
ADD c12, t2, c12
MUL b1, a2, t2
ADD c16, t3, c16
MUL b2, a2, t3
ADD c15, t4, c15
MUL b2, a1, t4
ADD c01, t1, c01
MUL b1, a3, t1
ADD c02, t2, c02
MUL b1, a4, t2
ADD c06, t3, c06
MUL b2, a4, t3
ADD c05, t4, c05
MUL b4, a1, t4
ADD c03, t1, c03
MUL b3, a1, t1
ADD c04, t2, c04
MUL b3, a2, t2
ADD c08, t3, c08
MUL b4, a2, t3
ADD c13, t4, c13
MUL b2, a3, t4
ADD c09, t1, c09
MUL b3, a3, t1
ADD c10, t2, c10
MUL b3, a4, t2
ADD c14, t3, c14
MUL b4, a4, t3
ADD c07, t4, c07
lda AO, 4 * SIZE(AO)
MUL b4, a3, t4
lda BO, 4 * SIZE(BO)
ADD c11, t1, c11
ADD c12, t2, c12
ADD c16, t3, c16
ADD c15, t4, c15
.align 4
$L18:
#if defined(LN) || defined(RT)
#ifdef LN
subq KK, 4, TMP1
#else
subq KK, 4, TMP1
#endif
sll TMP1, BASE_SHIFT + 2, TMP2
addq AORIG, TMP2, AO
sll TMP1, BASE_SHIFT + 2, TMP2
addq B, TMP2, BO
#else
lda AO, -4 * SIZE(AO)
lda BO, -4 * SIZE(BO)
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
LD b4, 7 * SIZE(BO)
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c09, c09
SUB a4, c13, c13
SUB b1, c02, c02
SUB b2, c06, c06
SUB b3, c10, c10
SUB b4, c14, c14
LD a1, 8 * SIZE(BO)
LD a2, 9 * SIZE(BO)
LD a3, 10 * SIZE(BO)
LD a4, 11 * SIZE(BO)
LD b1, 12 * SIZE(BO)
LD b2, 13 * SIZE(BO)
LD b3, 14 * SIZE(BO)
LD b4, 15 * SIZE(BO)
SUB a1, c03, c03
SUB a2, c07, c07
SUB a3, c11, c11
SUB a4, c15, c15
SUB b1, c04, c04
SUB b2, c08, c08
SUB b3, c12, c12
SUB b4, c16, c16
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
LD b4, 7 * SIZE(AO)
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
SUB a4, c04, c04
SUB b1, c05, c05
SUB b2, c06, c06
SUB b3, c07, c07
SUB b4, c08, c08
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
LD a3, 10 * SIZE(AO)
LD a4, 11 * SIZE(AO)
LD b1, 12 * SIZE(AO)
LD b2, 13 * SIZE(AO)
LD b3, 14 * SIZE(AO)
LD b4, 15 * SIZE(AO)
SUB a1, c09, c09
SUB a2, c10, c10
SUB a3, c11, c11
SUB a4, c12, c12
SUB b1, c13, c13
SUB b2, c14, c14
SUB b3, c15, c15
SUB b4, c16, c16
#endif
#ifdef LN
LD a1, 15 * SIZE(AO)
LD a2, 14 * SIZE(AO)
LD a3, 13 * SIZE(AO)
LD a4, 12 * SIZE(AO)
MUL a1, c04, c04
MUL a1, c08, c08
MUL a1, c12, c12
MUL a1, c16, c16
MUL a2, c04, t1
MUL a2, c08, t2
MUL a2, c12, t3
MUL a2, c16, t4
SUB c03, t1, c03
SUB c07, t2, c07
SUB c11, t3, c11
SUB c15, t4, c15
MUL a3, c04, t1
MUL a3, c08, t2
MUL a3, c12, t3
MUL a3, c16, t4
SUB c02, t1, c02
SUB c06, t2, c06
SUB c10, t3, c10
SUB c14, t4, c14
MUL a4, c04, t1
MUL a4, c08, t2
MUL a4, c12, t3
MUL a4, c16, t4
SUB c01, t1, c01
SUB c05, t2, c05
SUB c09, t3, c09
SUB c13, t4, c13
LD b1, 10 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 8 * SIZE(AO)
MUL b1, c03, c03
MUL b1, c07, c07
MUL b1, c11, c11
MUL b1, c15, c15
MUL b2, c03, t1
MUL b2, c07, t2
MUL b2, c11, t3
MUL b2, c15, t4
SUB c02, t1, c02
SUB c06, t2, c06
SUB c10, t3, c10
SUB c14, t4, c14
MUL b3, c03, t1
MUL b3, c07, t2
MUL b3, c11, t3
MUL b3, c15, t4
SUB c01, t1, c01
SUB c05, t2, c05
SUB c09, t3, c09
SUB c13, t4, c13
LD a1, 5 * SIZE(AO)
LD a2, 4 * SIZE(AO)
LD a3, 0 * SIZE(AO)
MUL a1, c02, c02
MUL a1, c06, c06
MUL a1, c10, c10
MUL a1, c14, c14
MUL a2, c02, t1
MUL a2, c06, t2
MUL a2, c10, t3
MUL a2, c14, t4
SUB c01, t1, c01
SUB c05, t2, c05
SUB c09, t3, c09
SUB c13, t4, c13
MUL a3, c01, c01
MUL a3, c05, c05
MUL a3, c09, c09
MUL a3, c13, c13
#endif
#ifdef LT
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
MUL a1, c01, c01
MUL a1, c05, c05
MUL a1, c09, c09
MUL a1, c13, c13
MUL a2, c01, t1
MUL a2, c05, t2
MUL a2, c09, t3
MUL a2, c13, t4
SUB c02, t1, c02
SUB c06, t2, c06
SUB c10, t3, c10
SUB c14, t4, c14
MUL a3, c01, t1
MUL a3, c05, t2
MUL a3, c09, t3
MUL a3, c13, t4
SUB c03, t1, c03
SUB c07, t2, c07
SUB c11, t3, c11
SUB c15, t4, c15
MUL a4, c01, t1
MUL a4, c05, t2
MUL a4, c09, t3
MUL a4, c13, t4
SUB c04, t1, c04
SUB c08, t2, c08
SUB c12, t3, c12
SUB c16, t4, c16
LD b1, 5 * SIZE(AO)
LD b2, 6 * SIZE(AO)
LD b3, 7 * SIZE(AO)
MUL b1, c02, c02
MUL b1, c06, c06
MUL b1, c10, c10
MUL b1, c14, c14
MUL b2, c02, t1
MUL b2, c06, t2
MUL b2, c10, t3
MUL b2, c14, t4
SUB c03, t1, c03
SUB c07, t2, c07
SUB c11, t3, c11
SUB c15, t4, c15
MUL b3, c02, t1
MUL b3, c06, t2
MUL b3, c10, t3
MUL b3, c14, t4
SUB c04, t1, c04
SUB c08, t2, c08
SUB c12, t3, c12
SUB c16, t4, c16
LD a1, 10 * SIZE(AO)
LD a2, 11 * SIZE(AO)
LD a3, 15 * SIZE(AO)
MUL a1, c03, c03
MUL a1, c07, c07
MUL a1, c11, c11
MUL a1, c15, c15
MUL a2, c03, t1
MUL a2, c07, t2
MUL a2, c11, t3
MUL a2, c15, t4
SUB c04, t1, c04
SUB c08, t2, c08
SUB c12, t3, c12
SUB c16, t4, c16
MUL a3, c04, c04
MUL a3, c08, c08
MUL a3, c12, c12
MUL a3, c16, c16
#endif
#ifdef RN
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
MUL a1, c04, c04
MUL a2, c01, t1
MUL a2, c02, t2
MUL a2, c03, t3
MUL a2, c04, t4
SUB c05, t1, c05
SUB c06, t2, c06
SUB c07, t3, c07
SUB c08, t4, c08
MUL a3, c01, t1
MUL a3, c02, t2
MUL a3, c03, t3
MUL a3, c04, t4
SUB c09, t1, c09
SUB c10, t2, c10
SUB c11, t3, c11
SUB c12, t4, c12
MUL a4, c01, t1
MUL a4, c02, t2
MUL a4, c03, t3
MUL a4, c04, t4
SUB c13, t1, c13
SUB c14, t2, c14
SUB c15, t3, c15
SUB c16, t4, c16
LD b1, 5 * SIZE(BO)
LD b2, 6 * SIZE(BO)
LD b3, 7 * SIZE(BO)
MUL b1, c05, c05
MUL b1, c06, c06
MUL b1, c07, c07
MUL b1, c08, c08
MUL b2, c05, t1
MUL b2, c06, t2
MUL b2, c07, t3
MUL b2, c08, t4
SUB c09, t1, c09
SUB c10, t2, c10
SUB c11, t3, c11
SUB c12, t4, c12
MUL b3, c05, t1
MUL b3, c06, t2
MUL b3, c07, t3
MUL b3, c08, t4
SUB c13, t1, c13
SUB c14, t2, c14
SUB c15, t3, c15
SUB c16, t4, c16
LD a1, 10 * SIZE(BO)
LD a2, 11 * SIZE(BO)
LD a3, 15 * SIZE(BO)
MUL a1, c09, c09
MUL a1, c10, c10
MUL a1, c11, c11
MUL a1, c12, c12
MUL a2, c09, t1
MUL a2, c10, t2
MUL a2, c11, t3
MUL a2, c12, t4
SUB c13, t1, c13
SUB c14, t2, c14
SUB c15, t3, c15
SUB c16, t4, c16
MUL a3, c13, c13
MUL a3, c14, c14
MUL a3, c15, c15
MUL a3, c16, c16
#endif
#ifdef RT
LD a1, 15 * SIZE(BO)
LD a2, 14 * SIZE(BO)
LD a3, 13 * SIZE(BO)
LD a4, 12 * SIZE(BO)
MUL a1, c13, c13
MUL a1, c14, c14
MUL a1, c15, c15
MUL a1, c16, c16
MUL a2, c13, t1
MUL a2, c14, t2
MUL a2, c15, t3
MUL a2, c16, t4
SUB c09, t1, c09
SUB c10, t2, c10
SUB c11, t3, c11
SUB c12, t4, c12
MUL a3, c13, t1
MUL a3, c14, t2
MUL a3, c15, t3
MUL a3, c16, t4
SUB c05, t1, c05
SUB c06, t2, c06
SUB c07, t3, c07
SUB c08, t4, c08
MUL a4, c13, t1
MUL a4, c14, t2
MUL a4, c15, t3
MUL a4, c16, t4
SUB c01, t1, c01
SUB c02, t2, c02
SUB c03, t3, c03
SUB c04, t4, c04
LD b1, 10 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 8 * SIZE(BO)
MUL b1, c09, c09
MUL b1, c10, c10
MUL b1, c11, c11
MUL b1, c12, c12
MUL b2, c09, t1
MUL b2, c10, t2
MUL b2, c11, t3
MUL b2, c12, t4
SUB c05, t1, c05
SUB c06, t2, c06
SUB c07, t3, c07
SUB c08, t4, c08
MUL b3, c09, t1
MUL b3, c10, t2
MUL b3, c11, t3
MUL b3, c12, t4
SUB c01, t1, c01
SUB c02, t2, c02
SUB c03, t3, c03
SUB c04, t4, c04
LD a1, 5 * SIZE(BO)
LD a2, 4 * SIZE(BO)
LD a3, 0 * SIZE(BO)
MUL a1, c05, c05
MUL a1, c06, c06
MUL a1, c07, c07
MUL a1, c08, c08
MUL a2, c05, t1
MUL a2, c06, t2
MUL a2, c07, t3
MUL a2, c08, t4
SUB c01, t1, c01
SUB c02, t2, c02
SUB c03, t3, c03
SUB c04, t4, c04
MUL a3, c01, c01
MUL a3, c02, c02
MUL a3, c03, c03
MUL a3, c04, c04
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
ST c05, 1 * SIZE(BO)
ST c09, 2 * SIZE(BO)
ST c13, 3 * SIZE(BO)
ST c02, 4 * SIZE(BO)
ST c06, 5 * SIZE(BO)
ST c10, 6 * SIZE(BO)
ST c14, 7 * SIZE(BO)
ST c03, 8 * SIZE(BO)
ST c07, 9 * SIZE(BO)
ST c11, 10 * SIZE(BO)
ST c15, 11 * SIZE(BO)
ST c04, 12 * SIZE(BO)
ST c08, 13 * SIZE(BO)
ST c12, 14 * SIZE(BO)
ST c16, 15 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
ST c02, 1 * SIZE(AO)
ST c03, 2 * SIZE(AO)
ST c04, 3 * SIZE(AO)
ST c05, 4 * SIZE(AO)
ST c06, 5 * SIZE(AO)
ST c07, 6 * SIZE(AO)
ST c08, 7 * SIZE(AO)
ST c09, 8 * SIZE(AO)
ST c10, 9 * SIZE(AO)
ST c11, 10 * SIZE(AO)
ST c12, 11 * SIZE(AO)
ST c13, 12 * SIZE(AO)
ST c14, 13 * SIZE(AO)
ST c15, 14 * SIZE(AO)
ST c16, 15 * SIZE(AO)
#endif
#ifdef LN
lda C1, -4 * SIZE(C1)
lda C2, -4 * SIZE(C2)
lda C3, -4 * SIZE(C3)
lda C4, -4 * SIZE(C4)
#endif
ST c01, 0 * SIZE(C1)
ST c02, 1 * SIZE(C1)
ST c03, 2 * SIZE(C1)
ST c04, 3 * SIZE(C1)
ST c05, 0 * SIZE(C2)
ST c06, 1 * SIZE(C2)
ST c07, 2 * SIZE(C2)
ST c08, 3 * SIZE(C2)
ST c09, 0 * SIZE(C3)
ST c10, 1 * SIZE(C3)
ST c11, 2 * SIZE(C3)
ST c12, 3 * SIZE(C3)
ST c13, 0 * SIZE(C4)
ST c14, 1 * SIZE(C4)
ST c15, 2 * SIZE(C4)
ST c16, 3 * SIZE(C4)
#ifndef LN
lda C1, 4 * SIZE(C1)
lda C2, 4 * SIZE(C2)
lda C3, 4 * SIZE(C3)
lda C4, 4 * SIZE(C4)
#endif
fclr t1
fclr t2
fclr t3
fclr t4
#ifdef RT
sll K, 2 + BASE_SHIFT, TMP1
addq AORIG, TMP1, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, BASE_SHIFT + 2, TMP1
addq AO, TMP1, AO
addq BO, TMP1, BO
#endif
#ifdef LT
addq KK, 4, KK
#endif
#ifdef LN
subq KK, 4, KK
#endif
lda I, -1(I)
bgt I, $L11
.align 4
$L39:
#ifdef LN
sll K, 2 + BASE_SHIFT, TMP1
addq B, TMP1, B
#endif
#if defined(LT) || defined(RN)
mov BO, B
#endif
#ifdef RN
addq KK, 4, KK
#endif
#ifdef RT
subq KK, 4, KK
#endif
lda J, -1(J)
bgt J, $L01
.align 4
$L40:
and N, 2, J
ble J, $L80
#ifdef RT
sll K, 1 + BASE_SHIFT, TMP1
subq B, TMP1, B
addq LDC, LDC, TMP1
subq C, TMP1, C
#endif
mov C, C1
addq C, LDC, C2
fclr t1
#ifndef RT
addq C2, LDC, C
#endif
fclr t2
#ifdef LN
addq M, OFFSET, KK
#endif
#ifdef LT
mov OFFSET, KK
#endif
#if defined(LN) || defined(RT)
mov A, AORIG
#else
mov A, AO
#endif
fclr t3
fclr t4
and M, 1, I
ble I, $L60
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr c01
LD a2, 1 * SIZE(AO)
fclr c05
LD b1, 0 * SIZE(B)
fclr c02
LD b2, 1 * SIZE(B)
fclr c06
lda L, -2(KK)
LD b3, 2 * SIZE(B)
lda AO, 1 * SIZE(AO)
LD b4, 3 * SIZE(B)
lda BO, 2 * SIZE(B)
ble KK, $L78
ble L, $L75
#else
#ifdef LN
sll K, BASE_SHIFT + 0, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, BASE_SHIFT + 0, TMP1
addq AORIG, TMP1, AO
sll KK, BASE_SHIFT + 1, TMP1
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c01
LD a2, 1 * SIZE(AO)
fclr c05
LD b1, 0 * SIZE(BO)
fclr c02
LD b2, 1 * SIZE(BO)
fclr c06
lda L, -2(TMP1)
LD b3, 2 * SIZE(BO)
lda AO, 1 * SIZE(AO)
LD b4, 3 * SIZE(BO)
lda BO, 2 * SIZE(BO)
ble TMP1, $L78
ble L, $L75
#endif
.align 4
$L72:
ADD c01, t1, c01
lda L, -2(L)
MUL a1, b1, t1
LD b1, 2 * SIZE(BO)
ADD c05, t2, c05
MUL a1, b2, t2
LD a1, 1 * SIZE(AO)
LD b2, 3 * SIZE(BO)
ADD c02, t3, c02
lda AO, 2 * SIZE(AO)
MUL a2, b3, t3
LD b3, 4 * SIZE(BO)
ADD c06, t4, c06
MUL a2, b4, t4
LD a2, 0 * SIZE(AO)
LD b4, 5 * SIZE(BO)
lda BO, 4 * SIZE(BO)
unop
unop
bgt L, $L72
.align 4
$L75:
ADD c01, t1, c01
MUL a1, b1, t1
#if defined(LT) || defined(RN)
blbs KK, $L77
#else
blbs TMP1, $L77
#endif
.align 4
ADD c05, t2, c05
MUL a1, b2, t2
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
ADD c01, t1, c01
LD b2, 1 * SIZE(BO)
lda AO, 1 * SIZE(AO)
MUL a1, b1, t1
lda BO, 2 * SIZE(BO)
.align 4
$L77:
ADD c05, t2, c05
MUL a1, b2, t2
ADD c02, t3, c02
ADD c06, t4, c06
ADD c01, c02, c01
lda AO, 1 * SIZE(AO)
ADD c05, c06, c05
lda BO, 2 * SIZE(BO)
ADD c01, t1, c01
ADD c05, t2, c05
.align 4
$L78:
#if defined(LN) || defined(RT)
#ifdef LN
subq KK, 1, TMP1
#else
subq KK, 2, TMP1
#endif
sll TMP1, BASE_SHIFT + 0, TMP2
addq AORIG, TMP2, AO
sll TMP1, BASE_SHIFT + 1, TMP2
addq B, TMP2, BO
#else
lda AO, -1 * SIZE(AO)
lda BO, -2 * SIZE(BO)
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
SUB a1, c01, c01
SUB a2, c05, c05
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
SUB a1, c01, c01
SUB a2, c05, c05
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(AO)
MUL a1, c01, c01
MUL a1, c05, c05
#endif
#ifdef RN
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 3 * SIZE(BO)
MUL a1, c01, c01
MUL a2, c01, t1
SUB c05, t1, c05
MUL a3, c05, c05
#endif
#ifdef RT
LD a1, 3 * SIZE(BO)
LD a2, 2 * SIZE(BO)
LD a3, 0 * SIZE(BO)
MUL a1, c05, c05
MUL a2, c05, t1
SUB c01, t1, c01
MUL a3, c01, c01
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
ST c05, 1 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
ST c05, 1 * SIZE(AO)
#endif
#ifdef LN
lda C1, -1 * SIZE(C1)
lda C2, -1 * SIZE(C2)
#endif
ST c01, 0 * SIZE(C1)
ST c05, 0 * SIZE(C2)
fclr t1
fclr t2
fclr t3
fclr t4
#ifdef RT
sll K, 0 + BASE_SHIFT, TMP1
addq AORIG, TMP1, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, BASE_SHIFT + 0, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 1, TMP2
addq BO, TMP2, BO
#endif
#ifdef LT
addq KK, 1, KK
#endif
#ifdef LN
subq KK, 1, KK
#endif
.align 4
$L60:
and M, 2, I
ble I, $L70
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr c01
LD a2, 1 * SIZE(AO)
fclr c05
LD a3, 2 * SIZE(AO)
fclr c02
LD a4, 3 * SIZE(AO)
fclr c06
LD b1, 0 * SIZE(B)
lda L, -2(KK)
LD b2, 1 * SIZE(B)
lda AO, 2 * SIZE(AO)
LD b3, 2 * SIZE(B)
LD b4, 3 * SIZE(B)
lda BO, 2 * SIZE(B)
ble KK, $L68
ble L, $L65
#else
#ifdef LN
sll K, BASE_SHIFT + 1, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, BASE_SHIFT + 1, TMP1
addq AORIG, TMP1, AO
sll KK, BASE_SHIFT + 1, TMP1
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c01
LD a2, 1 * SIZE(AO)
fclr c05
LD a3, 2 * SIZE(AO)
fclr c02
LD a4, 3 * SIZE(AO)
fclr c06
LD b1, 0 * SIZE(BO)
lda L, -2(TMP1)
LD b2, 1 * SIZE(BO)
lda AO, 2 * SIZE(AO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
lda BO, 2 * SIZE(BO)
ble TMP1, $L68
ble L, $L65
#endif
.align 4
$L62:
ADD c01, t1, c01
unop
MUL a1, b1, t1
unop
ADD c02, t2, c02
lda AO, 4 * SIZE(AO)
MUL a2, b1, t2
LD b1, 2 * SIZE(BO)
ADD c05, t3, c05
lda L, -2(L)
MUL a1, b2, t3
LD a1, -2 * SIZE(AO)
ADD c06, t4, c06
unop
MUL a2, b2, t4
LD a2, -1 * SIZE(AO)
ADD c01, t1, c01
LD b2, 3 * SIZE(BO)
MUL a3, b3, t1
lda BO, 4 * SIZE(BO)
ADD c02, t2, c02
unop
MUL a4, b3, t2
LD b3, 0 * SIZE(BO)
ADD c05, t3, c05
unop
MUL a3, b4, t3
LD a3, 0 * SIZE(AO)
ADD c06, t4, c06
MUL a4, b4, t4
LD b4, 1 * SIZE(BO)
unop
LD a4, 1 * SIZE(AO)
unop
unop
bgt L, $L62
.align 4
$L65:
ADD c01, t1, c01
MUL a1, b1, t1
#if defined(LT) || defined(RN)
blbs KK, $L67
#else
blbs TMP1, $L67
#endif
.align 4
ADD c02, t2, c02
unop
MUL a2, b1, t2
LD b1, 0 * SIZE(BO)
ADD c05, t3, c05
lda BO, 2 * SIZE(BO)
MUL a1, b2, t3
LD a1, 0 * SIZE(AO)
ADD c06, t4, c06
unop
MUL a2, b2, t4
LD a2, 1 * SIZE(AO)
ADD c01, t1, c01
LD b2, -1 * SIZE(BO)
MUL a1, b1, t1
lda AO, 2 * SIZE(AO)
.align 4
$L67:
ADD c02, t2, c02
MUL a2, b1, t2
ADD c05, t3, c05
MUL a1, b2, t3
ADD c06, t4, c06
lda AO, 2 * SIZE(AO)
MUL a2, b2, t4
lda BO, 2 * SIZE(BO)
ADD c01, t1, c01
ADD c02, t2, c02
ADD c05, t3, c05
ADD c06, t4, c06
.align 4
$L68:
#if defined(LN) || defined(RT)
#ifdef LN
subq KK, 2, TMP1
#else
subq KK, 2, TMP1
#endif
sll TMP1, BASE_SHIFT + 1, TMP2
addq AORIG, TMP2, AO
sll TMP1, BASE_SHIFT + 1, TMP2
addq B, TMP2, BO
#else
lda AO, -2 * SIZE(AO)
lda BO, -2 * SIZE(BO)
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c02, c02
SUB a4, c06, c06
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c05, c05
SUB a4, c06, c06
#endif
#ifdef LN
LD a1, 3 * SIZE(AO)
LD a2, 2 * SIZE(AO)
LD a3, 0 * SIZE(AO)
MUL a1, c02, c02
MUL a1, c06, c06
MUL a2, c02, t1
MUL a2, c06, t2
SUB c01, t1, c01
SUB c05, t2, c05
MUL a3, c01, c01
MUL a3, c05, c05
#endif
#ifdef LT
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 3 * SIZE(AO)
MUL a1, c01, c01
MUL a1, c05, c05
MUL a2, c01, t1
MUL a2, c05, t2
SUB c02, t1, c02
SUB c06, t2, c06
MUL a3, c02, c02
MUL a3, c06, c06
#endif
#ifdef RN
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 3 * SIZE(BO)
MUL a1, c01, c01
MUL a1, c02, c02
MUL a2, c01, t1
MUL a2, c02, t2
SUB c05, t1, c05
SUB c06, t2, c06
MUL a3, c05, c05
MUL a3, c06, c06
#endif
#ifdef RT
LD a1, 3 * SIZE(BO)
LD a2, 2 * SIZE(BO)
LD a3, 0 * SIZE(BO)
MUL a1, c05, c05
MUL a1, c06, c06
MUL a2, c05, t1
MUL a2, c06, t2
SUB c01, t1, c01
SUB c02, t2, c02
MUL a3, c01, c01
MUL a3, c02, c02
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
ST c05, 1 * SIZE(BO)
ST c02, 2 * SIZE(BO)
ST c06, 3 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
ST c02, 1 * SIZE(AO)
ST c05, 2 * SIZE(AO)
ST c06, 3 * SIZE(AO)
#endif
#ifdef LN
lda C1, -2 * SIZE(C1)
lda C2, -2 * SIZE(C2)
#endif
ST c01, 0 * SIZE(C1)
ST c02, 1 * SIZE(C1)
ST c05, 0 * SIZE(C2)
ST c06, 1 * SIZE(C2)
#ifndef LN
lda C1, 2 * SIZE(C1)
lda C2, 2 * SIZE(C2)
#endif
fclr t1
fclr t2
fclr t3
fclr t4
#ifdef RT
sll K, 1 + BASE_SHIFT, TMP1
addq AORIG, TMP1, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, BASE_SHIFT + 1, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 1, TMP2
addq BO, TMP2, BO
#endif
#ifdef LT
addq KK, 2, KK
#endif
#ifdef LN
subq KK, 2, KK
#endif
.align 4
$L70:
sra M, 2, I
ble I, $L79
.align 4
$L51:
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr c03
LD a2, 1 * SIZE(AO)
fclr c07
LD a3, 2 * SIZE(AO)
fclr c04
LD a4, 3 * SIZE(AO)
fclr c08
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c05
LD b3, 2 * SIZE(B)
fclr c02
LD b4, 3 * SIZE(B)
fclr c06
lda L, -2(KK)
lda BO, 2 * SIZE(B)
lda AO, 4 * SIZE(AO)
ble KK, $L58
ble L, $L55
#else
#ifdef LN
sll K, BASE_SHIFT + 2, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, BASE_SHIFT + 2, TMP1
addq AORIG, TMP1, AO
sll KK, BASE_SHIFT + 1, TMP1
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c03
LD a2, 1 * SIZE(AO)
fclr c07
LD a3, 2 * SIZE(AO)
fclr c04
LD a4, 3 * SIZE(AO)
fclr c08
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c05
LD b3, 2 * SIZE(BO)
fclr c02
LD b4, 3 * SIZE(BO)
fclr c06
lda L, -2(TMP1)
lda BO, 2 * SIZE(BO)
lda AO, 4 * SIZE(AO)
ble TMP1, $L58
ble L, $L55
#endif
.align 4
$L52:
ADD c05, t1, c05
unop
MUL a1, b1, t1
unop
ADD c06, t2, c06
lda L, -2(L)
MUL a2, b1, t2
unop
ADD c07, t3, c07
unop
MUL a3, b1, t3
unop
ADD c08, t4, c08
unop
MUL a4, b1, t4
LD b1, 2 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b2, t1
LD a1, 0 * SIZE(AO)
ADD c02, t2, c02
lda BO, 4 * SIZE(BO)
MUL a2, b2, t2
LD a2, 1 * SIZE(AO)
ADD c03, t3, c03
unop
MUL a3, b2, t3
LD a3, 2 * SIZE(AO)
ADD c04, t4, c04
unop
MUL a4, b2, t4
LD a5, 3 * SIZE(AO)
ADD c05, t1, c05
unop
MUL a1, b3, t1
LD b2, -1 * SIZE(BO)
ADD c06, t2, c06
unop
MUL a2, b3, t2
unop
ADD c07, t3, c07
unop
MUL a3, b3, t3
lda AO, 8 * SIZE(AO)
ADD c08, t4, c08
unop
MUL a5, b3, t4
LD b3, 0 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b4, t1
LD a1, -4 * SIZE(AO)
ADD c02, t2, c02
unop
MUL a2, b4, t2
LD a2, -3 * SIZE(AO)
ADD c03, t3, c03
LD a4, -1 * SIZE(AO)
MUL a3, b4, t3
LD a3, -2 * SIZE(AO)
ADD c04, t4, c04
MUL a5, b4, t4
LD b4, 1 * SIZE(BO)
bgt L, $L52
.align 4
$L55:
ADD c05, t1, c05
MUL a1, b1, t1
#if defined(LT) || defined(RN)
blbs KK, $L57
#else
blbs TMP1, $L57
#endif
.align 4
ADD c06, t2, c06
MUL a2, b1, t2
ADD c07, t3, c07
MUL a3, b1, t3
ADD c08, t4, c08
unop
MUL a4, b1, t4
LD b1, 0 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b2, t1
LD a1, 0 * SIZE(AO)
ADD c02, t2, c02
unop
MUL a2, b2, t2
LD a2, 1 * SIZE(AO)
ADD c03, t3, c03
unop
MUL a3, b2, t3
LD a3, 2 * SIZE(AO)
ADD c04, t4, c04
MUL a4, b2, t4
LD a4, 3 * SIZE(AO)
lda AO, 4 * SIZE(AO)
ADD c05, t1, c05
LD b2, 1 * SIZE(BO)
MUL a1, b1, t1
lda BO, 2 * SIZE(BO)
.align 4
$L57:
ADD c06, t2, c06
MUL a2, b1, t2
ADD c07, t3, c07
MUL a3, b1, t3
ADD c08, t4, c08
MUL a4, b1, t4
ADD c01, t1, c01
MUL a1, b2, t1
ADD c02, t2, c02
MUL a2, b2, t2
ADD c03, t3, c03
MUL a3, b2, t3
ADD c04, t4, c04
lda AO, 4 * SIZE(AO)
MUL a4, b2, t4
lda BO, 2 * SIZE(BO)
ADD c05, t1, c05
ADD c06, t2, c06
ADD c07, t3, c07
ADD c08, t4, c08
.align 4
$L58:
#if defined(LN) || defined(RT)
#ifdef LN
subq KK, 4, TMP1
#else
subq KK, 2, TMP1
#endif
sll TMP1, BASE_SHIFT + 2, TMP2
addq AORIG, TMP2, AO
sll TMP1, BASE_SHIFT + 1, TMP2
addq B, TMP2, BO
#else
lda AO, -4 * SIZE(AO)
lda BO, -2 * SIZE(BO)
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
LD b4, 7 * SIZE(BO)
SUB a1, c01, c01
SUB a2, c05, c05
SUB a3, c02, c02
SUB a4, c06, c06
SUB b1, c03, c03
SUB b2, c07, c07
SUB b3, c04, c04
SUB b4, c08, c08
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
LD b4, 7 * SIZE(AO)
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
SUB a4, c04, c04
SUB b1, c05, c05
SUB b2, c06, c06
SUB b3, c07, c07
SUB b4, c08, c08
#endif
#ifdef LN
LD a1, 15 * SIZE(AO)
LD a2, 14 * SIZE(AO)
LD a3, 13 * SIZE(AO)
LD a4, 12 * SIZE(AO)
MUL a1, c04, c04
MUL a1, c08, c08
MUL a2, c04, t1
MUL a2, c08, t2
SUB c03, t1, c03
SUB c07, t2, c07
MUL a3, c04, t1
MUL a3, c08, t2
SUB c02, t1, c02
SUB c06, t2, c06
MUL a4, c04, t1
MUL a4, c08, t2
SUB c01, t1, c01
SUB c05, t2, c05
LD b1, 10 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 8 * SIZE(AO)
MUL b1, c03, c03
MUL b1, c07, c07
MUL b2, c03, t1
MUL b2, c07, t2
SUB c02, t1, c02
SUB c06, t2, c06
MUL b3, c03, t1
MUL b3, c07, t2
SUB c01, t1, c01
SUB c05, t2, c05
LD a1, 5 * SIZE(AO)
LD a2, 4 * SIZE(AO)
LD a3, 0 * SIZE(AO)
MUL a1, c02, c02
MUL a1, c06, c06
MUL a2, c02, t1
MUL a2, c06, t2
SUB c01, t1, c01
SUB c05, t2, c05
MUL a3, c01, c01
MUL a3, c05, c05
#endif
#ifdef LT
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
MUL a1, c01, c01
MUL a1, c05, c05
MUL a2, c01, t1
MUL a2, c05, t2
SUB c02, t1, c02
SUB c06, t2, c06
MUL a3, c01, t1
MUL a3, c05, t2
SUB c03, t1, c03
SUB c07, t2, c07
MUL a4, c01, t1
MUL a4, c05, t2
SUB c04, t1, c04
SUB c08, t2, c08
LD b1, 5 * SIZE(AO)
LD b2, 6 * SIZE(AO)
LD b3, 7 * SIZE(AO)
MUL b1, c02, c02
MUL b1, c06, c06
MUL b2, c02, t1
MUL b2, c06, t2
SUB c03, t1, c03
SUB c07, t2, c07
MUL b3, c02, t1
MUL b3, c06, t2
SUB c04, t1, c04
SUB c08, t2, c08
LD a1, 10 * SIZE(AO)
LD a2, 11 * SIZE(AO)
LD a3, 15 * SIZE(AO)
MUL a1, c03, c03
MUL a1, c07, c07
MUL a2, c03, t1
MUL a2, c07, t2
SUB c04, t1, c04
SUB c08, t2, c08
MUL a3, c04, c04
MUL a3, c08, c08
#endif
#ifdef RN
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 3 * SIZE(BO)
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
MUL a1, c04, c04
MUL a2, c01, t1
MUL a2, c02, t2
MUL a2, c03, t3
MUL a2, c04, t4
SUB c05, t1, c05
SUB c06, t2, c06
SUB c07, t3, c07
SUB c08, t4, c08
MUL a3, c05, c05
MUL a3, c06, c06
MUL a3, c07, c07
MUL a3, c08, c08
#endif
#ifdef RT
LD a1, 3 * SIZE(BO)
LD a2, 2 * SIZE(BO)
LD a3, 0 * SIZE(BO)
MUL a1, c05, c05
MUL a1, c06, c06
MUL a1, c07, c07
MUL a1, c08, c08
MUL a2, c05, t1
MUL a2, c06, t2
MUL a2, c07, t3
MUL a2, c08, t4
SUB c01, t1, c01
SUB c02, t2, c02
SUB c03, t3, c03
SUB c04, t4, c04
MUL a3, c01, c01
MUL a3, c02, c02
MUL a3, c03, c03
MUL a3, c04, c04
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
ST c05, 1 * SIZE(BO)
ST c02, 2 * SIZE(BO)
ST c06, 3 * SIZE(BO)
ST c03, 4 * SIZE(BO)
ST c07, 5 * SIZE(BO)
ST c04, 6 * SIZE(BO)
ST c08, 7 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
ST c02, 1 * SIZE(AO)
ST c03, 2 * SIZE(AO)
ST c04, 3 * SIZE(AO)
ST c05, 4 * SIZE(AO)
ST c06, 5 * SIZE(AO)
ST c07, 6 * SIZE(AO)
ST c08, 7 * SIZE(AO)
#endif
#ifdef LN
lda C1, -4 * SIZE(C1)
lda C2, -4 * SIZE(C2)
#endif
ST c01, 0 * SIZE(C1)
ST c02, 1 * SIZE(C1)
ST c03, 2 * SIZE(C1)
ST c04, 3 * SIZE(C1)
ST c05, 0 * SIZE(C2)
ST c06, 1 * SIZE(C2)
ST c07, 2 * SIZE(C2)
ST c08, 3 * SIZE(C2)
#ifndef LN
lda C1, 4 * SIZE(C1)
lda C2, 4 * SIZE(C2)
#endif
fclr t1
fclr t2
fclr t3
fclr t4
#ifdef RT
sll K, 2 + BASE_SHIFT, TMP1
addq AORIG, TMP1, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, BASE_SHIFT + 2, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 1, TMP2
addq BO, TMP2, BO
#endif
#ifdef LT
addq KK, 4, KK
#endif
#ifdef LN
subq KK, 4, KK
#endif
lda I, -1(I)
bgt I, $L51
.align 4
$L79:
#ifdef LN
sll K, 1 + BASE_SHIFT, TMP1
addq B, TMP1, B
#endif
#if defined(LT) || defined(RN)
mov BO, B
#endif
#ifdef RN
addq KK, 2, KK
#endif
#ifdef RT
subq KK, 2, KK
#endif
.align 4
$L80:
and N, 1, J
ble J, $L999
#ifdef RT
sll K, BASE_SHIFT, TMP1
subq B, TMP1, B
subq C, LDC, C
#endif
mov C, C1
#ifndef RT
addq C, LDC, C
#endif
#ifdef LN
addq M, OFFSET, KK
#endif
#ifdef LT
mov OFFSET, KK
#endif
#if defined(LN) || defined(RT)
mov A, AORIG
#else
mov A, AO
#endif
and M, 1, I
ble I, $L100
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c02
LD b3, 2 * SIZE(B)
fclr c03
LD b4, 3 * SIZE(B)
fclr c04
sra KK, 2, L
mov B, BO
unop
ble L, $L115
#else
#ifdef LN
sll K, BASE_SHIFT + 0, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, BASE_SHIFT + 0, TMP1
addq AORIG, TMP1, AO
sll KK, BASE_SHIFT + 0, TMP1
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c02
LD b3, 2 * SIZE(BO)
fclr c03
LD b4, 3 * SIZE(BO)
fclr c04
sra TMP1, 2, L
unop
ble L, $L115
#endif
.align 4
$L112:
ADD c01, t1, c01
MUL a1, b1, t1
LD a1, 4 * SIZE(AO)
LD b1, 4 * SIZE(BO)
ADD c02, t2, c02
MUL a2, b2, t2
LD a2, 5 * SIZE(AO)
LD b2, 5 * SIZE(BO)
ADD c03, t3, c03
MUL a3, b3, t3
LD a3, 6 * SIZE(AO)
LD b3, 6 * SIZE(BO)
ADD c04, t4, c04
MUL a4, b4, t4
LD a4, 7 * SIZE(AO)
LD b4, 7 * SIZE(BO)
lda L, -1(L)
lda AO, 4 * SIZE(AO)
lda BO, 4 * SIZE(BO)
bgt L, $L112
.align 4
$L115:
#if defined(LT) || defined(RN)
and KK, 3, L
#else
and TMP1, 3, L
#endif
ble L, $L118
.align 4
$L116:
ADD c01, t1, c01
MUL a1, b1, t1
LD a1, 1 * SIZE(AO)
LD b1, 1 * SIZE(BO)
lda L, -1(L)
lda AO, 1 * SIZE(AO)
lda BO, 1 * SIZE(BO)
bgt L, $L116
.align 4
$L118:
ADD c01, t1, c01
ADD c02, t2, c02
ADD c03, t3, c03
ADD c04, t4, c04
ADD c01, c02, c01
ADD c03, c04, c03
ADD c01, c03, c01
#if defined(LN) || defined(RT)
subq KK, 1, TMP1
sll TMP1, BASE_SHIFT + 0, TMP2
addq AORIG, TMP2, AO
addq B, TMP2, BO
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
SUB a1, c01, c01
#else
LD a1, 0 * SIZE(AO)
SUB a1, c01, c01
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(AO)
MUL a1, c01, c01
#endif
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
MUL a1, c01, c01
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
#endif
#ifdef LN
lda C1, -1 * SIZE(C1)
#endif
ST c01, 0 * SIZE(C1)
#ifndef LN
lda C1, 1 * SIZE(C1)
#endif
#ifdef RT
SXADDQ K, AORIG, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, BASE_SHIFT + 0, TMP2
addq AO, TMP2, AO
addq BO, TMP2, BO
#endif
#ifdef LT
addq KK, 1, KK
#endif
#ifdef LN
subq KK, 1, KK
#endif
.align 4
$L100:
and M, 2, I
ble I, $L110
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c02
LD b3, 2 * SIZE(B)
fclr c03
LD b4, 3 * SIZE(B)
fclr c04
sra KK, 2, L
mov B, BO
ble L, $L105
#else
#ifdef LN
sll K, BASE_SHIFT + 1, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, BASE_SHIFT + 1, TMP1
addq AORIG, TMP1, AO
sll KK, BASE_SHIFT + 0, TMP1
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c02
LD b3, 2 * SIZE(BO)
fclr c03
LD b4, 3 * SIZE(BO)
fclr c04
sra TMP1, 2, L
ble L, $L105
#endif
.align 5
$L102:
ADD c01, t1, c01
lda L, -1(L)
MUL a1, b1, t1
LD a1, 4 * SIZE(AO)
ADD c02, t2, c02
MUL a2, b1, t2
LD a2, 5 * SIZE(AO)
LD b1, 4 * SIZE(BO)
ADD c03, t3, c03
lda BO, 4 * SIZE(BO)
MUL a3, b2, t3
LD a3, 6 * SIZE(AO)
ADD c04, t4, c04
MUL a4, b2, t4
LD a5, 7 * SIZE(AO)
LD b2, 1 * SIZE(BO)
ADD c01, t1, c01
MUL a1, b3, t1
LD a1, 8 * SIZE(AO)
lda AO, 8 * SIZE(AO)
ADD c02, t2, c02
MUL a2, b3, t2
LD b3, 2 * SIZE(BO)
LD a2, 1 * SIZE(AO)
ADD c03, t3, c03
LD a4, 3 * SIZE(AO)
MUL a3, b4, t3
LD a3, 2 * SIZE(AO)
ADD c04, t4, c04
MUL a5, b4, t4
LD b4, 3 * SIZE(BO)
bgt L, $L102
.align 4
$L105:
#if defined(LT) || defined(RN)
and KK, 3, L
#else
and TMP1, 3, L
#endif
ble L, $L108
.align 4
$L106:
ADD c01, t1, c01
lda L, -1(L)
MUL a1, b1, t1
LD a1, 2 * SIZE(AO)
ADD c02, t2, c02
MUL a2, b1, t2
LD a2, 3 * SIZE(AO)
LD b1, 1 * SIZE(BO)
lda AO, 2 * SIZE(AO)
unop
lda BO, 1 * SIZE(BO)
bgt L, $L106
.align 4
$L108:
ADD c01, t1, c01
ADD c02, t2, c02
ADD c03, t3, c03
ADD c04, t4, c04
ADD c01, c03, c01
ADD c02, c04, c02
#if defined(LN) || defined(RT)
#ifdef LN
subq KK, 2, TMP1
#else
subq KK, 1, TMP1
#endif
sll TMP1, BASE_SHIFT + 1, TMP2
addq AORIG, TMP2, AO
sll TMP1, BASE_SHIFT + 0, TMP2
addq B, TMP2, BO
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
SUB a1, c01, c01
SUB a2, c02, c02
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
SUB a1, c01, c01
SUB a2, c02, c02
#endif
#ifdef LN
LD a1, 3 * SIZE(AO)
LD a2, 2 * SIZE(AO)
LD a3, 0 * SIZE(AO)
MUL a1, c02, c02
MUL a2, c02, t1
SUB c01, t1, c01
MUL a3, c01, c01
#endif
#ifdef LT
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 3 * SIZE(AO)
MUL a1, c01, c01
MUL a2, c01, t1
SUB c02, t1, c02
MUL a3, c02, c02
#endif
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
MUL a1, c01, c01
MUL a1, c02, c02
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
ST c02, 1 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
ST c02, 1 * SIZE(AO)
#endif
#ifdef LN
lda C1, -2 * SIZE(C1)
#endif
ST c01, 0 * SIZE(C1)
ST c02, 1 * SIZE(C1)
#ifndef LN
lda C1, 2 * SIZE(C1)
#endif
fclr t1
fclr t2
fclr t3
fclr t4
#ifdef RT
sll K, 1 + BASE_SHIFT, TMP1
addq AORIG, TMP1, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, BASE_SHIFT + 1, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 0, TMP2
addq BO, TMP2, BO
#endif
#ifdef LT
addq KK, 2, KK
#endif
#ifdef LN
subq KK, 2, KK
#endif
.align 4
$L110:
sra M, 2, I
ble I, $L119
.align 4
$L91:
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c02
LD b3, 2 * SIZE(B)
fclr c03
LD b4, 3 * SIZE(B)
fclr c04
sra KK, 2, L
mov B, BO
ble L, $L95
#else
#ifdef LN
sll K, BASE_SHIFT + 2, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, BASE_SHIFT + 2, TMP1
addq AORIG, TMP1, AO
sll KK, BASE_SHIFT + 0, TMP1
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c02
LD b3, 2 * SIZE(BO)
fclr c03
LD b4, 3 * SIZE(BO)
fclr c04
sra TMP1, 2, L
unop
ble L, $L95
#endif
.align 5
$L92:
ADD c01, t1, c01
unop
MUL a1, b1, t1
LD a1, 4 * SIZE(AO)
ADD c02, t2, c02
lda L, -1(L)
MUL a2, b1, t2
LD a2, 5 * SIZE(AO)
ADD c03, t3, c03
unop
MUL a3, b1, t3
LD a3, 6 * SIZE(AO)
ADD c04, t4, c04
MUL a4, b1, t4
LD a4, 7 * SIZE(AO)
LD b1, 4 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b2, t1
LD a1, 8 * SIZE(AO)
ADD c02, t2, c02
unop
MUL a2, b2, t2
LD a2, 9 * SIZE(AO)
ADD c03, t3, c03
unop
MUL a3, b2, t3
LD a3, 10 * SIZE(AO)
ADD c04, t4, c04
MUL a4, b2, t4
LD a4, 11 * SIZE(AO)
LD b2, 5 * SIZE(BO)
ADD c01, t1, c01
unop
MUL a1, b3, t1
LD a1, 12 * SIZE(AO)
ADD c02, t2, c02
unop
MUL a2, b3, t2
LD a2, 13 * SIZE(AO)
ADD c03, t3, c03
unop
MUL a3, b3, t3
LD a3, 14 * SIZE(AO)
ADD c04, t4, c04
MUL a4, b3, t4
LD a5, 15 * SIZE(AO)
LD b3, 6 * SIZE(BO)
ADD c01, t1, c01
MUL a1, b4, t1
LD a1, 16 * SIZE(AO)
lda AO, 16 * SIZE(AO)
ADD c02, t2, c02
lda BO, 4 * SIZE(BO)
MUL a2, b4, t2
LD a2, 1 * SIZE(AO)
ADD c03, t3, c03
LD a4, 3 * SIZE(AO)
MUL a3, b4, t3
LD a3, 2 * SIZE(AO)
ADD c04, t4, c04
MUL a5, b4, t4
LD b4, 3 * SIZE(BO)
bgt L, $L92
.align 4
$L95:
#if defined(LT) || defined(RN)
and KK, 3, L
#else
and TMP1, 3, L
#endif
unop
ble L, $L98
.align 4
$L96:
ADD c01, t1, c01
lda L, -1(L)
MUL a1, b1, t1
LD a1, 4 * SIZE(AO)
ADD c02, t2, c02
lda BO, 1 * SIZE(BO)
MUL a2, b1, t2
LD a2, 5 * SIZE(AO)
ADD c03, t3, c03
unop
MUL a3, b1, t3
LD a3, 6 * SIZE(AO)
ADD c04, t4, c04
MUL a4, b1, t4
LD a4, 7 * SIZE(AO)
LD b1, 0 * SIZE(BO)
lda AO, 4 * SIZE(AO)
bgt L, $L96
.align 4
$L98:
ADD c01, t1, c01
ADD c02, t2, c02
ADD c03, t3, c03
ADD c04, t4, c04
#if defined(LN) || defined(RT)
#ifdef LN
subq KK, 4, TMP1
#else
subq KK, 1, TMP1
#endif
sll TMP1, BASE_SHIFT + 2, TMP2
addq AORIG, TMP2, AO
sll TMP1, BASE_SHIFT + 0, TMP2
addq B, TMP2, BO
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
SUB a4, c04, c04
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
SUB a4, c04, c04
#endif
#ifdef LN
LD a1, 15 * SIZE(AO)
LD a2, 14 * SIZE(AO)
LD a3, 13 * SIZE(AO)
LD a4, 12 * SIZE(AO)
MUL a1, c04, c04
MUL a2, c04, t1
SUB c03, t1, c03
MUL a3, c04, t1
SUB c02, t1, c02
MUL a4, c04, t1
SUB c01, t1, c01
LD b1, 10 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 8 * SIZE(AO)
MUL b1, c03, c03
MUL b2, c03, t1
SUB c02, t1, c02
MUL b3, c03, t1
SUB c01, t1, c01
LD a1, 5 * SIZE(AO)
LD a2, 4 * SIZE(AO)
LD a3, 0 * SIZE(AO)
MUL a1, c02, c02
MUL a2, c02, t1
SUB c01, t1, c01
MUL a3, c01, c01
#endif
#ifdef LT
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
MUL a1, c01, c01
MUL a2, c01, t1
SUB c02, t1, c02
MUL a3, c01, t1
SUB c03, t1, c03
MUL a4, c01, t1
SUB c04, t1, c04
LD b1, 5 * SIZE(AO)
LD b2, 6 * SIZE(AO)
LD b3, 7 * SIZE(AO)
MUL b1, c02, c02
MUL b2, c02, t1
SUB c03, t1, c03
MUL b3, c02, t1
SUB c04, t1, c04
LD a1, 10 * SIZE(AO)
LD a2, 11 * SIZE(AO)
LD a3, 15 * SIZE(AO)
MUL a1, c03, c03
MUL a2, c03, t1
SUB c04, t1, c04
MUL a3, c04, c04
#endif
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
MUL a1, c04, c04
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
ST c02, 1 * SIZE(BO)
ST c03, 2 * SIZE(BO)
ST c04, 3 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
ST c02, 1 * SIZE(AO)
ST c03, 2 * SIZE(AO)
ST c04, 3 * SIZE(AO)
#endif
#ifdef LN
lda C1, -4 * SIZE(C1)
#endif
ST c01, 0 * SIZE(C1)
ST c02, 1 * SIZE(C1)
ST c03, 2 * SIZE(C1)
ST c04, 3 * SIZE(C1)
#ifndef LN
lda C1, 4 * SIZE(C1)
#endif
fclr t1
fclr t2
fclr t3
fclr t4
#ifdef RT
sll K, 2 + BASE_SHIFT, TMP1
addq AORIG, TMP1, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, BASE_SHIFT + 2, TMP2
addq AO, TMP2, AO
sll TMP1, BASE_SHIFT + 0, TMP2
addq BO, TMP2, BO
#endif
#ifdef LT
addq KK, 4, KK
#endif
#ifdef LN
subq KK, 4, KK
#endif
lda I, -1(I)
bgt I, $L91
.align 4
$L119:
#ifdef LN
SXADDQ K, B, B
#endif
#if defined(LT) || defined(RN)
mov BO, B
#endif
#ifdef RN
addq KK, 1, KK
#endif
#ifdef RT
subq KK, 1, KK
#endif
.align 4
$L999:
ldt $f2, 0($sp)
ldt $f3, 8($sp)
ldt $f4, 16($sp)
ldt $f5, 24($sp)
ldt $f6, 32($sp)
ldt $f7, 40($sp)
ldt $f8, 48($sp)
ldt $f9, 56($sp)
clr $0
lda $sp, STACKSIZE($sp)
ret
EPILOGUE