/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define STACKSIZE 64
#define PREFETCHSIZE 32
#define M $16
#define N $17
#define A $21
#define LDA $18
#define X $19
#define INCX $20
#define Y $22
#define INCY $23
#define BUFFER $24
#define I $25
#define J $27
#define Y1 $4
#define A1 $5
#define A2 $6
#define alpha_r $f19
#define alpha_i $f20
#define alpha1 $f0
#define alpha2 $f1
#define alpha3 $f10
#define alpha4 $f11
#define y0 $f12
#define y1 $f13
#define y2 $f14
#define y3 $f15
#define y4 $f16
#define y5 $f17
#define y6 $f18
#define y7 $f21
#define a0 $f22
#define a1 $f23
#define a2 $f24
#define a3 $f25
#define a4 $f26
#define a5 $f27
#define a6 $f28
#define a7 $f29
#define t0 $f2
#define t1 $f3
#define t2 $f4
#define t3 $f5
#if !defined(CONJ) && !defined(XCONJ)
#define ADD1 ADD
#define ADD2 ADD
#define ADD3 SUB
#define ADD4 ADD
#elif defined(CONJ) && !defined(XCONJ)
#define ADD1 ADD
#define ADD2 SUB
#define ADD3 ADD
#define ADD4 ADD
#elif !defined(CONJ) && defined(XCONJ)
#define ADD1 ADD
#define ADD2 ADD
#define ADD3 ADD
#define ADD4 SUB
#else
#define ADD1 ADD
#define ADD2 SUB
#define ADD3 SUB
#define ADD4 SUB
#endif
PROLOGUE
lda $sp, -STACKSIZE($sp)
ldq LDA, 0 + STACKSIZE($sp)
ldq X, 8 + STACKSIZE($sp)
ldq INCX, 16 + STACKSIZE($sp)
ldq Y, 24 + STACKSIZE($sp)
ldq INCY, 32 + STACKSIZE($sp)
ldq BUFFER, 40 + STACKSIZE($sp)
stt $f2, 0($sp)
stt $f3, 8($sp)
stt $f4, 16($sp)
stt $f5, 24($sp)
stt $f6, 32($sp)
stt $f7, 40($sp)
stt $f8, 48($sp)
stt $f9, 56($sp)
PROFCODE
cmple M, 0, $0
sll INCX, ZBASE_SHIFT, INCX
cmple N, 0, $1
sll INCY, ZBASE_SHIFT, INCY
or $0, $1, $0
bne $0, $L999
cmpeq INCY, 2 * SIZE, $0
sll LDA, ZBASE_SHIFT,LDA
bne $0, $L10
mov BUFFER, Y1
mov Y, BUFFER
mov Y1, Y
sra M, 2, I
ble I, $L05
.align 4
$L02:
ST $f31, 0 * SIZE(Y1)
ST $f31, 1 * SIZE(Y1)
ST $f31, 2 * SIZE(Y1)
ST $f31, 3 * SIZE(Y1)
ST $f31, 4 * SIZE(Y1)
ST $f31, 5 * SIZE(Y1)
ST $f31, 6 * SIZE(Y1)
ST $f31, 7 * SIZE(Y1)
lda Y1, 8 * SIZE(Y1)
lda I, -1(I)
bgt I, $L02
.align 4
$L05:
and M, 3, I
ble I, $L10
.align 4
$L06:
ST $f31, 0 * SIZE(Y1)
ST $f31, 1 * SIZE(Y1)
addq Y1, 2 * SIZE, Y1
lda I, -1(I)
bgt I, $L06
.align 4
$L10:
sra N, 1, J
ble J, $L20
.align 4
$L11:
LD alpha1, 0 * SIZE(X)
LD alpha2, 1 * SIZE(X)
addq X, INCX, X
LD alpha3, 0 * SIZE(X)
LD alpha4, 1 * SIZE(X)
addq X, INCX, X
MUL alpha_r, alpha1, y0
MUL alpha_r, alpha2, y1
MUL alpha_r, alpha3, y2
MUL alpha_r, alpha4, y3
MUL alpha_i, alpha2, t0
mov A, A1
MUL alpha_i, alpha1, t1
addq A, LDA, A2
MUL alpha_i, alpha4, t2
addq A2, LDA, A
MUL alpha_i, alpha3, t3
mov Y, Y1
#ifndef XCONJ
SUB y0, t0, alpha1
ADD y1, t1, alpha2
SUB y2, t2, alpha3
ADD y3, t3, alpha4
#else
ADD y0, t0, alpha1
SUB y1, t1, alpha2
ADD y2, t2, alpha3
SUB y3, t3, alpha4
#endif
ldl $31, 4 * SIZE(X)
sra M, 2, I
ble I, $L15
LD a0, 0 * SIZE(A1)
LD a1, 1 * SIZE(A1)
LD a2, 2 * SIZE(A1)
LD a3, 3 * SIZE(A1)
LD a4, 0 * SIZE(A2)
LD a5, 1 * SIZE(A2)
LD a6, 2 * SIZE(A2)
LD a7, 3 * SIZE(A2)
MUL alpha1, a0, t0
LD y0, 0 * SIZE(Y1)
MUL alpha1, a1, t1
LD y1, 1 * SIZE(Y1)
MUL alpha1, a2, t2
LD y2, 2 * SIZE(Y1)
MUL alpha1, a3, t3
LD y3, 3 * SIZE(Y1)
ADD1 y0, t0, y0
unop
MUL alpha3, a4, t0
LD y4, 4 * SIZE(Y1)
ADD2 y1, t1, y1
unop
MUL alpha3, a5, t1
LD y5, 5 * SIZE(Y1)
ADD1 y2, t2, y2
unop
MUL alpha3, a6, t2
LD y6, 6 * SIZE(Y1)
ADD2 y3, t3, y3
unop
MUL alpha3, a7, t3
LD y7, 7 * SIZE(Y1)
ADD1 y0, t0, y0
unop
MUL alpha2, a1, t0
LD a1, 5 * SIZE(A1)
ADD2 y1, t1, y1
unop
MUL alpha2, a0, t1
LD a0, 4 * SIZE(A1)
ADD1 y2, t2, y2
unop
MUL alpha2, a3, t2
LD a3, 7 * SIZE(A1)
ADD2 y3, t3, y3
unop
MUL alpha2, a2, t3
LD a2, 6 * SIZE(A1)
ADD3 y0, t0, y0
unop
MUL alpha4, a5, t0
LD a5, 5 * SIZE(A2)
ADD4 y1, t1, y1
unop
MUL alpha4, a4, t1
LD a4, 4 * SIZE(A2)
ADD3 y2, t2, y2
unop
MUL alpha4, a7, t2
LD a7, 7 * SIZE(A2)
ADD4 y3, t3, y3
unop
MUL alpha4, a6, t3
LD a6, 6 * SIZE(A2)
ADD3 y0, t0, y0
MUL alpha1, a0, t0
ADD4 y1, t1, y1
MUL alpha1, a1, t1
ADD3 y2, t2, y2
unop
MUL alpha1, a2, t2
unop
ADD4 y3, t3, y3
lda I, -1(I)
MUL alpha1, a3, t3
ble I, $L13
.align 4
$L12:
ADD1 y4, t0, y4
ST y0, 0 * SIZE(Y1)
MUL alpha3, a4, t0
ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
ADD2 y5, t1, y5
ST y1, 1 * SIZE(Y1)
MUL alpha3, a5, t1
lda I, -1(I)
ADD1 y6, t2, y6
ST y2, 2 * SIZE(Y1)
MUL alpha3, a6, t2
unop
ADD2 y7, t3, y7
ST y3, 3 * SIZE(Y1)
MUL alpha3, a7, t3
unop
ADD1 y4, t0, y4
unop
MUL alpha2, a1, t0
LD a1, 9 * SIZE(A1)
ADD2 y5, t1, y5
unop
MUL alpha2, a0, t1
LD a0, 8 * SIZE(A1)
ADD1 y6, t2, y6
unop
MUL alpha2, a3, t2
LD a3, 11 * SIZE(A1)
ADD2 y7, t3, y7
unop
MUL alpha2, a2, t3
LD a2, 10 * SIZE(A1)
ADD3 y4, t0, y4
lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
MUL alpha4, a5, t0
LD a5, 9 * SIZE(A2)
ADD4 y5, t1, y5
unop
MUL alpha4, a4, t1
LD a4, 8 * SIZE(A2)
ADD3 y6, t2, y6
unop
MUL alpha4, a7, t2
LD a7, 11 * SIZE(A2)
ADD4 y7, t3, y7
unop
MUL alpha4, a6, t3
LD a6, 10 * SIZE(A2)
ADD3 y4, t0, y4
unop
MUL alpha1, a0, t0
LD y0, 8 * SIZE(Y1)
ADD4 y5, t1, y5
unop
MUL alpha1, a1, t1
LD y1, 9 * SIZE(Y1)
ADD3 y6, t2, y6
unop
MUL alpha1, a2, t2
LD y2, 10 * SIZE(Y1)
ADD4 y7, t3, y7
unop
MUL alpha1, a3, t3
LD y3, 11 * SIZE(Y1)
ADD1 y0, t0, y0
ST y4, 4 * SIZE(Y1)
MUL alpha3, a4, t0
ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
ADD2 y1, t1, y1
ST y5, 5 * SIZE(Y1)
MUL alpha3, a5, t1
unop
ADD1 y2, t2, y2
ST y6, 6 * SIZE(Y1)
MUL alpha3, a6, t2
unop
ADD2 y3, t3, y3
ST y7, 7 * SIZE(Y1)
MUL alpha3, a7, t3
lda Y1, 8 * SIZE(Y1)
ADD1 y0, t0, y0
unop
MUL alpha2, a1, t0
LD a1, 13 * SIZE(A1)
ADD2 y1, t1, y1
unop
MUL alpha2, a0, t1
LD a0, 12 * SIZE(A1)
ADD1 y2, t2, y2
unop
MUL alpha2, a3, t2
LD a3, 15 * SIZE(A1)
ADD2 y3, t3, y3
unop
MUL alpha2, a2, t3
LD a2, 14 * SIZE(A1)
ADD3 y0, t0, y0
unop
MUL alpha4, a5, t0
LD a5, 13 * SIZE(A2)
ADD4 y1, t1, y1
unop
MUL alpha4, a4, t1
LD a4, 12 * SIZE(A2)
ADD3 y2, t2, y2
unop
MUL alpha4, a7, t2
LD a7, 15 * SIZE(A2)
ADD4 y3, t3, y3
unop
MUL alpha4, a6, t3
LD a6, 14 * SIZE(A2)
ADD3 y0, t0, y0
unop
MUL alpha1, a0, t0
LD y4, 4 * SIZE(Y1)
ADD4 y1, t1, y1
lda A2, 8 * SIZE(A2)
MUL alpha1, a1, t1
LD y5, 5 * SIZE(Y1)
ADD3 y2, t2, y2
lda A1, 8 * SIZE(A1)
MUL alpha1, a2, t2
LD y6, 6 * SIZE(Y1)
ADD4 y3, t3, y3
MUL alpha1, a3, t3
LD y7, 7 * SIZE(Y1)
bgt I, $L12
.align 4
$L13:
ADD1 y4, t0, y4
ST y0, 0 * SIZE(Y1)
MUL alpha3, a4, t0
unop
ADD2 y5, t1, y5
ST y1, 1 * SIZE(Y1)
MUL alpha3, a5, t1
unop
ADD1 y6, t2, y6
ST y2, 2 * SIZE(Y1)
MUL alpha3, a6, t2
unop
ADD2 y7, t3, y7
ST y3, 3 * SIZE(Y1)
MUL alpha3, a7, t3
unop
ADD1 y4, t0, y4
MUL alpha2, a1, t0
ADD2 y5, t1, y5
MUL alpha2, a0, t1
ADD1 y6, t2, y6
MUL alpha2, a3, t2
ADD2 y7, t3, y7
MUL alpha2, a2, t3
ADD3 y4, t0, y4
MUL alpha4, a5, t0
ADD4 y5, t1, y5
MUL alpha4, a4, t1
ADD3 y6, t2, y6
MUL alpha4, a7, t2
ADD4 y7, t3, y7
MUL alpha4, a6, t3
ADD3 y4, t0, y4
ADD4 y5, t1, y5
ADD3 y6, t2, y6
ADD4 y7, t3, y7
ST y4, 4 * SIZE(Y1)
lda A1, 8 * SIZE(A1)
ST y5, 5 * SIZE(Y1)
lda A2, 8 * SIZE(A2)
ST y6, 6 * SIZE(Y1)
unop
ST y7, 7 * SIZE(Y1)
lda Y1, 8 * SIZE(Y1)
.align 4
$L15:
and M, 2, I
ble I, $L17
LD a0, 0 * SIZE(A1)
LD a1, 1 * SIZE(A1)
LD a2, 2 * SIZE(A1)
LD a3, 3 * SIZE(A1)
LD a4, 0 * SIZE(A2)
LD a5, 1 * SIZE(A2)
LD a6, 2 * SIZE(A2)
LD a7, 3 * SIZE(A2)
MUL alpha1, a0, t0
LD y0, 0 * SIZE(Y1)
MUL alpha1, a1, t1
LD y1, 1 * SIZE(Y1)
MUL alpha1, a2, t2
LD y2, 2 * SIZE(Y1)
MUL alpha1, a3, t3
LD y3, 3 * SIZE(Y1)
ADD1 y0, t0, y0
MUL alpha3, a4, t0
ADD2 y1, t1, y1
MUL alpha3, a5, t1
ADD1 y2, t2, y2
MUL alpha3, a6, t2
ADD2 y3, t3, y3
MUL alpha3, a7, t3
ADD1 y0, t0, y0
MUL alpha2, a1, t0
ADD2 y1, t1, y1
MUL alpha2, a0, t1
ADD1 y2, t2, y2
MUL alpha2, a3, t2
ADD2 y3, t3, y3
MUL alpha2, a2, t3
ADD3 y0, t0, y0
MUL alpha4, a5, t0
ADD4 y1, t1, y1
MUL alpha4, a4, t1
ADD3 y2, t2, y2
MUL alpha4, a7, t2
ADD4 y3, t3, y3
MUL alpha4, a6, t3
ADD3 y0, t0, y0
ADD4 y1, t1, y1
ADD3 y2, t2, y2
ADD4 y3, t3, y3
ST y0, 0 * SIZE(Y1)
lda A1, 4 * SIZE(A1)
ST y1, 1 * SIZE(Y1)
lda A2, 4 * SIZE(A2)
ST y2, 2 * SIZE(Y1)
unop
ST y3, 3 * SIZE(Y1)
lda Y1, 4 * SIZE(Y1)
.align 4
$L17:
blbc M, $L18
LD a0, 0 * SIZE(A1)
LD a1, 1 * SIZE(A1)
LD a2, 0 * SIZE(A2)
LD a3, 1 * SIZE(A2)
LD y0, 0 * SIZE(Y1)
LD y1, 1 * SIZE(Y1)
MUL alpha1, a0, t0
MUL alpha1, a1, t1
ADD1 y0, t0, y0
MUL alpha3, a2, t0
ADD2 y1, t1, y1
MUL alpha3, a3, t1
ADD1 y0, t0, y0
MUL alpha2, a1, t0
ADD2 y1, t1, y1
MUL alpha2, a0, t1
ADD3 y0, t0, y0
MUL alpha4, a3, t0
ADD4 y1, t1, y1
MUL alpha4, a2, t1
ADD3 y0, t0, y0
ADD4 y1, t1, y1
ST y0, 0 * SIZE(Y1)
ST y1, 1 * SIZE(Y1)
.align 4
$L18:
lda J, -1(J)
bgt J, $L11
.align 4
$L20:
blbc N, $L990
LD alpha1, 0 * SIZE(X)
LD alpha2, 1 * SIZE(X)
MUL alpha_r, alpha1, y0
MUL alpha_r, alpha2, y1
MUL alpha_i, alpha2, t0
mov A, A1
MUL alpha_i, alpha1, t1
mov Y, Y1
#ifndef XCONJ
SUB y0, t0, alpha1
ADD y1, t1, alpha2
#else
ADD y0, t0, alpha1
SUB y1, t1, alpha2
#endif
sra M, 2, I
ble I, $L25
LD a0, 0 * SIZE(A1)
LD a1, 1 * SIZE(A1)
LD a2, 2 * SIZE(A1)
LD a3, 3 * SIZE(A1)
LD y0, 0 * SIZE(Y1)
LD y1, 1 * SIZE(Y1)
LD y2, 2 * SIZE(Y1)
LD y3, 3 * SIZE(Y1)
MUL alpha1, a0, t0
LD a4, 4 * SIZE(A1)
MUL alpha1, a1, t1
LD a5, 5 * SIZE(A1)
MUL alpha1, a2, t2
LD a6, 6 * SIZE(A1)
MUL alpha1, a3, t3
LD a7, 7 * SIZE(A1)
ADD1 y0, t0, y0
unop
MUL alpha2, a1, t0
LD a1, 9 * SIZE(A1)
ADD2 y1, t1, y1
unop
MUL alpha2, a0, t1
LD a0, 8 * SIZE(A1)
ADD1 y2, t2, y2
unop
MUL alpha2, a3, t2
LD a3, 11 * SIZE(A1)
ADD2 y3, t3, y3
unop
MUL alpha2, a2, t3
LD a2, 10 * SIZE(A1)
ADD3 y0, t0, y0
unop
LD y4, 4 * SIZE(Y1)
MUL alpha1, a4, t0
ADD4 y1, t1, y1
unop
LD y5, 5 * SIZE(Y1)
MUL alpha1, a5, t1
ADD3 y2, t2, y2
LD y6, 6 * SIZE(Y1)
MUL alpha1, a6, t2
lda I, -1(I)
ADD4 y3, t3, y3
LD y7, 7 * SIZE(Y1)
MUL alpha1, a7, t3
ble I, $L23
.align 4
$L22:
ADD1 y4, t0, y4
ST y0, 0 * SIZE(Y1)
MUL alpha2, a5, t0
LD a5, 13 * SIZE(A1)
ADD2 y5, t1, y5
ST y1, 1 * SIZE(Y1)
MUL alpha2, a4, t1
LD a4, 12 * SIZE(A1)
ADD1 y6, t2, y6
ST y2, 2 * SIZE(Y1)
MUL alpha2, a7, t2
LD a7, 15 * SIZE(A1)
ADD2 y7, t3, y7
ST y3, 3 * SIZE(Y1)
MUL alpha2, a6, t3
LD a6, 14 * SIZE(A1)
ADD3 y4, t0, y4
LD y0, 8 * SIZE(Y1)
MUL alpha1, a0, t0
ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
ADD4 y5, t1, y5
LD y1, 9 * SIZE(Y1)
MUL alpha1, a1, t1
lda I, -1(I)
ADD3 y6, t2, y6
LD y2, 10 * SIZE(Y1)
MUL alpha1, a2, t2
unop
ADD4 y7, t3, y7
LD y3, 11 * SIZE(Y1)
MUL alpha1, a3, t3
unop
ADD1 y0, t0, y0
ST y4, 4 * SIZE(Y1)
MUL alpha2, a1, t0
LD a1, 17 * SIZE(A1)
ADD2 y1, t1, y1
ST y5, 5 * SIZE(Y1)
MUL alpha2, a0, t1
LD a0, 16 * SIZE(A1)
ADD1 y2, t2, y2
ST y6, 6 * SIZE(Y1)
MUL alpha2, a3, t2
LD a3, 19 * SIZE(A1)
ADD2 y3, t3, y3
ST y7, 7 * SIZE(Y1)
MUL alpha2, a2, t3
LD a2, 18 * SIZE(A1)
ADD3 y0, t0, y0
LD y4, 12 * SIZE(Y1)
MUL alpha1, a4, t0
ldl $31, (PREFETCHSIZE + 0) * SIZE(Y1)
ADD4 y1, t1, y1
LD y5, 13 * SIZE(Y1)
MUL alpha1, a5, t1
lda A1, 8 * SIZE(A1)
ADD3 y2, t2, y2
LD y6, 14 * SIZE(Y1)
MUL alpha1, a6, t2
lda Y1, 8 * SIZE(Y1)
ADD4 y3, t3, y3
LD y7, 7 * SIZE(Y1)
MUL alpha1, a7, t3
bgt I, $L22
.align 4
$L23:
ADD1 y4, t0, y4
ST y0, 0 * SIZE(Y1)
MUL alpha2, a5, t0
unop
ADD2 y5, t1, y5
ST y1, 1 * SIZE(Y1)
MUL alpha2, a4, t1
unop
ADD1 y6, t2, y6
ST y2, 2 * SIZE(Y1)
MUL alpha2, a7, t2
unop
ADD2 y7, t3, y7
ST y3, 3 * SIZE(Y1)
MUL alpha2, a6, t3
unop
ADD3 y4, t0, y4
ADD4 y5, t1, y5
ADD3 y6, t2, y6
ADD4 y7, t3, y7
ST y4, 4 * SIZE(Y1)
unop
ST y5, 5 * SIZE(Y1)
unop
ST y6, 6 * SIZE(Y1)
lda A1, 8 * SIZE(A1)
ST y7, 7 * SIZE(Y1)
lda Y1, 8 * SIZE(Y1)
.align 4
$L25:
and M, 2, I
ble I, $L27
LD a0, 0 * SIZE(A1)
LD a1, 1 * SIZE(A1)
LD a2, 2 * SIZE(A1)
LD a3, 3 * SIZE(A1)
MUL alpha1, a0, t0
LD y0, 0 * SIZE(Y1)
MUL alpha1, a1, t1
LD y1, 1 * SIZE(Y1)
MUL alpha1, a2, t2
LD y2, 2 * SIZE(Y1)
MUL alpha1, a3, t3
LD y3, 3 * SIZE(Y1)
ADD1 y0, t0, y0
MUL alpha2, a1, t0
ADD2 y1, t1, y1
MUL alpha2, a0, t1
ADD1 y2, t2, y2
MUL alpha2, a3, t2
ADD2 y3, t3, y3
MUL alpha2, a2, t3
ADD3 y0, t0, y0
ADD4 y1, t1, y1
ADD3 y2, t2, y2
ADD4 y3, t3, y3
ST y0, 0 * SIZE(Y1)
ST y1, 1 * SIZE(Y1)
ST y2, 2 * SIZE(Y1)
lda A1, 4 * SIZE(A1)
ST y3, 3 * SIZE(Y1)
lda Y1, 4 * SIZE(Y1)
.align 4
$L27:
blbc M, $L990
LD a0, 0 * SIZE(A1)
LD a1, 1 * SIZE(A1)
MUL alpha1, a0, t0
LD y0, 0 * SIZE(Y1)
MUL alpha1, a1, t1
LD y1, 1 * SIZE(Y1)
ADD1 y0, t0, y0
MUL alpha2, a1, t0
ADD2 y1, t1, y1
MUL alpha2, a0, t1
ADD3 y0, t0, y0
ADD4 y1, t1, y1
ST y0, 0 * SIZE(Y1)
ST y1, 1 * SIZE(Y1)
.align 4
$L990:
cmpeq INCY, 2 * SIZE, $0
bne $0, $L999
mov BUFFER, Y1
sra M, 2, I
ble I, $L995
.align 4
$L992:
LD a0, 0 * SIZE(BUFFER)
LD a1, 1 * SIZE(BUFFER)
addq BUFFER, INCY, BUFFER
LD a2, 0 * SIZE(BUFFER)
LD a3, 1 * SIZE(BUFFER)
addq BUFFER, INCY, BUFFER
LD y0, 0 * SIZE(Y)
LD y1, 1 * SIZE(Y)
LD y2, 2 * SIZE(Y)
LD y3, 3 * SIZE(Y)
LD a4, 0 * SIZE(BUFFER)
LD a5, 1 * SIZE(BUFFER)
addq BUFFER, INCY, BUFFER
LD a6, 0 * SIZE(BUFFER)
LD a7, 1 * SIZE(BUFFER)
addq BUFFER, INCY, BUFFER
LD y4, 4 * SIZE(Y)
LD y5, 5 * SIZE(Y)
LD y6, 6 * SIZE(Y)
LD y7, 7 * SIZE(Y)
ADD a0, y0, a0
ADD a1, y1, a1
ADD a2, y2, a2
ADD a3, y3, a3
ST a0, 0 * SIZE(Y1)
ADD a4, y4, a4
ST a1, 1 * SIZE(Y1)
ADD a5, y5, a5
addq Y1, INCY, Y1
ST a2, 0 * SIZE(Y1)
ADD a6, y6, a6
ST a3, 1 * SIZE(Y1)
ADD a7, y7, a7
addq Y1, INCY, Y1
ST a4, 0 * SIZE(Y1)
ST a5, 1 * SIZE(Y1)
addq Y1, INCY, Y1
ST a6, 0 * SIZE(Y1)
ST a7, 1 * SIZE(Y1)
addq Y1, INCY, Y1
lda I, -1(I)
lda Y, 8 * SIZE(Y)
bgt I, $L992
.align 4
$L995:
and M, 3, I
ble I, $L999
.align 4
$L996:
LD a0, 0 * SIZE(BUFFER)
LD a1, 1 * SIZE(BUFFER)
addq BUFFER, INCY, BUFFER
LD y0, 0 * SIZE(Y)
LD y1, 1 * SIZE(Y)
lda Y, 2 * SIZE(Y)
ADD a0, y0, a0
ADD a1, y1, a1
ST a0, 0 * SIZE(Y1)
ST a1, 1 * SIZE(Y1)
addq Y1, INCY, Y1
lda I, -1(I)
bgt I, $L996
.align 4
$L999:
ldt $f2, 0($sp)
ldt $f3, 8($sp)
ldt $f4, 16($sp)
ldt $f5, 24($sp)
ldt $f6, 32($sp)
ldt $f7, 40($sp)
ldt $f8, 48($sp)
ldt $f9, 56($sp)
lda $sp, STACKSIZE($sp)
ret
EPILOGUE