/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define M %i0
#define N %i1
#if defined(DOUBLE) && !defined(__64BIT__)
#define X %i5
#define INCX %i2
#define Y %i3
#define INCY %i4
#else
#define X %i4
#define INCX %i5
#define Y %i2
#define INCY %i3
#endif
#define A %l0
#define LDA %l1
#define BUFFER %l2
#define I %l3
#define J %l4
#define A1 %o0
#define X1 %o2
#define XX %o3
#ifdef DOUBLE
#define t1 %f0
#define t2 %f2
#define t3 %f4
#define t4 %f6
#define x1 %f8
#define x2 %f10
#define x3 %f12
#define x4 %f14
#define x5 %f16
#define x6 %f18
#define x7 %f20
#define x8 %f22
#define a1 %f24
#define a2 %f26
#define a3 %f28
#define a4 %f30
#define a5 %f32
#define a6 %f34
#define a7 %f36
#define a8 %f38
#define a9 %f40
#define a10 %f42
#define a11 %f44
#define a12 %f46
#define a13 %f48
#define a14 %f50
#define a15 %f52
#define a16 %f54
#define y1 %f56
#define y2 %f58
#define ALPHA %f60
#else
#define t1 %f0
#define t2 %f1
#define t3 %f2
#define t4 %f3
#define x1 %f4
#define x2 %f5
#define x3 %f6
#define x4 %f7
#define x5 %f8
#define x6 %f9
#define x7 %f10
#define x8 %f11
#define a1 %f12
#define a2 %f13
#define a3 %f14
#define a4 %f15
#define a5 %f16
#define a6 %f17
#define a7 %f18
#define a8 %f19
#define a9 %f20
#define a10 %f21
#define a11 %f22
#define a12 %f23
#define a13 %f24
#define a14 %f25
#define a15 %f26
#define a16 %f27
#define y1 %f28
#define y2 %f29
#define ALPHA %f30
#endif
#define PREFETCHSIZE 60
PROLOGUE
SAVESP
nop
#ifndef __64BIT__
#ifdef DOUBLE
st %i3, [%sp + STACK_START + 16]
st %i4, [%sp + STACK_START + 20]
ld [%sp + STACK_START + 28], INCX
ld [%sp + STACK_START + 32], Y
ld [%sp + STACK_START + 36], INCY
ld [%sp + STACK_START + 40], A
ld [%sp + STACK_START + 44], LDA
ld [%sp + STACK_START + 48], BUFFER
#else
st %i3, [%sp + STACK_START + 16]
ld [%sp + STACK_START + 28], Y
ld [%sp + STACK_START + 32], INCY
ld [%sp + STACK_START + 36], A
ld [%sp + STACK_START + 40], LDA
ld [%sp + STACK_START + 44], BUFFER
#endif
LDF [%sp + STACK_START + 16], ALPHA
#else
ldx [%sp + STACK_START + 56], Y
ldx [%sp + STACK_START + 64], INCY
ldx [%sp + STACK_START + 72], A
ldx [%sp + STACK_START + 80], LDA
ldx [%sp + STACK_START + 88], BUFFER
#ifdef DOUBLE
FMOV %f6, ALPHA
#else
FMOV %f7, ALPHA
#endif
#endif
sll LDA, BASE_SHIFT, LDA
cmp M, 0
ble %icc, .LL999
sll INCX, BASE_SHIFT, INCX
cmp N, 0
ble %icc, .LL999
sll INCY, BASE_SHIFT, INCY
cmp INCX, SIZE
be %icc, .LL10
mov X, XX
mov BUFFER, XX
mov BUFFER, X1
sra M, 3, J
cmp J, 0
ble,pn %icc, .LL05
nop
.LL01:
LDF [X], a1
add X, INCX, X
LDF [X], a2
add X, INCX, X
LDF [X], a3
add X, INCX, X
LDF [X], a4
add X, INCX, X
LDF [X], a5
add X, INCX, X
LDF [X], a6
add X, INCX, X
LDF [X], a7
add X, INCX, X
LDF [X], a8
add X, INCX, X
STF a1, [X1 + 0 * SIZE]
STF a2, [X1 + 1 * SIZE]
STF a3, [X1 + 2 * SIZE]
STF a4, [X1 + 3 * SIZE]
STF a5, [X1 + 4 * SIZE]
STF a6, [X1 + 5 * SIZE]
STF a7, [X1 + 6 * SIZE]
STF a8, [X1 + 7 * SIZE]
add X1, 8 * SIZE, X1
deccc J
bg,pn %icc, .LL01
nop
.LL05:
andcc M, 7, J
ble,pn %icc, .LL10
nop
.LL06:
LDF [X], a1
add X, INCX, X
STF a1, [X1 + 0 * SIZE]
add X1, 1 * SIZE, X1
deccc J
bg,pn %icc, .LL06
nop
.LL10:
mov N, J
cmp N, 0
ble,pn %icc, .LL999
nop
.LL11:
mov XX, X1
mov A, A1
add A, LDA, A
LDF [Y], y1
add Y, INCY, Y
FMUL ALPHA, y1, y1
sra M, 3, I
cmp I, 0
ble,pn %icc, .LL15
nop
LDF [X1 + 0 * SIZE], x1
LDF [A1 + 0 * SIZE], a1
LDF [X1 + 1 * SIZE], x2
LDF [A1 + 1 * SIZE], a2
LDF [X1 + 2 * SIZE], x3
LDF [A1 + 2 * SIZE], a3
LDF [X1 + 3 * SIZE], x4
LDF [A1 + 3 * SIZE], a4
LDF [X1 + 4 * SIZE], x5
LDF [A1 + 4 * SIZE], a5
LDF [X1 + 5 * SIZE], x6
LDF [A1 + 5 * SIZE], a6
LDF [X1 + 6 * SIZE], x7
LDF [A1 + 6 * SIZE], a7
LDF [X1 + 7 * SIZE], x8
LDF [A1 + 7 * SIZE], a8
FMUL x1, y1, t1
FMUL x2, y1, t2
FMUL x3, y1, t3
FMUL x4, y1, t4
FADD a1, t1, a1
FMUL x5, y1, t1
FADD a2, t2, a2
FMUL x6, y1, t2
deccc I
ble,pn %icc, .LL13
nop
.LL12:
prefetch [A1 + PREFETCHSIZE * SIZE], 0
FADD a3, t3, a3
LDF [X1 + 8 * SIZE], x1
FMUL x7, y1, t3
LDF [X1 + 9 * SIZE], x2
FADD a4, t4, a4
LDF [X1 + 10 * SIZE], x3
FMUL x8, y1, t4
LDF [X1 + 11 * SIZE], x4
FADD a5, t1, a5
STF a1, [A1 + 0 * SIZE]
LDF [A1 + 8 * SIZE], a1
FMUL x1, y1, t1
STF a2, [A1 + 1 * SIZE]
LDF [A1 + 9 * SIZE], a2
FADD a6, t2, a6
STF a3, [A1 + 2 * SIZE]
LDF [A1 + 10 * SIZE], a3
FMUL x2, y1, t2
STF a4, [A1 + 3 * SIZE]
LDF [A1 + 11 * SIZE], a4
FADD a7, t3, a7
LDF [X1 + 12 * SIZE], x5
FMUL x3, y1, t3
LDF [X1 + 13 * SIZE], x6
FADD a8, t4, a8
LDF [X1 + 14 * SIZE], x7
FMUL x4, y1, t4
LDF [X1 + 15 * SIZE], x8
FADD a1, t1, a1
STF a5, [A1 + 4 * SIZE]
deccc I
LDF [A1 + 12 * SIZE], a5
FMUL x5, y1, t1
STF a6, [A1 + 5 * SIZE]
LDF [A1 + 13 * SIZE], a6
FADD a2, t2, a2
STF a7, [A1 + 6 * SIZE]
LDF [A1 + 14 * SIZE], a7
FMUL x6, y1, t2
STF a8, [A1 + 7 * SIZE]
LDF [A1 + 15 * SIZE], a8
add A1, 8 * SIZE, A1
bg,pn %icc, .LL12
add X1, 8 * SIZE, X1
.LL13:
FADD a3, t3, a3
FMUL x7, y1, t3
FADD a4, t4, a4
FMUL x8, y1, t4
FADD a5, t1, a5
FADD a6, t2, a6
FADD a7, t3, a7
FADD a8, t4, a8
STF a1, [A1 + 0 * SIZE]
STF a2, [A1 + 1 * SIZE]
STF a3, [A1 + 2 * SIZE]
STF a4, [A1 + 3 * SIZE]
STF a5, [A1 + 4 * SIZE]
STF a6, [A1 + 5 * SIZE]
STF a7, [A1 + 6 * SIZE]
STF a8, [A1 + 7 * SIZE]
add A1, 8 * SIZE, A1
add X1, 8 * SIZE, X1
.LL15:
andcc M, 4, I
ble,pn %icc, .LL16
nop
LDF [X1 + 0 * SIZE], x1
LDF [A1 + 0 * SIZE], a1
LDF [X1 + 1 * SIZE], x2
LDF [A1 + 1 * SIZE], a2
LDF [X1 + 2 * SIZE], x3
LDF [A1 + 2 * SIZE], a3
LDF [X1 + 3 * SIZE], x4
LDF [A1 + 3 * SIZE], a4
FMUL x1, y1, t1
FMUL x2, y1, t2
FMUL x3, y1, t3
FMUL x4, y1, t4
FADD a1, t1, a1
FADD a2, t2, a2
FADD a3, t3, a3
FADD a4, t4, a4
STF a1, [A1 + 0 * SIZE]
STF a2, [A1 + 1 * SIZE]
STF a3, [A1 + 2 * SIZE]
add X1, 4 * SIZE, X1
STF a4, [A1 + 3 * SIZE]
add A1, 4 * SIZE, A1
.LL16:
andcc M, 2, I
ble,pn %icc, .LL17
nop
LDF [X1 + 0 * SIZE], x1
LDF [X1 + 1 * SIZE], x2
LDF [A1 + 0 * SIZE], a1
LDF [A1 + 1 * SIZE], a2
FMUL x1, y1, t1
FMUL x2, y1, t2
FADD a1, t1, a1
FADD a2, t2, a2
STF a1, [A1 + 0 * SIZE]
add X1, 2 * SIZE, X1
STF a2, [A1 + 1 * SIZE]
add A1, 2 * SIZE, A1
.LL17:
andcc M, 1, I
ble,pn %icc, .LL19
nop
LDF [X1 + 0 * SIZE], x1
add X1, 1 * SIZE, X1
LDF [A1 + 0 * SIZE], a1
FMUL x1, y1, t1
FADD a1, t1, a1
STF a1, [A1 + 0 * SIZE]
add A1, 1 * SIZE, A1
.LL19:
deccc J
bg %icc, .LL11
nop
.LL999:
return %i7 + 8
clr %o0
EPILOGUE