/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define P 1024
#ifndef __64BIT__
#define STACKSIZE 224
#else
#define STACKSIZE 304
#endif
#ifdef linux
#ifndef __64BIT__
#define M r3
#define N r4
#define A r6
#define LDA r7
#define X r8
#define INCX r9
#define Y r10
#define INCY r5
#else
#define M r3
#define N r4
#define A r8
#define LDA r9
#define X r10
#define INCX r5
#define Y r6
#define INCY r7
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define M r3
#define N r4
#define A r10
#define LDA r5
#define X r6
#define INCX r7
#define Y r8
#define INCY r9
#else
#define M r3
#define N r4
#define A r8
#define LDA r9
#define X r10
#define INCX r5
#define Y r6
#define INCY r7
#endif
#endif
#define BUFFER r11
#define XP r12
#define X1 r14
#define J r15
#define AO1 r16
#define AO2 r17
#define AO3 r18
#define AO4 r19
#define PREA r20
#define PREC r21
#define YY r22
#if defined(PPCG4)
#define PREFETCHSIZE_A (3 * 8)
#define PREFETCHSIZE_C 7
#endif
#if defined(POWER6)
#define PREFETCHSIZE_A (3 * 8)
#define PREFETCHSIZE_C 7
#endif
#if !(defined(CONJ) && defined(XCONJ))
#define FMADDR FMADD
#define FMSUBR FNMSUB
#else
#define FMADDR FNMSUB
#define FMSUBR FMADD
#endif
#ifndef NEEDPARAM
#ifndef __64BIT__
#define FZERO 200(SP)
#else
#define FZERO 256(SP)
#endif
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r14, 144(SP)
std r15, 152(SP)
std r16, 160(SP)
std r17, 168(SP)
std r18, 176(SP)
std r19, 184(SP)
std r20, 192(SP)
std r21, 200(SP)
std r22, 208(SP)
std r0, FZERO
#else
stw r14, 144(SP)
stw r15, 148(SP)
stw r16, 152(SP)
stw r17, 156(SP)
stw r18, 160(SP)
stw r19, 164(SP)
stw r20, 168(SP)
stw r21, 172(SP)
stw r22, 176(SP)
stw r0, FZERO
stw r0, 4 + FZERO
#endif
#ifdef linux
#ifndef __64BIT__
lwz INCY, 8 + STACKSIZE(SP)
lwz BUFFER, 12 + STACKSIZE(SP)
#else
ld INCX, 112 + STACKSIZE(SP)
ld Y, 120 + STACKSIZE(SP)
ld INCY, 128 + STACKSIZE(SP)
ld BUFFER, 136 + STACKSIZE(SP)
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifndef __64BIT__
#ifdef DOUBLE
lwz LDA, 56 + STACKSIZE(SP)
lwz X, 60 + STACKSIZE(SP)
lwz INCX, 64 + STACKSIZE(SP)
lwz Y, 68 + STACKSIZE(SP)
lwz INCY, 72 + STACKSIZE(SP)
lwz BUFFER, 76 + STACKSIZE(SP)
#else
lwz INCX, 56 + STACKSIZE(SP)
lwz Y, 60 + STACKSIZE(SP)
lwz INCY, 64 + STACKSIZE(SP)
lwz BUFFER, 68 + STACKSIZE(SP)
#endif
#else
ld INCX, 112 + STACKSIZE(SP)
ld Y, 120 + STACKSIZE(SP)
ld INCY, 128 + STACKSIZE(SP)
ld BUFFER, 136 + STACKSIZE(SP)
#endif
#endif
#ifndef XCONJ
#ifndef CONJ
#define FMADD1 FMADD
#define FMADD2 FMADD
#define FMADD3 FNMSUB
#define FMADD4 FMADD
#else
#define FMADD1 FMADD
#define FMADD2 FMADD
#define FMADD3 FMADD
#define FMADD4 FNMSUB
#endif
#else
#ifndef CONJ
#define FMADD1 FMADD
#define FMADD2 FNMSUB
#define FMADD3 FMADD
#define FMADD4 FMADD
#else
#define FMADD1 FMADD
#define FMADD2 FMADD
#define FMADD3 FNMSUB
#define FMADD4 FMADD
#endif
#endif
#define y1 f0
#define y2 f1
#define y3 f2
#define y4 f3
#define y5 f4
#define y6 f5
#define y7 f6
#define y8 f7
#define a1 f8
#define a2 f9
#define a3 f10
#define a4 f11
#define a5 f12
#define a6 f13
#define a7 f14
#define a8 f15
#define b1 f16
#define b2 f17
#define b3 f18
#define b4 f19
#define b5 f20
#define b6 f21
#define b7 f22
#define b8 f23
#define alpha_r f24
#define alpha_i f25
fmr alpha_r, f1
fmr alpha_i, f2
slwi LDA, LDA, ZBASE_SHIFT
slwi INCX, INCX, ZBASE_SHIFT
slwi INCY, INCY, ZBASE_SHIFT
li PREA, PREFETCHSIZE_A * SIZE
li PREC, PREFETCHSIZE_C * SIZE
addi A, A, -SIZE
addi INCX, INCX, -SIZE
addi INCY, INCY, -SIZE
sub X, X, INCX
sub Y, Y, INCY
mr YY, Y
cmpwi cr0, M, 0
ble LL(999)
cmpwi cr0, N, 0
ble LL(999)
mr XP, X
cmpwi cr0, INCX, SIZE
beq LL(10)
addi XP, BUFFER, -SIZE
addi X1, BUFFER, -SIZE
srawi. r0, M, 2
mtspr CTR, r0
ble LL(05)
.align 4
LL(02):
LFDUX f0, X, INCX
LFDU f1, 1 * SIZE(X)
LFDUX f2, X, INCX
LFDU f3, 1 * SIZE(X)
LFDUX f4, X, INCX
LFDU f5, 1 * SIZE(X)
LFDUX f6, X, INCX
LFDU f7, 1 * SIZE(X)
STFDU f0, 1 * SIZE(X1)
STFDU f1, 1 * SIZE(X1)
STFDU f2, 1 * SIZE(X1)
STFDU f3, 1 * SIZE(X1)
STFDU f4, 1 * SIZE(X1)
STFDU f5, 1 * SIZE(X1)
STFDU f6, 1 * SIZE(X1)
STFDU f7, 1 * SIZE(X1)
bdnz LL(02)
.align 4
LL(05):
andi. r0, M, 3
mtspr CTR, r0
ble LL(10)
.align 4
LL(06):
LFDUX f0, X, INCX
LFDU f1, 1 * SIZE(X)
STFDU f0, 1 * SIZE(X1)
STFDU f1, 1 * SIZE(X1)
bdnz LL(06)
.align 4
LL(10):
srawi. J, N, 2
ble LL(20)
.align 4
LL(11):
lfd y1, FZERO
mr AO1, A
fmr y2, y1
mr X1, XP
fmr y3, y1
add AO2, A, LDA
fmr y4, y1
add AO3, AO2, LDA
fmr y5, y1
add AO4, AO3, LDA
fmr y6, y1
add A, AO4, LDA
fmr y7, y1
dcbtst PREC, Y
fmr y8, y1
srawi. r0, M, 2
mtspr CTR, r0
ble LL(15)
LFDU a1, 1 * SIZE(AO1)
LFDU b1, 1 * SIZE(X1)
LFDU a2, 1 * SIZE(AO1)
LFDU b2, 1 * SIZE(X1)
LFDU a3, 1 * SIZE(AO2)
LFDU a4, 1 * SIZE(AO2)
LFDU a5, 1 * SIZE(AO3)
LFDU a6, 1 * SIZE(AO3)
LFDU a7, 1 * SIZE(AO4)
bdz LL(13)
.align 5
LL(12):
FMADD1 y1, a1, b1, y1
LFDU a8, 1 * SIZE(AO4)
FMADD2 y2, a1, b2, y2
LFDU b3, 1 * SIZE(X1)
FMADD1 y3, a3, b1, y3
LFDU b4, 1 * SIZE(X1)
FMADD2 y4, a3, b2, y4
#ifdef PPCG4
dcbt AO1, PREA
#endif
FMADD3 y1, a2, b2, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b1, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b2, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b1, y4
LFDU a4, 1 * SIZE(AO2)
#ifdef PPCG4
dcbt X1, PREA
#endif
FMADD1 y5, a5, b1, y5
FMADD2 y6, a5, b2, y6
FMADD1 y7, a7, b1, y7
FMADD2 y8, a7, b2, y8
#ifdef PPCG4
dcbt AO2, PREA
#endif
FMADD3 y5, a6, b2, y5
LFDU a5, 1 * SIZE(AO3)
FMADD4 y6, a6, b1, y6
LFDU a6, 1 * SIZE(AO3)
FMADD3 y7, a8, b2, y7
LFDU a7, 1 * SIZE(AO4)
FMADD4 y8, a8, b1, y8
LFDU a8, 1 * SIZE(AO4)
FMADD1 y1, a1, b3, y1
LFDU b1, 1 * SIZE(X1)
FMADD2 y2, a1, b4, y2
LFDU b2, 1 * SIZE(X1)
FMADD1 y3, a3, b3, y3
FMADD2 y4, a3, b4, y4
#ifdef PPCG4
dcbt AO3, PREA
#endif
FMADD3 y1, a2, b4, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b3, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b4, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b3, y4
LFDU a4, 1 * SIZE(AO2)
FMADD1 y5, a5, b3, y5
FMADD2 y6, a5, b4, y6
FMADD1 y7, a7, b3, y7
FMADD2 y8, a7, b4, y8
#ifdef PPCG4
dcbt AO4, PREA
#endif
FMADD3 y5, a6, b4, y5
LFDU a5, 1 * SIZE(AO3)
FMADD4 y6, a6, b3, y6
LFDU a6, 1 * SIZE(AO3)
FMADD3 y7, a8, b4, y7
LFDU a7, 1 * SIZE(AO4)
FMADD4 y8, a8, b3, y8
LFDU a8, 1 * SIZE(AO4)
FMADD1 y1, a1, b1, y1
LFDU b3, 1 * SIZE(X1)
FMADD2 y2, a1, b2, y2
LFDU b4, 1 * SIZE(X1)
FMADD1 y3, a3, b1, y3
FMADD2 y4, a3, b2, y4
#if defined(PPCG4) && defined(DOUBLE)
dcbt AO1, PREA
#endif
FMADD3 y1, a2, b2, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b1, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b2, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b1, y4
LFDU a4, 1 * SIZE(AO2)
#if defined(PPCG4) && defined(DOUBLE)
dcbt X1, PREA
#endif
FMADD1 y5, a5, b1, y5
FMADD2 y6, a5, b2, y6
FMADD1 y7, a7, b1, y7
FMADD2 y8, a7, b2, y8
#if defined(PPCG4) && defined(DOUBLE)
dcbt AO2, PREA
#endif
FMADD3 y5, a6, b2, y5
LFDU a5, 1 * SIZE(AO3)
FMADD4 y6, a6, b1, y6
LFDU a6, 1 * SIZE(AO3)
FMADD3 y7, a8, b2, y7
LFDU a7, 1 * SIZE(AO4)
FMADD4 y8, a8, b1, y8
LFDU a8, 1 * SIZE(AO4)
FMADD1 y1, a1, b3, y1
FMADD2 y2, a1, b4, y2
FMADD1 y3, a3, b3, y3
FMADD2 y4, a3, b4, y4
#if defined(PPCG4) && defined(DOUBLE)
dcbt AO3, PREA
#endif
FMADD3 y1, a2, b4, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b3, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b4, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b3, y4
LFDU a4, 1 * SIZE(AO2)
FMADD1 y5, a5, b3, y5
LFDU b1, 1 * SIZE(X1)
FMADD2 y6, a5, b4, y6
LFDU b2, 1 * SIZE(X1)
FMADD1 y7, a7, b3, y7
FMADD2 y8, a7, b4, y8
#if defined(PPCG4) && defined(DOUBLE)
dcbt AO4, PREA
#endif
FMADD3 y5, a6, b4, y5
LFDU a5, 1 * SIZE(AO3)
FMADD4 y6, a6, b3, y6
LFDU a6, 1 * SIZE(AO3)
FMADD3 y7, a8, b4, y7
LFDU a7, 1 * SIZE(AO4)
FMADD4 y8, a8, b3, y8
bdnz LL(12)
.align 4
LL(13):
FMADD1 y1, a1, b1, y1
LFDU a8, 1 * SIZE(AO4)
FMADD2 y2, a1, b2, y2
LFDU b3, 1 * SIZE(X1)
FMADD1 y3, a3, b1, y3
LFDU b4, 1 * SIZE(X1)
FMADD2 y4, a3, b2, y4
FMADD3 y1, a2, b2, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b1, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b2, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b1, y4
LFDU a4, 1 * SIZE(AO2)
FMADD1 y5, a5, b1, y5
FMADD2 y6, a5, b2, y6
FMADD1 y7, a7, b1, y7
FMADD2 y8, a7, b2, y8
FMADD3 y5, a6, b2, y5
LFDU a5, 1 * SIZE(AO3)
FMADD4 y6, a6, b1, y6
LFDU a6, 1 * SIZE(AO3)
FMADD3 y7, a8, b2, y7
LFDU a7, 1 * SIZE(AO4)
FMADD4 y8, a8, b1, y8
LFDU a8, 1 * SIZE(AO4)
FMADD1 y1, a1, b3, y1
LFDU b1, 1 * SIZE(X1)
FMADD2 y2, a1, b4, y2
LFDU b2, 1 * SIZE(X1)
FMADD1 y3, a3, b3, y3
FMADD2 y4, a3, b4, y4
FMADD3 y1, a2, b4, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b3, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b4, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b3, y4
LFDU a4, 1 * SIZE(AO2)
FMADD1 y5, a5, b3, y5
FMADD2 y6, a5, b4, y6
FMADD1 y7, a7, b3, y7
FMADD2 y8, a7, b4, y8
FMADD3 y5, a6, b4, y5
LFDU a5, 1 * SIZE(AO3)
FMADD4 y6, a6, b3, y6
LFDU a6, 1 * SIZE(AO3)
FMADD3 y7, a8, b4, y7
LFDU a7, 1 * SIZE(AO4)
FMADD4 y8, a8, b3, y8
LFDU a8, 1 * SIZE(AO4)
FMADD1 y1, a1, b1, y1
LFDU b3, 1 * SIZE(X1)
FMADD2 y2, a1, b2, y2
LFDU b4, 1 * SIZE(X1)
FMADD1 y3, a3, b1, y3
FMADD2 y4, a3, b2, y4
FMADD3 y1, a2, b2, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b1, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b2, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b1, y4
LFDU a4, 1 * SIZE(AO2)
FMADD1 y5, a5, b1, y5
FMADD2 y6, a5, b2, y6
FMADD1 y7, a7, b1, y7
FMADD2 y8, a7, b2, y8
FMADD3 y5, a6, b2, y5
LFDU a5, 1 * SIZE(AO3)
FMADD4 y6, a6, b1, y6
LFDU a6, 1 * SIZE(AO3)
FMADD3 y7, a8, b2, y7
LFDU a7, 1 * SIZE(AO4)
FMADD4 y8, a8, b1, y8
LFDU a8, 1 * SIZE(AO4)
FMADD1 y1, a1, b3, y1
FMADD2 y2, a1, b4, y2
FMADD1 y3, a3, b3, y3
FMADD2 y4, a3, b4, y4
FMADD3 y1, a2, b4, y1
FMADD4 y2, a2, b3, y2
FMADD3 y3, a4, b4, y3
FMADD4 y4, a4, b3, y4
FMADD1 y5, a5, b3, y5
FMADD2 y6, a5, b4, y6
FMADD1 y7, a7, b3, y7
FMADD2 y8, a7, b4, y8
FMADD3 y5, a6, b4, y5
FMADD4 y6, a6, b3, y6
FMADD3 y7, a8, b4, y7
FMADD4 y8, a8, b3, y8
.align 4
LL(15):
andi. r0, M, 2
ble LL(17)
LFDU a1, 1 * SIZE(AO1)
LFDU b1, 1 * SIZE(X1)
LFDU a2, 1 * SIZE(AO1)
LFDU b2, 1 * SIZE(X1)
LFDU a3, 1 * SIZE(AO2)
LFDU b3, 1 * SIZE(X1)
LFDU a4, 1 * SIZE(AO2)
LFDU b4, 1 * SIZE(X1)
FMADD1 y1, a1, b1, y1
LFDU a5, 1 * SIZE(AO3)
FMADD2 y2, a1, b2, y2
LFDU a6, 1 * SIZE(AO3)
FMADD1 y3, a3, b1, y3
LFDU a7, 1 * SIZE(AO4)
FMADD2 y4, a3, b2, y4
LFDU a8, 1 * SIZE(AO4)
FMADD3 y1, a2, b2, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b1, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b2, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b1, y4
LFDU a4, 1 * SIZE(AO2)
FMADD1 y5, a5, b1, y5
FMADD2 y6, a5, b2, y6
FMADD1 y7, a7, b1, y7
FMADD2 y8, a7, b2, y8
FMADD3 y5, a6, b2, y5
LFDU a5, 1 * SIZE(AO3)
FMADD4 y6, a6, b1, y6
LFDU a6, 1 * SIZE(AO3)
FMADD3 y7, a8, b2, y7
LFDU a7, 1 * SIZE(AO4)
FMADD4 y8, a8, b1, y8
LFDU a8, 1 * SIZE(AO4)
FMADD1 y1, a1, b3, y1
FMADD2 y2, a1, b4, y2
FMADD1 y3, a3, b3, y3
FMADD2 y4, a3, b4, y4
FMADD3 y1, a2, b4, y1
FMADD4 y2, a2, b3, y2
FMADD3 y3, a4, b4, y3
FMADD4 y4, a4, b3, y4
FMADD1 y5, a5, b3, y5
FMADD2 y6, a5, b4, y6
FMADD1 y7, a7, b3, y7
FMADD2 y8, a7, b4, y8
FMADD3 y5, a6, b4, y5
FMADD4 y6, a6, b3, y6
FMADD3 y7, a8, b4, y7
FMADD4 y8, a8, b3, y8
.align 4
LL(17):
andi. r0, M, 1
ble LL(19)
LFDU a1, 1 * SIZE(AO1)
LFDU a2, 1 * SIZE(AO1)
LFDU a3, 1 * SIZE(AO2)
LFDU a4, 1 * SIZE(AO2)
LFDU a5, 1 * SIZE(AO3)
LFDU a6, 1 * SIZE(AO3)
LFDU a7, 1 * SIZE(AO4)
LFDU a8, 1 * SIZE(AO4)
LFDU b1, 1 * SIZE(X1)
LFDU b2, 1 * SIZE(X1)
FMADD1 y1, a1, b1, y1
FMADD2 y2, a1, b2, y2
FMADD1 y3, a3, b1, y3
FMADD2 y4, a3, b2, y4
FMADD3 y1, a2, b2, y1
FMADD4 y2, a2, b1, y2
FMADD3 y3, a4, b2, y3
FMADD4 y4, a4, b1, y4
FMADD1 y5, a5, b1, y5
FMADD2 y6, a5, b2, y6
FMADD1 y7, a7, b1, y7
FMADD2 y8, a7, b2, y8
FMADD3 y5, a6, b2, y5
FMADD4 y6, a6, b1, y6
FMADD3 y7, a8, b2, y7
FMADD4 y8, a8, b1, y8
.align 4
LL(19):
LFDUX b1, Y, INCY
LFDU b2, 1 * SIZE(Y)
LFDUX b3, Y, INCY
LFDU b4, 1 * SIZE(Y)
LFDUX b5, Y, INCY
LFDU b6, 1 * SIZE(Y)
LFDUX b7, Y, INCY
LFDU b8, 1 * SIZE(Y)
FMADD b1, alpha_r, y1, b1
FMADDR b2, alpha_r, y2, b2
FMADD b3, alpha_r, y3, b3
FMADDR b4, alpha_r, y4, b4
FMADD b5, alpha_r, y5, b5
FMADDR b6, alpha_r, y6, b6
FMADD b7, alpha_r, y7, b7
FMADDR b8, alpha_r, y8, b8
FMSUBR b1, alpha_i, y2, b1
FMADD b2, alpha_i, y1, b2
FMSUBR b3, alpha_i, y4, b3
FMADD b4, alpha_i, y3, b4
FMSUBR b5, alpha_i, y6, b5
FMADD b6, alpha_i, y5, b6
FMSUBR b7, alpha_i, y8, b7
FMADD b8, alpha_i, y7, b8
STFDUX b1, YY, INCY
STFDU b2, 1 * SIZE(YY)
STFDUX b3, YY, INCY
STFDU b4, 1 * SIZE(YY)
STFDUX b5, YY, INCY
STFDU b6, 1 * SIZE(YY)
STFDUX b7, YY, INCY
STFDU b8, 1 * SIZE(YY)
addi J, J, -1
cmpwi cr0, J, 0
bgt LL(11)
.align 4
LL(20):
andi. J, N, 2
ble LL(30)
lfd y1, FZERO
mr AO1, A
fmr y2, y1
mr X1, XP
fmr y3, y1
add AO2, A, LDA
fmr y4, y1
add A, AO2, LDA
srawi. r0, M, 2
mtspr CTR, r0
ble LL(25)
LFDU a1, 1 * SIZE(AO1)
LFDU b1, 1 * SIZE(X1)
LFDU a2, 1 * SIZE(AO1)
LFDU b2, 1 * SIZE(X1)
LFDU a3, 1 * SIZE(AO2)
bdz LL(23)
.align 5
LL(22):
FMADD1 y1, a1, b1, y1
LFDU a4, 1 * SIZE(AO2)
FMADD2 y2, a1, b2, y2
LFDU b3, 1 * SIZE(X1)
FMADD1 y3, a3, b1, y3
LFDU b4, 1 * SIZE(X1)
FMADD2 y4, a3, b2, y4
#ifdef PPCG4
dcbt AO1, PREA
#endif
FMADD3 y1, a2, b2, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b1, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b2, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b1, y4
LFDU a4, 1 * SIZE(AO2)
#ifdef PPCG4
dcbt AO2, PREA
#endif
FMADD1 y1, a1, b3, y1
LFDU b1, 1 * SIZE(X1)
FMADD2 y2, a1, b4, y2
LFDU b2, 1 * SIZE(X1)
FMADD1 y3, a3, b3, y3
FMADD2 y4, a3, b4, y4
#ifdef PPCG4
dcbt X1, PREA
#endif
FMADD3 y1, a2, b4, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b3, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b4, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b3, y4
LFDU a4, 1 * SIZE(AO2)
FMADD1 y1, a1, b1, y1
LFDU b3, 1 * SIZE(X1)
FMADD2 y2, a1, b2, y2
LFDU b4, 1 * SIZE(X1)
FMADD1 y3, a3, b1, y3
FMADD2 y4, a3, b2, y4
#if defined(PPCG4) && defined(DOUBLE)
dcbt AO1, PREA
#endif
FMADD3 y1, a2, b2, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b1, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b2, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b1, y4
LFDU a4, 1 * SIZE(AO2)
#if defined(PPCG4) && defined(DOUBLE)
dcbt AO2, PREA
#endif
FMADD1 y1, a1, b3, y1
LFDU b1, 1 * SIZE(X1)
FMADD2 y2, a1, b4, y2
LFDU b2, 1 * SIZE(X1)
FMADD1 y3, a3, b3, y3
FMADD2 y4, a3, b4, y4
#if defined(PPCG4) && defined(DOUBLE)
dcbt X1, PREA
#endif
FMADD3 y1, a2, b4, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b3, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b4, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b3, y4
bdnz LL(22)
.align 4
LL(23):
FMADD1 y1, a1, b1, y1
LFDU a4, 1 * SIZE(AO2)
FMADD2 y2, a1, b2, y2
LFDU b3, 1 * SIZE(X1)
FMADD1 y3, a3, b1, y3
LFDU b4, 1 * SIZE(X1)
FMADD2 y4, a3, b2, y4
FMADD3 y1, a2, b2, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b1, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b2, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b1, y4
LFDU a4, 1 * SIZE(AO2)
FMADD1 y1, a1, b3, y1
LFDU b1, 1 * SIZE(X1)
FMADD2 y2, a1, b4, y2
LFDU b2, 1 * SIZE(X1)
FMADD1 y3, a3, b3, y3
FMADD2 y4, a3, b4, y4
FMADD3 y1, a2, b4, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b3, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b4, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b3, y4
LFDU a4, 1 * SIZE(AO2)
FMADD1 y1, a1, b1, y1
LFDU b3, 1 * SIZE(X1)
FMADD2 y2, a1, b2, y2
LFDU b4, 1 * SIZE(X1)
FMADD1 y3, a3, b1, y3
FMADD2 y4, a3, b2, y4
FMADD3 y1, a2, b2, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b1, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b2, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b1, y4
LFDU a4, 1 * SIZE(AO2)
FMADD1 y1, a1, b3, y1
FMADD2 y2, a1, b4, y2
FMADD1 y3, a3, b3, y3
FMADD2 y4, a3, b4, y4
FMADD3 y1, a2, b4, y1
FMADD4 y2, a2, b3, y2
FMADD3 y3, a4, b4, y3
FMADD4 y4, a4, b3, y4
.align 4
LL(25):
andi. r0, M, 2
ble LL(27)
LFDU a1, 1 * SIZE(AO1)
LFDU b1, 1 * SIZE(X1)
LFDU a2, 1 * SIZE(AO1)
LFDU b2, 1 * SIZE(X1)
LFDU a3, 1 * SIZE(AO2)
LFDU b3, 1 * SIZE(X1)
LFDU a4, 1 * SIZE(AO2)
LFDU b4, 1 * SIZE(X1)
FMADD1 y1, a1, b1, y1
FMADD2 y2, a1, b2, y2
FMADD1 y3, a3, b1, y3
FMADD2 y4, a3, b2, y4
FMADD3 y1, a2, b2, y1
LFDU a1, 1 * SIZE(AO1)
FMADD4 y2, a2, b1, y2
LFDU a2, 1 * SIZE(AO1)
FMADD3 y3, a4, b2, y3
LFDU a3, 1 * SIZE(AO2)
FMADD4 y4, a4, b1, y4
LFDU a4, 1 * SIZE(AO2)
FMADD1 y1, a1, b3, y1
FMADD2 y2, a1, b4, y2
FMADD1 y3, a3, b3, y3
FMADD2 y4, a3, b4, y4
FMADD3 y1, a2, b4, y1
FMADD4 y2, a2, b3, y2
FMADD3 y3, a4, b4, y3
FMADD4 y4, a4, b3, y4
.align 4
LL(27):
andi. r0, M, 1
ble LL(29)
LFDU a1, 1 * SIZE(AO1)
LFDU a2, 1 * SIZE(AO1)
LFDU a3, 1 * SIZE(AO2)
LFDU a4, 1 * SIZE(AO2)
LFDU b1, 1 * SIZE(X1)
LFDU b2, 1 * SIZE(X1)
FMADD1 y1, a1, b1, y1
FMADD2 y2, a1, b2, y2
FMADD1 y3, a3, b1, y3
FMADD2 y4, a3, b2, y4
FMADD3 y1, a2, b2, y1
FMADD4 y2, a2, b1, y2
FMADD3 y3, a4, b2, y3
FMADD4 y4, a4, b1, y4
.align 4
LL(29):
LFDUX b1, Y, INCY
LFDU b2, 1 * SIZE(Y)
LFDUX b3, Y, INCY
LFDU b4, 1 * SIZE(Y)
FMADD b1, alpha_r, y1, b1
FMADDR b2, alpha_r, y2, b2
FMADD b3, alpha_r, y3, b3
FMADDR b4, alpha_r, y4, b4
FMSUBR b1, alpha_i, y2, b1
FMADD b2, alpha_i, y1, b2
FMSUBR b3, alpha_i, y4, b3
FMADD b4, alpha_i, y3, b4
STFDUX b1, YY, INCY
STFDU b2, 1 * SIZE(YY)
STFDUX b3, YY, INCY
STFDU b4, 1 * SIZE(YY)
.align 4
LL(30):
andi. J, N, 1
ble LL(999)
lfd y1, FZERO
mr AO1, A
fmr y2, y1
mr X1, XP
fmr y3, y1
fmr y4, y1
add A, A, LDA
srawi. r0, M, 2
mtspr CTR, r0
ble LL(35)
LFDU a1, 1 * SIZE(AO1)
LFDU b1, 1 * SIZE(X1)
LFDU a2, 1 * SIZE(AO1)
LFDU b2, 1 * SIZE(X1)
bdz LL(33)
.align 5
LL(32):
FMADD1 y1, a1, b1, y1
LFDU b3, 1 * SIZE(X1)
FMADD2 y2, a1, b2, y2
LFDU b4, 1 * SIZE(X1)
#ifdef PPCG4
dcbt AO1, PREA
#endif
FMADD3 y3, a2, b2, y3
LFDU a1, 1 * SIZE(AO1)
FMADD4 y4, a2, b1, y4
LFDU a2, 1 * SIZE(AO1)
FMADD1 y1, a1, b3, y1
LFDU b1, 1 * SIZE(X1)
FMADD2 y2, a1, b4, y2
LFDU b2, 1 * SIZE(X1)
#ifdef PPCG4
dcbt X1, PREA
#endif
FMADD3 y3, a2, b4, y3
LFDU a1, 1 * SIZE(AO1)
FMADD4 y4, a2, b3, y4
LFDU a2, 1 * SIZE(AO1)
FMADD1 y1, a1, b1, y1
LFDU b3, 1 * SIZE(X1)
FMADD2 y2, a1, b2, y2
LFDU b4, 1 * SIZE(X1)
#if defined(PPCG4) && defined(DOUBLE)
dcbt AO1, PREA
#endif
FMADD3 y3, a2, b2, y3
LFDU a1, 1 * SIZE(AO1)
FMADD4 y4, a2, b1, y4
LFDU a2, 1 * SIZE(AO1)
FMADD1 y1, a1, b3, y1
LFDU b1, 1 * SIZE(X1)
FMADD2 y2, a1, b4, y2
LFDU b2, 1 * SIZE(X1)
#if defined(PPCG4) && defined(DOUBLE)
dcbt X1, PREA
#endif
FMADD3 y3, a2, b4, y3
LFDU a1, 1 * SIZE(AO1)
FMADD4 y4, a2, b3, y4
LFDU a2, 1 * SIZE(AO1)
bdnz LL(32)
.align 4
LL(33):
FMADD1 y1, a1, b1, y1
LFDU b3, 1 * SIZE(X1)
FMADD2 y2, a1, b2, y2
LFDU b4, 1 * SIZE(X1)
FMADD3 y3, a2, b2, y3
LFDU a1, 1 * SIZE(AO1)
FMADD4 y4, a2, b1, y4
LFDU a2, 1 * SIZE(AO1)
FMADD1 y1, a1, b3, y1
LFDU b1, 1 * SIZE(X1)
FMADD2 y2, a1, b4, y2
LFDU b2, 1 * SIZE(X1)
FMADD3 y3, a2, b4, y3
LFDU a1, 1 * SIZE(AO1)
FMADD4 y4, a2, b3, y4
LFDU a2, 1 * SIZE(AO1)
FMADD1 y1, a1, b1, y1
LFDU b3, 1 * SIZE(X1)
FMADD2 y2, a1, b2, y2
LFDU b4, 1 * SIZE(X1)
FMADD3 y3, a2, b2, y3
LFDU a1, 1 * SIZE(AO1)
FMADD4 y4, a2, b1, y4
LFDU a2, 1 * SIZE(AO1)
FMADD1 y1, a1, b3, y1
FMADD2 y2, a1, b4, y2
FMADD3 y3, a2, b4, y3
FMADD4 y4, a2, b3, y4
.align 4
LL(35):
andi. r0, M, 2
ble LL(37)
LFDU a1, 1 * SIZE(AO1)
LFDU b1, 1 * SIZE(X1)
LFDU a2, 1 * SIZE(AO1)
LFDU b2, 1 * SIZE(X1)
FMADD1 y1, a1, b1, y1
LFDU b3, 1 * SIZE(X1)
FMADD2 y2, a1, b2, y2
LFDU a3, 1 * SIZE(AO1)
FMADD3 y3, a2, b2, y3
LFDU b4, 1 * SIZE(X1)
FMADD4 y4, a2, b1, y4
LFDU a4, 1 * SIZE(AO1)
FMADD1 y1, a3, b3, y1
FMADD2 y2, a3, b4, y2
FMADD3 y3, a4, b4, y3
FMADD4 y4, a4, b3, y4
.align 4
LL(37):
andi. r0, M, 1
ble LL(39)
LFDU a1, 1 * SIZE(AO1)
LFDU b1, 1 * SIZE(X1)
LFDU a2, 1 * SIZE(AO1)
LFDU b2, 1 * SIZE(X1)
FMADD1 y1, a1, b1, y1
FMADD2 y2, a1, b2, y2
FMADD3 y3, a2, b2, y3
FMADD4 y4, a2, b1, y4
.align 4
LL(39):
LFDUX b1, Y, INCY
LFDU b2, 1 * SIZE(Y)
FADD y1, y1, y3
FADD y2, y2, y4
FMADD b1, alpha_r, y1, b1
FMADDR b2, alpha_r, y2, b2
FMSUBR b1, alpha_i, y2, b1
FMADD b2, alpha_i, y1, b2
STFDUX b1, YY, INCY
STFDU b2, 1 * SIZE(YY)
.align 4
LL(999):
li r3, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r14, 144(SP)
ld r15, 152(SP)
ld r16, 160(SP)
ld r17, 168(SP)
ld r18, 176(SP)
ld r19, 184(SP)
ld r20, 192(SP)
ld r21, 200(SP)
ld r22, 208(SP)
#else
lwz r14, 144(SP)
lwz r15, 148(SP)
lwz r16, 152(SP)
lwz r17, 156(SP)
lwz r18, 160(SP)
lwz r19, 164(SP)
lwz r20, 168(SP)
lwz r21, 172(SP)
lwz r22, 176(SP)
#endif
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif