/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define N r3
#define X r4
#define INCX r5
#define Y r6
#define INCY r7
#define INCX2 r8
#define INCY2 r9
#define X2 r10
#define Y2 r11
#define A1 f0
#define A2 f1
#define A3 f2
#define A4 f3
#define A5 f4
#define A6 f5
#define A7 f6
#define A8 f7
#define A9 f8
#define T1 f9
#define T2 f10
#define T3 f11
#define T4 f12
#define T5 f13
#define T6 f14
#define T7 f15
PROLOGUE
PROFCODE
li r10, -16
stfpdux f14, SP, r10
stfpdux f15, SP, r10
slwi INCX, INCX, BASE_SHIFT
slwi INCY, INCY, BASE_SHIFT
add INCX2, INCX, INCX
add INCY2, INCY, INCY
cmpwi cr0, N, 0
ble LL(999)
cmpwi cr0, INCY, SIZE
bne LL(60)
cmpwi cr0, INCX, SIZE
bne LL(50)
sub X, X, INCX2
sub Y, Y, INCY2
andi. r0, X, 2 * SIZE - 1
bne LL(30)
andi. r0, Y, 2 * SIZE - 1
bne LL(20)
.align 4
LL(10): /* X : aligned Y : aligned */
srawi. r0, N, 4
mtspr CTR, r0
beq- LL(15)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
LFPDUX A3, X, INCX2
LFPDUX A4, X, INCX2
LFPDUX A5, X, INCX2
LFPDUX A6, X, INCX2
LFPDUX A7, X, INCX2
LFPDUX A8, X, INCX2
bdz LL(13)
.align 4
LL(12):
STFPDUX A1, Y, INCY2
LFPDUX A1, X, INCX2
STFPDUX A2, Y, INCY2
LFPDUX A2, X, INCX2
STFPDUX A3, Y, INCY2
LFPDUX A3, X, INCX2
STFPDUX A4, Y, INCY2
LFPDUX A4, X, INCX2
STFPDUX A5, Y, INCY2
LFPDUX A5, X, INCX2
STFPDUX A6, Y, INCY2
LFPDUX A6, X, INCX2
STFPDUX A7, Y, INCY2
LFPDUX A7, X, INCX2
STFPDUX A8, Y, INCY2
LFPDUX A8, X, INCX2
bdnz LL(12)
.align 4
LL(13):
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A4, Y, INCY2
STFPDUX A5, Y, INCY2
STFPDUX A6, Y, INCY2
STFPDUX A7, Y, INCY2
STFPDUX A8, Y, INCY2
.align 4
LL(15):
andi. r0, N, 15
beq LL(999)
andi. r0, N, 8
beq LL(16)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
LFPDUX A3, X, INCX2
LFPDUX A4, X, INCX2
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A4, Y, INCY2
.align 4
LL(16):
andi. r0, N, 4
beq LL(17)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
.align 4
LL(17):
andi. r0, N, 2
beq LL(18)
LFPDUX A1, X, INCX2
STFPDUX A1, Y, INCY2
.align 4
LL(18):
andi. r0, N, 1
beq LL(999)
LFDUX A1, X, INCX2
STFDUX A1, Y, INCY2
.align 4
b LL(999)
.align 4
LL(20): /* X ): aligned Y ): unaligned */
LFXDUX A1, X, INCX2
addi N, N, -1
cmpwi cr0, N, 0
STFSDX A1, Y, INCY2
add Y, Y, INCY
ble LL(999)
.align 4
srawi. r0, N, 4
mtspr CTR, r0
beq- LL(25)
LFXDUX T1, X, INCX2
LFXDUX T2, X, INCX2
LFXDUX T3, X, INCX2
LFXDUX T4, X, INCX2
LFPDUX A6, X, INCX2
fsmr A1, T1
LFPDUX A7, X, INCX2
fsmr T1, T2
LFPDUX A8, X, INCX2
fsmr T2, T3
LFPDUX A9, X, INCX2
fsmr T3, T4
bdz LL(23)
.align 4
LL(22):
STFPDUX A1, Y, INCY2
fxmr T5, A6
STFPDUX T1, Y, INCY2
fxmr T6, A7
STFPDUX T2, Y, INCY2
fxmr T7, A8
STFPDUX T3, Y, INCY2
fxmr A1, A9
fsmr T4, T5
LFPDUX A2, X, INCX2
fsmr T5, T6
LFPDUX A3, X, INCX2
fsmr T6, T7
LFPDUX A4, X, INCX2
fsmr T7, A1
LFPDUX A5, X, INCX2
STFPDUX T4, Y, INCY2
fxmr T1, A2
STFPDUX T5, Y, INCY2
fxmr T2, A3
STFPDUX T6, Y, INCY2
fxmr T3, A4
STFPDUX T7, Y, INCY2
fxmr T4, A5
LFPDUX A6, X, INCX2
fsmr A1, T1
LFPDUX A7, X, INCX2
fsmr T1, T2
LFPDUX A8, X, INCX2
fsmr T2, T3
LFPDUX A9, X, INCX2
fsmr T3, T4
bdnz LL(22)
.align 4
LL(23):
STFPDUX A1, Y, INCY2
fxmr T5, A6
STFPDUX T1, Y, INCY2
fxmr T6, A7
STFPDUX T2, Y, INCY2
fxmr T7, A8
STFPDUX T3, Y, INCY2
fxmr A1, A9
fsmr T4, T5
fsmr T5, T6
fsmr T6, T7
fsmr T7, A1
STFPDUX T4, Y, INCY2
STFPDUX T5, Y, INCY2
STFPDUX T6, Y, INCY2
STFPDUX T7, Y, INCY2
.align 4
LL(25):
andi. r0, N, 15
beq LL(999)
andi. r0, N, 8
beq LL(26)
LFXDUX A2, X, INCX2
LFXDUX A3, X, INCX2
LFXDUX A4, X, INCX2
LFXDUX A5, X, INCX2
fsmr A1, A2
fsmr A2, A3
fsmr A3, A4
fsmr A4, A5
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A4, Y, INCY2
fpmr A1, A5
.align 4
LL(26):
andi. r0, N, 4
beq LL(27)
LFXDUX A2, X, INCX2
LFXDUX A3, X, INCX2
fsmr A1, A2
fsmr A2, A3
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
fpmr A1, A3
.align 4
LL(27):
andi. r0, N, 2
beq LL(28)
LFXDUX A2, X, INCX2
fsmr A1, A2
STFPDUX A1, Y, INCY2
fpmr A1, A2
.align 4
LL(28):
andi. r0, N, 1
beq LL(999)
STFDUX A1, Y, INCY2
b LL(999)
.align 4
LL(30): /* X : unaligned Y : aligned */
andi. r0, Y, 2 * SIZE - 1
bne LL(40)
LFDX A1, X, INCX2
add X, X, INCX
srawi. r0, N, 4
mtspr CTR, r0
beq- LL(35)
LFXDUX T1, X, INCX2
LFXDUX T2, X, INCX2
LFXDUX T3, X, INCX2
LFXDUX T4, X, INCX2
LFPDUX A6, X, INCX2
fsmr A1, T1
LFPDUX A7, X, INCX2
fsmr T1, T2
LFPDUX A8, X, INCX2
fsmr T2, T3
LFPDUX A9, X, INCX2
fsmr T3, T4
bdz LL(33)
.align 4
LL(32):
fxmr T5, A6
STFPDUX A1, Y, INCY2
fxmr T6, A7
STFPDUX T1, Y, INCY2
fxmr T7, A8
STFPDUX T2, Y, INCY2
fxmr A1, A9
STFPDUX T3, Y, INCY2
fsmr T4, T5
LFPDUX A2, X, INCX2
fsmr T5, T6
LFPDUX A3, X, INCX2
fsmr T6, T7
LFPDUX A4, X, INCX2
fsmr T7, A1
LFPDUX A5, X, INCX2
STFPDUX T4, Y, INCY2
fxmr T1, A2
STFPDUX T5, Y, INCY2
fxmr T2, A3
STFPDUX T6, Y, INCY2
fxmr T3, A4
STFPDUX T7, Y, INCY2
fxmr T4, A5
LFPDUX A6, X, INCX2
fsmr A1, T1
LFPDUX A7, X, INCX2
fsmr T1, T2
LFPDUX A8, X, INCX2
fsmr T2, T3
LFPDUX A9, X, INCX2
fsmr T3, T4
bdnz LL(32)
.align 4
LL(33):
STFPDUX A1, Y, INCY2
fxmr T5, A6
STFPDUX T1, Y, INCY2
fxmr T6, A7
STFPDUX T2, Y, INCY2
fxmr T7, A8
STFPDUX T3, Y, INCY2
fxmr A1, A9
fsmr T4, T5
fsmr T5, T6
fsmr T6, T7
fsmr T7, A1
STFPDUX T4, Y, INCY2
STFPDUX T5, Y, INCY2
STFPDUX T6, Y, INCY2
STFPDUX T7, Y, INCY2
.align 4
LL(35):
andi. r0, N, 15
beq LL(999)
andi. r0, N, 8
beq LL(36)
LFXDUX A2, X, INCX2
LFXDUX A3, X, INCX2
LFXDUX A4, X, INCX2
LFXDUX A5, X, INCX2
fsmr A1, A2
fsmr A2, A3
fsmr A3, A4
fsmr A4, A5
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A4, Y, INCY2
fpmr A1, A5
.align 4
LL(36):
andi. r0, N, 4
beq LL(37)
LFXDUX A2, X, INCX2
LFXDUX A3, X, INCX2
fsmr A1, A2
fsmr A2, A3
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
fpmr A1, A3
.align 4
LL(37):
andi. r0, N, 2
beq LL(38)
LFXDUX A2, X, INCX2
fsmr A1, A2
STFPDUX A1, Y, INCY2
fpmr A1, A2
.align 4
LL(38):
andi. r0, N, 1
beq LL(999)
STFDUX A1, Y, INCY2
b LL(999)
.align 4
LL(40): /* X : unaligned Y : unaligned */
LFDX A1, X, INCX2
add X, X, INCX
addi N, N, -1
cmpwi cr0, N, 0
STFDX A1, Y, INCY2
add Y, Y, INCY
ble LL(999)
srawi. r0, N, 4
mtspr CTR, r0
beq- LL(45)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
LFPDUX A3, X, INCX2
LFPDUX A4, X, INCX2
LFPDUX A5, X, INCX2
LFPDUX A6, X, INCX2
LFPDUX A7, X, INCX2
LFPDUX A8, X, INCX2
bdz LL(43)
.align 4
LL(42):
STFPDUX A1, Y, INCY2
LFPDUX A1, X, INCX2
STFPDUX A2, Y, INCY2
LFPDUX A2, X, INCX2
STFPDUX A3, Y, INCY2
LFPDUX A3, X, INCX2
STFPDUX A4, Y, INCY2
LFPDUX A4, X, INCX2
STFPDUX A5, Y, INCY2
LFPDUX A5, X, INCX2
STFPDUX A6, Y, INCY2
LFPDUX A6, X, INCX2
STFPDUX A7, Y, INCY2
LFPDUX A7, X, INCX2
STFPDUX A8, Y, INCY2
LFPDUX A8, X, INCX2
bdnz LL(42)
.align 4
LL(43):
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A4, Y, INCY2
STFPDUX A5, Y, INCY2
STFPDUX A6, Y, INCY2
STFPDUX A7, Y, INCY2
STFPDUX A8, Y, INCY2
.align 4
LL(45):
andi. r0, N, 15
beq LL(999)
andi. r0, N, 8
beq LL(46)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
LFPDUX A3, X, INCX2
LFPDUX A4, X, INCX2
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A4, Y, INCY2
.align 4
LL(46):
andi. r0, N, 4
beq LL(47)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
.align 4
LL(47):
andi. r0, N, 2
beq LL(48)
LFPDUX A1, X, INCX2
STFPDUX A1, Y, INCY2
.align 4
LL(48):
andi. r0, N, 1
beq LL(999)
LFDUX A1, X, INCX2
STFDUX A1, Y, INCY2
.align 4
b LL(999)
.align 4
# INCX != 1, INCY == 1
LL(50):
andi. r0, Y, 2 * SIZE - 1
beq LL(51)
LFD A1, 0 * SIZE(X)
add X, X, INCX
STFD A1, 0 * SIZE(Y)
add Y, Y, INCY
addi N, N, -1
cmpwi cr0, N, 0
ble LL(999)
.align 4
LL(51):
sub X, X, INCX
sub Y, Y, INCY2
srawi. r0, N, 4
mtspr CTR, r0
beq- LL(55)
.align 4
LL(52):
LFDUX A1, X, INCX
LFDUX A2, X, INCX
LFDUX A3, X, INCX
LFDUX A4, X, INCX
LFDUX A5, X, INCX
LFDUX A6, X, INCX
LFDUX A7, X, INCX
LFDUX A8, X, INCX
LFDUX A9, X, INCX
LFDUX T1, X, INCX
LFDUX T2, X, INCX
LFDUX T3, X, INCX
fsmfp A1, A2
LFDUX T4, X, INCX
fsmfp A3, A4
LFDUX T5, X, INCX
fsmfp A5, A6
LFDUX T6, X, INCX
fsmfp A7, A8
LFDUX T7, X, INCX
fsmfp A9, T1
STFPDUX A1, Y, INCY2
fsmfp T2, T3
STFPDUX A3, Y, INCY2
fsmfp T4, T5
STFPDUX A5, Y, INCY2
fsmfp T6, T7
STFPDUX A7, Y, INCY2
STFPDUX A9, Y, INCY2
STFPDUX T2, Y, INCY2
STFPDUX T4, Y, INCY2
STFPDUX T6, Y, INCY2
bdnz LL(52)
.align 4
LL(55):
andi. r0, N, 15
beq LL(999)
andi. r0, N, 8
beq LL(56)
LFDUX A1, X, INCX
LFDUX A2, X, INCX
LFDUX A3, X, INCX
LFDUX A4, X, INCX
LFDUX A5, X, INCX
LFDUX A6, X, INCX
LFDUX A7, X, INCX
LFDUX A8, X, INCX
fsmfp A1, A2
fsmfp A3, A4
fsmfp A5, A6
fsmfp A7, A8
STFPDUX A1, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A5, Y, INCY2
STFPDUX A7, Y, INCY2
.align 4
LL(56):
andi. r0, N, 4
beq LL(57)
LFDUX A1, X, INCX
LFDUX A2, X, INCX
LFDUX A3, X, INCX
LFDUX A4, X, INCX
fsmfp A1, A2
fsmfp A3, A4
STFPDUX A1, Y, INCY2
STFPDUX A3, Y, INCY2
.align 4
LL(57):
andi. r0, N, 2
beq LL(58)
LFDUX A1, X, INCX
LFDUX A2, X, INCX
fsmfp A1, A2
STFPDUX A1, Y, INCY2
.align 4
LL(58):
andi. r0, N, 1
beq LL(999)
LFDUX A1, X, INCX
STFDUX A1, Y, INCY2
b LL(999)
.align 4
# INCX == 1, INCY != 1
LL(60):
cmpwi cr0, INCY, SIZE
bne LL(100)
andi. r0, X, 2 * SIZE - 1
beq LL(61)
LFD A1, 0 * SIZE(X)
add X, X, INCX
STFD A1, 0 * SIZE(Y)
add Y, Y, INCY
addi N, N, -1
cmpwi cr0, N, 0
ble LL(999)
.align 4
LL(61):
sub X, X, INCX2
sub Y, Y, INCY
srawi. r0, N, 4
mtspr CTR, r0
beq- LL(65)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
LFPDUX A3, X, INCX2
LFPDUX A4, X, INCX2
LFPDUX A5, X, INCX2
LFPDUX A6, X, INCX2
LFPDUX A7, X, INCX2
LFPDUX A8, X, INCX2
bdz LL(63)
.align 4
LL(62):
STFDUX A1, Y, INCY
STFSDUX A1, Y, INCY
LFPDUX A1, X, INCX2
STFDUX A2, Y, INCY
STFSDUX A2, Y, INCY
LFPDUX A2, X, INCX2
STFDUX A3, Y, INCY
STFSDUX A3, Y, INCY
LFPDUX A3, X, INCX2
STFDUX A4, Y, INCY
STFSDUX A4, Y, INCY
LFPDUX A4, X, INCX2
STFDUX A5, Y, INCY
STFSDUX A5, Y, INCY
LFPDUX A5, X, INCX2
STFDUX A6, Y, INCY
STFSDUX A6, Y, INCY
LFPDUX A6, X, INCX2
STFDUX A7, Y, INCY
STFSDUX A7, Y, INCY
LFPDUX A7, X, INCX2
STFDUX A8, Y, INCY
STFSDUX A8, Y, INCY
LFPDUX A8, X, INCX2
bdnz LL(62)
.align 4
LL(63):
STFDUX A1, Y, INCY
STFSDUX A1, Y, INCY
STFDUX A2, Y, INCY
STFSDUX A2, Y, INCY
STFDUX A3, Y, INCY
STFSDUX A3, Y, INCY
STFDUX A4, Y, INCY
STFSDUX A4, Y, INCY
STFDUX A5, Y, INCY
STFSDUX A5, Y, INCY
STFDUX A6, Y, INCY
STFSDUX A6, Y, INCY
STFDUX A7, Y, INCY
STFSDUX A7, Y, INCY
STFDUX A8, Y, INCY
STFSDUX A8, Y, INCY
.align 4
LL(65):
andi. r0, N, 15
beq LL(999)
andi. r0, N, 8
beq LL(66)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
LFPDUX A3, X, INCX2
LFPDUX A4, X, INCX2
STFDUX A1, Y, INCY
STFSDUX A1, Y, INCY
STFDUX A2, Y, INCY
STFSDUX A2, Y, INCY
STFDUX A3, Y, INCY
STFSDUX A3, Y, INCY
STFDUX A4, Y, INCY
STFSDUX A4, Y, INCY
.align 4
LL(66):
andi. r0, N, 4
beq LL(67)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
STFDUX A1, Y, INCY
STFSDUX A1, Y, INCY
STFDUX A2, Y, INCY
STFSDUX A2, Y, INCY
.align 4
LL(67):
andi. r0, N, 2
beq LL(68)
LFPDUX A1, X, INCX2
STFDUX A1, Y, INCY
STFSDUX A1, Y, INCY
.align 4
LL(68):
andi. r0, N, 1
beq LL(999)
LFDUX A1, X, INCX2
STFDUX A1, Y, INCY
b LL(999)
.align 4
LL(100):
sub X, X, INCX
sub Y, Y, INCY
srawi. r0, N, 3
mtspr CTR, r0
beq- LL(115)
LFDUX A1, X, INCX
LFDUX A2, X, INCX
LFDUX A3, X, INCX
LFDUX A4, X, INCX
LFDUX A5, X, INCX
LFDUX A6, X, INCX
LFDUX A7, X, INCX
LFDUX A8, X, INCX
bdz LL(113)
.align 4
LL(112):
STFDUX A1, Y, INCY
LFDUX A1, X, INCX
STFDUX A2, Y, INCY
LFDUX A2, X, INCX
STFDUX A3, Y, INCY
LFDUX A3, X, INCX
STFDUX A4, Y, INCY
LFDUX A4, X, INCX
STFDUX A5, Y, INCY
LFDUX A5, X, INCX
STFDUX A6, Y, INCY
LFDUX A6, X, INCX
STFDUX A7, Y, INCY
LFDUX A7, X, INCX
STFDUX A8, Y, INCY
LFDUX A8, X, INCX
bdnz LL(112)
.align 4
LL(113):
STFDUX A1, Y, INCY
STFDUX A2, Y, INCY
STFDUX A3, Y, INCY
STFDUX A4, Y, INCY
STFDUX A5, Y, INCY
STFDUX A6, Y, INCY
STFDUX A7, Y, INCY
STFDUX A8, Y, INCY
.align 4
LL(115):
andi. r0, N, 7
beq LL(999)
andi. r0, N, 4
beq LL(117)
LFDUX A1, X, INCX
LFDUX A2, X, INCX
LFDUX A3, X, INCX
LFDUX A4, X, INCX
STFDUX A1, Y, INCY
STFDUX A2, Y, INCY
STFDUX A3, Y, INCY
STFDUX A4, Y, INCY
.align 4
LL(117):
andi. r0, N, 2
beq LL(118)
LFDUX A1, X, INCX
LFDUX A2, X, INCX
STFDUX A1, Y, INCY
STFDUX A2, Y, INCY
.align 4
LL(118):
andi. r0, N, 1
beq LL(999)
LFDUX A1, X, INCX
STFDUX A1, Y, INCY
.align 4
LL(999):
li r10, 16
addi SP, SP, -16
lfpdux f15, SP, r10
lfpdux f14, SP, r10
addi SP, SP, 16
blr
EPILOGUE