/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define N r3
#define X r4
#define INCX r5
#define Y r6
#define INCY r7
#define INCX2 r8
#define INCY2 r9
#define X2 r10
#define Y2 r11
#define A1 f0
#define A2 f1
#define A3 f2
#define A4 f3
#define A5 f4
#define A6 f5
#define A7 f6
#define A8 f7
#define A9 f8
#define T1 f9
#define T2 f10
#define T3 f11
#define T4 f12
#define T5 f13
#define T6 f14
#define T7 f15
PROLOGUE
PROFCODE
li r10, -16
stfpdux f14, SP, r10
stfpdux f15, SP, r10
slwi INCX, INCX, BASE_SHIFT
slwi INCY, INCY, BASE_SHIFT
add INCX2, INCX, INCX
add INCY2, INCY, INCY
cmpwi cr0, N, 0
ble LL(999)
sub X, X, INCX2
sub Y, Y, INCY2
cmpwi cr0, INCX, SIZE
bne LL(100)
cmpwi cr0, INCY, SIZE
bne LL(100)
andi. r0, X, 2 * SIZE - 1
bne LL(30)
andi. r0, Y, 2 * SIZE - 1
bne LL(20)
.align 4
LL(10): /* X ): aligned Y ): aligned */
srawi. r0, N, 3
mtspr CTR, r0
beq- LL(15)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
LFPDUX A3, X, INCX2
LFPDUX A4, X, INCX2
LFPDUX A5, X, INCX2
LFPDUX A6, X, INCX2
LFPDUX A7, X, INCX2
LFPDUX A8, X, INCX2
bdz LL(13)
.align 4
LL(12):
STFPDUX A1, Y, INCY2
LFPDUX A1, X, INCX2
STFPDUX A2, Y, INCY2
LFPDUX A2, X, INCX2
STFPDUX A3, Y, INCY2
LFPDUX A3, X, INCX2
STFPDUX A4, Y, INCY2
LFPDUX A4, X, INCX2
STFPDUX A5, Y, INCY2
LFPDUX A5, X, INCX2
STFPDUX A6, Y, INCY2
LFPDUX A6, X, INCX2
STFPDUX A7, Y, INCY2
LFPDUX A7, X, INCX2
STFPDUX A8, Y, INCY2
LFPDUX A8, X, INCX2
bdnz LL(12)
.align 4
LL(13):
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A4, Y, INCY2
STFPDUX A5, Y, INCY2
STFPDUX A6, Y, INCY2
STFPDUX A7, Y, INCY2
STFPDUX A8, Y, INCY2
.align 4
LL(15):
andi. r0, N, 7
beq LL(999)
andi. r0, N, 4
beq LL(16)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
LFPDUX A3, X, INCX2
LFPDUX A4, X, INCX2
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A4, Y, INCY2
.align 4
LL(16):
andi. r0, N, 2
beq LL(17)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
.align 4
LL(17):
andi. r0, N, 1
beq LL(999)
LFPDUX A1, X, INCX2
STFPDUX A1, Y, INCY2
b LL(999)
.align 4
LL(20): /* X : aligned Y : unaligned */
LFXDUX A1, X, INCX2
addi N, N, -1
cmpwi cr0, N, 0
STFSDX A1, Y, INCY2
add Y, Y, INCY
ble LL(29)
.align 4
srawi. r0, N, 3
mtspr CTR, r0
beq- LL(25)
LFXDUX T1, X, INCX2
LFXDUX T2, X, INCX2
LFXDUX T3, X, INCX2
LFXDUX T4, X, INCX2
LFPDUX A6, X, INCX2
fsmr A1, T1
LFPDUX A7, X, INCX2
fsmr T1, T2
LFPDUX A8, X, INCX2
fsmr T2, T3
LFPDUX A9, X, INCX2
fsmr T3, T4
bdz LL(23)
.align 4
LL(22):
STFPDUX A1, Y, INCY2
fxmr T5, A6
STFPDUX T1, Y, INCY2
fxmr T6, A7
STFPDUX T2, Y, INCY2
fxmr T7, A8
STFPDUX T3, Y, INCY2
fxmr A1, A9
fsmr T4, T5
LFPDUX A2, X, INCX2
fsmr T5, T6
LFPDUX A3, X, INCX2
fsmr T6, T7
LFPDUX A4, X, INCX2
fsmr T7, A1
LFPDUX A5, X, INCX2
STFPDUX T4, Y, INCY2
fxmr T1, A2
STFPDUX T5, Y, INCY2
fxmr T2, A3
STFPDUX T6, Y, INCY2
fxmr T3, A4
STFPDUX T7, Y, INCY2
fxmr T4, A5
LFPDUX A6, X, INCX2
fsmr A1, T1
LFPDUX A7, X, INCX2
fsmr T1, T2
LFPDUX A8, X, INCX2
fsmr T2, T3
LFPDUX A9, X, INCX2
fsmr T3, T4
bdnz LL(22)
.align 4
LL(23):
STFPDUX A1, Y, INCY2
fxmr T5, A6
STFPDUX T1, Y, INCY2
fxmr T6, A7
STFPDUX T2, Y, INCY2
fxmr T7, A8
STFPDUX T3, Y, INCY2
fxmr A1, A9
fsmr T4, T5
fsmr T5, T6
fsmr T6, T7
fsmr T7, A1
STFPDUX T4, Y, INCY2
STFPDUX T5, Y, INCY2
STFPDUX T6, Y, INCY2
STFPDUX T7, Y, INCY2
.align 4
LL(25):
andi. r0, N, 7
beq LL(29)
andi. r0, N, 4
beq LL(26)
LFXDUX A2, X, INCX2
LFXDUX A3, X, INCX2
LFXDUX A4, X, INCX2
LFXDUX A5, X, INCX2
fsmr A1, A2
fsmr A2, A3
fsmr A3, A4
fsmr A4, A5
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A4, Y, INCY2
fpmr A1, A5
.align 4
LL(26):
andi. r0, N, 2
beq LL(27)
LFXDUX A2, X, INCX2
LFXDUX A3, X, INCX2
fsmr A1, A2
fsmr A2, A3
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
fpmr A1, A3
.align 4
LL(27):
andi. r0, N, 1
beq LL(29)
LFXDUX A2, X, INCX2
fsmr A1, A2
STFPDUX A1, Y, INCY2
fpmr A1, A2
.align 4
LL(29):
STFDUX A1, Y, INCY2
b LL(999)
.align 4
LL(30): /* X ): unaligned Y ): aligned */
andi. r0, Y, 2 * SIZE - 1
bne LL(40)
LFDX A1, X, INCX2
add X, X, INCX
srawi. r0, N, 3
mtspr CTR, r0
beq- LL(35)
LFXDUX T1, X, INCX2
LFXDUX T2, X, INCX2
LFXDUX T3, X, INCX2
LFXDUX T4, X, INCX2
LFPDUX A6, X, INCX2
fsmr A1, T1
LFPDUX A7, X, INCX2
fsmr T1, T2
LFPDUX A8, X, INCX2
fsmr T2, T3
LFPDUX A9, X, INCX2
fsmr T3, T4
bdz LL(33)
.align 4
LL(32):
fxmr T5, A6
STFPDUX A1, Y, INCY2
fxmr T6, A7
STFPDUX T1, Y, INCY2
fxmr T7, A8
STFPDUX T2, Y, INCY2
fxmr A1, A9
STFPDUX T3, Y, INCY2
LFPDUX A2, X, INCX2
fsmr T4, T5
LFPDUX A3, X, INCX2
fsmr T5, T6
LFPDUX A4, X, INCX2
fsmr T6, T7
LFPDUX A5, X, INCX2
fsmr T7, A1
fxmr T1, A2
STFPDUX T4, Y, INCY2
fxmr T2, A3
STFPDUX T5, Y, INCY2
fxmr T3, A4
STFPDUX T6, Y, INCY2
fxmr T4, A5
STFPDUX T7, Y, INCY2
fsmr A1, T1
LFPDUX A6, X, INCX2
fsmr T1, T2
LFPDUX A7, X, INCX2
fsmr T2, T3
LFPDUX A8, X, INCX2
fsmr T3, T4
LFPDUX A9, X, INCX2
bdnz LL(32)
.align 4
LL(33):
STFPDUX A1, Y, INCY2
fxmr T5, A6
STFPDUX T1, Y, INCY2
fxmr T6, A7
STFPDUX T2, Y, INCY2
fxmr T7, A8
STFPDUX T3, Y, INCY2
fxmr A1, A9
fsmr T4, T5
fsmr T5, T6
fsmr T6, T7
fsmr T7, A1
STFPDUX T4, Y, INCY2
STFPDUX T5, Y, INCY2
STFPDUX T6, Y, INCY2
STFPDUX T7, Y, INCY2
.align 4
LL(35):
andi. r0, N, 7
beq LL(999)
andi. r0, N, 4
beq LL(36)
LFXDUX A2, X, INCX2
LFXDUX A3, X, INCX2
LFXDUX A4, X, INCX2
LFXDUX A5, X, INCX2
fsmr A1, A2
fsmr A2, A3
fsmr A3, A4
fsmr A4, A5
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A4, Y, INCY2
fpmr A1, A5
.align 4
LL(36):
andi. r0, N, 2
beq LL(37)
LFXDUX A2, X, INCX2
LFXDUX A3, X, INCX2
fsmr A1, A2
fsmr A2, A3
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
fpmr A1, A3
.align 4
LL(37):
andi. r0, N, 1
beq LL(999)
LFXDUX A2, X, INCX2
fsmr A1, A2
STFPDUX A1, Y, INCY2
b LL(999)
.align 4
LL(40): /* X : unaligned Y : unaligned */
LFDX A1, X, INCX2
add X, X, INCX
addi N, N, -1
cmpwi cr0, N, 0
STFDX A1, Y, INCY2
add Y, Y, INCY
ble LL(49)
srawi. r0, N, 3
mtspr CTR, r0
beq- LL(45)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
LFPDUX A3, X, INCX2
LFPDUX A4, X, INCX2
LFPDUX A5, X, INCX2
LFPDUX A6, X, INCX2
LFPDUX A7, X, INCX2
LFPDUX A8, X, INCX2
bdz LL(43)
.align 4
LL(42):
STFPDUX A1, Y, INCY2
LFPDUX A1, X, INCX2
STFPDUX A2, Y, INCY2
LFPDUX A2, X, INCX2
STFPDUX A3, Y, INCY2
LFPDUX A3, X, INCX2
STFPDUX A4, Y, INCY2
LFPDUX A4, X, INCX2
STFPDUX A5, Y, INCY2
LFPDUX A5, X, INCX2
STFPDUX A6, Y, INCY2
LFPDUX A6, X, INCX2
STFPDUX A7, Y, INCY2
LFPDUX A7, X, INCX2
STFPDUX A8, Y, INCY2
LFPDUX A8, X, INCX2
bdnz LL(42)
.align 4
LL(43):
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A4, Y, INCY2
STFPDUX A5, Y, INCY2
STFPDUX A6, Y, INCY2
STFPDUX A7, Y, INCY2
STFPDUX A8, Y, INCY2
.align 4
LL(45):
andi. r0, N, 7
beq LL(49)
andi. r0, N, 4
beq LL(46)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
LFPDUX A3, X, INCX2
LFPDUX A4, X, INCX2
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
STFPDUX A3, Y, INCY2
STFPDUX A4, Y, INCY2
.align 4
LL(46):
andi. r0, N, 2
beq LL(47)
LFPDUX A1, X, INCX2
LFPDUX A2, X, INCX2
STFPDUX A1, Y, INCY2
STFPDUX A2, Y, INCY2
.align 4
LL(47):
andi. r0, N, 1
beq LL(49)
LFPDUX A1, X, INCX2
STFPDUX A1, Y, INCY2
LL(49):
LFDUX A1, X, INCX2
STFDUX A1, Y, INCY2
b LL(999)
.align 4
LL(100):
addi X2, X, SIZE
addi Y2, Y, SIZE
srawi. r0, N, 2
mtspr CTR, r0
beq- LL(115)
LFDUX A1, X, INCX2
LFDUX A2, X2, INCX2
LFDUX A3, X, INCX2
LFDUX A4, X2, INCX2
LFDUX A5, X, INCX2
LFDUX A6, X2, INCX2
LFDUX A7, X, INCX2
LFDUX A8, X2, INCX2
bdz LL(113)
.align 4
LL(112):
STFDUX A1, Y, INCY2
LFDUX A1, X, INCX2
STFDUX A2, Y2, INCY2
LFDUX A2, X2, INCX2
STFDUX A3, Y, INCY2
LFDUX A3, X, INCX2
STFDUX A4, Y2, INCY2
LFDUX A4, X2, INCX2
STFDUX A5, Y, INCY2
LFDUX A5, X, INCX2
STFDUX A6, Y2, INCY2
LFDUX A6, X2, INCX2
STFDUX A7, Y, INCY2
LFDUX A7, X, INCX2
STFDUX A8, Y2, INCY2
LFDUX A8, X2, INCX2
bdnz LL(112)
.align 4
LL(113):
STFDUX A1, Y, INCY2
STFDUX A2, Y2, INCY2
STFDUX A3, Y, INCY2
STFDUX A4, Y2, INCY2
STFDUX A5, Y, INCY2
STFDUX A6, Y2, INCY2
STFDUX A7, Y, INCY2
STFDUX A8, Y2, INCY2
.align 4
LL(115):
andi. r0, N, 3
beq LL(999)
andi. r0, N, 2
beq LL(117)
LFDUX A1, X, INCX2
LFDUX A2, X2, INCX2
LFDUX A3, X, INCX2
LFDUX A4, X2, INCX2
STFDUX A1, Y, INCY2
STFDUX A2, Y2, INCY2
STFDUX A3, Y, INCY2
STFDUX A4, Y2, INCY2
.align 4
LL(117):
andi. r0, N, 1
beq LL(999)
LFDUX A1, X, INCX2
LFDUX A2, X2, INCX2
STFDUX A1, Y, INCY2
STFDUX A2, Y2, INCY2
.align 4
LL(999):
li r10, 16
addi SP, SP, -16
lfpdux f15, SP, r10
lfpdux f14, SP, r10
addi SP, SP, 16
blr
EPILOGUE