/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define N r3
#define X r4
#define INCX r5
#define PREA r8
#define FZERO f0
#define STACKSIZE 16
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stw r0, 0(SP)
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
lfs FZERO, 0(SP)
slwi INCX, INCX, BASE_SHIFT
fmr f1, FZERO
li PREA, 8 * 16 * SIZE
fmr f2, FZERO
cmpwi cr0, N, 0
fmr f3, FZERO
ble- LL(999)
cmpwi cr0, INCX, 0
ble- LL(999)
cmpwi cr0, INCX, SIZE
bne- cr0, LL(20)
srawi. r0, N, 4
mtspr CTR, r0
beq- cr0, LL(15)
.align 4
LFD f8, 0 * SIZE(X)
LFD f9, 1 * SIZE(X)
fabs f4, f8
LFD f10, 2 * SIZE(X)
fabs f5, f9
LFD f11, 3 * SIZE(X)
fabs f6, f10
LFD f8, 4 * SIZE(X)
fabs f7, f11
bdz LL(13)
.align 4
LL(12):
FADD f0, f0, f4
dcbt X, PREA
fabs f4, f8
LFD f9, 5 * SIZE(X)
FADD f1, f1, f5
nop
fabs f5, f9
LFD f10, 6 * SIZE(X)
FADD f2, f2, f6
nop
fabs f6, f10
LFD f11, 7 * SIZE(X)
FADD f3, f3, f7
nop
fabs f7, f11
LFD f8, 8 * SIZE(X)
FADD f0, f0, f4
nop
fabs f4, f8
LFD f9, 9 * SIZE(X)
FADD f1, f1, f5
nop
fabs f5, f9
LFD f10, 10 * SIZE(X)
FADD f2, f2, f6
nop
fabs f6, f10
LFD f11, 11 * SIZE(X)
FADD f3, f3, f7
nop
fabs f7, f11
LFD f8, 12 * SIZE(X)
FADD f0, f0, f4
nop
fabs f4, f8
LFD f9, 13 * SIZE(X)
FADD f1, f1, f5
nop
fabs f5, f9
LFD f10, 14 * SIZE(X)
FADD f2, f2, f6
nop
fabs f6, f10
LFD f11, 15 * SIZE(X)
FADD f3, f3, f7
nop
fabs f7, f11
LFD f8, 16 * SIZE(X)
FADD f0, f0, f4
nop
fabs f4, f8
LFD f9, 17 * SIZE(X)
FADD f1, f1, f5
addi X, X, 16 * SIZE
fabs f5, f9
LFD f10, 2 * SIZE(X)
FADD f2, f2, f6
nop
fabs f6, f10
LFD f11, 3 * SIZE(X)
FADD f3, f3, f7
LFD f8, 4 * SIZE(X)
fabs f7, f11
bdnz LL(12)
.align 4
LL(13):
FADD f0, f0, f4
nop
fabs f4, f8
LFD f9, 5 * SIZE(X)
FADD f1, f1, f5
nop
fabs f5, f9
LFD f10, 6 * SIZE(X)
FADD f2, f2, f6
nop
fabs f6, f10
LFD f11, 7 * SIZE(X)
FADD f3, f3, f7
nop
fabs f7, f11
LFD f8, 8 * SIZE(X)
FADD f0, f0, f4
nop
fabs f4, f8
LFD f9, 9 * SIZE(X)
FADD f1, f1, f5
nop
fabs f5, f9
LFD f10, 10 * SIZE(X)
FADD f2, f2, f6
nop
fabs f6, f10
LFD f11, 11 * SIZE(X)
FADD f3, f3, f7
nop
fabs f7, f11
LFD f8, 12 * SIZE(X)
FADD f0, f0, f4
nop
fabs f4, f8
LFD f9, 13 * SIZE(X)
FADD f1, f1, f5
nop
fabs f5, f9
LFD f10, 14 * SIZE(X)
FADD f2, f2, f6
addi X, X, 16 * SIZE
fabs f6, f10
LFD f11, -1 * SIZE(X)
FADD f3, f3, f7
fabs f7, f11
FADD f0, f0, f4
FADD f1, f1, f5
FADD f2, f2, f6
FADD f3, f3, f7
.align 4
LL(15):
andi. r0, N, 15
beq LL(999)
andi. r0, N, 8
beq LL(16)
LFD f8, 0 * SIZE(X)
LFD f9, 1 * SIZE(X)
fabs f4, f8
LFD f10, 2 * SIZE(X)
fabs f5, f9
LFD f11, 3 * SIZE(X)
fabs f6, f10
LFD f8, 4 * SIZE(X)
fabs f7, f11
FADD f0, f0, f4
nop
fabs f4, f8
LFD f9, 5 * SIZE(X)
FADD f1, f1, f5
nop
fabs f5, f9
LFD f10, 6 * SIZE(X)
FADD f2, f2, f6
addi X, X, 8 * SIZE
fabs f6, f10
LFD f11, -1 * SIZE(X)
FADD f3, f3, f7
fabs f7, f11
FADD f0, f0, f4
FADD f1, f1, f5
FADD f2, f2, f6
FADD f3, f3, f7
.align 4
LL(16):
andi. r0, N, 4
beq LL(17)
LFD f8, 0 * SIZE(X)
LFD f9, 1 * SIZE(X)
fabs f4, f8
LFD f10, 2 * SIZE(X)
fabs f5, f9
LFD f11, 3 * SIZE(X)
fabs f6, f10
addi X, X, 4 * SIZE
fabs f7, f11
nop
FADD f0, f0, f4
FADD f1, f1, f5
FADD f2, f2, f6
FADD f3, f3, f7
.align 4
LL(17):
andi. r0, N, 2
beq LL(18)
LFD f8, 0 * SIZE(X)
LFD f9, 1 * SIZE(X)
fabs f4, f8
fabs f5, f9
FADD f0, f0, f4
addi X, X, 2 * SIZE
FADD f1, f1, f5
nop
.align 4
LL(18):
andi. r0, N, 1
beq LL(999)
LFD f8, 0 * SIZE(X)
fabs f4, f8
FADD f0, f0, f4
b LL(999)
.align 4
LL(20):
sub X, X, INCX
srawi. r0, N, 4
mtspr CTR, r0
beq- cr0, LL(25)
.align 4
LFDUX f8, X, INCX
LFDUX f9, X, INCX
fabs f4, f8
LFDUX f10, X, INCX
fabs f5, f9
LFDUX f11, X, INCX
fabs f6, f10
LFDUX f8, X, INCX
fabs f7, f11
bdz LL(23)
.align 4
LL(22):
FADD f0, f0, f4
dcbt X, PREA
fabs f4, f8
LFDUX f9, X, INCX
FADD f1, f1, f5
nop
fabs f5, f9
LFDUX f10, X, INCX
FADD f2, f2, f6
nop
fabs f6, f10
LFDUX f11, X, INCX
FADD f3, f3, f7
nop
fabs f7, f11
LFDUX f8, X, INCX
FADD f0, f0, f4
nop
fabs f4, f8
LFDUX f9, X, INCX
FADD f1, f1, f5
nop
fabs f5, f9
LFDUX f10, X, INCX
FADD f2, f2, f6
nop
fabs f6, f10
LFDUX f11, X, INCX
FADD f3, f3, f7
nop
fabs f7, f11
LFDUX f8, X, INCX
FADD f0, f0, f4
nop
fabs f4, f8
LFDUX f9, X, INCX
FADD f1, f1, f5
nop
fabs f5, f9
LFDUX f10, X, INCX
FADD f2, f2, f6
nop
fabs f6, f10
LFDUX f11, X, INCX
FADD f3, f3, f7
nop
fabs f7, f11
LFDUX f8, X, INCX
FADD f0, f0, f4
nop
fabs f4, f8
LFDUX f9, X, INCX
FADD f1, f1, f5
nop
fabs f5, f9
LFDUX f10, X, INCX
FADD f2, f2, f6
nop
fabs f6, f10
LFDUX f11, X, INCX
FADD f3, f3, f7
LFDUX f8, X, INCX
fabs f7, f11
bdnz LL(22)
.align 4
LL(23):
FADD f0, f0, f4
nop
fabs f4, f8
LFDUX f9, X, INCX
FADD f1, f1, f5
nop
fabs f5, f9
LFDUX f10, X, INCX
FADD f2, f2, f6
nop
fabs f6, f10
LFDUX f11, X, INCX
FADD f3, f3, f7
nop
fabs f7, f11
LFDUX f8, X, INCX
FADD f0, f0, f4
nop
fabs f4, f8
LFDUX f9, X, INCX
FADD f1, f1, f5
nop
fabs f5, f9
LFDUX f10, X, INCX
FADD f2, f2, f6
nop
fabs f6, f10
LFDUX f11, X, INCX
FADD f3, f3, f7
nop
fabs f7, f11
LFDUX f8, X, INCX
FADD f0, f0, f4
nop
fabs f4, f8
LFDUX f9, X, INCX
FADD f1, f1, f5
nop
fabs f5, f9
LFDUX f10, X, INCX
FADD f2, f2, f6
nop
fabs f6, f10
LFDUX f11, X, INCX
FADD f3, f3, f7
fabs f7, f11
FADD f0, f0, f4
FADD f1, f1, f5
FADD f2, f2, f6
FADD f3, f3, f7
.align 4
LL(25):
andi. r0, N, 15
beq LL(999)
andi. r0, N, 8
beq LL(26)
LFDUX f8, X, INCX
LFDUX f9, X, INCX
fabs f4, f8
LFDUX f10, X, INCX
fabs f5, f9
LFDUX f11, X, INCX
fabs f6, f10
LFDUX f8, X, INCX
fabs f7, f11
FADD f0, f0, f4
nop
fabs f4, f8
LFDUX f9, X, INCX
FADD f1, f1, f5
nop
fabs f5, f9
LFDUX f10, X, INCX
FADD f2, f2, f6
fabs f6, f10
LFDUX f11, X, INCX
FADD f3, f3, f7
fabs f7, f11
FADD f0, f0, f4
FADD f1, f1, f5
FADD f2, f2, f6
FADD f3, f3, f7
.align 4
LL(26):
andi. r0, N, 4
beq LL(27)
LFDUX f8, X, INCX
LFDUX f9, X, INCX
fabs f4, f8
LFDUX f10, X, INCX
fabs f5, f9
LFDUX f11, X, INCX
fabs f6, f10
fabs f7, f11
FADD f0, f0, f4
FADD f1, f1, f5
FADD f2, f2, f6
FADD f3, f3, f7
.align 4
LL(27):
andi. r0, N, 2
beq LL(28)
LFDUX f8, X, INCX
LFDUX f9, X, INCX
fabs f4, f8
fabs f5, f9
FADD f0, f0, f4
FADD f1, f1, f5
.align 4
LL(28):
andi. r0, N, 1
beq LL(999)
LFDUX f8, X, INCX
fabs f4, f8
FADD f0, f0, f4
.align 4
LL(999):
FADD f0, f0, f1
FADD f2, f2, f3
FADD f1, f0, f2
addi SP, SP, STACKSIZE
blr
EPILOGUE