/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 360
#else
#define STACKSIZE 272
#endif
#define ALPHA 0
#define FZERO 16
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r8
#define B r9
#define C r10
#define LDC r7
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#endif
#endif
#define STACK r11
#define I r21
#define J r22
#define AO r23
#define BO r24
#define CO1 r25
#define CO2 r26
#define CO3 r27
#define CO4 r28
#define PREA r29
#define PREB r29
#define PREC r30
#define VREG r31
#define LOAD_A lvx
#define LOAD_B lvx
#define OFFSET_0 0
#define OFFSET_1 r14
#define OFFSET_2 r15
#define OFFSET_3 r16
#define OFFSET_4 r17
#define OFFSET_5 r18
#define OFFSET_6 r19
#define OFFSET_7 r20
#define c01 v0
#define c02 v1
#define c03 v2
#define c04 v3
#define c05 v4
#define c06 v5
#define c07 v6
#define c08 v7
#define c09 v8
#define c10 v9
#define c11 v10
#define c12 v11
#define c13 v12
#define c14 v13
#define c15 v14
#define c16 v15
#define a1 v16
#define a2 v17
#define a3 v18
#define a4 v19
#define a5 v20
#define a6 v21
#define a7 v22
#define a8 v23
#define b1 v24
#define b2 v25
#define bp1 v26
#define bp2 v27
#define C1 v16
#define C2 v17
#define C3 v18
#define C4 v19
#define C5 v20
#define C6 v21
#define C7 v22
#define C8 v23
#define C9 v24
#define c00 v25
#define PERMRSHIFT1 v26
#define PERMRSHIFT2 v27
#define PERMRSHIFT3 v28
#define PERMRSHIFT4 v29
#define VZERO v30
#define alpha v31
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
mr STACK, SP
li r0, 0 * 16
stvx v20, SP, r0
li r0, 1 * 16
stvx v21, SP, r0
li r0, 2 * 16
stvx v22, SP, r0
li r0, 3 * 16
stvx v23, SP, r0
li r0, 4 * 16
stvx v24, SP, r0
li r0, 5 * 16
stvx v25, SP, r0
li r0, 6 * 16
stvx v26, SP, r0
li r0, 7 * 16
stvx v27, SP, r0
li r0, 8 * 16
stvx v28, SP, r0
li r0, 9 * 16
stvx v29, SP, r0
li r0, 10 * 16
stvx v30, SP, r0
li r0, 11 * 16
stvx v31, SP, r0
#ifdef __64BIT__
std r31, 192(SP)
std r30, 200(SP)
std r29, 208(SP)
std r28, 216(SP)
std r27, 224(SP)
std r26, 232(SP)
std r25, 240(SP)
std r24, 248(SP)
std r23, 256(SP)
std r22, 264(SP)
std r21, 272(SP)
std r20, 280(SP)
std r19, 288(SP)
std r18, 296(SP)
std r17, 304(SP)
std r16, 312(SP)
std r15, 320(SP)
std r14, 328(SP)
#else
stw r31, 192(SP)
stw r30, 196(SP)
stw r29, 200(SP)
stw r28, 204(SP)
stw r27, 208(SP)
stw r26, 212(SP)
stw r25, 216(SP)
stw r24, 220(SP)
stw r23, 224(SP)
stw r22, 228(SP)
stw r21, 232(SP)
stw r20, 236(SP)
stw r19, 240(SP)
stw r18, 244(SP)
stw r17, 248(SP)
stw r16, 252(SP)
stw r15, 256(SP)
stw r14, 260(SP)
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
lwz LDC, 56 + STACKSIZE(SP)
#endif
#endif
li r0, -1
mfspr VREG, VRsave
mtspr VRsave, r0
addi SP, SP, -128
li r0, -128
and SP, SP, r0
li OFFSET_1, 4 * SIZE
li OFFSET_2, 8 * SIZE
li OFFSET_3, 12 * SIZE
li OFFSET_4, 16 * SIZE
li OFFSET_5, 20 * SIZE
li OFFSET_6, 24 * SIZE
li OFFSET_7, 28 * SIZE
stfs f1, ALPHA + 0(SP)
stfs f1, ALPHA + 4(SP)
stfs f1, ALPHA + 8(SP)
stfs f1, ALPHA + 12(SP)
li r29, 0
stw r29, FZERO(SP)
slwi LDC, LDC, BASE_SHIFT
li PREC, (15 * SIZE)
#ifdef CELL
li PREB, (3 * 32 * SIZE)
#else
li PREB, (5 * 32 * SIZE)
#endif
cmpwi cr0, M, 0
ble LL(999)
cmpwi cr0, N, 0
ble LL(999)
cmpwi cr0, K, 0
ble LL(999)
srawi. J, N, 2
ble LL(60)
.align 4
LL(01):
mr CO1, C
add CO2, C, LDC
add CO3, CO2, LDC
add CO4, CO3, LDC
add C, CO4, LDC
mr AO, A
srawi. I, M, 4
ble LL(20)
.align 4
LL(11):
vxor c01, c01, c01
LOAD_B b1, OFFSET_0, B
vxor c02, c02, c02
LOAD_A a1, OFFSET_0, AO
vxor c03, c03, c03
LOAD_A a2, OFFSET_1, AO
vxor c04, c04, c04
LOAD_A a3, OFFSET_2, AO
vxor c05, c05, c05
LOAD_A a4, OFFSET_3, AO
vxor c06, c06, c06
LOAD_A a5, OFFSET_4, AO
vxor c07, c07, c07
nop
vxor c08, c08, c08
vxor c09, c09, c09
dcbtst CO1, PREC
vxor c10, c10, c10
dcbtst CO2, PREC
vxor c11, c11, c11
dcbtst CO3, PREC
vxor c12, c12, c12
dcbtst CO4, PREC
vxor c13, c13, c13
mr BO, B
vxor c14, c14, c14
srawi. r0, K, 2
vxor c15, c15, c15
mtspr CTR, r0
vxor c16, c16, c16
vspltw bp1, b1, 0
ble LL(13)
.align 4
#define NOP1 mr r3, r3
#define NOP2 mr r4, r4
LL(12):
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
vmaddfp c02, a2, bp1, c02
DCBT(A, PREA)
vmaddfp c03, a3, bp1, c03
NOP1
vmaddfp c04, a4, bp1, c04
vspltw bp1, b1, 2
vmaddfp c05, a1, bp2, c05
DCBT(B, PREB)
vmaddfp c06, a2, bp2, c06
NOP2
vmaddfp c07, a3, bp2, c07
NOP1
vmaddfp c08, a4, bp2, c08
vspltw bp2, b1, 3
vmaddfp c09, a1, bp1, c09
NOP1
vmaddfp c10, a2, bp1, c10
LOAD_B b2, OFFSET_1, BO
vmaddfp c11, a3, bp1, c11
addi BO, BO, 8 * SIZE
vmaddfp c12, a4, bp1, c12
vspltw bp1, b2, 0
vmaddfp c13, a1, bp2, c13
NOP1
vmaddfp c14, a2, bp2, c14
LOAD_A a5, OFFSET_4, AO
vmaddfp c15, a3, bp2, c15
LOAD_A a6, OFFSET_5, AO
vmaddfp c16, a4, bp2, c16
vspltw bp2, b2, 1
vmaddfp c01, a5, bp1, c01
LOAD_A a7, OFFSET_6, AO
vmaddfp c02, a6, bp1, c02
LOAD_A a8, OFFSET_7, AO
vmaddfp c03, a7, bp1, c03
NOP1
vmaddfp c04, a8, bp1, c04
NOP2
vmaddfp c05, a5, bp2, c05
vspltw bp1, b2, 2
vmaddfp c06, a6, bp2, c06
addi AO, AO, 32 * SIZE
vmaddfp c07, a7, bp2, c07
LOAD_B b1, OFFSET_0, BO
vmaddfp c08, a8, bp2, c08
vspltw bp2, b2, 3
vmaddfp c09, a5, bp1, c09
NOP1
vmaddfp c10, a6, bp1, c10
NOP2
vmaddfp c11, a7, bp1, c11
NOP1
vmaddfp c12, a8, bp1, c12
vspltw bp1, b1, 0
vmaddfp c13, a5, bp2, c13
DCBT(A, PREA)
vmaddfp c14, a6, bp2, c14
LOAD_A a1, OFFSET_0, AO
vmaddfp c15, a7, bp2, c15
LOAD_A a2, OFFSET_1, AO
vmaddfp c16, a8, bp2, c16
vspltw bp2, b1, 1
vmaddfp c01, a1, bp1, c01
LOAD_A a3, OFFSET_2, AO
vmaddfp c02, a2, bp1, c02
LOAD_A a4, OFFSET_3, AO
vmaddfp c03, a3, bp1, c03
NOP1
vmaddfp c04, a4, bp1, c04
vspltw bp1, b1, 2
vmaddfp c05, a1, bp2, c05
NOP1
vmaddfp c06, a2, bp2, c06
NOP2
vmaddfp c07, a3, bp2, c07
NOP1
vmaddfp c08, a4, bp2, c08
vspltw bp2, b1, 3
vmaddfp c09, a1, bp1, c09
LOAD_B b2, OFFSET_1, BO
vmaddfp c10, a2, bp1, c10
NOP2
vmaddfp c11, a3, bp1, c11
NOP1
vmaddfp c12, a4, bp1, c12
addi BO, BO, 8 * SIZE
vmaddfp c13, a1, bp2, c13
vspltw bp1, b2, 0
vmaddfp c14, a2, bp2, c14
LOAD_A a5, OFFSET_4, AO
vmaddfp c15, a3, bp2, c15
LOAD_A a6, OFFSET_5, AO
vmaddfp c16, a4, bp2, c16
vspltw bp2, b2, 1
vmaddfp c01, a5, bp1, c01
LOAD_A a7, OFFSET_6, AO
vmaddfp c02, a6, bp1, c02
LOAD_A a8, OFFSET_7, AO
vmaddfp c03, a7, bp1, c03
addi AO, AO, 32 * SIZE
vmaddfp c04, a8, bp1, c04
NOP2
vmaddfp c05, a5, bp2, c05
vspltw bp1, b2, 2
vmaddfp c06, a6, bp2, c06
NOP2
vmaddfp c07, a7, bp2, c07
NOP1
vmaddfp c08, a8, bp2, c08
LOAD_B b1, OFFSET_0, BO
vmaddfp c09, a5, bp1, c09
vspltw bp2, b2, 3
vmaddfp c10, a6, bp1, c10
LOAD_A a1, OFFSET_0, AO //
vmaddfp c11, a7, bp1, c11
LOAD_A a2, OFFSET_1, AO
vmaddfp c12, a8, bp1, c12
NOP2
vmaddfp c13, a5, bp2, c13
vspltw bp1, b1, 0
vmaddfp c14, a6, bp2, c14
LOAD_A a3, OFFSET_2, AO
vmaddfp c15, a7, bp2, c15
LOAD_A a4, OFFSET_3, AO
vmaddfp c16, a8, bp2, c16
bdnz+ LL(12)
.align 4
LL(13):
andi. r0, K, 2
nop
nop
ble+ LL(15)
.align 4
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
vmaddfp c02, a2, bp1, c02
NOP2
vmaddfp c03, a3, bp1, c03
NOP1
vmaddfp c04, a4, bp1, c04
NOP2
vmaddfp c05, a1, bp2, c05
vspltw bp1, b1, 2
vmaddfp c06, a2, bp2, c06
NOP2
vmaddfp c07, a3, bp2, c07
NOP1
vmaddfp c08, a4, bp2, c08
LOAD_B b2, OFFSET_1, BO
vmaddfp c09, a1, bp1, c09
vspltw bp2, b1, 3
vmaddfp c10, a2, bp1, c10
LOAD_A a5, OFFSET_4, AO
vmaddfp c11, a3, bp1, c11
LOAD_A a6, OFFSET_5, AO
vmaddfp c12, a4, bp1, c12
addi BO, BO, 8 * SIZE
vmaddfp c13, a1, bp2, c13
vspltw bp1, b2, 0
vmaddfp c14, a2, bp2, c14
LOAD_A a7, OFFSET_6, AO
vmaddfp c15, a3, bp2, c15
LOAD_A a8, OFFSET_7, AO
vmaddfp c16, a4, bp2, c16
addi AO, AO, 32 * SIZE
vmaddfp c01, a5, bp1, c01
vspltw bp2, b2, 1
vmaddfp c02, a6, bp1, c02
NOP2
vmaddfp c03, a7, bp1, c03
NOP1
vmaddfp c04, a8, bp1, c04
NOP2
vmaddfp c05, a5, bp2, c05
vspltw bp1, b2, 2
vmaddfp c06, a6, bp2, c06
NOP2
vmaddfp c07, a7, bp2, c07
NOP1
vmaddfp c08, a8, bp2, c08
LOAD_B b1, OFFSET_0, BO
vmaddfp c09, a5, bp1, c09
vspltw bp2, b2, 3
vmaddfp c10, a6, bp1, c10
LOAD_A a1, OFFSET_0, AO
vmaddfp c11, a7, bp1, c11
LOAD_A a2, OFFSET_1, AO
vmaddfp c12, a8, bp1, c12
NOP2
vmaddfp c13, a5, bp2, c13
vspltw bp1, b1, 0
vmaddfp c14, a6, bp2, c14
LOAD_A a3, OFFSET_2, AO
vmaddfp c15, a7, bp2, c15
LOAD_A a4, OFFSET_3, AO
vmaddfp c16, a8, bp2, c16
.align 4
LL(15):
andi. r0, K, 1
lvx alpha, OFFSET_0, SP
vxor VZERO, VZERO, VZERO
ble+ LL(18)
.align 4
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
vmaddfp c02, a2, bp1, c02
nop
vmaddfp c03, a3, bp1, c03
nop
vmaddfp c04, a4, bp1, c04
nop
vmaddfp c05, a1, bp2, c05
vspltw bp1, b1, 2
vmaddfp c06, a2, bp2, c06
nop
vmaddfp c07, a3, bp2, c07
nop
vmaddfp c08, a4, bp2, c08
nop
vmaddfp c09, a1, bp1, c09
vspltw bp2, b1, 3
vmaddfp c10, a2, bp1, c10
addi AO, AO, 16 * SIZE
vmaddfp c11, a3, bp1, c11
addi BO, BO, 4 * SIZE
vmaddfp c12, a4, bp1, c12
nop
vmaddfp c13, a1, bp2, c13
vmaddfp c14, a2, bp2, c14
vmaddfp c15, a3, bp2, c15
vmaddfp c16, a4, bp2, c16
.align 4
LL(18):
lvx C1, OFFSET_0, CO1
cmpwi cr0, LDC, 32 * SIZE
lvx C2, OFFSET_1, CO1
lvsr PERMRSHIFT1, 0, CO1
lvx C3, OFFSET_2, CO1
lvsr PERMRSHIFT2, 0, CO2
lvx C4, OFFSET_3, CO1
lvsr PERMRSHIFT3, 0, CO3
lvx C5, OFFSET_4, CO1
lvsr PERMRSHIFT4, 0, CO4
ble LL(19)
vperm c00, VZERO, c01, PERMRSHIFT1
vperm c01, c01, c02, PERMRSHIFT1
vperm c02, c02, c03, PERMRSHIFT1
vperm c03, c03, c04, PERMRSHIFT1
vperm c04, c04, VZERO, PERMRSHIFT1
vmaddfp c00, alpha, c00, C1
lvx C1, OFFSET_0, CO2
vmaddfp c01, alpha, c01, C2
lvx C6, OFFSET_1, CO2
vmaddfp c02, alpha, c02, C3
lvx C7, OFFSET_2, CO2
vmaddfp c03, alpha, c03, C4
lvx C8, OFFSET_3, CO2
vmaddfp c04, alpha, c04, C5
lvx C9, OFFSET_4, CO2
stvx c00, OFFSET_0, CO1
vperm c00, VZERO, c05, PERMRSHIFT2
stvx c01, OFFSET_1, CO1
vperm c05, c05, c06, PERMRSHIFT2
stvx c02, OFFSET_2, CO1
vperm c06, c06, c07, PERMRSHIFT2
stvx c03, OFFSET_3, CO1
vperm c07, c07, c08, PERMRSHIFT2
stvx c04, OFFSET_4, CO1
vperm c08, c08, VZERO, PERMRSHIFT2
vmaddfp c00, alpha, c00, C1
lvx C1, OFFSET_0, CO3
vmaddfp c05, alpha, c05, C6
lvx C2, OFFSET_1, CO3
vmaddfp c06, alpha, c06, C7
lvx C3, OFFSET_2, CO3
vmaddfp c07, alpha, c07, C8
lvx C4, OFFSET_3, CO3
vmaddfp c08, alpha, c08, C9
lvx C5, OFFSET_4, CO3
stvx c00, OFFSET_0, CO2
vperm c00, VZERO, c09, PERMRSHIFT3
stvx c05, OFFSET_1, CO2
vperm c09, c09, c10, PERMRSHIFT3
stvx c06, OFFSET_2, CO2
vperm c10, c10, c11, PERMRSHIFT3
stvx c07, OFFSET_3, CO2
vperm c11, c11, c12, PERMRSHIFT3
stvx c08, OFFSET_4, CO2
vperm c12, c12, VZERO, PERMRSHIFT3
vmaddfp c00, alpha, c00, C1
lvx C9, OFFSET_4, CO4
vmaddfp c09, alpha, c09, C2
lvx C1, OFFSET_0, CO4
vmaddfp c10, alpha, c10, C3
lvx C6, OFFSET_1, CO4
vmaddfp c11, alpha, c11, C4
lvx C7, OFFSET_2, CO4
vmaddfp c12, alpha, c12, C5
lvx C8, OFFSET_3, CO4
stvx c00, OFFSET_0, CO3
vperm c00, VZERO, c13, PERMRSHIFT4
stvx c09, OFFSET_1, CO3
vperm c13, c13, c14, PERMRSHIFT4
stvx c10, OFFSET_2, CO3
vperm c14, c14, c15, PERMRSHIFT4
stvx c11, OFFSET_3, CO3
vperm c15, c15, c16, PERMRSHIFT4
stvx c12, OFFSET_4, CO3
vperm c16, c16, VZERO, PERMRSHIFT4
vmaddfp c00, alpha, c00, C1
vmaddfp c13, alpha, c13, C6
vmaddfp c14, alpha, c14, C7
vmaddfp c15, alpha, c15, C8
vmaddfp c16, alpha, c16, C9
stvx c00, OFFSET_0, CO4
stvx c13, OFFSET_1, CO4
stvx c14, OFFSET_2, CO4
stvx c15, OFFSET_3, CO4
stvx c16, OFFSET_4, CO4
addi CO1, CO1, 16 * SIZE
addi CO2, CO2, 16 * SIZE
addi CO3, CO3, 16 * SIZE
addi CO4, CO4, 16 * SIZE
addic. I, I, -1
bgt+ LL(11)
b LL(20)
.align 4
LL(19):
lvx C6, OFFSET_1, CO2
lvx C7, OFFSET_2, CO2
lvx C8, OFFSET_3, CO2
lvx C9, OFFSET_4, CO2
vperm c00, VZERO, c01, PERMRSHIFT1
vperm c01, c01, c02, PERMRSHIFT1
vperm c02, c02, c03, PERMRSHIFT1
vperm c03, c03, c04, PERMRSHIFT1
vperm c04, c04, VZERO, PERMRSHIFT1
vmaddfp c00, alpha, c00, C1
vmaddfp c01, alpha, c01, C2
lvx C2, OFFSET_1, CO3
vmaddfp c02, alpha, c02, C3
lvx C3, OFFSET_2, CO3
vmaddfp c03, alpha, c03, C4
lvx C4, OFFSET_3, CO3
vmaddfp c04, alpha, c04, C5
lvx C5, OFFSET_4, CO3
stvx c00, OFFSET_0, CO1
stvx c01, OFFSET_1, CO1
stvx c02, OFFSET_2, CO1
stvx c03, OFFSET_3, CO1
stvx c04, OFFSET_4, CO1
lvx C1, OFFSET_0, CO2
vperm c00, VZERO, c05, PERMRSHIFT2
vperm c05, c05, c06, PERMRSHIFT2
vperm c06, c06, c07, PERMRSHIFT2
vperm c07, c07, c08, PERMRSHIFT2
vperm c08, c08, VZERO, PERMRSHIFT2
vmaddfp c00, alpha, c00, C1
vmaddfp c05, alpha, c05, C6
lvx C6, OFFSET_1, CO4
vmaddfp c06, alpha, c06, C7
lvx C7, OFFSET_2, CO4
vmaddfp c07, alpha, c07, C8
lvx C8, OFFSET_3, CO4
vmaddfp c08, alpha, c08, C9
lvx C9, OFFSET_4, CO4
stvx c00, OFFSET_0, CO2
stvx c05, OFFSET_1, CO2
stvx c06, OFFSET_2, CO2
stvx c07, OFFSET_3, CO2
stvx c08, OFFSET_4, CO2
lvx C1, OFFSET_0, CO3
vperm c00, VZERO, c09, PERMRSHIFT3
vperm c09, c09, c10, PERMRSHIFT3
vperm c10, c10, c11, PERMRSHIFT3
vperm c11, c11, c12, PERMRSHIFT3
vperm c12, c12, VZERO, PERMRSHIFT3
vmaddfp c00, alpha, c00, C1
vmaddfp c09, alpha, c09, C2
vmaddfp c10, alpha, c10, C3
vmaddfp c11, alpha, c11, C4
vmaddfp c12, alpha, c12, C5
stvx c00, OFFSET_0, CO3
stvx c09, OFFSET_1, CO3
stvx c10, OFFSET_2, CO3
stvx c11, OFFSET_3, CO3
stvx c12, OFFSET_4, CO3
lvx C1, OFFSET_0, CO4
vperm c00, VZERO, c13, PERMRSHIFT4
vperm c13, c13, c14, PERMRSHIFT4
vperm c14, c14, c15, PERMRSHIFT4
vperm c15, c15, c16, PERMRSHIFT4
vperm c16, c16, VZERO, PERMRSHIFT4
vmaddfp c00, alpha, c00, C1
vmaddfp c13, alpha, c13, C6
vmaddfp c14, alpha, c14, C7
vmaddfp c15, alpha, c15, C8
vmaddfp c16, alpha, c16, C9
stvx c00, OFFSET_0, CO4
stvx c13, OFFSET_1, CO4
stvx c14, OFFSET_2, CO4
stvx c15, OFFSET_3, CO4
stvx c16, OFFSET_4, CO4
addi CO1, CO1, 16 * SIZE
addi CO2, CO2, 16 * SIZE
addi CO3, CO3, 16 * SIZE
addi CO4, CO4, 16 * SIZE
addic. I, I, -1
bgt+ LL(11)
.align 4
LL(20):
andi. I, M, 8
ble LL(30)
vxor c01, c01, c01
LOAD_A a1, OFFSET_0, AO
vxor c02, c02, c02
LOAD_A a2, OFFSET_1, AO
vxor c05, c05, c05
LOAD_A a3, OFFSET_2, AO
vxor c06, c06, c06
LOAD_A a4, OFFSET_3, AO
vxor c09, c09, c09
LOAD_B b1, OFFSET_0, B
vxor c10, c10, c10
LOAD_B b2, OFFSET_1, B
vxor c13, c13, c13
vxor c14, c14, c14
mr BO, B
vspltw bp1, b1, 0
srawi. r0, K, 1
mtspr CTR, r0
ble LL(25)
.align 4
LL(22):
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
addi AO, AO, 16 * SIZE
vmaddfp c02, a2, bp1, c02
addi BO, BO, 8 * SIZE
vmaddfp c05, a1, bp2, c05
vspltw bp1, b1, 2
vmaddfp c06, a2, bp2, c06
vmaddfp c09, a1, bp1, c09
vspltw bp2, b1, 3
LOAD_B b1, OFFSET_0, BO
vmaddfp c10, a2, bp1, c10
vmaddfp c13, a1, bp2, c13
LOAD_A a1, OFFSET_0, AO
vspltw bp1, b2, 0
vmaddfp c14, a2, bp2, c14
LOAD_A a2, OFFSET_1, AO
vmaddfp c01, a3, bp1, c01
vspltw bp2, b2, 1
vmaddfp c02, a4, bp1, c02
vmaddfp c05, a3, bp2, c05
vspltw bp1, b2, 2
vmaddfp c06, a4, bp2, c06
vmaddfp c09, a3, bp1, c09
vspltw bp2, b2, 3
LOAD_B b2, OFFSET_1, BO
vmaddfp c10, a4, bp1, c10
vmaddfp c13, a3, bp2, c13
LOAD_A a3, OFFSET_2, AO
vmaddfp c14, a4, bp2, c14
LOAD_A a4, OFFSET_3, AO
vspltw bp1, b1, 0
bdnz LL(22)
.align 4
LL(25):
andi. r0, K, 1
lvx alpha, OFFSET_0, SP
vxor VZERO, VZERO, VZERO
ble+ LL(28)
.align 4
LL(26):
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
vmaddfp c02, a2, bp1, c02
nop
vmaddfp c05, a1, bp2, c05
vspltw bp1, b1, 2
vmaddfp c06, a2, bp2, c06
nop
vmaddfp c09, a1, bp1, c09
vspltw bp2, b1, 3
vmaddfp c10, a2, bp1, c10
addi AO, AO, 8 * SIZE
vmaddfp c13, a1, bp2, c13
addi BO, BO, 4 * SIZE
vmaddfp c14, a2, bp2, c14
nop
.align 4
LL(28):
lvx C1, OFFSET_0, CO1
lvx C2, OFFSET_1, CO1
lvx C3, OFFSET_2, CO1
lvsr PERMRSHIFT1, 0, CO1
lvsr PERMRSHIFT2, 0, CO2
lvsr PERMRSHIFT3, 0, CO3
lvsr PERMRSHIFT4, 0, CO4
vperm c00, VZERO, c01, PERMRSHIFT1
vperm c01, c01, c02, PERMRSHIFT1
vperm c02, c02, VZERO, PERMRSHIFT1
vmaddfp c00, alpha, c00, C1
vmaddfp c01, alpha, c01, C2
vmaddfp c02, alpha, c02, C3
stvx c00, OFFSET_0, CO1
stvx c01, OFFSET_1, CO1
stvx c02, OFFSET_2, CO1
lvx C1, OFFSET_0, CO2
lvx C2, OFFSET_1, CO2
lvx C3, OFFSET_2, CO2
vperm c00, VZERO, c05, PERMRSHIFT2
vperm c05, c05, c06, PERMRSHIFT2
vperm c06, c06, VZERO, PERMRSHIFT2
vmaddfp c00, alpha, c00, C1
vmaddfp c05, alpha, c05, C2
vmaddfp c06, alpha, c06, C3
stvx c00, OFFSET_0, CO2
stvx c05, OFFSET_1, CO2
stvx c06, OFFSET_2, CO2
lvx C1, OFFSET_0, CO3
lvx C2, OFFSET_1, CO3
lvx C3, OFFSET_2, CO3
vperm c00, VZERO, c09, PERMRSHIFT3
vperm c09, c09, c10, PERMRSHIFT3
vperm c10, c10, VZERO, PERMRSHIFT3
vmaddfp c00, alpha, c00, C1
vmaddfp c09, alpha, c09, C2
vmaddfp c10, alpha, c10, C3
stvx c00, OFFSET_0, CO3
stvx c09, OFFSET_1, CO3
stvx c10, OFFSET_2, CO3
lvx C1, OFFSET_0, CO4
lvx C2, OFFSET_1, CO4
lvx C3, OFFSET_2, CO4
vperm c00, VZERO, c13, PERMRSHIFT4
vperm c13, c13, c14, PERMRSHIFT4
vperm c14, c14, VZERO, PERMRSHIFT4
vmaddfp c00, alpha, c00, C1
vmaddfp c13, alpha, c13, C2
vmaddfp c14, alpha, c14, C3
stvx c00, OFFSET_0, CO4
stvx c13, OFFSET_1, CO4
stvx c14, OFFSET_2, CO4
addi CO1, CO1, 8 * SIZE
addi CO2, CO2, 8 * SIZE
addi CO3, CO3, 8 * SIZE
addi CO4, CO4, 8 * SIZE
.align 4
LL(30):
andi. I, M, 4
ble LL(40)
vxor c01, c01, c01
LOAD_A a1, OFFSET_0, AO
vxor c02, c02, c02
LOAD_A a2, OFFSET_1, AO
vxor c05, c05, c05
LOAD_B b1, OFFSET_0, B
vxor c06, c06, c06
LOAD_B b2, OFFSET_1, B
vxor c09, c09, c09
vxor c10, c10, c10
vxor c13, c13, c13
vxor c14, c14, c14
vspltw bp1, b1, 0
mr BO, B
srawi. r0, K, 1
mtspr CTR, r0
ble LL(35)
.align 4
LL(32):
vmaddfp c01, a1, bp1, c01
addi AO, AO, 8 * SIZE
vspltw bp2, b1, 1
vmaddfp c05, a1, bp2, c05
addi BO, BO, 8 * SIZE
vspltw bp1, b1, 2
vmaddfp c09, a1, bp1, c09
vspltw bp2, b1, 3
vmaddfp c13, a1, bp2, c13
LOAD_A a1, OFFSET_0, AO
vspltw bp1, b2, 0
LOAD_B b1, OFFSET_0, BO
vmaddfp c02, a2, bp1, c02
vspltw bp2, b2, 1
vmaddfp c06, a2, bp2, c06
vspltw bp1, b2, 2
vmaddfp c10, a2, bp1, c10
vspltw bp2, b2, 3
LOAD_B b2, OFFSET_1, BO
vmaddfp c14, a2, bp2, c14
LOAD_A a2, OFFSET_1, AO
vspltw bp1, b1, 0
bdnz LL(32)
.align 4
LL(35):
andi. r0, K, 1
lvx alpha, OFFSET_0, SP
vxor VZERO, VZERO, VZERO
ble+ LL(38)
.align 4
LL(36):
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
vmaddfp c05, a1, bp2, c05
vspltw bp1, b1, 2
vmaddfp c09, a1, bp1, c09
vspltw bp2, b1, 3
vmaddfp c13, a1, bp2, c13
addi AO, AO, 4 * SIZE
addi BO, BO, 4 * SIZE
.align 4
LL(38):
vaddfp c01, c01, c02
vaddfp c05, c05, c06
vaddfp c09, c09, c10
vaddfp c13, c13, c14
lvx C1, OFFSET_0, CO1
lvx C2, OFFSET_1, CO1
lvsr PERMRSHIFT1, 0, CO1
lvsr PERMRSHIFT2, 0, CO2
lvsr PERMRSHIFT3, 0, CO3
lvsr PERMRSHIFT4, 0, CO4
vperm c00, VZERO, c01, PERMRSHIFT1
vperm c01, c01, VZERO, PERMRSHIFT1
vmaddfp c00, alpha, c00, C1
vmaddfp c01, alpha, c01, C2
stvx c00, OFFSET_0, CO1
stvx c01, OFFSET_1, CO1
lvx C1, OFFSET_0, CO2
lvx C2, OFFSET_1, CO2
vperm c00, VZERO, c05, PERMRSHIFT2
vperm c05, c05, VZERO, PERMRSHIFT2
vmaddfp c00, alpha, c00, C1
vmaddfp c05, alpha, c05, C2
stvx c00, OFFSET_0, CO2
stvx c05, OFFSET_1, CO2
lvx C1, OFFSET_0, CO3
lvx C2, OFFSET_1, CO3
vperm c00, VZERO, c09, PERMRSHIFT3
vperm c09, c09, VZERO, PERMRSHIFT3
vmaddfp c00, alpha, c00, C1
vmaddfp c09, alpha, c09, C2
stvx c00, OFFSET_0, CO3
stvx c09, OFFSET_1, CO3
lvx C1, OFFSET_0, CO4
lvx C2, OFFSET_1, CO4
vperm c00, VZERO, c13, PERMRSHIFT4
vperm c13, c13, VZERO, PERMRSHIFT4
vmaddfp c00, alpha, c00, C1
vmaddfp c13, alpha, c13, C2
stvx c00, OFFSET_0, CO4
stvx c13, OFFSET_1, CO4
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
addi CO3, CO3, 4 * SIZE
addi CO4, CO4, 4 * SIZE
.align 4
LL(40):
andi. I, M, 2
ble LL(50)
mr BO, B
LFD f8, 0 * SIZE(AO)
LFD f9, 1 * SIZE(AO)
LFD f10, 0 * SIZE(B)
LFD f11, 1 * SIZE(B)
LFD f12, 2 * SIZE(B)
LFD f13, 3 * SIZE(B)
lfs f0, FZERO(SP)
fmr f1, f0
fmr f2, f0
fmr f3, f0
fmr f4, f0
fmr f5, f0
fmr f6, f0
fmr f7, f0
srawi. r0, K, 1
mtspr CTR, r0
ble LL(45)
.align 4
LL(42):
FMADD f0, f8, f10, f0
FMADD f2, f8, f11, f2
FMADD f4, f8, f12, f4
FMADD f6, f8, f13, f6
FMADD f1, f9, f10, f1
FMADD f3, f9, f11, f3
FMADD f5, f9, f12, f5
FMADD f7, f9, f13, f7
LFD f8, 2 * SIZE(AO)
LFD f9, 3 * SIZE(AO)
LFD f10, 4 * SIZE(BO)
LFD f11, 5 * SIZE(BO)
LFD f12, 6 * SIZE(BO)
LFD f13, 7 * SIZE(BO)
FMADD f0, f8, f10, f0
FMADD f2, f8, f11, f2
FMADD f4, f8, f12, f4
FMADD f6, f8, f13, f6
FMADD f1, f9, f10, f1
FMADD f3, f9, f11, f3
FMADD f5, f9, f12, f5
FMADD f7, f9, f13, f7
LFD f8, 4 * SIZE(AO)
LFD f9, 5 * SIZE(AO)
LFD f10, 8 * SIZE(BO)
LFD f11, 9 * SIZE(BO)
LFD f12, 10 * SIZE(BO)
LFD f13, 11 * SIZE(BO)
addi AO, AO, 4 * SIZE
addi BO, BO, 8 * SIZE
bdnz LL(42)
.align 4
LL(45):
andi. r0, K, 1
ble LL(48)
.align 4
LL(46):
FMADD f0, f8, f10, f0
FMADD f2, f8, f11, f2
FMADD f4, f8, f12, f4
FMADD f6, f8, f13, f6
FMADD f1, f9, f10, f1
FMADD f3, f9, f11, f3
FMADD f5, f9, f12, f5
FMADD f7, f9, f13, f7
LFD f8, 2 * SIZE(AO)
LFD f9, 3 * SIZE(AO)
LFD f10, 4 * SIZE(BO)
LFD f11, 5 * SIZE(BO)
LFD f12, 6 * SIZE(BO)
LFD f13, 7 * SIZE(BO)
addi AO, AO, 2 * SIZE
addi BO, BO, 4 * SIZE
.align 4
LL(48):
lfs f13, ALPHA(SP)
LFD f8, 0 * SIZE(CO1)
LFD f9, 1 * SIZE(CO1)
LFD f10, 0 * SIZE(CO2)
LFD f11, 1 * SIZE(CO2)
FMADD f0, f0, f13, f8
FMADD f1, f1, f13, f9
FMADD f2, f2, f13, f10
FMADD f3, f3, f13, f11
LFD f8, 0 * SIZE(CO3)
LFD f9, 1 * SIZE(CO3)
LFD f10, 0 * SIZE(CO4)
LFD f11, 1 * SIZE(CO4)
FMADD f4, f4, f13, f8
FMADD f5, f5, f13, f9
FMADD f6, f6, f13, f10
FMADD f7, f7, f13, f11
STFD f0, 0 * SIZE(CO1)
STFD f1, 1 * SIZE(CO1)
STFD f2, 0 * SIZE(CO2)
STFD f3, 1 * SIZE(CO2)
STFD f4, 0 * SIZE(CO3)
STFD f5, 1 * SIZE(CO3)
STFD f6, 0 * SIZE(CO4)
STFD f7, 1 * SIZE(CO4)
addi CO1, CO1, 2 * SIZE
addi CO2, CO2, 2 * SIZE
addi CO3, CO3, 2 * SIZE
addi CO4, CO4, 2 * SIZE
.align 4
LL(50):
andi. I, M, 1
ble LL(59)
mr BO, B
LFD f8, 0 * SIZE(AO)
LFD f9, 1 * SIZE(AO)
LFD f10, 0 * SIZE(B)
LFD f11, 1 * SIZE(B)
LFD f12, 2 * SIZE(B)
LFD f13, 3 * SIZE(B)
lfs f0, FZERO(SP)
fmr f1, f0
fmr f2, f0
fmr f3, f0
srawi. r0, K, 1
mtspr CTR, r0
ble LL(55)
.align 4
LL(52):
FMADD f0, f8, f10, f0
FMADD f1, f8, f11, f1
FMADD f2, f8, f12, f2
FMADD f3, f8, f13, f3
LFD f8, 2 * SIZE(AO)
LFD f10, 4 * SIZE(BO)
LFD f11, 5 * SIZE(BO)
LFD f12, 6 * SIZE(BO)
LFD f13, 7 * SIZE(BO)
FMADD f0, f9, f10, f0
FMADD f1, f9, f11, f1
FMADD f2, f9, f12, f2
FMADD f3, f9, f13, f3
LFD f9, 3 * SIZE(AO)
LFD f10, 8 * SIZE(BO)
LFD f11, 9 * SIZE(BO)
LFD f12, 10 * SIZE(BO)
LFD f13, 11 * SIZE(BO)
addi AO, AO, 2 * SIZE
addi BO, BO, 8 * SIZE
bdnz LL(52)
.align 4
LL(55):
andi. r0, K, 1
ble LL(58)
.align 4
LL(56):
FMADD f0, f8, f10, f0
FMADD f1, f8, f11, f1
FMADD f2, f8, f12, f2
FMADD f3, f8, f13, f3
LFD f8, 2 * SIZE(AO)
LFD f10, 4 * SIZE(BO)
LFD f11, 5 * SIZE(BO)
LFD f12, 6 * SIZE(BO)
LFD f13, 7 * SIZE(BO)
addi AO, AO, 1 * SIZE
addi BO, BO, 4 * SIZE
.align 4
LL(58):
lfs f13, ALPHA(SP)
LFD f8, 0 * SIZE(CO1)
LFD f9, 0 * SIZE(CO2)
LFD f10, 0 * SIZE(CO3)
LFD f11, 0 * SIZE(CO4)
FMADD f0, f0, f13, f8
FMADD f1, f1, f13, f9
FMADD f2, f2, f13, f10
FMADD f3, f3, f13, f11
STFD f0, 0 * SIZE(CO1)
STFD f1, 0 * SIZE(CO2)
STFD f2, 0 * SIZE(CO3)
STFD f3, 0 * SIZE(CO4)
.align 4
LL(59):
mr B, BO
addic. J, J, -1
bgt LL(01)
.align 4
LL(60):
andi. r0, N, 2
ble LL(120)
mr CO1, C
add CO2, C, LDC
add C, CO2, LDC
mr AO, A
srawi. I, M, 4
ble LL(80)
.align 4
LL(71):
vxor c01, c01, c01
LOAD_B b1, OFFSET_0, B
vxor c02, c02, c02
vxor c03, c03, c03
LOAD_A a1, OFFSET_0, AO
vxor c04, c04, c04
LOAD_A a2, OFFSET_1, AO
vxor c05, c05, c05
LOAD_A a3, OFFSET_2, AO
vxor c06, c06, c06
LOAD_A a4, OFFSET_3, AO
vxor c07, c07, c07
vxor c08, c08, c08
mr BO, B
dcbtst CO1, PREC
dcbtst CO2, PREC
vspltw bp1, b1, 0
srawi. r0, K, 1
mtspr CTR, r0
ble LL(75)
.align 4
LL(72):
LOAD_A a5, OFFSET_4, AO
LOAD_A a6, OFFSET_5, AO
LOAD_A a7, OFFSET_6, AO
LOAD_A a8, OFFSET_7, AO
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
vmaddfp c02, a2, bp1, c02
vmaddfp c03, a3, bp1, c03
vmaddfp c04, a4, bp1, c04
vmaddfp c05, a1, bp2, c05
vspltw bp1, b1, 2
vmaddfp c06, a2, bp2, c06
vmaddfp c07, a3, bp2, c07
vmaddfp c08, a4, bp2, c08
vmaddfp c01, a5, bp1, c01
vspltw bp2, b1, 3
vmaddfp c02, a6, bp1, c02
vmaddfp c03, a7, bp1, c03
vmaddfp c04, a8, bp1, c04
LOAD_B b1, OFFSET_1, BO
vspltw bp1, b1, 0
vmaddfp c05, a5, bp2, c05
vmaddfp c06, a6, bp2, c06
vmaddfp c07, a7, bp2, c07
vmaddfp c08, a8, bp2, c08
addi AO, AO, 32 * SIZE
addi BO, BO, 4 * SIZE
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
bdnz LL(72)
.align 4
LL(75):
andi. r0, K, 1
lvx alpha, OFFSET_0, SP
vxor VZERO, VZERO, VZERO
ble+ LL(78)
.align 4
LL(76):
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
vmaddfp c02, a2, bp1, c02
addi AO, AO, 16 * SIZE
vmaddfp c03, a3, bp1, c03
addi BO, BO, 2 * SIZE
vmaddfp c04, a4, bp1, c04
nop
vmaddfp c05, a1, bp2, c05
vmaddfp c06, a2, bp2, c06
vmaddfp c07, a3, bp2, c07
vmaddfp c08, a4, bp2, c08
.align 4
LL(78):
lvx C1, OFFSET_0, CO1
lvx C2, OFFSET_1, CO1
lvx C3, OFFSET_2, CO1
lvx C4, OFFSET_3, CO1
lvx C5, OFFSET_4, CO1
lvsr PERMRSHIFT1, 0, CO1
lvsr PERMRSHIFT2, 0, CO2
lvsr PERMRSHIFT3, 0, CO3
lvsr PERMRSHIFT4, 0, CO4
vperm c00, VZERO, c01, PERMRSHIFT1
vperm c01, c01, c02, PERMRSHIFT1
vperm c02, c02, c03, PERMRSHIFT1
vperm c03, c03, c04, PERMRSHIFT1
vperm c04, c04, VZERO, PERMRSHIFT1
vmaddfp c00, alpha, c00, C1
vmaddfp c01, alpha, c01, C2
vmaddfp c02, alpha, c02, C3
vmaddfp c03, alpha, c03, C4
vmaddfp c04, alpha, c04, C5
stvx c00, OFFSET_0, CO1
stvx c01, OFFSET_1, CO1
stvx c02, OFFSET_2, CO1
stvx c03, OFFSET_3, CO1
stvx c04, OFFSET_4, CO1
lvx C1, OFFSET_0, CO2
lvx C2, OFFSET_1, CO2
lvx C3, OFFSET_2, CO2
lvx C4, OFFSET_3, CO2
lvx C5, OFFSET_4, CO2
vperm c00, VZERO, c05, PERMRSHIFT2
vperm c05, c05, c06, PERMRSHIFT2
vperm c06, c06, c07, PERMRSHIFT2
vperm c07, c07, c08, PERMRSHIFT2
vperm c08, c08, VZERO, PERMRSHIFT2
vmaddfp c00, alpha, c00, C1
vmaddfp c05, alpha, c05, C2
vmaddfp c06, alpha, c06, C3
vmaddfp c07, alpha, c07, C4
vmaddfp c08, alpha, c08, C5
stvx c00, OFFSET_0, CO2
stvx c05, OFFSET_1, CO2
stvx c06, OFFSET_2, CO2
stvx c07, OFFSET_3, CO2
stvx c08, OFFSET_4, CO2
addi CO1, CO1, 16 * SIZE
addi CO2, CO2, 16 * SIZE
addic. I, I, -1
bgt+ LL(71)
.align 4
LL(80):
andi. I, M, 8
ble LL(90)
vxor c01, c01, c01
LOAD_B b1, OFFSET_0, B
vxor c02, c02, c02
vxor c03, c03, c03
LOAD_A a1, OFFSET_0, AO
vxor c04, c04, c04
LOAD_A a2, OFFSET_1, AO
vxor c05, c05, c05
LOAD_A a3, OFFSET_2, AO
vxor c06, c06, c06
LOAD_A a4, OFFSET_3, AO
vxor c07, c07, c07
vxor c08, c08, c08
mr BO, B
vspltw bp1, b1, 0
srawi. r0, K, 1
mtspr CTR, r0
ble LL(85)
.align 4
LL(82):
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
vmaddfp c02, a2, bp1, c02
vmaddfp c05, a1, bp2, c05
vspltw bp1, b1, 2
vmaddfp c06, a2, bp2, c06
vmaddfp c03, a3, bp1, c03
vspltw bp2, b1, 3
vmaddfp c04, a4, bp1, c04
LOAD_B b1, OFFSET_1, BO
vspltw bp1, b1, 0
vmaddfp c07, a3, bp2, c07
vmaddfp c08, a4, bp2, c08
addi AO, AO, 16 * SIZE
addi BO, BO, 4 * SIZE
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
bdnz LL(82)
.align 4
LL(85):
andi. r0, K, 1
lvx alpha, OFFSET_0, SP
vxor VZERO, VZERO, VZERO
ble+ LL(88)
.align 4
LL(86):
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
vmaddfp c02, a2, bp1, c02
addi AO, AO, 8 * SIZE
vmaddfp c05, a1, bp2, c05
addi BO, BO, 2 * SIZE
vmaddfp c06, a2, bp2, c06
.align 4
LL(88):
lvx C1, OFFSET_0, CO1
lvx C2, OFFSET_1, CO1
lvx C3, OFFSET_2, CO1
vaddfp c01, c01, c03
vaddfp c02, c02, c04
vaddfp c05, c05, c07
vaddfp c06, c06, c08
lvsr PERMRSHIFT1, 0, CO1
lvsr PERMRSHIFT2, 0, CO2
lvsr PERMRSHIFT3, 0, CO3
lvsr PERMRSHIFT4, 0, CO4
vperm c00, VZERO, c01, PERMRSHIFT1
vperm c01, c01, c02, PERMRSHIFT1
vperm c02, c02, VZERO, PERMRSHIFT1
vmaddfp c00, alpha, c00, C1
vmaddfp c01, alpha, c01, C2
vmaddfp c02, alpha, c02, C3
stvx c00, OFFSET_0, CO1
stvx c01, OFFSET_1, CO1
stvx c02, OFFSET_2, CO1
lvx C1, OFFSET_0, CO2
lvx C2, OFFSET_1, CO2
lvx C3, OFFSET_2, CO2
vperm c00, VZERO, c05, PERMRSHIFT2
vperm c05, c05, c06, PERMRSHIFT2
vperm c06, c06, VZERO, PERMRSHIFT2
vmaddfp c00, alpha, c00, C1
vmaddfp c05, alpha, c05, C2
vmaddfp c06, alpha, c06, C3
stvx c00, OFFSET_0, CO2
stvx c05, OFFSET_1, CO2
stvx c06, OFFSET_2, CO2
addi CO1, CO1, 8 * SIZE
addi CO2, CO2, 8 * SIZE
.align 4
LL(90):
andi. I, M, 4
ble LL(100)
vxor c01, c01, c01
LOAD_B b1, OFFSET_0, B
vxor c02, c02, c02
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
vxor c05, c05, c05
vxor c06, c06, c06
mr BO, B
vspltw bp1, b1, 0
srawi. r0, K, 1
mtspr CTR, r0
ble LL(95)
.align 4
LL(92):
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
vmaddfp c05, a1, bp2, c05
vspltw bp1, b1, 2
vmaddfp c02, a2, bp1, c02
vspltw bp2, b1, 3
LOAD_B b1, OFFSET_1, BO
vspltw bp1, b1, 0
vmaddfp c06, a2, bp2, c06
addi AO, AO, 8 * SIZE
addi BO, BO, 4 * SIZE
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
bdnz LL(92)
.align 4
LL(95):
andi. r0, K, 1
lvx alpha, OFFSET_0, SP
vxor VZERO, VZERO, VZERO
ble+ LL(98)
.align 4
LL(96):
vspltw bp2, b1, 1
vmaddfp c01, a1, bp1, c01
vmaddfp c05, a1, bp2, c05
addi AO, AO, 4 * SIZE
addi BO, BO, 2 * SIZE
.align 4
LL(98):
vaddfp c01, c01, c02
vaddfp c05, c05, c06
vaddfp c09, c09, c10
vaddfp c13, c13, c14
lvx C1, OFFSET_0, CO1
lvx C2, OFFSET_1, CO1
lvsr PERMRSHIFT1, 0, CO1
lvsr PERMRSHIFT2, 0, CO2
lvsr PERMRSHIFT3, 0, CO3
lvsr PERMRSHIFT4, 0, CO4
vperm c00, VZERO, c01, PERMRSHIFT1
vperm c01, c01, VZERO, PERMRSHIFT1
vmaddfp c00, alpha, c00, C1
vmaddfp c01, alpha, c01, C2
stvx c00, OFFSET_0, CO1
stvx c01, OFFSET_1, CO1
lvx C1, OFFSET_0, CO2
lvx C2, OFFSET_1, CO2
vperm c00, VZERO, c05, PERMRSHIFT2
vperm c05, c05, VZERO, PERMRSHIFT2
vmaddfp c00, alpha, c00, C1
vmaddfp c05, alpha, c05, C2
stvx c00, OFFSET_0, CO2
stvx c05, OFFSET_1, CO2
addi CO1, CO1, 4 * SIZE
addi CO2, CO2, 4 * SIZE
.align 4
LL(100):
andi. I, M, 2
ble LL(110)
mr BO, B
LFD f8, 0 * SIZE(AO)
LFD f9, 1 * SIZE(AO)
LFD f10, 0 * SIZE(B)
LFD f11, 1 * SIZE(B)
LFD f12, 2 * SIZE(B)
LFD f13, 3 * SIZE(B)
lfs f0, FZERO(SP)
fmr f1, f0
fmr f2, f0
fmr f3, f0
fmr f4, f0
fmr f5, f0
fmr f6, f0
fmr f7, f0
srawi. r0, K, 1
mtspr CTR, r0
ble LL(105)
.align 4
LL(102):
FMADD f0, f8, f10, f0
FMADD f1, f9, f10, f1
FMADD f2, f8, f11, f2
FMADD f3, f9, f11, f3
LFD f8, 2 * SIZE(AO)
LFD f9, 3 * SIZE(AO)
FMADD f4, f8, f12, f4
FMADD f5, f9, f12, f5
FMADD f6, f8, f13, f6
FMADD f7, f9, f13, f7
LFD f8, 4 * SIZE(AO)
LFD f9, 5 * SIZE(AO)
LFD f10, 4 * SIZE(BO)
LFD f11, 5 * SIZE(BO)
LFD f12, 6 * SIZE(BO)
LFD f13, 7 * SIZE(BO)
addi AO, AO, 4 * SIZE
addi BO, BO, 4 * SIZE
bdnz LL(102)
.align 4
LL(105):
andi. r0, K, 1
lfs f13, ALPHA(SP)
ble LL(108)
.align 4
LL(106):
FMADD f0, f8, f10, f0
FMADD f1, f9, f10, f1
FMADD f2, f8, f11, f2
FMADD f3, f9, f11, f3
LFD f8, 2 * SIZE(AO)
LFD f9, 3 * SIZE(AO)
LFD f10, 2 * SIZE(BO)
LFD f11, 3 * SIZE(BO)
addi AO, AO, 2 * SIZE
addi BO, BO, 2 * SIZE
.align 4
LL(108):
LFD f8, 0 * SIZE(CO1)
LFD f9, 1 * SIZE(CO1)
LFD f10, 0 * SIZE(CO2)
LFD f11, 1 * SIZE(CO2)
FADD f0, f0, f4
FADD f1, f1, f5
FADD f2, f2, f6
FADD f3, f3, f7
FMADD f0, f0, f13, f8
FMADD f1, f1, f13, f9
FMADD f2, f2, f13, f10
FMADD f3, f3, f13, f11
STFD f0, 0 * SIZE(CO1)
STFD f1, 1 * SIZE(CO1)
STFD f2, 0 * SIZE(CO2)
STFD f3, 1 * SIZE(CO2)
addi CO1, CO1, 2 * SIZE
addi CO2, CO2, 2 * SIZE
.align 4
LL(110):
andi. I, M, 1
ble LL(119)
mr BO, B
LFD f8, 0 * SIZE(AO)
LFD f9, 1 * SIZE(AO)
LFD f10, 0 * SIZE(B)
LFD f11, 1 * SIZE(B)
LFD f12, 2 * SIZE(B)
LFD f13, 3 * SIZE(B)
lfs f0, FZERO(SP)
fmr f1, f0
fmr f2, f0
fmr f3, f0
srawi. r0, K, 1
mtspr CTR, r0
ble LL(115)
.align 4
LL(112):
FMADD f0, f8, f10, f0
FMADD f1, f8, f11, f1
FMADD f2, f9, f12, f2
FMADD f3, f9, f13, f3
LFD f8, 2 * SIZE(AO)
LFD f9, 3 * SIZE(AO)
LFD f10, 4 * SIZE(BO)
LFD f11, 5 * SIZE(BO)
LFD f12, 6 * SIZE(BO)
LFD f13, 7 * SIZE(BO)
addi AO, AO, 2 * SIZE
addi BO, BO, 4 * SIZE
bdnz LL(112)
.align 4
LL(115):
andi. r0, K, 1
lfs f13, ALPHA(SP)
ble LL(118)
.align 4
LL(116):
FMADD f0, f8, f10, f0
FMADD f1, f8, f11, f1
LFD f8, 1 * SIZE(AO)
LFD f10, 2 * SIZE(BO)
LFD f11, 3 * SIZE(BO)
addi AO, AO, 1 * SIZE
addi BO, BO, 2 * SIZE
.align 4
LL(118):
LFD f8, 0 * SIZE(CO1)
LFD f9, 0 * SIZE(CO2)
FADD f0, f0, f2
FADD f1, f1, f3
FMADD f0, f0, f13, f8
FMADD f1, f1, f13, f9
STFD f0, 0 * SIZE(CO1)
STFD f1, 0 * SIZE(CO2)
.align 4
LL(119):
mr B, BO
.align 4
LL(120):
andi. r0, N, 1
ble LL(999)
mr CO1, C
mr AO, A
srawi. I, M, 4
ble LL(140)
.align 4
LL(130):
vxor c01, c01, c01
vxor c02, c02, c02
vxor c03, c03, c03
vxor c04, c04, c04
mr BO, B
dcbtst CO1, PREC
mr J, K
andi. r0, B, 15
ble+ LL(131)
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
LOAD_B b1, OFFSET_0, BO
vspltw bp1, b1, 2
vspltw bp2, b1, 3
addi AO, AO, 16 * SIZE
addi BO, BO, SIZE
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
vmaddfp c03, a3, bp1, c03
vmaddfp c04, a4, bp1, c04
subi J, J, 1
cmpwi cr0, J, 0
ble LL(138)
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
addi AO, AO, 16 * SIZE
addi BO, BO, SIZE
vmaddfp c01, a1, bp2, c01
vmaddfp c02, a2, bp2, c02
vmaddfp c03, a3, bp2, c03
vmaddfp c04, a4, bp2, c04
subi J, J, 1
cmpwi cr0, J, 0
ble LL(138)
.align 4
LL(131):
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
LOAD_A a5, OFFSET_4, AO
LOAD_A a6, OFFSET_5, AO
LOAD_A a7, OFFSET_6, AO
LOAD_A a8, OFFSET_7, AO
LOAD_B b1, OFFSET_0, BO
srawi. r0, J, 2
mtspr CTR, r0
ble LL(135)
.align 4
LL(133):
vspltw bp1, b1, 0
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
vmaddfp c03, a3, bp1, c03
vmaddfp c04, a4, bp1, c04
vspltw bp2, b1, 1
vmaddfp c01, a5, bp2, c01
vmaddfp c02, a6, bp2, c02
vmaddfp c03, a7, bp2, c03
vmaddfp c04, a8, bp2, c04
addi AO, AO, 32 * SIZE
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
vspltw bp1, b1, 2
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
vmaddfp c03, a3, bp1, c03
vmaddfp c04, a4, bp1, c04
LOAD_A a5, OFFSET_4, AO
LOAD_A a6, OFFSET_5, AO
LOAD_A a7, OFFSET_6, AO
LOAD_A a8, OFFSET_7, AO
vspltw bp2, b1, 3
vmaddfp c01, a5, bp2, c01
vmaddfp c02, a6, bp2, c02
vmaddfp c03, a7, bp2, c03
vmaddfp c04, a8, bp2, c04
addi AO, AO, 32 * SIZE
addi BO, BO, 4 * SIZE
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
LOAD_A a5, OFFSET_4, AO
LOAD_A a6, OFFSET_5, AO
LOAD_A a7, OFFSET_6, AO
LOAD_A a8, OFFSET_7, AO
LOAD_B b1, OFFSET_0, BO
bdnz LL(133)
.align 4
LL(135):
andi. r0, J, 3
ble+ LL(138)
cmpwi cr0, r0, 3
bne LL(136)
vspltw bp1, b1, 0
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
vmaddfp c03, a3, bp1, c03
vmaddfp c04, a4, bp1, c04
addi AO, AO, 16 * SIZE
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
vspltw bp2, b1, 1
vmaddfp c01, a1, bp2, c01
vmaddfp c02, a2, bp2, c02
vmaddfp c03, a3, bp2, c03
vmaddfp c04, a4, bp2, c04
addi AO, AO, 16 * SIZE
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
vspltw bp1, b1, 2
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
vmaddfp c03, a3, bp1, c03
vmaddfp c04, a4, bp1, c04
addi AO, AO, 16 * SIZE
addi BO, BO, 3 * SIZE
b LL(138)
.align 4
LL(136):
cmpwi cr0, r0, 2
bne LL(137)
vspltw bp1, b1, 0
vspltw bp2, b1, 1
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
vmaddfp c03, a3, bp1, c03
vmaddfp c04, a4, bp1, c04
LOAD_A a1, OFFSET_4, AO
LOAD_A a2, OFFSET_5, AO
LOAD_A a3, OFFSET_6, AO
LOAD_A a4, OFFSET_7, AO
vmaddfp c01, a1, bp2, c01
vmaddfp c02, a2, bp2, c02
vmaddfp c03, a3, bp2, c03
vmaddfp c04, a4, bp2, c04
addi AO, AO, 32 * SIZE
addi BO, BO, 2 * SIZE
b LL(138)
.align 4
LL(137):
cmpwi cr0, r0, 1
bne LL(138)
vspltw bp1, b1, 0
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
vmaddfp c03, a3, bp1, c03
vmaddfp c04, a4, bp1, c04
addi AO, AO, 16 * SIZE
addi BO, BO, 1 * SIZE
.align 4
LL(138):
lvx alpha, OFFSET_0, SP
vxor VZERO, VZERO, VZERO
lvx C1, OFFSET_0, CO1
lvx C2, OFFSET_1, CO1
lvx C3, OFFSET_2, CO1
lvx C4, OFFSET_3, CO1
lvx C5, OFFSET_4, CO1
lvsr PERMRSHIFT1, 0, CO1
vperm c00, VZERO, c01, PERMRSHIFT1
vperm c01, c01, c02, PERMRSHIFT1
vperm c02, c02, c03, PERMRSHIFT1
vperm c03, c03, c04, PERMRSHIFT1
vperm c04, c04, VZERO, PERMRSHIFT1
vmaddfp c00, alpha, c00, C1
vmaddfp c01, alpha, c01, C2
vmaddfp c02, alpha, c02, C3
vmaddfp c03, alpha, c03, C4
vmaddfp c04, alpha, c04, C5
stvx c00, OFFSET_0, CO1
stvx c01, OFFSET_1, CO1
stvx c02, OFFSET_2, CO1
stvx c03, OFFSET_3, CO1
stvx c04, OFFSET_4, CO1
addi CO1, CO1, 16 * SIZE
addic. I, I, -1
bgt+ LL(130)
.align 4
LL(140):
andi. I, M, 8
ble LL(150)
vxor c01, c01, c01
vxor c02, c02, c02
mr BO, B
mr J, K
andi. r0, B, 15
ble+ LL(141)
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_B b1, OFFSET_0, BO
vspltw bp1, b1, 2
vspltw bp2, b1, 3
addi AO, AO, 8 * SIZE
addi BO, BO, SIZE
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
subi J, J, 1
cmpwi cr0, J, 0
ble LL(148)
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
addi AO, AO, 8 * SIZE
addi BO, BO, SIZE
vmaddfp c01, a1, bp2, c01
vmaddfp c02, a2, bp2, c02
subi J, J, 1
cmpwi cr0, J, 0
ble LL(148)
.align 4
LL(141):
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
LOAD_A a5, OFFSET_4, AO
LOAD_A a6, OFFSET_5, AO
LOAD_A a7, OFFSET_6, AO
LOAD_A a8, OFFSET_7, AO
LOAD_B b1, OFFSET_0, BO
srawi. r0, J, 2
mtspr CTR, r0
ble LL(145)
.align 4
LL(143):
vspltw bp1, b1, 0
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
vspltw bp2, b1, 1
vmaddfp c01, a3, bp2, c01
vmaddfp c02, a4, bp2, c02
vspltw bp1, b1, 2
vmaddfp c01, a5, bp1, c01
vmaddfp c02, a6, bp1, c02
vspltw bp2, b1, 3
vmaddfp c01, a7, bp2, c01
vmaddfp c02, a8, bp2, c02
addi AO, AO, 32 * SIZE
addi BO, BO, 4 * SIZE
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
LOAD_A a5, OFFSET_4, AO
LOAD_A a6, OFFSET_5, AO
LOAD_A a7, OFFSET_6, AO
LOAD_A a8, OFFSET_7, AO
LOAD_B b1, OFFSET_0, BO
bdnz LL(143)
.align 4
LL(145):
andi. r0, J, 3
ble+ LL(148)
cmpwi cr0, r0, 3
bne LL(146)
vspltw bp1, b1, 0
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
vspltw bp2, b1, 1
vmaddfp c01, a3, bp2, c01
vmaddfp c02, a4, bp2, c02
LOAD_A a1, OFFSET_4, AO
LOAD_A a2, OFFSET_5, AO
vspltw bp1, b1, 2
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
addi AO, AO, 24 * SIZE
addi BO, BO, 3 * SIZE
b LL(148)
.align 4
LL(146):
cmpwi cr0, r0, 2
bne LL(147)
vspltw bp1, b1, 0
vspltw bp2, b1, 1
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
vmaddfp c01, a3, bp2, c01
vmaddfp c02, a4, bp2, c02
addi AO, AO, 16 * SIZE
addi BO, BO, 2 * SIZE
b LL(148)
.align 4
LL(147):
cmpwi cr0, r0, 1
bne LL(148)
vspltw bp1, b1, 0
vmaddfp c01, a1, bp1, c01
vmaddfp c02, a2, bp1, c02
addi AO, AO, 8 * SIZE
addi BO, BO, 1 * SIZE
.align 4
LL(148):
lvx alpha, OFFSET_0, SP
vxor VZERO, VZERO, VZERO
lvx C1, OFFSET_0, CO1
lvx C2, OFFSET_1, CO1
lvx C3, OFFSET_2, CO1
lvsr PERMRSHIFT1, 0, CO1
vperm c00, VZERO, c01, PERMRSHIFT1
vperm c01, c01, c02, PERMRSHIFT1
vperm c02, c02, VZERO, PERMRSHIFT1
vmaddfp c00, alpha, c00, C1
vmaddfp c01, alpha, c01, C2
vmaddfp c02, alpha, c02, C3
stvx c00, OFFSET_0, CO1
stvx c01, OFFSET_1, CO1
stvx c02, OFFSET_2, CO1
addi CO1, CO1, 8 * SIZE
.align 4
LL(150):
andi. I, M, 4
ble LL(160)
vxor c01, c01, c01
mr BO, B
mr J, K
andi. r0, B, 15
ble+ LL(151)
LOAD_A a1, OFFSET_0, AO
LOAD_B b1, OFFSET_0, BO
vspltw bp1, b1, 2
vspltw bp2, b1, 3
addi AO, AO, 4 * SIZE
addi BO, BO, SIZE
vmaddfp c01, a1, bp1, c01
subi J, J, 1
cmpwi cr0, J, 0
ble LL(158)
LOAD_A a1, OFFSET_0, AO
addi AO, AO, 4 * SIZE
addi BO, BO, SIZE
vmaddfp c01, a1, bp2, c01
subi J, J, 1
cmpwi cr0, J, 0
ble LL(158)
.align 4
LL(151):
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
LOAD_B b1, OFFSET_0, BO
srawi. r0, J, 2
mtspr CTR, r0
ble LL(155)
.align 4
LL(153):
vspltw bp1, b1, 0
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
vmaddfp c01, a2, bp2, c01
vspltw bp1, b1, 2
vmaddfp c01, a3, bp1, c01
vspltw bp2, b1, 3
vmaddfp c01, a4, bp2, c01
addi AO, AO, 16 * SIZE
addi BO, BO, 4 * SIZE
LOAD_A a1, OFFSET_0, AO
LOAD_A a2, OFFSET_1, AO
LOAD_A a3, OFFSET_2, AO
LOAD_A a4, OFFSET_3, AO
LOAD_B b1, OFFSET_0, BO
bdnz LL(153)
.align 4
LL(155):
andi. r0, J, 3
ble+ LL(158)
cmpwi cr0, r0, 3
bne LL(156)
vspltw bp1, b1, 0
vmaddfp c01, a1, bp1, c01
vspltw bp2, b1, 1
vmaddfp c01, a2, bp2, c01
vspltw bp1, b1, 2
vmaddfp c01, a3, bp1, c01
addi AO, AO, 12 * SIZE
addi BO, BO, 3 * SIZE
b LL(158)
.align 4
LL(156):
cmpwi cr0, r0, 2
bne LL(157)
vspltw bp1, b1, 0
vspltw bp2, b1, 1
vmaddfp c01, a1, bp1, c01
vmaddfp c01, a2, bp2, c01
addi AO, AO, 8 * SIZE
addi BO, BO, 2 * SIZE
b LL(158)
.align 4
LL(157):
cmpwi cr0, r0, 1
bne LL(158)
vspltw bp1, b1, 0
vmaddfp c01, a1, bp1, c01
addi AO, AO, 4 * SIZE
addi BO, BO, 1 * SIZE
.align 4
LL(158):
lvx alpha, OFFSET_0, SP
vxor VZERO, VZERO, VZERO
lvx C1, OFFSET_0, CO1
lvx C2, OFFSET_1, CO1
lvsr PERMRSHIFT1, 0, CO1
vperm c00, VZERO, c01, PERMRSHIFT1
vperm c01, c01, VZERO, PERMRSHIFT1
vmaddfp c00, alpha, c00, C1
vmaddfp c01, alpha, c01, C2
stvx c00, OFFSET_0, CO1
stvx c01, OFFSET_1, CO1
addi CO1, CO1, 4 * SIZE
.align 4
LL(160):
andi. I, M, 2
ble LL(170)
mr BO, B
LFD f8, 0 * SIZE(AO)
LFD f9, 1 * SIZE(AO)
LFD f10, 2 * SIZE(AO)
LFD f11, 3 * SIZE(AO)
LFD f12, 0 * SIZE(B)
LFD f13, 1 * SIZE(B)
lfs f0, FZERO(SP)
fmr f1, f0
fmr f2, f0
fmr f3, f0
srawi. r0, K, 1
mtspr CTR, r0
ble LL(165)
.align 4
LL(162):
FMADD f0, f8, f12, f0
FMADD f1, f9, f12, f1
FMADD f2, f10, f13, f2
FMADD f3, f11, f13, f3
LFD f8, 4 * SIZE(AO)
LFD f9, 5 * SIZE(AO)
LFD f10, 6 * SIZE(AO)
LFD f11, 7 * SIZE(AO)
LFD f12, 2 * SIZE(BO)
LFD f13, 3 * SIZE(BO)
addi AO, AO, 4 * SIZE
addi BO, BO, 2 * SIZE
bdnz LL(162)
.align 4
LL(165):
andi. r0, K, 1
lfs f13, ALPHA(SP)
ble LL(168)
.align 4
LL(166):
FMADD f0, f8, f12, f0
FMADD f1, f9, f12, f1
addi AO, AO, 2 * SIZE
addi BO, BO, 1 * SIZE
.align 4
LL(168):
LFD f8, 0 * SIZE(CO1)
LFD f9, 1 * SIZE(CO1)
FADD f0, f0, f2
FADD f1, f1, f3
FMADD f0, f0, f13, f8
FMADD f1, f1, f13, f9
STFD f0, 0 * SIZE(CO1)
STFD f1, 1 * SIZE(CO1)
addi CO1, CO1, 2 * SIZE
.align 4
LL(170):
andi. I, M, 1
ble LL(999)
mr BO, B
LFD f8, 0 * SIZE(AO)
LFD f9, 1 * SIZE(AO)
LFD f10, 0 * SIZE(B)
LFD f11, 1 * SIZE(B)
lfs f0, FZERO(SP)
fmr f1, f0
srawi. r0, K, 1
mtspr CTR, r0
ble LL(175)
.align 4
LL(172):
FMADD f0, f8, f10, f0
FMADD f1, f9, f11, f1
LFD f8, 2 * SIZE(AO)
LFD f9, 3 * SIZE(AO)
LFD f10, 2 * SIZE(BO)
LFD f11, 3 * SIZE(BO)
addi AO, AO, 2 * SIZE
addi BO, BO, 2 * SIZE
bdnz LL(172)
.align 4
LL(175):
andi. r0, K, 1
lfs f13, ALPHA(SP)
ble LL(178)
.align 4
LL(176):
FMADD f0, f8, f10, f0
addi AO, AO, 1 * SIZE
addi BO, BO, 1 * SIZE
.align 4
LL(178):
LFD f8, 0 * SIZE(CO1)
FADD f0, f0, f1
FMADD f0, f0, f13, f8
STFD f0, 0 * SIZE(CO1)
.align 4
LL(999):
mr SP, STACK
li r0, 0 * 16
lvx v20, SP, r0
li r0, 1 * 16
lvx v21, SP, r0
li r0, 2 * 16
lvx v22, SP, r0
li r0, 3 * 16
lvx v23, SP, r0
li r0, 4 * 16
lvx v24, SP, r0
li r0, 5 * 16
lvx v25, SP, r0
li r0, 6 * 16
lvx v26, SP, r0
li r0, 7 * 16
lvx v27, SP, r0
li r0, 8 * 16
lvx v28, SP, r0
li r0, 9 * 16
lvx v29, SP, r0
li r0, 10 * 16
lvx v30, SP, r0
li r0, 11 * 16
lvx v31, SP, r0
mtspr VRsave, VREG
#ifdef __64BIT__
ld r31, 192(SP)
ld r30, 200(SP)
ld r29, 208(SP)
ld r28, 216(SP)
ld r27, 224(SP)
ld r26, 232(SP)
ld r25, 240(SP)
ld r24, 248(SP)
ld r23, 256(SP)
ld r22, 264(SP)
ld r21, 272(SP)
ld r20, 280(SP)
ld r19, 288(SP)
ld r18, 296(SP)
ld r17, 304(SP)
ld r16, 312(SP)
ld r15, 320(SP)
ld r14, 328(SP)
#else
lwz r31, 192(SP)
lwz r30, 196(SP)
lwz r29, 200(SP)
lwz r28, 204(SP)
lwz r27, 208(SP)
lwz r26, 212(SP)
lwz r25, 216(SP)
lwz r24, 220(SP)
lwz r23, 224(SP)
lwz r22, 228(SP)
lwz r21, 232(SP)
lwz r20, 236(SP)
lwz r19, 240(SP)
lwz r18, 244(SP)
lwz r17, 248(SP)
lwz r16, 252(SP)
lwz r15, 256(SP)
lwz r14, 260(SP)
#endif
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif