/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define M %i0
#define N %i1
#define K %i2
#if defined(DOUBLE) && !defined(__64BIT__)
#define A %i5
#define B %i4
#else
#define A %i4
#define B %i5
#endif
#define C %o4
#define LDC %o5
#define AO %l0
#define BO %l1
#define I %l2
#define J %l3
#define L %l4
#define C1 %o0
#define C2 %o1
#define C3 %o2
#define C4 %o3
#define OFFSET %l5
#define KK %l6
#define TEMP1 %l7
#define TEMP2 %i3
#define AORIG %g1
#ifdef DOUBLE
#define c01 %f0
#define c02 %f2
#define c03 %f4
#define c04 %f6
#define c05 %f8
#define c06 %f10
#define c07 %f12
#define c08 %f14
#define c09 %f16
#define c10 %f18
#define c11 %f20
#define c12 %f22
#define c13 %f24
#define c14 %f26
#define c15 %f28
#define c16 %f30
#define t1 %f32
#define t2 %f34
#define t3 %f36
#define t4 %f38
#define a1 %f40
#define a2 %f42
#define a3 %f44
#define a4 %f46
#define a5 %f58
#define b1 %f48
#define b2 %f50
#define b3 %f52
#define b4 %f54
#define b5 %f56
#define FZERO %f60
#define ALPHA %f62
#else
#define c01 %f0
#define c02 %f1
#define c03 %f2
#define c04 %f3
#define c05 %f4
#define c06 %f5
#define c07 %f6
#define c08 %f7
#define c09 %f8
#define c10 %f9
#define c11 %f10
#define c12 %f11
#define c13 %f12
#define c14 %f13
#define c15 %f14
#define c16 %f15
#define t1 %f16
#define t2 %f17
#define t3 %f18
#define t4 %f19
#define a1 %f20
#define a2 %f21
#define a3 %f22
#define a4 %f23
#define a5 %f31
#define b1 %f24
#define b2 %f25
#define b3 %f26
#define b4 %f27
#define b5 %f28
#define FZERO %f29
#define ALPHA %f30
#endif
#define APREFETCHSIZE 40
#define BPREFETCHSIZE 40
#define APREFETCH_CATEGORY 0
#define BPREFETCH_CATEGORY 0
PROLOGUE
SAVESP
nop
#ifndef __64BIT__
#ifdef DOUBLE
ld [%sp + STACK_START + 28], B
ld [%sp + STACK_START + 32], C
ld [%sp + STACK_START + 36], LDC
ld [%sp + STACK_START + 40], OFFSET
#else
ld [%sp + STACK_START + 28], C
ld [%sp + STACK_START + 32], LDC
ld [%sp + STACK_START + 36], OFFSET
#endif
#else
ldx [%sp+ STACK_START + 56], C
ldx [%sp+ STACK_START + 64], LDC
ldx [%sp+ STACK_START + 72], OFFSET
#endif
FCLR(29)
sll LDC, BASE_SHIFT, LDC
#ifdef LN
smul M, K, TEMP1
sll TEMP1, BASE_SHIFT, TEMP1
add A, TEMP1, A
sll M, BASE_SHIFT, TEMP1
add C, TEMP1, C
#endif
#ifdef RN
neg OFFSET, KK
#endif
#ifdef RT
smul N, K, TEMP1
sll TEMP1, BASE_SHIFT, TEMP1
add B, TEMP1, B
smul N, LDC, TEMP1
add C, TEMP1, C
sub N, OFFSET, KK
#endif
sra N, 2, J
cmp J, 0
ble,pn %icc, .LL100
nop
.LL11:
#ifdef RT
sll K, 2 + BASE_SHIFT, TEMP1
sub B, TEMP1, B
sll LDC, 2, TEMP1
sub C, TEMP1, C
#endif
mov C, C1
add C, LDC, C2
add C2, LDC, C3
add C3, LDC, C4
#ifdef LN
add M, OFFSET, KK
#endif
#ifdef LT
mov OFFSET, KK
#endif
#if defined(LN) || defined(RT)
mov A, AORIG
#else
mov A, AO
#endif
#ifndef RT
add C4, LDC, C
#endif
and M, 1, I
cmp I, 0
ble,pn %icc, .LL50
nop
#if defined(LT) || defined(RN)
sra KK, 2, L
mov B, BO
cmp L, 0
#else
#ifdef LN
sll K, 0 + BASE_SHIFT, TEMP1
sub AORIG, TEMP1, AORIG
#endif
sll KK, 0 + BASE_SHIFT, TEMP1
sll KK, 2 + BASE_SHIFT, TEMP2
add AORIG, TEMP1, AO
add B, TEMP2, BO
sub K, KK, TEMP1
sra TEMP1, 2, L
cmp L, 0
#endif
LDF [AO + 0 * SIZE], a1
FMOV FZERO, c01
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c02
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [BO + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [BO + 3 * SIZE], b4
FMOV FZERO, t4
ble,pn %icc, .LL75
nop
.LL72:
FADD c01, t1, c01
add L, -1, L
FMUL a1, b1, t1
LDF [BO + 4 * SIZE], b1
FADD c02, t2, c02
cmp L, 0
FMUL a1, b2, t2
LDF [BO + 5 * SIZE], b2
FADD c03, t3, c03
FMUL a1, b3, t3
LDF [BO + 6 * SIZE], b3
FADD c04, t4, c04
FMUL a1, b4, t4
LDF [BO + 7 * SIZE], b4
LDF [AO + 4 * SIZE], a1
FADD c01, t1, c01
add AO, 4 * SIZE, AO
FMUL a2, b1, t1
LDF [BO + 8 * SIZE], b1
FADD c02, t2, c02
FMUL a2, b2, t2
LDF [BO + 9 * SIZE], b2
FADD c03, t3, c03
FMUL a2, b3, t3
LDF [BO + 10 * SIZE], b3
FADD c04, t4, c04
FMUL a2, b4, t4
LDF [BO + 11 * SIZE], b4
LDF [AO + 1 * SIZE], a2
FADD c01, t1, c01
FMUL a3, b1, t1
LDF [BO + 12 * SIZE], b1
FADD c02, t2, c02
FMUL a3, b2, t2
LDF [BO + 13 * SIZE], b2
FADD c03, t3, c03
FMUL a3, b3, t3
LDF [BO + 14 * SIZE], b3
FADD c04, t4, c04
FMUL a3, b4, t4
LDF [BO + 15 * SIZE], b4
LDF [AO + 2 * SIZE], a3
FADD c01, t1, c01
FMUL a4, b1, t1
LDF [BO + 16 * SIZE], b1
FADD c02, t2, c02
FMUL a4, b2, t2
LDF [BO + 17 * SIZE], b2
FADD c03, t3, c03
FMUL a4, b3, t3
LDF [BO + 18 * SIZE], b3
FADD c04, t4, c04
FMUL a4, b4, t4
LDF [BO + 19 * SIZE], b4
add BO, 16 * SIZE, BO
bg,pt %icc, .LL72
LDF [AO + 3 * SIZE], a4
.LL75:
#if defined(LT) || defined(RN)
and KK, 3, L
#else
and TEMP1, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL79
nop
.LL76:
FADD c01, t1, c01
add AO, 1 * SIZE, AO
FMUL a1, b1, t1
LDF [BO + 4 * SIZE], b1
FADD c02, t2, c02
add L, -1, L
FMUL a1, b2, t2
LDF [BO + 5 * SIZE], b2
FADD c03, t3, c03
cmp L, 0
FMUL a1, b3, t3
LDF [BO + 6 * SIZE], b3
FADD c04, t4, c04
add BO, 4 * SIZE, BO
FMUL a1, b4, t4
LDF [AO + 0 * SIZE], a1
bg,pt %icc, .LL76
LDF [BO + 3 * SIZE], b4
.LL79:
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
#if defined(LN) || defined(RT)
#ifdef LN
sub KK, 1, TEMP1
#else
sub KK, 4, TEMP1
#endif
sll TEMP1, 0 + BASE_SHIFT, TEMP2
sll TEMP1, 2 + BASE_SHIFT, TEMP1
add AORIG, TEMP2, AO
add B, TEMP1, BO
#endif
#if defined(LN) || defined(LT)
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
LDF [BO + 2 * SIZE], a3
LDF [BO + 3 * SIZE], a4
FSUB a1, c01, c01
FSUB a2, c02, c02
FSUB a3, c03, c03
FSUB a4, c04, c04
#else
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [AO + 2 * SIZE], a3
LDF [AO + 3 * SIZE], a4
FSUB a1, c01, c01
FSUB a2, c02, c02
FSUB a3, c03, c03
FSUB a4, c04, c04
#endif
#ifdef LN
LDF [AO + 0 * SIZE], a1
FMUL a1, c01, c01
FMUL a1, c02, c02
FMUL a1, c03, c03
FMUL a1, c04, c04
#endif
#ifdef LT
LDF [AO + 0 * SIZE], a1
FMUL a1, c01, c01
FMUL a1, c02, c02
FMUL a1, c03, c03
FMUL a1, c04, c04
#endif
#ifdef RN
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
LDF [BO + 2 * SIZE], a3
LDF [BO + 3 * SIZE], a4
FMUL a1, c01, c01
FMUL a2, c01, t1
FSUB c02, t1, c02
FMUL a3, c01, t1
FSUB c03, t1, c03
FMUL a4, c01, t1
FSUB c04, t1, c04
LDF [BO + 5 * SIZE], a1
LDF [BO + 6 * SIZE], a2
LDF [BO + 7 * SIZE], a3
FMUL a1, c02, c02
FMUL a2, c02, t1
FSUB c03, t1, c03
FMUL a3, c02, t1
FSUB c04, t1, c04
LDF [BO + 10 * SIZE], a1
LDF [BO + 11 * SIZE], a2
FMUL a1, c03, c03
FMUL a2, c03, t1
FSUB c04, t1, c04
LDF [BO + 15 * SIZE], a1
FMUL a1, c04, c04
#endif
#ifdef RT
LDF [BO + 15 * SIZE], a1
LDF [BO + 14 * SIZE], a2
LDF [BO + 13 * SIZE], a3
LDF [BO + 12 * SIZE], a4
FMUL a1, c04, c04
FMUL a2, c04, t1
FSUB c03, t1, c03
FMUL a3, c04, t1
FSUB c02, t1, c02
FMUL a4, c04, t1
FSUB c01, t1, c01
LDF [BO + 10 * SIZE], a1
LDF [BO + 9 * SIZE], a2
LDF [BO + 8 * SIZE], a3
FMUL a1, c03, c03
FMUL a2, c03, t1
FSUB c02, t1, c02
FMUL a3, c03, t1
FSUB c01, t1, c01
LDF [BO + 5 * SIZE], a1
LDF [BO + 4 * SIZE], a2
FMUL a1, c02, c02
FMUL a2, c02, t1
FSUB c01, t1, c01
LDF [BO + 0 * SIZE], a1
FMUL a1, c01, c01
#endif
#ifdef LN
add C1, -1 * SIZE, C1
add C2, -1 * SIZE, C2
add C3, -1 * SIZE, C3
add C4, -1 * SIZE, C4
#endif
#if defined(LN) || defined(LT)
STF c01, [BO + 0 * SIZE]
STF c02, [BO + 1 * SIZE]
STF c03, [BO + 2 * SIZE]
STF c04, [BO + 3 * SIZE]
#else
STF c01, [AO + 0 * SIZE]
STF c02, [AO + 1 * SIZE]
STF c03, [AO + 2 * SIZE]
STF c04, [AO + 3 * SIZE]
#endif
STF c01, [C1 + 0 * SIZE]
STF c02, [C2 + 0 * SIZE]
STF c03, [C3 + 0 * SIZE]
STF c04, [C4 + 0 * SIZE]
FMOV FZERO, t1
FMOV FZERO, t2
FMOV FZERO, t3
FMOV FZERO, t4
#ifndef LN
add C1, 1 * SIZE, C1
add C2, 1 * SIZE, C2
add C3, 1 * SIZE, C3
add C4, 1 * SIZE, C4
#endif
#ifdef RT
sll K, 0 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AORIG
#endif
#if defined(LT) || defined(RN)
sub K, KK, TEMP1
sll TEMP1, 0 + BASE_SHIFT, TEMP2
sll TEMP1, 2 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LT
add KK, 1, KK
#endif
#ifdef LN
sub KK, 1, KK
#endif
.LL50:
and M, 2, I
cmp I, 0
ble,pn %icc, .LL70
nop
#if defined(LT) || defined(RN)
sra KK, 2, L
mov B, BO
cmp L, 0
#else
#ifdef LN
sll K, 1 + BASE_SHIFT, TEMP1
sub AORIG, TEMP1, AORIG
#endif
sll KK, 1 + BASE_SHIFT, TEMP1
sll KK, 2 + BASE_SHIFT, TEMP2
add AORIG, TEMP1, AO
add B, TEMP2, BO
sub K, KK, TEMP1
sra TEMP1, 2, L
cmp L, 0
#endif
FMOV FZERO, c02
FMOV FZERO, t1
FMOV FZERO, c04
LDF [AO + 0 * SIZE], a1
FMOV FZERO, t2
LDF [BO + 0 * SIZE], b1
FMOV FZERO, c06
LDF [AO + 1 * SIZE], a2
FMOV FZERO, t3
LDF [BO + 1 * SIZE], b2
FMOV FZERO, c08
LDF [AO + 2 * SIZE], a3
FMOV FZERO, t4
LDF [BO + 2 * SIZE], b3
FMOV FZERO, c01
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c03
LDF [BO + 3 * SIZE], b4
FMOV FZERO, c05
ble,pn %icc, .LL55
FMOV FZERO, c07
.LL52:
FADD c02, t1, c02
add AO, 8 * SIZE, AO
prefetch [AO + APREFETCHSIZE * SIZE], 0
FMUL a1, b1, t1
add BO, 16 * SIZE, BO
FADD c04, t2, c04
add L, -1, L
FMUL a1, b2, t2
FADD c06, t3, c06
cmp L, 0
FMUL a1, b3, t3
FADD c08, t4, c08
FMUL a1, b4, t4
LDF [AO - 4 * SIZE], a1
FADD c01, t1, c01
FMUL a2, b1, t1
LDF [BO - 12 * SIZE], b1
FADD c03, t2, c03
FMUL a2, b2, t2
LDF [BO - 11 * SIZE], b2
FADD c05, t3, c05
FMUL a2, b3, t3
LDF [BO - 10 * SIZE], b3
FADD c07, t4, c07
FMUL a2, b4, t4
LDF [BO - 9 * SIZE], b4
FADD c02, t1, c02
FMUL a3, b1, t1
LDF [AO - 3 * SIZE], a2
FADD c04, t2, c04
FMUL a3, b2, t2
FADD c06, t3, c06
FMUL a3, b3, t3
FADD c08, t4, c08
FMUL a3, b4, t4
LDF [AO - 2 * SIZE], a3
FADD c01, t1, c01
FMUL a4, b1, t1
LDF [BO - 8 * SIZE], b1
FADD c03, t2, c03
FMUL a4, b2, t2
LDF [BO - 7 * SIZE], b2
FADD c05, t3, c05
FMUL a4, b3, t3
LDF [BO - 6 * SIZE], b3
FADD c07, t4, c07
FMUL a4, b4, t4
LDF [BO - 5 * SIZE], b4
FADD c02, t1, c02
FMUL a1, b1, t1
LDF [AO - 1 * SIZE], a4
FADD c04, t2, c04
FMUL a1, b2, t2
FADD c06, t3, c06
FMUL a1, b3, t3
FADD c08, t4, c08
FMUL a1, b4, t4
LDF [AO + 0 * SIZE], a1
FADD c01, t1, c01
FMUL a2, b1, t1
LDF [BO - 4 * SIZE], b1
FADD c03, t2, c03
FMUL a2, b2, t2
LDF [BO - 3 * SIZE], b2
FADD c05, t3, c05
FMUL a2, b3, t3
LDF [BO - 2 * SIZE], b3
FADD c07, t4, c07
FMUL a2, b4, t4
LDF [BO - 1 * SIZE], b4
FADD c02, t1, c02
FMUL a3, b1, t1
LDF [AO + 1 * SIZE], a2
FADD c04, t2, c04
FMUL a3, b2, t2
FADD c06, t3, c06
FMUL a3, b3, t3
FADD c08, t4, c08
FMUL a3, b4, t4
LDF [AO + 2 * SIZE], a3
FADD c01, t1, c01
FMUL a4, b1, t1
LDF [BO + 0 * SIZE], b1
FADD c03, t2, c03
FMUL a4, b2, t2
LDF [BO + 1 * SIZE], b2
FADD c05, t3, c05
FMUL a4, b3, t3
LDF [BO + 2 * SIZE], b3
FADD c07, t4, c07
FMUL a4, b4, t4
LDF [BO + 3 * SIZE], b4
bg,pt %icc, .LL52
LDF [AO + 3 * SIZE], a4
.LL55:
#if defined(LT) || defined(RN)
and KK, 3, L
#else
and TEMP1, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL59
nop
.LL56:
FADD c02, t1, c02
add AO, 2 * SIZE, AO
FMUL a1, b1, t1
add L, -1, L
add BO, 4 * SIZE, BO
FADD c04, t2, c04
cmp L, 0
FMUL a1, b2, t2
FADD c06, t3, c06
FMUL a1, b3, t3
FADD c08, t4, c08
FMUL a1, b4, t4
LDF [AO + 0 * SIZE], a1
FADD c01, t1, c01
FMUL a2, b1, t1
LDF [BO + 0 * SIZE], b1
FADD c03, t2, c03
FMUL a2, b2, t2
LDF [BO + 1 * SIZE], b2
FADD c05, t3, c05
FMUL a2, b3, t3
LDF [BO + 2 * SIZE], b3
FADD c07, t4, c07
FMUL a2, b4, t4
LDF [BO + 3 * SIZE], b4
bg,pt %icc, .LL56
LDF [AO + 1 * SIZE], a2
.LL59:
#if defined(LN) || defined(RT)
#ifdef LN
sub KK, 2, TEMP1
#else
sub KK, 4, TEMP1
#endif
sll TEMP1, 1 + BASE_SHIFT, TEMP2
sll TEMP1, 2 + BASE_SHIFT, TEMP1
add AORIG, TEMP2, AO
add B, TEMP1, BO
#endif
FADD c02, t1, c02
FADD c04, t2, c04
FADD c06, t3, c06
FADD c08, t4, c08
#if defined(LN) || defined(LT)
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
LDF [BO + 2 * SIZE], a3
LDF [BO + 3 * SIZE], a4
LDF [BO + 4 * SIZE], b1
LDF [BO + 5 * SIZE], b2
LDF [BO + 6 * SIZE], b3
LDF [BO + 7 * SIZE], b4
FSUB a1, c01, c01
FSUB a2, c03, c03
FSUB a3, c05, c05
FSUB a4, c07, c07
FSUB b1, c02, c02
FSUB b2, c04, c04
FSUB b3, c06, c06
FSUB b4, c08, c08
#else
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [AO + 2 * SIZE], a3
LDF [AO + 3 * SIZE], a4
LDF [AO + 4 * SIZE], b1
LDF [AO + 5 * SIZE], b2
LDF [AO + 6 * SIZE], b3
LDF [AO + 7 * SIZE], b4
FSUB a1, c01, c01
FSUB a2, c02, c02
FSUB a3, c03, c03
FSUB a4, c04, c04
FSUB b1, c05, c05
FSUB b2, c06, c06
FSUB b3, c07, c07
FSUB b4, c08, c08
#endif
#ifdef LN
LDF [AO + 3 * SIZE], a1
LDF [AO + 2 * SIZE], a2
LDF [AO + 0 * SIZE], a3
FMUL a1, c02, c02
FMUL a1, c04, c04
FMUL a1, c06, c06
FMUL a1, c08, c08
FMUL a2, c02, t1
FMUL a2, c04, t2
FMUL a2, c06, t3
FMUL a2, c08, t4
FSUB c01, t1, c01
FSUB c03, t2, c03
FSUB c05, t3, c05
FSUB c07, t4, c07
FMUL a3, c01, c01
FMUL a3, c03, c03
FMUL a3, c05, c05
FMUL a3, c07, c07
#endif
#ifdef LT
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [AO + 3 * SIZE], a3
FMUL a1, c01, c01
FMUL a1, c03, c03
FMUL a1, c05, c05
FMUL a1, c07, c07
FMUL a2, c01, t1
FMUL a2, c03, t2
FMUL a2, c05, t3
FMUL a2, c07, t4
FSUB c02, t1, c02
FSUB c04, t2, c04
FSUB c06, t3, c06
FSUB c08, t4, c08
FMUL a3, c02, c02
FMUL a3, c04, c04
FMUL a3, c06, c06
FMUL a3, c08, c08
#endif
#ifdef RN
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
LDF [BO + 2 * SIZE], a3
LDF [BO + 3 * SIZE], a4
FMUL a1, c01, c01
FMUL a1, c02, c02
FMUL a2, c01, t1
FMUL a2, c02, t2
FSUB c03, t1, c03
FSUB c04, t2, c04
FMUL a3, c01, t1
FMUL a3, c02, t2
FSUB c05, t1, c05
FSUB c06, t2, c06
FMUL a4, c01, t1
FMUL a4, c02, t2
FSUB c07, t1, c07
FSUB c08, t2, c08
LDF [BO + 5 * SIZE], a1
LDF [BO + 6 * SIZE], a2
LDF [BO + 7 * SIZE], a3
FMUL a1, c03, c03
FMUL a1, c04, c04
FMUL a2, c03, t1
FMUL a2, c04, t2
FSUB c05, t1, c05
FSUB c06, t2, c06
FMUL a3, c03, t1
FMUL a3, c04, t2
FSUB c07, t1, c07
FSUB c08, t2, c08
LDF [BO + 10 * SIZE], a1
LDF [BO + 11 * SIZE], a2
FMUL a1, c05, c05
FMUL a1, c06, c06
FMUL a2, c05, t1
FMUL a2, c06, t2
FSUB c07, t1, c07
FSUB c08, t2, c08
LDF [BO + 15 * SIZE], a1
FMUL a1, c07, c07
FMUL a1, c08, c08
#endif
#ifdef RT
LDF [BO + 15 * SIZE], a1
LDF [BO + 14 * SIZE], a2
LDF [BO + 13 * SIZE], a3
LDF [BO + 12 * SIZE], a4
FMUL a1, c07, c07
FMUL a1, c08, c08
FMUL a2, c07, t1
FMUL a2, c08, t2
FSUB c05, t1, c05
FSUB c06, t2, c06
FMUL a3, c07, t1
FMUL a3, c08, t2
FSUB c03, t1, c03
FSUB c04, t2, c04
FMUL a4, c07, t1
FMUL a4, c08, t2
FSUB c01, t1, c01
FSUB c02, t2, c02
LDF [BO + 10 * SIZE], a1
LDF [BO + 9 * SIZE], a2
LDF [BO + 8 * SIZE], a3
FMUL a1, c05, c05
FMUL a1, c06, c06
FMUL a2, c05, t1
FMUL a2, c06, t2
FSUB c03, t1, c03
FSUB c04, t2, c04
FMUL a3, c05, t1
FMUL a3, c06, t2
FSUB c01, t1, c01
FSUB c02, t2, c02
LDF [BO + 5 * SIZE], a1
LDF [BO + 4 * SIZE], a2
FMUL a1, c03, c03
FMUL a1, c04, c04
FMUL a2, c03, t1
FMUL a2, c04, t2
FSUB c01, t1, c01
FSUB c02, t2, c02
LDF [BO + 0 * SIZE], a1
FMUL a1, c01, c01
FMUL a1, c02, c02
#endif
#ifdef LN
add C1, -2 * SIZE, C1
add C2, -2 * SIZE, C2
add C3, -2 * SIZE, C3
add C4, -2 * SIZE, C4
#endif
#if defined(LN) || defined(LT)
STF c01, [BO + 0 * SIZE]
STF c03, [BO + 1 * SIZE]
STF c05, [BO + 2 * SIZE]
STF c07, [BO + 3 * SIZE]
STF c02, [BO + 4 * SIZE]
STF c04, [BO + 5 * SIZE]
STF c06, [BO + 6 * SIZE]
STF c08, [BO + 7 * SIZE]
#else
STF c01, [AO + 0 * SIZE]
STF c02, [AO + 1 * SIZE]
STF c03, [AO + 2 * SIZE]
STF c04, [AO + 3 * SIZE]
STF c05, [AO + 4 * SIZE]
STF c06, [AO + 5 * SIZE]
STF c07, [AO + 6 * SIZE]
STF c08, [AO + 7 * SIZE]
#endif
STF c01, [C1 + 0 * SIZE]
STF c02, [C1 + 1 * SIZE]
STF c03, [C2 + 0 * SIZE]
STF c04, [C2 + 1 * SIZE]
STF c05, [C3 + 0 * SIZE]
STF c06, [C3 + 1 * SIZE]
STF c07, [C4 + 0 * SIZE]
STF c08, [C4 + 1 * SIZE]
FMOV FZERO, t1
FMOV FZERO, t2
FMOV FZERO, t3
FMOV FZERO, t4
#ifndef LN
add C1, 2 * SIZE, C1
add C2, 2 * SIZE, C2
add C3, 2 * SIZE, C3
add C4, 2 * SIZE, C4
#endif
#ifdef RT
sll K, 1 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AORIG
#endif
#if defined(LT) || defined(RN)
sub K, KK, TEMP1
sll TEMP1, 1 + BASE_SHIFT, TEMP2
sll TEMP1, 2 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LT
add KK, 2, KK
#endif
#ifdef LN
sub KK, 2, KK
#endif
.LL70:
sra M, 2, I
cmp I, 0
ble,pn %icc, .LL99
nop
.LL21:
FMOV FZERO, t1
FMOV FZERO, t2
FMOV FZERO, t3
FMOV FZERO, t4
FMOV FZERO, c01
FMOV FZERO, c02
FMOV FZERO, c03
#if defined(LT) || defined(RN)
sra KK, 2, L
mov B, BO
cmp L, 0
#else
#ifdef LN
sll K, 2 + BASE_SHIFT, TEMP1
sub AORIG, TEMP1, AORIG
#endif
sll KK, 2 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AO
add B, TEMP1, BO
sub K, KK, TEMP1
sra TEMP1, 2, L
cmp L, 0
#endif
LDF [AO + 0 * SIZE], a1
FMOV FZERO, c04
LDF [BO + 0 * SIZE], b1
FMOV FZERO, c05
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c06
LDF [BO + 1 * SIZE], b2
FMOV FZERO, c07
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c08
LDF [BO + 2 * SIZE], b3
FMOV FZERO, c09
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c10
LDF [BO + 3 * SIZE], b4
FMOV FZERO, c11
LDF [BO + 4 * SIZE], b5 /* ***** */
LDF [AO + 4 * SIZE], a5 /* ***** */
#ifdef LN
prefetch [C1 + 3 * SIZE], 3
FMOV FZERO, c12
prefetch [C2 + 3 * SIZE], 3
FMOV FZERO, c13
prefetch [C3 + 3 * SIZE], 3
FMOV FZERO, c14
prefetch [C4 + 3 * SIZE], 3
FMOV FZERO, c15
#else
prefetch [C1 - 3 * SIZE], 3
FMOV FZERO, c12
prefetch [C2 - 3 * SIZE], 3
FMOV FZERO, c13
prefetch [C3 - 3 * SIZE], 3
FMOV FZERO, c14
prefetch [C4 - 3 * SIZE], 3
FMOV FZERO, c15
#endif
ble,pn %icc, .LL25
FMOV FZERO, c16
.LL22:
FADD c04, t1, c04
prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
FMUL a1, b1, t1
nop
FADD c08, t2, c08
prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
FMUL a1, b2, t2
add AO, 16 * SIZE, AO
FADD c12, t3, c12
LDF [AO - 13 * SIZE], a4
FMUL a1, b3, t3
add BO, 16 * SIZE, BO
FADD c16, t4, c16
nop
FMUL a1, b4, t4
LDF [AO - 8 * SIZE], a1
FADD c01, t1, c01
nop
FMUL a2, b1, t1
nop
FADD c05, t2, c05
nop
FMUL a2, b2, t2
nop
FADD c09, t3, c09
nop
FMUL a2, b3, t3
nop
FADD c13, t4, c13
add L, -1, L
FMUL a2, b4, t4
LDF [AO - 11 * SIZE], a2
FADD c02, t1, c02
nop
FMUL a3, b1, t1
nop
FADD c06, t2, c06
nop
FMUL a3, b2, t2
nop
FADD c10, t3, c10
nop
FMUL a3, b3, t3
nop
FADD c14, t4, c14
nop
FMUL a3, b4, t4
LDF [AO - 10 * SIZE], a3
FADD c03, t1, c03
nop
FMUL a4, b1, t1
LDF [BO - 8 * SIZE], b1
FADD c07, t2, c07
nop
FMUL a4, b2, t2
LDF [BO - 11 * SIZE], b2
FADD c11, t3, c11
nop
FMUL a4, b3, t3
LDF [BO - 10 * SIZE], b3
FADD c15, t4, c15
nop
FMUL a4, b4, t4
LDF [BO - 9 * SIZE], b4
FADD c04, t1, c04
nop
FMUL a5, b5, t1
LDF [AO - 9 * SIZE], a4
FADD c08, t2, c08
nop
FMUL a5, b2, t2
nop
FADD c12, t3, c12
nop
FMUL a5, b3, t3
nop
FADD c16, t4, c16
nop
FMUL a5, b4, t4
LDF [AO - 4 * SIZE], a5
FADD c01, t1, c01
nop
FMUL a2, b5, t1
nop
FADD c05, t2, c05
nop
FMUL a2, b2, t2
nop
FADD c09, t3, c09
nop
FMUL a2, b3, t3
nop
FADD c13, t4, c13
nop
FMUL a2, b4, t4
LDF [AO - 7 * SIZE], a2
FADD c02, t1, c02
nop
FMUL a3, b5, t1
nop
FADD c06, t2, c06
nop
FMUL a3, b2, t2
nop
FADD c10, t3, c10
nop
FMUL a3, b3, t3
nop
FADD c14, t4, c14
nop
FMUL a3, b4, t4
LDF [AO - 6 * SIZE], a3
FADD c03, t1, c03
nop
FMUL a4, b5, t1
LDF [BO - 4 * SIZE], b5
FADD c07, t2, c07
nop
FMUL a4, b2, t2
LDF [BO - 7 * SIZE], b2
FADD c11, t3, c11
nop
FMUL a4, b3, t3
LDF [BO - 6 * SIZE], b3
FADD c15, t4, c15
nop
FMUL a4, b4, t4
LDF [BO - 5 * SIZE], b4
FADD c04, t1, c04
nop
FMUL a1, b1, t1
LDF [AO - 5 * SIZE], a4
FADD c08, t2, c08
nop
FMUL a1, b2, t2
nop
FADD c12, t3, c12
nop
FMUL a1, b3, t3
nop
FADD c16, t4, c16
nop
FMUL a1, b4, t4
LDF [AO - 0 * SIZE], a1
FADD c01, t1, c01
nop
FMUL a2, b1, t1
nop
#ifdef DOUBLE
prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
#else
nop
#endif
FADD c05, t2, c05
nop
FMUL a2, b2, t2
FADD c09, t3, c09
nop
FMUL a2, b3, t3
nop
FADD c13, t4, c13
nop
FMUL a2, b4, t4
nop
FADD c02, t1, c02
nop
FMUL a3, b1, t1
LDF [AO - 3 * SIZE], a2
FADD c06, t2, c06
#ifdef DOUBLE
prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
#else
nop
#endif
FMUL a3, b2, t2
nop
FADD c10, t3, c10
nop
FMUL a3, b3, t3
nop
FADD c14, t4, c14
nop
FMUL a3, b4, t4
LDF [AO - 2 * SIZE], a3
FADD c03, t1, c03
nop
FMUL a4, b1, t1
LDF [BO - 0 * SIZE], b1
FADD c07, t2, c07
nop
FMUL a4, b2, t2
LDF [BO - 3 * SIZE], b2
FADD c11, t3, c11
nop
FMUL a4, b3, t3
LDF [BO - 2 * SIZE], b3
FADD c15, t4, c15
nop
FMUL a4, b4, t4
LDF [BO - 1 * SIZE], b4
FADD c04, t1, c04
nop
FMUL a5, b5, t1
LDF [AO - 1 * SIZE], a4
FADD c08, t2, c08
FMUL a5, b2, t2
FADD c12, t3, c12
FMUL a5, b3, t3
FADD c16, t4, c16
nop
FMUL a5, b4, t4
LDF [AO + 4 * SIZE], a5
FADD c01, t1, c01
nop
FMUL a2, b5, t1
nop
FADD c05, t2, c05
nop
FMUL a2, b2, t2
nop
FADD c09, t3, c09
nop
FMUL a2, b3, t3
nop
FADD c13, t4, c13
nop
FMUL a2, b4, t4
LDF [AO + 1 * SIZE], a2
FADD c02, t1, c02
nop
FMUL a3, b5, t1
nop
FADD c06, t2, c06
nop
FMUL a3, b2, t2
nop
FADD c10, t3, c10
nop
FMUL a3, b3, t3
nop
FADD c14, t4, c14
nop
FMUL a3, b4, t4
LDF [AO + 2 * SIZE], a3
FADD c03, t1, c03
cmp L, 0
FMUL a4, b5, t1
LDF [BO + 4 * SIZE], b5
FADD c07, t2, c07
nop
FMUL a4, b2, t2
LDF [BO + 1 * SIZE], b2
FADD c11, t3, c11
nop
FMUL a4, b3, t3
LDF [BO + 2 * SIZE], b3
FADD c15, t4, c15
FMUL a4, b4, t4
bg,pt %icc, .LL22
LDF [BO + 3 * SIZE], b4
.LL25:
#if defined(LT) || defined(RN)
and KK, 3, L
#else
and TEMP1, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL29
nop
.LL26:
FADD c04, t1, c04
LDF [AO + 3 * SIZE], a4
FMUL a1, b1, t1
add AO, 4 * SIZE, AO
FADD c08, t2, c08
add BO, 4 * SIZE, BO
FMUL a1, b2, t2
add L, -1, L
FADD c12, t3, c12
nop
FMUL a1, b3, t3
cmp L, 0
FADD c16, t4, c16
nop
FMUL a1, b4, t4
LDF [AO + 0 * SIZE], a1
FADD c01, t1, c01
nop
FMUL a2, b1, t1
nop
FADD c05, t2, c05
nop
FMUL a2, b2, t2
nop
FADD c09, t3, c09
nop
FMUL a2, b3, t3
nop
FADD c13, t4, c13
nop
FMUL a2, b4, t4
LDF [AO + 1 * SIZE], a2
FADD c02, t1, c02
nop
FMUL a3, b1, t1
nop
FADD c06, t2, c06
nop
FMUL a3, b2, t2
nop
FADD c10, t3, c10
nop
FMUL a3, b3, t3
nop
FADD c14, t4, c14
nop
FMUL a3, b4, t4
LDF [AO + 2 * SIZE], a3
FADD c03, t1, c03
nop
FMUL a4, b1, t1
LDF [BO + 0 * SIZE], b1
FADD c07, t2, c07
nop
FMUL a4, b2, t2
LDF [BO + 1 * SIZE], b2
FADD c11, t3, c11
nop
FMUL a4, b3, t3
LDF [BO + 2 * SIZE], b3
FADD c15, t4, c15
FMUL a4, b4, t4
bg,pt %icc, .LL26
LDF [BO + 3 * SIZE], b4
.LL29:
#if defined(LN) || defined(RT)
sub KK, 4, TEMP1
sll TEMP1, 2 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AO
add B, TEMP1, BO
#endif
FADD c04, t1, c04
FADD c08, t2, c08
FADD c12, t3, c12
FADD c16, t4, c16
#if defined(LN) || defined(LT)
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
LDF [BO + 2 * SIZE], a3
LDF [BO + 3 * SIZE], a4
LDF [BO + 4 * SIZE], b1
LDF [BO + 5 * SIZE], b2
LDF [BO + 6 * SIZE], b3
LDF [BO + 7 * SIZE], b4
FSUB a1, c01, c01
FSUB a2, c05, c05
FSUB a3, c09, c09
FSUB a4, c13, c13
FSUB b1, c02, c02
FSUB b2, c06, c06
FSUB b3, c10, c10
FSUB b4, c14, c14
LDF [BO + 8 * SIZE], a1
LDF [BO + 9 * SIZE], a2
LDF [BO + 10 * SIZE], a3
LDF [BO + 11 * SIZE], a4
LDF [BO + 12 * SIZE], b1
LDF [BO + 13 * SIZE], b2
LDF [BO + 14 * SIZE], b3
LDF [BO + 15 * SIZE], b4
FSUB a1, c03, c03
FSUB a2, c07, c07
FSUB a3, c11, c11
FSUB a4, c15, c15
FSUB b1, c04, c04
FSUB b2, c08, c08
FSUB b3, c12, c12
FSUB b4, c16, c16
#else
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [AO + 2 * SIZE], a3
LDF [AO + 3 * SIZE], a4
LDF [AO + 4 * SIZE], b1
LDF [AO + 5 * SIZE], b2
LDF [AO + 6 * SIZE], b3
LDF [AO + 7 * SIZE], b4
FSUB a1, c01, c01
FSUB a2, c02, c02
FSUB a3, c03, c03
FSUB a4, c04, c04
FSUB b1, c05, c05
FSUB b2, c06, c06
FSUB b3, c07, c07
FSUB b4, c08, c08
LDF [AO + 8 * SIZE], a1
LDF [AO + 9 * SIZE], a2
LDF [AO + 10 * SIZE], a3
LDF [AO + 11 * SIZE], a4
LDF [AO + 12 * SIZE], b1
LDF [AO + 13 * SIZE], b2
LDF [AO + 14 * SIZE], b3
LDF [AO + 15 * SIZE], b4
FSUB a1, c09, c09
FSUB a2, c10, c10
FSUB a3, c11, c11
FSUB a4, c12, c12
FSUB b1, c13, c13
FSUB b2, c14, c14
FSUB b3, c15, c15
FSUB b4, c16, c16
#endif
#ifdef LN
LDF [AO + 15 * SIZE], a1
LDF [AO + 14 * SIZE], a2
LDF [AO + 13 * SIZE], a3
LDF [AO + 12 * SIZE], a4
FMUL a1, c04, c04
FMUL a1, c08, c08
FMUL a1, c12, c12
FMUL a1, c16, c16
FMUL a2, c04, t1
FMUL a2, c08, t2
FMUL a2, c12, t3
FMUL a2, c16, t4
FSUB c03, t1, c03
FSUB c07, t2, c07
FSUB c11, t3, c11
FSUB c15, t4, c15
FMUL a3, c04, t1
FMUL a3, c08, t2
FMUL a3, c12, t3
FMUL a3, c16, t4
FSUB c02, t1, c02
FSUB c06, t2, c06
FSUB c10, t3, c10
FSUB c14, t4, c14
FMUL a4, c04, t1
FMUL a4, c08, t2
FMUL a4, c12, t3
FMUL a4, c16, t4
FSUB c01, t1, c01
FSUB c05, t2, c05
FSUB c09, t3, c09
FSUB c13, t4, c13
LDF [AO + 10 * SIZE], a1
LDF [AO + 9 * SIZE], a2
LDF [AO + 8 * SIZE], a3
FMUL a1, c03, c03
FMUL a1, c07, c07
FMUL a1, c11, c11
FMUL a1, c15, c15
FMUL a2, c03, t1
FMUL a2, c07, t2
FMUL a2, c11, t3
FMUL a2, c15, t4
FSUB c02, t1, c02
FSUB c06, t2, c06
FSUB c10, t3, c10
FSUB c14, t4, c14
FMUL a3, c03, t1
FMUL a3, c07, t2
FMUL a3, c11, t3
FMUL a3, c15, t4
FSUB c01, t1, c01
FSUB c05, t2, c05
FSUB c09, t3, c09
FSUB c13, t4, c13
LDF [AO + 5 * SIZE], a1
LDF [AO + 4 * SIZE], a2
FMUL a1, c02, c02
FMUL a1, c06, c06
FMUL a1, c10, c10
FMUL a1, c14, c14
FMUL a2, c02, t1
FMUL a2, c06, t2
FMUL a2, c10, t3
FMUL a2, c14, t4
FSUB c01, t1, c01
FSUB c05, t2, c05
FSUB c09, t3, c09
FSUB c13, t4, c13
LDF [AO + 0 * SIZE], a1
FMUL a1, c01, c01
FMUL a1, c05, c05
FMUL a1, c09, c09
FMUL a1, c13, c13
#endif
#ifdef LT
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [AO + 2 * SIZE], a3
LDF [AO + 3 * SIZE], a4
FMUL a1, c01, c01
FMUL a1, c05, c05
FMUL a1, c09, c09
FMUL a1, c13, c13
FMUL a2, c01, t1
FMUL a2, c05, t2
FMUL a2, c09, t3
FMUL a2, c13, t4
FSUB c02, t1, c02
FSUB c06, t2, c06
FSUB c10, t3, c10
FSUB c14, t4, c14
FMUL a3, c01, t1
FMUL a3, c05, t2
FMUL a3, c09, t3
FMUL a3, c13, t4
FSUB c03, t1, c03
FSUB c07, t2, c07
FSUB c11, t3, c11
FSUB c15, t4, c15
FMUL a4, c01, t1
FMUL a4, c05, t2
FMUL a4, c09, t3
FMUL a4, c13, t4
FSUB c04, t1, c04
FSUB c08, t2, c08
FSUB c12, t3, c12
FSUB c16, t4, c16
LDF [AO + 5 * SIZE], a1
LDF [AO + 6 * SIZE], a2
LDF [AO + 7 * SIZE], a3
FMUL a1, c02, c02
FMUL a1, c06, c06
FMUL a1, c10, c10
FMUL a1, c14, c14
FMUL a2, c02, t1
FMUL a2, c06, t2
FMUL a2, c10, t3
FMUL a2, c14, t4
FSUB c03, t1, c03
FSUB c07, t2, c07
FSUB c11, t3, c11
FSUB c15, t4, c15
FMUL a3, c02, t1
FMUL a3, c06, t2
FMUL a3, c10, t3
FMUL a3, c14, t4
FSUB c04, t1, c04
FSUB c08, t2, c08
FSUB c12, t3, c12
FSUB c16, t4, c16
LDF [AO + 10 * SIZE], a1
LDF [AO + 11 * SIZE], a2
FMUL a1, c03, c03
FMUL a1, c07, c07
FMUL a1, c11, c11
FMUL a1, c15, c15
FMUL a2, c03, t1
FMUL a2, c07, t2
FMUL a2, c11, t3
FMUL a2, c15, t4
FSUB c04, t1, c04
FSUB c08, t2, c08
FSUB c12, t3, c12
FSUB c16, t4, c16
LDF [AO + 15 * SIZE], a1
FMUL a1, c04, c04
FMUL a1, c08, c08
FMUL a1, c12, c12
FMUL a1, c16, c16
#endif
#ifdef RN
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
LDF [BO + 2 * SIZE], a3
LDF [BO + 3 * SIZE], a4
FMUL a1, c01, c01
FMUL a1, c02, c02
FMUL a1, c03, c03
FMUL a1, c04, c04
FMUL a2, c01, t1
FMUL a2, c02, t2
FMUL a2, c03, t3
FMUL a2, c04, t4
FSUB c05, t1, c05
FSUB c06, t2, c06
FSUB c07, t3, c07
FSUB c08, t4, c08
FMUL a3, c01, t1
FMUL a3, c02, t2
FMUL a3, c03, t3
FMUL a3, c04, t4
FSUB c09, t1, c09
FSUB c10, t2, c10
FSUB c11, t3, c11
FSUB c12, t4, c12
FMUL a4, c01, t1
FMUL a4, c02, t2
FMUL a4, c03, t3
FMUL a4, c04, t4
FSUB c13, t1, c13
FSUB c14, t2, c14
FSUB c15, t3, c15
FSUB c16, t4, c16
LDF [BO + 5 * SIZE], a1
LDF [BO + 6 * SIZE], a2
LDF [BO + 7 * SIZE], a3
FMUL a1, c05, c05
FMUL a1, c06, c06
FMUL a1, c07, c07
FMUL a1, c08, c08
FMUL a2, c05, t1
FMUL a2, c06, t2
FMUL a2, c07, t3
FMUL a2, c08, t4
FSUB c09, t1, c09
FSUB c10, t2, c10
FSUB c11, t3, c11
FSUB c12, t4, c12
FMUL a3, c05, t1
FMUL a3, c06, t2
FMUL a3, c07, t3
FMUL a3, c08, t4
FSUB c13, t1, c13
FSUB c14, t2, c14
FSUB c15, t3, c15
FSUB c16, t4, c16
LDF [BO + 10 * SIZE], a1
LDF [BO + 11 * SIZE], a2
FMUL a1, c09, c09
FMUL a1, c10, c10
FMUL a1, c11, c11
FMUL a1, c12, c12
FMUL a2, c09, t1
FMUL a2, c10, t2
FMUL a2, c11, t3
FMUL a2, c12, t4
FSUB c13, t1, c13
FSUB c14, t2, c14
FSUB c15, t3, c15
FSUB c16, t4, c16
LDF [BO + 15 * SIZE], a1
FMUL a1, c13, c13
FMUL a1, c14, c14
FMUL a1, c15, c15
FMUL a1, c16, c16
#endif
#ifdef RT
LDF [BO + 15 * SIZE], a1
LDF [BO + 14 * SIZE], a2
LDF [BO + 13 * SIZE], a3
LDF [BO + 12 * SIZE], a4
FMUL a1, c13, c13
FMUL a1, c14, c14
FMUL a1, c15, c15
FMUL a1, c16, c16
FMUL a2, c13, t1
FMUL a2, c14, t2
FMUL a2, c15, t3
FMUL a2, c16, t4
FSUB c09, t1, c09
FSUB c10, t2, c10
FSUB c11, t3, c11
FSUB c12, t4, c12
FMUL a3, c13, t1
FMUL a3, c14, t2
FMUL a3, c15, t3
FMUL a3, c16, t4
FSUB c05, t1, c05
FSUB c06, t2, c06
FSUB c07, t3, c07
FSUB c08, t4, c08
FMUL a4, c13, t1
FMUL a4, c14, t2
FMUL a4, c15, t3
FMUL a4, c16, t4
FSUB c01, t1, c01
FSUB c02, t2, c02
FSUB c03, t3, c03
FSUB c04, t4, c04
LDF [BO + 10 * SIZE], a1
LDF [BO + 9 * SIZE], a2
LDF [BO + 8 * SIZE], a3
FMUL a1, c09, c09
FMUL a1, c10, c10
FMUL a1, c11, c11
FMUL a1, c12, c12
FMUL a2, c09, t1
FMUL a2, c10, t2
FMUL a2, c11, t3
FMUL a2, c12, t4
FSUB c05, t1, c05
FSUB c06, t2, c06
FSUB c07, t3, c07
FSUB c08, t4, c08
FMUL a3, c09, t1
FMUL a3, c10, t2
FMUL a3, c11, t3
FMUL a3, c12, t4
FSUB c01, t1, c01
FSUB c02, t2, c02
FSUB c03, t3, c03
FSUB c04, t4, c04
LDF [BO + 5 * SIZE], a1
LDF [BO + 4 * SIZE], a2
FMUL a1, c05, c05
FMUL a1, c06, c06
FMUL a1, c07, c07
FMUL a1, c08, c08
FMUL a2, c05, t1
FMUL a2, c06, t2
FMUL a2, c07, t3
FMUL a2, c08, t4
FSUB c01, t1, c01
FSUB c02, t2, c02
FSUB c03, t3, c03
FSUB c04, t4, c04
LDF [BO + 0 * SIZE], a1
FMUL a1, c01, c01
FMUL a1, c02, c02
FMUL a1, c03, c03
FMUL a1, c04, c04
#endif
#ifdef LN
add C1, -4 * SIZE, C1
add C2, -4 * SIZE, C2
add C3, -4 * SIZE, C3
add C4, -4 * SIZE, C4
#endif
#if defined(LN) || defined(LT)
STF c01, [BO + 0 * SIZE]
STF c05, [BO + 1 * SIZE]
STF c09, [BO + 2 * SIZE]
STF c13, [BO + 3 * SIZE]
STF c02, [BO + 4 * SIZE]
STF c06, [BO + 5 * SIZE]
STF c10, [BO + 6 * SIZE]
STF c14, [BO + 7 * SIZE]
STF c03, [BO + 8 * SIZE]
STF c07, [BO + 9 * SIZE]
STF c11, [BO + 10 * SIZE]
STF c15, [BO + 11 * SIZE]
STF c04, [BO + 12 * SIZE]
STF c08, [BO + 13 * SIZE]
STF c12, [BO + 14 * SIZE]
STF c16, [BO + 15 * SIZE]
#else
STF c01, [AO + 0 * SIZE]
STF c02, [AO + 1 * SIZE]
STF c03, [AO + 2 * SIZE]
STF c04, [AO + 3 * SIZE]
STF c05, [AO + 4 * SIZE]
STF c06, [AO + 5 * SIZE]
STF c07, [AO + 6 * SIZE]
STF c08, [AO + 7 * SIZE]
STF c09, [AO + 8 * SIZE]
STF c10, [AO + 9 * SIZE]
STF c11, [AO + 10 * SIZE]
STF c12, [AO + 11 * SIZE]
STF c13, [AO + 12 * SIZE]
STF c14, [AO + 13 * SIZE]
STF c15, [AO + 14 * SIZE]
STF c16, [AO + 15 * SIZE]
#endif
STF c01, [C1 + 0 * SIZE]
STF c02, [C1 + 1 * SIZE]
STF c03, [C1 + 2 * SIZE]
STF c04, [C1 + 3 * SIZE]
STF c05, [C2 + 0 * SIZE]
STF c06, [C2 + 1 * SIZE]
STF c07, [C2 + 2 * SIZE]
STF c08, [C2 + 3 * SIZE]
STF c09, [C3 + 0 * SIZE]
STF c10, [C3 + 1 * SIZE]
STF c11, [C3 + 2 * SIZE]
STF c12, [C3 + 3 * SIZE]
STF c13, [C4 + 0 * SIZE]
STF c14, [C4 + 1 * SIZE]
STF c15, [C4 + 2 * SIZE]
STF c16, [C4 + 3 * SIZE]
FMOV FZERO, t1
FMOV FZERO, t2
FMOV FZERO, t3
FMOV FZERO, t4
#ifndef LN
add C1, 4 * SIZE, C1
add C2, 4 * SIZE, C2
add C3, 4 * SIZE, C3
add C4, 4 * SIZE, C4
#endif
#ifdef RT
sll K, 2 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AORIG
#endif
#if defined(LT) || defined(RN)
sub K, KK, TEMP1
sll TEMP1, 2 + BASE_SHIFT, TEMP1
add AO, TEMP1, AO
add BO, TEMP1, BO
#endif
#ifdef LT
add KK, 4, KK
#endif
#ifdef LN
sub KK, 4, KK
#endif
add I, -1, I
cmp I, 0
sra K, 2, L
bg,pt %icc, .LL21
FMOV FZERO, c01
.LL99:
#ifdef LN
sll K, 2 + BASE_SHIFT, TEMP1
add B, TEMP1, B
#endif
#if defined(LT) || defined(RN)
mov BO, B
#endif
#ifdef RN
add KK, 4, KK
#endif
#ifdef RT
sub KK, 4, KK
#endif
add J, -1, J
cmp J, 0
bg,pt %icc, .LL11
nop
.LL100: /* n & 2 */
and N, 2, J
cmp J, 0
ble,pn %icc, .LL200
nop
#ifdef RT
sll K, 1 + BASE_SHIFT, TEMP1
sub B, TEMP1, B
sll LDC, 1, TEMP1
sub C, TEMP1, C
#endif
mov C, C1
add C, LDC, C2
#ifdef LN
add M, OFFSET, KK
#endif
#ifdef LT
mov OFFSET, KK
#endif
#if defined(LN) || defined(RT)
mov A, AORIG
#else
mov A, AO
#endif
#ifndef RT
add C2, LDC, C
#endif
and M, 1, I
cmp I, 0
ble,pn %icc, .LL150
nop
#if defined(LT) || defined(RN)
sra KK, 2, L
mov B, BO
cmp L, 0
#else
#ifdef LN
sll K, 0 + BASE_SHIFT, TEMP1
sub AORIG, TEMP1, AORIG
#endif
sll KK, 0 + BASE_SHIFT, TEMP1
sll KK, 1 + BASE_SHIFT, TEMP2
add AORIG, TEMP1, AO
add B, TEMP2, BO
sub K, KK, TEMP1
sra TEMP1, 2, L
cmp L, 0
#endif
LDF [AO + 0 * SIZE], a1
FMOV FZERO, c01
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c02
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [BO + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [BO + 3 * SIZE], b4
FMOV FZERO, t4
ble,pn %icc, .LL175
nop
.LL172:
FADD c01, t1, c01
add AO, 4 * SIZE, AO
FMUL a1, b1, t1
LDF [BO + 4 * SIZE], b1
FADD c02, t2, c02
FMUL a1, b2, t2
LDF [BO + 5 * SIZE], b2
add L, -1, L
LDF [AO + 0 * SIZE], a1
FADD c03, t3, c03
cmp L, 0
FMUL a2, b3, t3
LDF [BO + 6 * SIZE], b3
FADD c04, t4, c04
FMUL a2, b4, t4
LDF [BO + 7 * SIZE], b4
LDF [AO + 1 * SIZE], a2
FADD c01, t1, c01
FMUL a3, b1, t1
LDF [BO + 8 * SIZE], b1
FADD c02, t2, c02
FMUL a3, b2, t2
LDF [BO + 9 * SIZE], b2
LDF [AO + 2 * SIZE], a3
FADD c03, t3, c03
FMUL a4, b3, t3
LDF [BO + 10 * SIZE], b3
FADD c04, t4, c04
FMUL a4, b4, t4
LDF [BO + 11 * SIZE], b4
add BO, 8 * SIZE, BO
bg,pt %icc, .LL172
LDF [AO + 3 * SIZE], a4
.LL175:
#if defined(LT) || defined(RN)
and KK, 3, L
#else
and TEMP1, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL179
nop
.LL176:
FADD c01, t1, c01
add L, -1, L
FMUL a1, b1, t1
add AO, 1 * SIZE, AO
LDF [BO + 2 * SIZE], b1
FADD c02, t2, c02
cmp L, 0
FMUL a1, b2, t2
LDF [BO + 3 * SIZE], b2
add BO, 2 * SIZE, BO
bg,pt %icc, .LL176
LDF [AO + 0 * SIZE], a1
.LL179:
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
FADD c01, c03, c01
FADD c02, c04, c02
#if defined(LN) || defined(RT)
#ifdef LN
sub KK, 1, TEMP1
#else
sub KK, 2, TEMP1
#endif
sll TEMP1, 0 + BASE_SHIFT, TEMP2
sll TEMP1, 1 + BASE_SHIFT, TEMP1
add AORIG, TEMP2, AO
add B, TEMP1, BO
#endif
#if defined(LN) || defined(LT)
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
FSUB a1, c01, c01
FSUB a2, c02, c02
#else
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
FSUB a1, c01, c01
FSUB a2, c02, c02
#endif
#ifdef LN
LDF [AO + 0 * SIZE], a1
FMUL a1, c01, c01
FMUL a1, c02, c02
#endif
#ifdef LT
LDF [AO + 0 * SIZE], a1
FMUL a1, c01, c01
FMUL a1, c02, c02
#endif
#ifdef RN
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
LDF [BO + 3 * SIZE], a3
FMUL a1, c01, c01
FMUL a2, c01, t1
FSUB c02, t1, c02
FMUL a3, c02, c02
#endif
#ifdef RT
LDF [BO + 3 * SIZE], a1
LDF [BO + 2 * SIZE], a2
LDF [BO + 0 * SIZE], a3
FMUL a1, c02, c02
FMUL a2, c02, t1
FSUB c01, t1, c01
FMUL a3, c01, c01
#endif
#ifdef LN
add C1, -1 * SIZE, C1
add C2, -1 * SIZE, C2
#endif
#if defined(LN) || defined(LT)
STF c01, [BO + 0 * SIZE]
STF c02, [BO + 1 * SIZE]
#else
STF c01, [AO + 0 * SIZE]
STF c02, [AO + 1 * SIZE]
#endif
STF c01, [C1 + 0 * SIZE]
STF c02, [C2 + 0 * SIZE]
FMOV FZERO, t1
FMOV FZERO, t2
FMOV FZERO, t3
FMOV FZERO, t4
#ifndef LN
add C1, 1 * SIZE, C1
add C2, 1 * SIZE, C2
#endif
#ifdef RT
sll K, 0 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AORIG
#endif
#if defined(LT) || defined(RN)
sub K, KK, TEMP1
sll TEMP1, 0 + BASE_SHIFT, TEMP2
sll TEMP1, 1 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LT
add KK, 1, KK
#endif
#ifdef LN
sub KK, 1, KK
#endif
.LL150:
and M, 2, I
cmp I, 0
ble,pn %icc, .LL170
nop
#if defined(LT) || defined(RN)
sra KK, 2, L
mov B, BO
cmp L, 0
#else
#ifdef LN
sll K, 1 + BASE_SHIFT, TEMP1
sub AORIG, TEMP1, AORIG
#endif
sll KK, 1 + BASE_SHIFT, TEMP1
sll KK, 1 + BASE_SHIFT, TEMP2
add AORIG, TEMP1, AO
add B, TEMP2, BO
sub K, KK, TEMP1
sra TEMP1, 2, L
cmp L, 0
#endif
LDF [AO + 0 * SIZE], a1
FMOV FZERO, c01
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
cmp L, 0
FMOV FZERO, c02
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [BO + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [BO + 3 * SIZE], b4
FMOV FZERO, t4
ble,pn %icc, .LL155
nop
.LL152:
FADD c01, t1, c01
add L, -1, L
FMUL a1, b1, t1
prefetch [AO + APREFETCHSIZE * SIZE], 0
FADD c02, t2, c02
add BO, 8 * SIZE, BO
FMUL a1, b2, t2
LDF [AO + 4 * SIZE], a1
FADD c03, t3, c03
cmp L, 0
FMUL a2, b1, t3
LDF [BO - 4 * SIZE], b1
FADD c04, t4, c04
nop
FMUL a2, b2, t4
LDF [AO + 5 * SIZE], a2
FADD c01, t1, c01
nop
FMUL a3, b3, t1
LDF [BO - 3 * SIZE], b2
FADD c02, t2, c02
nop
FMUL a3, b4, t2
LDF [AO + 6 * SIZE], a3
FADD c03, t3, c03
nop
FMUL a4, b3, t3
LDF [BO - 2 * SIZE], b3
FADD c04, t4, c04
nop
FMUL a4, b4, t4
LDF [AO + 7 * SIZE], a4
FADD c01, t1, c01
nop
FMUL a1, b1, t1
LDF [BO - 1 * SIZE], b4
FADD c02, t2, c02
FMUL a1, b2, t2
LDF [AO + 8 * SIZE], a1
FADD c03, t3, c03
FMUL a2, b1, t3
LDF [BO + 0 * SIZE], b1
FADD c04, t4, c04
FMUL a2, b2, t4
LDF [AO + 9 * SIZE], a2
FADD c01, t1, c01
FMUL a3, b3, t1
LDF [BO + 1 * SIZE], b2
FADD c02, t2, c02
FMUL a3, b4, t2
LDF [AO + 10 * SIZE], a3
FADD c03, t3, c03
FMUL a4, b3, t3
LDF [BO + 2 * SIZE], b3
FADD c04, t4, c04
FMUL a4, b4, t4
LDF [AO + 11 * SIZE], a4
add AO, 8 * SIZE, AO
bg,pt %icc, .LL152
LDF [BO + 3 * SIZE], b4
.LL155:
#if defined(LT) || defined(RN)
and KK, 3, L
#else
and TEMP1, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL159
nop
.LL156:
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [BO + 0 * SIZE], b1
LDF [BO + 1 * SIZE], b2
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
FMUL a1, b1, t1
FMUL a1, b2, t2
FMUL a2, b1, t3
FMUL a2, b2, t4
add AO, 2 * SIZE, AO
add BO, 2 * SIZE, BO
add L, -1, L
cmp L, 0
bg,pt %icc, .LL156
nop
.LL159:
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
#if defined(LN) || defined(RT)
#ifdef LN
sub KK, 2, TEMP1
#else
sub KK, 2, TEMP1
#endif
sll TEMP1, 1 + BASE_SHIFT, TEMP2
sll TEMP1, 1 + BASE_SHIFT, TEMP1
add AORIG, TEMP2, AO
add B, TEMP1, BO
#endif
#if defined(LN) || defined(LT)
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
LDF [BO + 2 * SIZE], a3
LDF [BO + 3 * SIZE], a4
FSUB a1, c01, c01
FSUB a2, c02, c02
FSUB a3, c03, c03
FSUB a4, c04, c04
#else
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [AO + 2 * SIZE], a3
LDF [AO + 3 * SIZE], a4
FSUB a1, c01, c01
FSUB a2, c03, c03
FSUB a3, c02, c02
FSUB a4, c04, c04
#endif
#ifdef LN
LDF [AO + 3 * SIZE], a1
LDF [AO + 2 * SIZE], a2
LDF [AO + 0 * SIZE], a3
FMUL a1, c03, c03
FMUL a1, c04, c04
FMUL a2, c03, t1
FMUL a2, c04, t2
FSUB c01, t1, c01
FSUB c02, t2, c02
FMUL a3, c01, c01
FMUL a3, c02, c02
#endif
#ifdef LT
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [AO + 3 * SIZE], a3
FMUL a1, c01, c01
FMUL a1, c02, c02
FMUL a2, c01, t1
FMUL a2, c02, t2
FSUB c03, t1, c03
FSUB c04, t2, c04
FMUL a3, c03, c03
FMUL a3, c04, c04
#endif
#ifdef RN
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
LDF [BO + 3 * SIZE], a3
FMUL a1, c01, c01
FMUL a1, c03, c03
FMUL a2, c01, t1
FMUL a2, c03, t2
FSUB c02, t1, c02
FSUB c04, t2, c04
FMUL a3, c02, c02
FMUL a3, c04, c04
#endif
#ifdef RT
LDF [BO + 3 * SIZE], a1
LDF [BO + 2 * SIZE], a2
LDF [BO + 0 * SIZE], a3
FMUL a1, c02, c02
FMUL a1, c04, c04
FMUL a2, c02, t1
FMUL a2, c04, t2
FSUB c01, t1, c01
FSUB c03, t2, c03
FMUL a3, c01, c01
FMUL a3, c03, c03
#endif
#ifdef LN
add C1, -2 * SIZE, C1
add C2, -2 * SIZE, C2
#endif
#if defined(LN) || defined(LT)
STF c01, [BO + 0 * SIZE]
STF c02, [BO + 1 * SIZE]
STF c03, [BO + 2 * SIZE]
STF c04, [BO + 3 * SIZE]
#else
STF c01, [AO + 0 * SIZE]
STF c03, [AO + 1 * SIZE]
STF c02, [AO + 2 * SIZE]
STF c04, [AO + 3 * SIZE]
#endif
STF c01, [C1 + 0 * SIZE]
STF c03, [C1 + 1 * SIZE]
STF c02, [C2 + 0 * SIZE]
STF c04, [C2 + 1 * SIZE]
FMOV FZERO, t1
FMOV FZERO, t2
FMOV FZERO, t3
FMOV FZERO, t4
#ifndef LN
add C1, 2 * SIZE, C1
add C2, 2 * SIZE, C2
#endif
#ifdef RT
sll K, 1 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AORIG
#endif
#if defined(LT) || defined(RN)
sub K, KK, TEMP1
sll TEMP1, 1 + BASE_SHIFT, TEMP2
sll TEMP1, 1 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LT
add KK, 2, KK
#endif
#ifdef LN
sub KK, 2, KK
#endif
.LL170:
sra M, 2, I
cmp I, 0
ble,pn %icc, .LL199
FMOV FZERO, c03
.LL121:
#if defined(LT) || defined(RN)
sra KK, 2, L
mov B, BO
cmp L, 0
#else
#ifdef LN
sll K, 2 + BASE_SHIFT, TEMP1
sub AORIG, TEMP1, AORIG
#endif
sll KK, 2 + BASE_SHIFT, TEMP1
sll KK, 1 + BASE_SHIFT, TEMP2
add AORIG, TEMP1, AO
add B, TEMP2, BO
sub K, KK, TEMP1
sra TEMP1, 2, L
cmp L, 0
#endif
LDF [AO + 0 * SIZE], a1
FMOV FZERO, t1
LDF [BO + 0 * SIZE], b1
FMOV FZERO, c07
LDF [AO + 1 * SIZE], a2
FMOV FZERO, t2
LDF [BO + 1 * SIZE], b2
FMOV FZERO, c04
LDF [AO + 2 * SIZE], a3
FMOV FZERO, t3
LDF [BO + 2 * SIZE], b3
FMOV FZERO, c08
LDF [AO + 3 * SIZE], a4
FMOV FZERO, t4
LDF [BO + 3 * SIZE], b4
FMOV FZERO, c01
#ifdef LN
prefetch [C1 - 3 * SIZE], 2
FMOV FZERO, c05
prefetch [C2 - 3 * SIZE], 2
FMOV FZERO, c02
#else
prefetch [C1 + 3 * SIZE], 2
FMOV FZERO, c05
prefetch [C2 + 3 * SIZE], 2
FMOV FZERO, c02
#endif
ble,pn %icc, .LL125
FMOV FZERO, c06
.LL122:
FADD c03, t1, c03
add L, -1, L
FMUL a1, b1, t1
prefetch [AO + APREFETCHSIZE * SIZE], 0
FADD c07, t2, c07
add BO, 8 * SIZE, BO
FMUL a1, b2, t2
LDF [AO + 4 * SIZE], a1
FADD c04, t3, c04
add AO, 16 * SIZE, AO
FMUL a2, b1, t3
cmp L, 0
FADD c08, t4, c08
nop
FMUL a2, b2, t4
LDF [AO - 11 * SIZE], a2
FADD c01, t1, c01
nop
FMUL a3, b1, t1
nop
FADD c05, t2, c05
nop
FMUL a3, b2, t2
LDF [AO - 10 * SIZE], a3
FADD c02, t3, c02
nop
FMUL a4, b1, t3
LDF [BO - 4 * SIZE], b1
FADD c06, t4, c06
nop
FMUL a4, b2, t4
LDF [BO - 3 * SIZE], b2
FADD c03, t1, c03
nop
FMUL a1, b3, t1
LDF [AO - 9 * SIZE], a4
FADD c07, t2, c07
nop
FMUL a1, b4, t2
LDF [AO - 8 * SIZE], a1
FADD c04, t3, c04
nop
FMUL a2, b3, t3
nop
FADD c08, t4, c08
nop
FMUL a2, b4, t4
LDF [AO - 7 * SIZE], a2
FADD c01, t1, c01
nop
FMUL a3, b3, t1
nop
FADD c05, t2, c05
nop
FMUL a3, b4, t2
LDF [AO - 6 * SIZE], a3
FADD c02, t3, c02
nop
FMUL a4, b3, t3
LDF [BO - 2 * SIZE], b3
FADD c06, t4, c06
nop
FMUL a4, b4, t4
LDF [BO - 1 * SIZE], b4
FADD c03, t1, c03
nop
FMUL a1, b1, t1
LDF [AO - 5 * SIZE], a4
FADD c07, t2, c07
nop
FMUL a1, b2, t2
LDF [AO - 4 * SIZE], a1
FADD c04, t3, c04
nop
FMUL a2, b1, t3
nop
FADD c08, t4, c08
nop
FMUL a2, b2, t4
LDF [AO - 3 * SIZE], a2
FADD c01, t1, c01
nop
FMUL a3, b1, t1
nop
FADD c05, t2, c05
nop
FMUL a3, b2, t2
LDF [AO - 2 * SIZE], a3
FADD c02, t3, c02
nop
FMUL a4, b1, t3
LDF [BO + 0 * SIZE], b1
FADD c06, t4, c06
nop
FMUL a4, b2, t4
LDF [BO + 1 * SIZE], b2
FADD c03, t1, c03
nop
FMUL a1, b3, t1
LDF [AO - 1 * SIZE], a4
FADD c07, t2, c07
nop
FMUL a1, b4, t2
LDF [AO + 0 * SIZE], a1
FADD c04, t3, c04
nop
FMUL a2, b3, t3
nop
FADD c08, t4, c08
nop
FMUL a2, b4, t4
LDF [AO + 1 * SIZE], a2
FADD c01, t1, c01
nop
FMUL a3, b3, t1
nop
FADD c05, t2, c05
nop
FMUL a3, b4, t2
LDF [AO + 2 * SIZE], a3
FADD c02, t3, c02
nop
FMUL a4, b3, t3
LDF [BO + 2 * SIZE], b3
FADD c06, t4, c06
FMUL a4, b4, t4
LDF [AO + 3 * SIZE], a4
bg,pt %icc, .LL122
LDF [BO + 3 * SIZE], b4
.LL125:
#if defined(LT) || defined(RN)
and KK, 3, L
#else
and TEMP1, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL129
nop
.LL126:
FADD c03, t1, c03
add AO, 4 * SIZE, AO
FMUL a1, b1, t1
add BO, 2 * SIZE, BO
FADD c07, t2, c07
add L, -1, L
FMUL a1, b2, t2
LDF [AO + 0 * SIZE], a1
FADD c04, t3, c04
cmp L, 0
FMUL a2, b1, t3
FADD c08, t4, c08
FMUL a2, b2, t4
LDF [AO + 1 * SIZE], a2
FADD c01, t1, c01
FMUL a3, b1, t1
FADD c05, t2, c05
FMUL a3, b2, t2
LDF [AO + 2 * SIZE], a3
FADD c02, t3, c02
FMUL a4, b1, t3
LDF [BO + 0 * SIZE], b1
FADD c06, t4, c06
FMUL a4, b2, t4
LDF [BO + 1 * SIZE], b2
bg,pt %icc, .LL126
LDF [AO + 3 * SIZE], a4
.LL129:
FADD c03, t1, c03
FADD c07, t2, c07
FADD c04, t3, c04
FADD c08, t4, c08
#if defined(LN) || defined(RT)
#ifdef LN
sub KK, 4, TEMP1
#else
sub KK, 2, TEMP1
#endif
sll TEMP1, 2 + BASE_SHIFT, TEMP2
sll TEMP1, 1 + BASE_SHIFT, TEMP1
add AORIG, TEMP2, AO
add B, TEMP1, BO
#endif
#if defined(LN) || defined(LT)
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
LDF [BO + 2 * SIZE], a3
LDF [BO + 3 * SIZE], a4
LDF [BO + 4 * SIZE], b1
LDF [BO + 5 * SIZE], b2
LDF [BO + 6 * SIZE], b3
LDF [BO + 7 * SIZE], b4
FSUB a1, c01, c01
FSUB a2, c05, c05
FSUB a3, c02, c02
FSUB a4, c06, c06
FSUB b1, c03, c03
FSUB b2, c07, c07
FSUB b3, c04, c04
FSUB b4, c08, c08
#else
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [AO + 2 * SIZE], a3
LDF [AO + 3 * SIZE], a4
LDF [AO + 4 * SIZE], b1
LDF [AO + 5 * SIZE], b2
LDF [AO + 6 * SIZE], b3
LDF [AO + 7 * SIZE], b4
FSUB a1, c01, c01
FSUB a2, c02, c02
FSUB a3, c03, c03
FSUB a4, c04, c04
FSUB b1, c05, c05
FSUB b2, c06, c06
FSUB b3, c07, c07
FSUB b4, c08, c08
#endif
#ifdef LN
LDF [AO + 15 * SIZE], a1
LDF [AO + 14 * SIZE], a2
LDF [AO + 13 * SIZE], a3
LDF [AO + 12 * SIZE], a4
FMUL a1, c04, c04
FMUL a1, c08, c08
FMUL a2, c04, t1
FMUL a2, c08, t2
FSUB c03, t1, c03
FSUB c07, t2, c07
FMUL a3, c04, t1
FMUL a3, c08, t2
FSUB c02, t1, c02
FSUB c06, t2, c06
FMUL a4, c04, t1
FMUL a4, c08, t2
FSUB c01, t1, c01
FSUB c05, t2, c05
LDF [AO + 10 * SIZE], a1
LDF [AO + 9 * SIZE], a2
LDF [AO + 8 * SIZE], a3
FMUL a1, c03, c03
FMUL a1, c07, c07
FMUL a2, c03, t1
FMUL a2, c07, t2
FSUB c02, t1, c02
FSUB c06, t2, c06
FMUL a3, c03, t1
FMUL a3, c07, t2
FSUB c01, t1, c01
FSUB c05, t2, c05
LDF [AO + 5 * SIZE], a1
LDF [AO + 4 * SIZE], a2
FMUL a1, c02, c02
FMUL a1, c06, c06
FMUL a2, c02, t1
FMUL a2, c06, t2
FSUB c01, t1, c01
FSUB c05, t2, c05
LDF [AO + 0 * SIZE], a1
FMUL a1, c01, c01
FMUL a1, c05, c05
#endif
#ifdef LT
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [AO + 2 * SIZE], a3
LDF [AO + 3 * SIZE], a4
FMUL a1, c01, c01
FMUL a1, c05, c05
FMUL a2, c01, t1
FMUL a2, c05, t2
FSUB c02, t1, c02
FSUB c06, t2, c06
FMUL a3, c01, t1
FMUL a3, c05, t2
FSUB c03, t1, c03
FSUB c07, t2, c07
FMUL a4, c01, t1
FMUL a4, c05, t2
FSUB c04, t1, c04
FSUB c08, t2, c08
LDF [AO + 5 * SIZE], a1
LDF [AO + 6 * SIZE], a2
LDF [AO + 7 * SIZE], a3
FMUL a1, c02, c02
FMUL a1, c06, c06
FMUL a2, c02, t1
FMUL a2, c06, t2
FSUB c03, t1, c03
FSUB c07, t2, c07
FMUL a3, c02, t1
FMUL a3, c06, t2
FSUB c04, t1, c04
FSUB c08, t2, c08
LDF [AO + 10 * SIZE], a1
LDF [AO + 11 * SIZE], a2
FMUL a1, c03, c03
FMUL a1, c07, c07
FMUL a2, c03, t1
FMUL a2, c07, t2
FSUB c04, t1, c04
FSUB c08, t2, c08
LDF [AO + 15 * SIZE], a1
FMUL a1, c04, c04
FMUL a1, c08, c08
#endif
#ifdef RN
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
LDF [BO + 3 * SIZE], a3
FMUL a1, c01, c01
FMUL a1, c02, c02
FMUL a1, c03, c03
FMUL a1, c04, c04
FMUL a2, c01, t1
FMUL a2, c02, t2
FMUL a2, c03, t3
FMUL a2, c04, t4
FSUB c05, t1, c05
FSUB c06, t2, c06
FSUB c07, t3, c07
FSUB c08, t4, c08
FMUL a3, c05, c05
FMUL a3, c06, c06
FMUL a3, c07, c07
FMUL a3, c08, c08
#endif
#ifdef RT
LDF [BO + 3 * SIZE], a1
LDF [BO + 2 * SIZE], a2
LDF [BO + 0 * SIZE], a3
FMUL a1, c05, c05
FMUL a1, c06, c06
FMUL a1, c07, c07
FMUL a1, c08, c08
FMUL a2, c05, t1
FMUL a2, c06, t2
FMUL a2, c07, t3
FMUL a2, c08, t4
FSUB c01, t1, c01
FSUB c02, t2, c02
FSUB c03, t3, c03
FSUB c04, t4, c04
FMUL a3, c01, c01
FMUL a3, c02, c02
FMUL a3, c03, c03
FMUL a3, c04, c04
#endif
#ifdef LN
add C1, -4 * SIZE, C1
add C2, -4 * SIZE, C2
#endif
#if defined(LN) || defined(LT)
STF c01, [BO + 0 * SIZE]
STF c05, [BO + 1 * SIZE]
STF c02, [BO + 2 * SIZE]
STF c06, [BO + 3 * SIZE]
STF c03, [BO + 4 * SIZE]
STF c07, [BO + 5 * SIZE]
STF c04, [BO + 6 * SIZE]
STF c08, [BO + 7 * SIZE]
#else
STF c01, [AO + 0 * SIZE]
STF c02, [AO + 1 * SIZE]
STF c03, [AO + 2 * SIZE]
STF c04, [AO + 3 * SIZE]
STF c05, [AO + 4 * SIZE]
STF c06, [AO + 5 * SIZE]
STF c07, [AO + 6 * SIZE]
STF c08, [AO + 7 * SIZE]
#endif
STF c01, [C1 + 0 * SIZE]
STF c02, [C1 + 1 * SIZE]
STF c03, [C1 + 2 * SIZE]
STF c04, [C1 + 3 * SIZE]
STF c05, [C2 + 0 * SIZE]
STF c06, [C2 + 1 * SIZE]
STF c07, [C2 + 2 * SIZE]
STF c08, [C2 + 3 * SIZE]
FMOV FZERO, t1
FMOV FZERO, t2
FMOV FZERO, t3
FMOV FZERO, t4
#ifndef LN
add C1, 4 * SIZE, C1
add C2, 4 * SIZE, C2
#endif
#ifdef RT
sll K, 2 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AORIG
#endif
#if defined(LT) || defined(RN)
sub K, KK, TEMP1
sll TEMP1, 2 + BASE_SHIFT, TEMP2
sll TEMP1, 1 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LT
add KK, 4, KK
#endif
#ifdef LN
sub KK, 4, KK
#endif
add I, -1, I
cmp I, 0
bg,pt %icc, .LL121
FMOV FZERO, c03
.LL199:
#ifdef LN
sll K, 1 + BASE_SHIFT, TEMP1
add B, TEMP1, B
#endif
#if defined(LT) || defined(RN)
mov BO, B
#endif
#ifdef RN
add KK, 2, KK
#endif
#ifdef RT
sub KK, 2, KK
#endif
.LL200:
and N, 1, J
cmp J, 0
ble,pn %icc, .LL999
nop
#ifdef RT
sll K, 0 + BASE_SHIFT, TEMP1
sub B, TEMP1, B
sub C, LDC, C
#endif
mov C, C1
#ifdef LN
add M, OFFSET, KK
#endif
#ifdef LT
mov OFFSET, KK
#endif
#if defined(LN) || defined(RT)
mov A, AORIG
#else
mov A, AO
#endif
#ifndef RT
add C, LDC, C
#endif
and M, 1, I
cmp I, 0
ble,pn %icc, .LL250
nop
#if defined(LT) || defined(RN)
sra KK, 2, L
mov B, BO
cmp L, 0
#else
#ifdef LN
sll K, 0 + BASE_SHIFT, TEMP1
sub AORIG, TEMP1, AORIG
#endif
sll KK, 0 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AO
add B, TEMP1, BO
sub K, KK, TEMP1
sra TEMP1, 2, L
cmp L, 0
#endif
LDF [AO + 0 * SIZE], a1
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c01
LDF [AO + 2 * SIZE], a3
FMOV FZERO, t2
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c02
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t3
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t4
LDF [BO + 2 * SIZE], b3
ble,pn %icc, .LL275
LDF [BO + 3 * SIZE], b4
.LL272:
FADD c01, t1, c01
add L, -1, L
add AO, 4 * SIZE, AO
FMUL a1, b1, t1
add BO, 4 * SIZE, BO
LDF [AO + 0 * SIZE], a1
FADD c02, t2, c02
cmp L, 0
LDF [BO + 0 * SIZE], b1
FMUL a2, b2, t2
LDF [AO + 1 * SIZE], a2
FADD c01, t3, c01
LDF [BO + 1 * SIZE], b2
FMUL a3, b3, t3
LDF [AO + 2 * SIZE], a3
FADD c02, t4, c02
LDF [BO + 2 * SIZE], b3
FMUL a4, b4, t4
LDF [AO + 3 * SIZE], a4
bg,pt %icc, .LL272
LDF [BO + 3 * SIZE], b4
.LL275:
#if defined(LT) || defined(RN)
and KK, 3, L
#else
and TEMP1, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL279
nop
.LL276:
FADD c01, t1, c01
add L, -1, L
FMUL a1, b1, t1
LDF [AO + 1 * SIZE], a1
LDF [BO + 1 * SIZE], b1
add BO, 1 * SIZE, BO
cmp L, 0
bg,pt %icc, .LL276
add AO, 1 * SIZE, AO
.LL279:
FADD c01, t1, c01
FADD c02, t2, c02
FADD c01, t3, c01
FADD c02, t4, c02
FADD c01, c02, c01
#if defined(LN) || defined(RT)
sub KK, 1, TEMP1
sll TEMP1, 0 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AO
add B, TEMP1, BO
#endif
#if defined(LN) || defined(LT)
LDF [BO + 0 * SIZE], a1
FSUB a1, c01, c01
#else
LDF [AO + 0 * SIZE], a1
FSUB a1, c01, c01
#endif
#ifdef LN
LDF [AO + 0 * SIZE], a1
FMUL a1, c01, c01
#endif
#ifdef LT
LDF [AO + 0 * SIZE], a1
FMUL a1, c01, c01
#endif
#ifdef RN
LDF [BO + 0 * SIZE], a1
FMUL a1, c01, c01
#endif
#ifdef RT
LDF [BO + 0 * SIZE], a1
FMUL a1, c01, c01
#endif
#ifdef LN
add C1, -1 * SIZE, C1
#endif
#if defined(LN) || defined(LT)
STF c01, [BO + 0 * SIZE]
#else
STF c01, [AO + 0 * SIZE]
#endif
STF c01, [C1 + 0 * SIZE]
FMOV FZERO, t1
FMOV FZERO, t2
FMOV FZERO, t3
FMOV FZERO, t4
#ifndef LN
add C1, 1 * SIZE, C1
#endif
#ifdef RT
sll K, 0 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AORIG
#endif
#if defined(LT) || defined(RN)
sub K, KK, TEMP1
sll TEMP1, 0 + BASE_SHIFT, TEMP1
add AO, TEMP1, AO
add BO, TEMP1, BO
#endif
#ifdef LT
add KK, 1, KK
#endif
#ifdef LN
sub KK, 1, KK
#endif
.LL250:
and M, 2, I
cmp I, 0
ble,pn %icc, .LL270
nop
#if defined(LT) || defined(RN)
sra KK, 2, L
mov B, BO
cmp L, 0
#else
#ifdef LN
sll K, 1 + BASE_SHIFT, TEMP1
sub AORIG, TEMP1, AORIG
#endif
sll KK, 1 + BASE_SHIFT, TEMP1
sll KK, 0 + BASE_SHIFT, TEMP2
add AORIG, TEMP1, AO
add B, TEMP2, BO
sub K, KK, TEMP1
sra TEMP1, 2, L
cmp L, 0
#endif
LDF [AO + 0 * SIZE], a1
FMOV FZERO, c01
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c02
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [BO + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [BO + 3 * SIZE], b4
FMOV FZERO, t4
ble,pn %icc, .LL255
nop
.LL252:
FADD c01, t1, c01
add L, -1, L
FMUL a1, b1, t1
LDF [AO + 4 * SIZE], a1
FADD c02, t2, c02
FMUL a2, b1, t2
LDF [AO + 5 * SIZE], a2
LDF [BO + 4 * SIZE], b1
FADD c03, t3, c03
cmp L, 0
FMUL a3, b2, t3
LDF [AO + 6 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b2, t4
LDF [AO + 7 * SIZE], a4
LDF [BO + 5 * SIZE], b2
FADD c01, t1, c01
FMUL a1, b3, t1
LDF [AO + 8 * SIZE], a1
FADD c02, t2, c02
FMUL a2, b3, t2
LDF [AO + 9 * SIZE], a2
LDF [BO + 6 * SIZE], b3
FADD c03, t3, c03
FMUL a3, b4, t3
LDF [AO + 10 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b4, t4
LDF [AO + 11 * SIZE], a4
add AO, 8 * SIZE, AO
LDF [BO + 7 * SIZE], b4
bg,pt %icc, .LL252
add BO, 4 * SIZE, BO
.LL255:
#if defined(LT) || defined(RN)
and KK, 3, L
#else
and TEMP1, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL259
nop
.LL256:
FADD c01, t1, c01
add L, -1, L
FMUL a1, b1, t1
LDF [AO + 2 * SIZE], a1
FADD c02, t2, c02
cmp L, 0
FMUL a2, b1, t2
LDF [AO + 3 * SIZE], a2
LDF [BO + 1 * SIZE], b1
add AO, 2 * SIZE, AO
bg,pt %icc, .LL256
add BO, 1 * SIZE, BO
.LL259:
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
FADD c01, c03, c01
FADD c02, c04, c02
#if defined(LN) || defined(RT)
#ifdef LN
sub KK, 2, TEMP1
#else
sub KK, 1, TEMP1
#endif
sll TEMP1, 1 + BASE_SHIFT, TEMP2
sll TEMP1, 0 + BASE_SHIFT, TEMP1
add AORIG, TEMP2, AO
add B, TEMP1, BO
#endif
#if defined(LN) || defined(LT)
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
FSUB a1, c01, c01
FSUB a2, c02, c02
#else
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
FSUB a1, c01, c01
FSUB a2, c02, c02
#endif
#ifdef LN
LDF [AO + 3 * SIZE], a1
LDF [AO + 2 * SIZE], a2
LDF [AO + 0 * SIZE], a3
FMUL a1, c02, c02
FMUL a2, c02, t1
FSUB c01, t1, c01
FMUL a3, c01, c01
#endif
#ifdef LT
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [AO + 3 * SIZE], a3
FMUL a1, c01, c01
FMUL a2, c01, t1
FSUB c02, t1, c02
FMUL a3, c02, c02
#endif
#ifdef RN
LDF [BO + 0 * SIZE], a1
FMUL a1, c01, c01
FMUL a1, c02, c02
#endif
#ifdef RT
LDF [BO + 0 * SIZE], a1
FMUL a1, c01, c01
FMUL a1, c02, c02
#endif
#ifdef LN
add C1, -2 * SIZE, C1
#endif
#if defined(LN) || defined(LT)
STF c01, [BO + 0 * SIZE]
STF c02, [BO + 1 * SIZE]
#else
STF c01, [AO + 0 * SIZE]
STF c02, [AO + 1 * SIZE]
#endif
STF c01, [C1 + 0 * SIZE]
STF c02, [C1 + 1 * SIZE]
FMOV FZERO, t1
FMOV FZERO, t2
FMOV FZERO, t3
FMOV FZERO, t4
#ifndef LN
add C1, 2 * SIZE, C1
#endif
#ifdef RT
sll K, 1 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AORIG
#endif
#if defined(LT) || defined(RN)
sub K, KK, TEMP1
sll TEMP1, 1 + BASE_SHIFT, TEMP2
sll TEMP1, 0 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LT
add KK, 2, KK
#endif
#ifdef LN
sub KK, 2, KK
#endif
.LL270:
sra M, 2, I
cmp I, 0
ble,pn %icc, .LL299
nop
.LL221:
#if defined(LT) || defined(RN)
sra KK, 2, L
mov B, BO
cmp L, 0
#else
#ifdef LN
sll K, 2 + BASE_SHIFT, TEMP1
sub AORIG, TEMP1, AORIG
#endif
sll KK, 2 + BASE_SHIFT, TEMP1
sll KK, 0 + BASE_SHIFT, TEMP2
add AORIG, TEMP1, AO
add B, TEMP2, BO
sub K, KK, TEMP1
sra TEMP1, 2, L
cmp L, 0
#endif
LDF [AO + 0 * SIZE], a1
FMOV FZERO, c01
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c02
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [BO + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [BO + 3 * SIZE], b4
FMOV FZERO, t4
#ifdef LN
prefetch [C1 - 3 * SIZE], 2
#else
prefetch [C1 + 3 * SIZE], 2
#endif
ble,pn %icc, .LL225
prefetch [C1 + 4 * SIZE], 2
.LL222:
FADD c01, t1, c01
add BO, 4 * SIZE, BO
FMUL a1, b1, t1
LDF [AO + 4 * SIZE], a1
FADD c02, t2, c02
FMUL a2, b1, t2
LDF [AO + 5 * SIZE], a2
FADD c03, t3, c03
add L, -1, L
FMUL a3, b1, t3
LDF [AO + 6 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b1, t4
LDF [AO + 7 * SIZE], a4
LDF [BO + 0 * SIZE], b1
FADD c01, t1, c01
cmp L, 0
FMUL a1, b2, t1
LDF [AO + 8 * SIZE], a1
FADD c02, t2, c02
FMUL a2, b2, t2
LDF [AO + 9 * SIZE], a2
FADD c03, t3, c03
FMUL a3, b2, t3
LDF [AO + 10 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b2, t4
LDF [AO + 11 * SIZE], a4
LDF [BO + 1 * SIZE], b2
FADD c01, t1, c01
FMUL a1, b3, t1
LDF [AO + 12 * SIZE], a1
FADD c02, t2, c02
FMUL a2, b3, t2
LDF [AO + 13 * SIZE], a2
FADD c03, t3, c03
FMUL a3, b3, t3
LDF [AO + 14 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b3, t4
LDF [AO + 15 * SIZE], a4
LDF [BO + 2 * SIZE], b3
FADD c01, t1, c01
FMUL a1, b4, t1
LDF [AO + 16 * SIZE], a1
FADD c02, t2, c02
FMUL a2, b4, t2
LDF [AO + 17 * SIZE], a2
FADD c03, t3, c03
FMUL a3, b4, t3
LDF [AO + 18 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b4, t4
LDF [AO + 19 * SIZE], a4
add AO, 16 * SIZE, AO
bg,pt %icc, .LL222
LDF [BO + 3 * SIZE], b4
.LL225:
#if defined(LT) || defined(RN)
and KK, 3, L
#else
and TEMP1, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL229
nop
.LL226:
FADD c01, t1, c01
add BO, 1 * SIZE, BO
FMUL a1, b1, t1
LDF [AO + 4 * SIZE], a1
FADD c02, t2, c02
add L, -1, L
FMUL a2, b1, t2
LDF [AO + 5 * SIZE], a2
FADD c03, t3, c03
cmp L, 0
FMUL a3, b1, t3
LDF [AO + 6 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b1, t4
LDF [AO + 7 * SIZE], a4
add AO, 4 * SIZE, AO
bg,pt %icc, .LL226
LDF [BO + 0 * SIZE], b1
.LL229:
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
#if defined(LN) || defined(RT)
#ifdef LN
sub KK, 4, TEMP1
#else
sub KK, 1, TEMP1
#endif
sll TEMP1, 2 + BASE_SHIFT, TEMP2
sll TEMP1, 0 + BASE_SHIFT, TEMP1
add AORIG, TEMP2, AO
add B, TEMP1, BO
#endif
#if defined(LN) || defined(LT)
LDF [BO + 0 * SIZE], a1
LDF [BO + 1 * SIZE], a2
LDF [BO + 2 * SIZE], a3
LDF [BO + 3 * SIZE], a4
FSUB a1, c01, c01
FSUB a2, c02, c02
FSUB a3, c03, c03
FSUB a4, c04, c04
#else
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [AO + 2 * SIZE], a3
LDF [AO + 3 * SIZE], a4
FSUB a1, c01, c01
FSUB a2, c02, c02
FSUB a3, c03, c03
FSUB a4, c04, c04
#endif
#ifdef LN
LDF [AO + 15 * SIZE], a1
LDF [AO + 14 * SIZE], a2
LDF [AO + 13 * SIZE], a3
LDF [AO + 12 * SIZE], a4
FMUL a1, c04, c04
FMUL a2, c04, t1
FSUB c03, t1, c03
FMUL a3, c04, t1
FSUB c02, t1, c02
FMUL a4, c04, t1
FSUB c01, t1, c01
LDF [AO + 10 * SIZE], a1
LDF [AO + 9 * SIZE], a2
LDF [AO + 8 * SIZE], a3
FMUL a1, c03, c03
FMUL a2, c03, t1
FSUB c02, t1, c02
FMUL a3, c03, t1
FSUB c01, t1, c01
LDF [AO + 5 * SIZE], a1
LDF [AO + 4 * SIZE], a2
FMUL a1, c02, c02
FMUL a2, c02, t1
FSUB c01, t1, c01
LDF [AO + 0 * SIZE], a1
FMUL a1, c01, c01
#endif
#ifdef LT
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [AO + 2 * SIZE], a3
LDF [AO + 3 * SIZE], a4
FMUL a1, c01, c01
FMUL a2, c01, t1
FSUB c02, t1, c02
FMUL a3, c01, t1
FSUB c03, t1, c03
FMUL a4, c01, t1
FSUB c04, t1, c04
LDF [AO + 5 * SIZE], a1
LDF [AO + 6 * SIZE], a2
LDF [AO + 7 * SIZE], a3
FMUL a1, c02, c02
FMUL a2, c02, t1
FSUB c03, t1, c03
FMUL a3, c02, t1
FSUB c04, t1, c04
LDF [AO + 10 * SIZE], a1
LDF [AO + 11 * SIZE], a2
FMUL a1, c03, c03
FMUL a2, c03, t1
FSUB c04, t1, c04
LDF [AO + 15 * SIZE], a1
FMUL a1, c04, c04
#endif
#ifdef RN
LDF [BO + 0 * SIZE], a1
FMUL a1, c01, c01
FMUL a1, c02, c02
FMUL a1, c03, c03
FMUL a1, c04, c04
#endif
#ifdef RT
LDF [BO + 0 * SIZE], a1
FMUL a1, c01, c01
FMUL a1, c02, c02
FMUL a1, c03, c03
FMUL a1, c04, c04
#endif
#ifdef LN
add C1, -4 * SIZE, C1
#endif
#if defined(LN) || defined(LT)
STF c01, [BO + 0 * SIZE]
STF c02, [BO + 1 * SIZE]
STF c03, [BO + 2 * SIZE]
STF c04, [BO + 3 * SIZE]
#else
STF c01, [AO + 0 * SIZE]
STF c02, [AO + 1 * SIZE]
STF c03, [AO + 2 * SIZE]
STF c04, [AO + 3 * SIZE]
#endif
STF c01, [C1 + 0 * SIZE]
STF c02, [C1 + 1 * SIZE]
STF c03, [C1 + 2 * SIZE]
STF c04, [C1 + 3 * SIZE]
FMOV FZERO, t1
FMOV FZERO, t2
FMOV FZERO, t3
FMOV FZERO, t4
#ifndef LN
add C1, 4 * SIZE, C1
#endif
#ifdef RT
sll K, 2 + BASE_SHIFT, TEMP1
add AORIG, TEMP1, AORIG
#endif
#if defined(LT) || defined(RN)
sub K, KK, TEMP1
sll TEMP1, 2 + BASE_SHIFT, TEMP2
sll TEMP1, 0 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LT
add KK, 4, KK
#endif
#ifdef LN
sub KK, 4, KK
#endif
add I, -1, I
cmp I, 0
bg,pt %icc, .LL221
nop
.LL299:
#ifdef LN
sll K, 0 + BASE_SHIFT, TEMP1
add B, TEMP1, B
#endif
#if defined(LT) || defined(RN)
mov BO, B
#endif
#ifdef RN
add KK, 1, KK
#endif
#ifdef RT
sub KK, 1, KK
#endif
.LL999:
return %i7 + 8
clr %o0
EPILOGUE