/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define M %i0
#define N %i1
#define K %i2
#if defined(DOUBLE) && !defined(__64BIT__)
#define A %i5
#define B %i4
#else
#define A %i4
#define B %i5
#endif
#define C %o4
#define LDC %o5
#define AO %l0
#define BO %l1
#define I %l2
#define J %l3
#define L %l4
#define C1 %o0
#define C2 %o1
#define C3 %o2
#define C4 %o3
#define OFFSET %l5
#define KK %l6
#define TEMP1 %l7
#define TEMP2 %i3
#ifdef DOUBLE
#define c01 %f0
#define c02 %f2
#define c03 %f4
#define c04 %f6
#define c05 %f8
#define c06 %f10
#define c07 %f12
#define c08 %f14
#define c09 %f16
#define c10 %f18
#define c11 %f20
#define c12 %f22
#define c13 %f24
#define c14 %f26
#define c15 %f28
#define c16 %f30
#define t1 %f32
#define t2 %f34
#define t3 %f36
#define t4 %f38
#define a1 %f40
#define a2 %f42
#define a3 %f44
#define a4 %f46
#define a5 %f58
#define b1 %f48
#define b2 %f50
#define b3 %f52
#define b4 %f54
#define b5 %f56
#define FZERO %f60
#define ALPHA %f62
#else
#define c01 %f0
#define c02 %f1
#define c03 %f2
#define c04 %f3
#define c05 %f4
#define c06 %f5
#define c07 %f6
#define c08 %f7
#define c09 %f8
#define c10 %f9
#define c11 %f10
#define c12 %f11
#define c13 %f12
#define c14 %f13
#define c15 %f14
#define c16 %f15
#define t1 %f16
#define t2 %f17
#define t3 %f18
#define t4 %f19
#define a1 %f20
#define a2 %f21
#define a3 %f22
#define a4 %f23
#define a5 %f31
#define b1 %f24
#define b2 %f25
#define b3 %f26
#define b4 %f27
#define b5 %f28
#define FZERO %f29
#define ALPHA %f30
#endif
PROLOGUE
SAVESP
nop
#ifndef __64BIT__
#ifdef DOUBLE
st %i3, [%sp + STACK_START + 16] /* ALPHA */
st %i4, [%sp + STACK_START + 20]
ld [%sp + STACK_START + 28], B
ld [%sp + STACK_START + 32], C
ld [%sp + STACK_START + 36], LDC
#ifdef TRMMKERNEL
ld [%sp + STACK_START + 40], OFFSET
#endif
#else
st %i3, [%sp + STACK_START + 16] /* ALPHA */
ld [%sp + STACK_START + 28], C
ld [%sp + STACK_START + 32], LDC
#ifdef TRMMKERNEL
ld [%sp + STACK_START + 36], OFFSET
#endif
#endif
LDF [%sp + STACK_START + 16], ALPHA
#else
ldx [%sp+ STACK_START + 56], C
ldx [%sp+ STACK_START + 64], LDC
#ifdef TRMMKERNEL
ldx [%sp+ STACK_START + 72], OFFSET
#endif
#ifdef DOUBLE
FMOV %f6, ALPHA
#else
FMOV %f7, ALPHA
#endif
#endif
FCLR(29)
#if defined(TRMMKERNEL) && !defined(LEFT)
neg OFFSET, KK
#endif
sra N, 2, J
cmp J, 0
ble,pn %icc, .LL100
sll LDC, BASE_SHIFT, LDC
.LL11:
add C, LDC, C2
FMOV FZERO, t1
nop
mov C, C1
add C2, LDC, C3
FMOV FZERO, t2
sra K, 2, L
mov A, AO
sra M, 2, I
add C3, LDC, C4
FMOV FZERO, t3
#if defined(TRMMKERNEL) && defined(LEFT)
mov OFFSET, KK
#endif
cmp I, 0
add C4, LDC, C
FMOV FZERO, t4
ble,pn %icc, .LL50
FMOV FZERO, c01
.LL21:
#if !defined(TRMMKERNEL)
FMOV FZERO, c02
mov B, BO
FMOV FZERO, c03
cmp L, 0
#else
FMOV FZERO, c02
FMOV FZERO, c03
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov B, BO
#else
sll KK, 2 + BASE_SHIFT, TEMP1
add AO, TEMP1, AO
add B, TEMP1, BO
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 4, L
#else
add KK, 4, L
#endif
sra L, 2, L
cmp L, 0
#endif
LDF [AO + 0 * SIZE], a1
FMOV FZERO, c04
LDF [BO + 0 * SIZE], b1
FMOV FZERO, c05
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c06
LDF [BO + 1 * SIZE], b2
FMOV FZERO, c07
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c08
LDF [BO + 2 * SIZE], b3
FMOV FZERO, c09
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c10
LDF [BO + 3 * SIZE], b4
FMOV FZERO, c11
LDF [BO + 4 * SIZE], b5 /* ***** */
LDF [AO + 4 * SIZE], a5 /* ***** */
prefetch [C1 + 3 * SIZE], 3
FMOV FZERO, c12
prefetch [C2 + 3 * SIZE], 3
FMOV FZERO, c13
prefetch [C3 + 3 * SIZE], 3
FMOV FZERO, c14
prefetch [C4 + 3 * SIZE], 3
FMOV FZERO, c15
ble,pn %icc, .LL25
FMOV FZERO, c16
#define APREFETCHSIZE 40
#define BPREFETCHSIZE 40
#define APREFETCH_CATEGORY 0
#define BPREFETCH_CATEGORY 0
.LL22:
FADD c04, t1, c04
prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
FMUL a1, b1, t1
nop
FADD c08, t2, c08
prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
FMUL a1, b2, t2
add AO, 16 * SIZE, AO
FADD c12, t3, c12
LDF [AO - 13 * SIZE], a4
FMUL a1, b3, t3
add BO, 16 * SIZE, BO
FADD c16, t4, c16
nop
FMUL a1, b4, t4
LDF [AO - 8 * SIZE], a1
FADD c01, t1, c01
nop
FMUL a2, b1, t1
nop
FADD c05, t2, c05
nop
FMUL a2, b2, t2
nop
FADD c09, t3, c09
nop
FMUL a2, b3, t3
nop
FADD c13, t4, c13
add L, -1, L
FMUL a2, b4, t4
LDF [AO - 11 * SIZE], a2
FADD c02, t1, c02
nop
FMUL a3, b1, t1
nop
FADD c06, t2, c06
nop
FMUL a3, b2, t2
nop
FADD c10, t3, c10
nop
FMUL a3, b3, t3
nop
FADD c14, t4, c14
nop
FMUL a3, b4, t4
LDF [AO - 10 * SIZE], a3
FADD c03, t1, c03
nop
FMUL a4, b1, t1
LDF [BO - 8 * SIZE], b1
FADD c07, t2, c07
nop
FMUL a4, b2, t2
LDF [BO - 11 * SIZE], b2
FADD c11, t3, c11
nop
FMUL a4, b3, t3
LDF [BO - 10 * SIZE], b3
FADD c15, t4, c15
nop
FMUL a4, b4, t4
LDF [BO - 9 * SIZE], b4
FADD c04, t1, c04
nop
FMUL a5, b5, t1
LDF [AO - 9 * SIZE], a4
FADD c08, t2, c08
nop
FMUL a5, b2, t2
nop
FADD c12, t3, c12
nop
FMUL a5, b3, t3
nop
FADD c16, t4, c16
nop
FMUL a5, b4, t4
LDF [AO - 4 * SIZE], a5
FADD c01, t1, c01
nop
FMUL a2, b5, t1
nop
FADD c05, t2, c05
nop
FMUL a2, b2, t2
nop
FADD c09, t3, c09
nop
FMUL a2, b3, t3
nop
FADD c13, t4, c13
nop
FMUL a2, b4, t4
LDF [AO - 7 * SIZE], a2
FADD c02, t1, c02
nop
FMUL a3, b5, t1
nop
FADD c06, t2, c06
nop
FMUL a3, b2, t2
nop
FADD c10, t3, c10
nop
FMUL a3, b3, t3
nop
FADD c14, t4, c14
nop
FMUL a3, b4, t4
LDF [AO - 6 * SIZE], a3
FADD c03, t1, c03
nop
FMUL a4, b5, t1
LDF [BO - 4 * SIZE], b5
FADD c07, t2, c07
nop
FMUL a4, b2, t2
LDF [BO - 7 * SIZE], b2
FADD c11, t3, c11
nop
FMUL a4, b3, t3
LDF [BO - 6 * SIZE], b3
FADD c15, t4, c15
nop
FMUL a4, b4, t4
LDF [BO - 5 * SIZE], b4
FADD c04, t1, c04
nop
FMUL a1, b1, t1
LDF [AO - 5 * SIZE], a4
FADD c08, t2, c08
nop
FMUL a1, b2, t2
nop
FADD c12, t3, c12
nop
FMUL a1, b3, t3
nop
FADD c16, t4, c16
nop
FMUL a1, b4, t4
LDF [AO - 0 * SIZE], a1
FADD c01, t1, c01
nop
FMUL a2, b1, t1
nop
#ifdef DOUBLE
prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
#else
nop
#endif
FADD c05, t2, c05
nop
FMUL a2, b2, t2
FADD c09, t3, c09
nop
FMUL a2, b3, t3
nop
FADD c13, t4, c13
nop
FMUL a2, b4, t4
nop
FADD c02, t1, c02
nop
FMUL a3, b1, t1
LDF [AO - 3 * SIZE], a2
FADD c06, t2, c06
#ifdef DOUBLE
prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
#else
nop
#endif
FMUL a3, b2, t2
nop
FADD c10, t3, c10
nop
FMUL a3, b3, t3
nop
FADD c14, t4, c14
nop
FMUL a3, b4, t4
LDF [AO - 2 * SIZE], a3
FADD c03, t1, c03
nop
FMUL a4, b1, t1
LDF [BO - 0 * SIZE], b1
FADD c07, t2, c07
nop
FMUL a4, b2, t2
LDF [BO - 3 * SIZE], b2
FADD c11, t3, c11
nop
FMUL a4, b3, t3
LDF [BO - 2 * SIZE], b3
FADD c15, t4, c15
nop
FMUL a4, b4, t4
LDF [BO - 1 * SIZE], b4
FADD c04, t1, c04
nop
FMUL a5, b5, t1
LDF [AO - 1 * SIZE], a4
FADD c08, t2, c08
FMUL a5, b2, t2
FADD c12, t3, c12
FMUL a5, b3, t3
FADD c16, t4, c16
nop
FMUL a5, b4, t4
LDF [AO + 4 * SIZE], a5
FADD c01, t1, c01
nop
FMUL a2, b5, t1
nop
FADD c05, t2, c05
nop
FMUL a2, b2, t2
nop
FADD c09, t3, c09
nop
FMUL a2, b3, t3
nop
FADD c13, t4, c13
nop
FMUL a2, b4, t4
LDF [AO + 1 * SIZE], a2
FADD c02, t1, c02
nop
FMUL a3, b5, t1
nop
FADD c06, t2, c06
nop
FMUL a3, b2, t2
nop
FADD c10, t3, c10
nop
FMUL a3, b3, t3
nop
FADD c14, t4, c14
nop
FMUL a3, b4, t4
LDF [AO + 2 * SIZE], a3
FADD c03, t1, c03
cmp L, 0
FMUL a4, b5, t1
LDF [BO + 4 * SIZE], b5
FADD c07, t2, c07
nop
FMUL a4, b2, t2
LDF [BO + 1 * SIZE], b2
FADD c11, t3, c11
nop
FMUL a4, b3, t3
LDF [BO + 2 * SIZE], b3
FADD c15, t4, c15
FMUL a4, b4, t4
bg,pt %icc, .LL22
LDF [BO + 3 * SIZE], b4
.LL25:
#ifndef TRMMKERNEL
and K, 3, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 4, L
#else
add KK, 4, L
#endif
and L, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL29
nop
.LL26:
FADD c04, t1, c04
LDF [AO + 3 * SIZE], a4
FMUL a1, b1, t1
add AO, 4 * SIZE, AO
FADD c08, t2, c08
add BO, 4 * SIZE, BO
FMUL a1, b2, t2
add L, -1, L
FADD c12, t3, c12
nop
FMUL a1, b3, t3
cmp L, 0
FADD c16, t4, c16
nop
FMUL a1, b4, t4
LDF [AO + 0 * SIZE], a1
FADD c01, t1, c01
nop
FMUL a2, b1, t1
nop
FADD c05, t2, c05
nop
FMUL a2, b2, t2
nop
FADD c09, t3, c09
nop
FMUL a2, b3, t3
nop
FADD c13, t4, c13
nop
FMUL a2, b4, t4
LDF [AO + 1 * SIZE], a2
FADD c02, t1, c02
nop
FMUL a3, b1, t1
nop
FADD c06, t2, c06
nop
FMUL a3, b2, t2
nop
FADD c10, t3, c10
nop
FMUL a3, b3, t3
nop
FADD c14, t4, c14
nop
FMUL a3, b4, t4
LDF [AO + 2 * SIZE], a3
FADD c03, t1, c03
nop
FMUL a4, b1, t1
LDF [BO + 0 * SIZE], b1
FADD c07, t2, c07
nop
FMUL a4, b2, t2
LDF [BO + 1 * SIZE], b2
FADD c11, t3, c11
nop
FMUL a4, b3, t3
LDF [BO + 2 * SIZE], b3
FADD c15, t4, c15
FMUL a4, b4, t4
bg,pt %icc, .LL26
LDF [BO + 3 * SIZE], b4
.LL29:
#ifndef TRMMKERNEL
FADD c04, t1, c04
add I, -1, I
FMUL c01, ALPHA, c01
LDF [C1 + 0 * SIZE], a1
FADD c08, t2, c08
cmp I, 0
FMUL c02, ALPHA, c02
LDF [C1 + 1 * SIZE], a2
FADD c12, t3, c12
nop
FMUL c03, ALPHA, c03
LDF [C1 + 2 * SIZE], a3
FADD c16, t4, c16
nop
FMUL c04, ALPHA, c04
LDF [C1 + 3 * SIZE], a4
FMUL c05, ALPHA, c05
LDF [C2 + 0 * SIZE], b1
FMUL c06, ALPHA, c06
LDF [C2 + 1 * SIZE], b2
FMUL c07, ALPHA, c07
LDF [C2 + 2 * SIZE], b3
FMUL c08, ALPHA, c08
LDF [C2 + 3 * SIZE], b4
FMUL c09, ALPHA, c09
LDF [C3 + 0 * SIZE], t1
FMUL c10, ALPHA, c10
LDF [C3 + 1 * SIZE], t2
FMUL c11, ALPHA, c11
LDF [C3 + 2 * SIZE], t3
FMUL c12, ALPHA, c12
LDF [C3 + 3 * SIZE], t4
FMUL c13, ALPHA, c13
add C1, 4 * SIZE, C1
FADD c01, a1, c01
LDF [C4 + 0 * SIZE], a1
FMUL c14, ALPHA, c14
add C2, 4 * SIZE, C2
FADD c02, a2, c02
LDF [C4 + 1 * SIZE], a2
FMUL c15, ALPHA, c15
add C3, 4 * SIZE, C3
FADD c03, a3, c03
LDF [C4 + 2 * SIZE], a3
FMUL c16, ALPHA, c16
nop
FADD c04, a4, c04
LDF [C4 + 3 * SIZE], a4
STF c01, [C1 - 4 * SIZE]
FADD c05, b1, c05
STF c02, [C1 - 3 * SIZE]
FADD c06, b2, c06
STF c03, [C1 - 2 * SIZE]
FADD c07, b3, c07
STF c04, [C1 - 1 * SIZE]
FADD c08, b4, c08
STF c05, [C2 - 4 * SIZE]
FADD c09, t1, c09
STF c06, [C2 - 3 * SIZE]
FADD c10, t2, c10
STF c07, [C2 - 2 * SIZE]
FADD c11, t3, c11
STF c08, [C2 - 1 * SIZE]
FADD c12, t4, c12
STF c09, [C3 - 4 * SIZE]
FADD c13, a1, c13
STF c10, [C3 - 3 * SIZE]
FADD c14, a2, c14
STF c11, [C3 - 2 * SIZE]
FADD c15, a3, c15
STF c12, [C3 - 1 * SIZE]
FADD c16, a4, c16
STF c13, [C4 + 0 * SIZE]
FMOV FZERO, t1
STF c14, [C4 + 1 * SIZE]
FMOV FZERO, t2
STF c15, [C4 + 2 * SIZE]
FMOV FZERO, t3
STF c16, [C4 + 3 * SIZE]
FMOV FZERO, t4
add C4, 4 * SIZE, C4
#else
FADD c04, t1, c04
FMUL c01, ALPHA, c01
FADD c08, t2, c08
FMUL c02, ALPHA, c02
FADD c12, t3, c12
FMUL c03, ALPHA, c03
FADD c16, t4, c16
FMUL c04, ALPHA, c04
STF c01, [C1 + 0 * SIZE]
FMUL c05, ALPHA, c05
STF c02, [C1 + 1 * SIZE]
FMUL c06, ALPHA, c06
STF c03, [C1 + 2 * SIZE]
FMUL c07, ALPHA, c07
STF c04, [C1 + 3 * SIZE]
FMUL c08, ALPHA, c08
STF c05, [C2 + 0 * SIZE]
FMUL c09, ALPHA, c09
STF c06, [C2 + 1 * SIZE]
FMUL c10, ALPHA, c10
STF c07, [C2 + 2 * SIZE]
FMUL c11, ALPHA, c11
STF c08, [C2 + 3 * SIZE]
FMUL c12, ALPHA, c12
STF c09, [C3 + 0 * SIZE]
FMUL c13, ALPHA, c13
STF c10, [C3 + 1 * SIZE]
FMUL c14, ALPHA, c14
STF c11, [C3 + 2 * SIZE]
FMUL c15, ALPHA, c15
STF c12, [C3 + 3 * SIZE]
FMUL c16, ALPHA, c16
STF c13, [C4 + 0 * SIZE]
STF c14, [C4 + 1 * SIZE]
STF c15, [C4 + 2 * SIZE]
STF c16, [C4 + 3 * SIZE]
FMOV FZERO, t1
FMOV FZERO, t2
FMOV FZERO, t3
FMOV FZERO, t4
add C1, 4 * SIZE, C1
add C2, 4 * SIZE, C2
add C3, 4 * SIZE, C3
add C4, 4 * SIZE, C4
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
sub K, KK, TEMP1
#ifdef LEFT
add TEMP1, -4, TEMP1
#else
add TEMP1, -4, TEMP1
#endif
sll TEMP1, 2 + BASE_SHIFT, TEMP1
add AO, TEMP1, AO
add BO, TEMP1, BO
#endif
#ifdef LEFT
add KK, 4, KK
#endif
add I, -1, I
cmp I, 0
#endif
sra K, 2, L
bg,pt %icc, .LL21
FMOV FZERO, c01
.LL50:
and M, 2, I
FMOV FZERO, c02
cmp I, 0
FMOV FZERO, t1
ble,pn %icc, .LL70
FMOV FZERO, c04
#if !defined(TRMMKERNEL)
LDF [AO + 0 * SIZE], a1
sra K, 2, L
FMOV FZERO, t2
LDF [B + 0 * SIZE], b1
mov B, BO
FMOV FZERO, c06
LDF [AO + 1 * SIZE], a2
cmp L, 0
FMOV FZERO, t3
LDF [B + 1 * SIZE], b2
FMOV FZERO, c08
LDF [AO + 2 * SIZE], a3
FMOV FZERO, t4
LDF [B + 2 * SIZE], b3
FMOV FZERO, c01
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c03
LDF [B + 3 * SIZE], b4
FMOV FZERO, c05
#else
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov B, BO
#else
sll KK, 1 + BASE_SHIFT, TEMP1
sll KK, 2 + BASE_SHIFT, TEMP2
add AO, TEMP1, AO
add B, TEMP2, BO
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 2, L
#else
add KK, 4, L
#endif
sra L, 2, L
cmp L, 0
LDF [AO + 0 * SIZE], a1
FMOV FZERO, t2
LDF [BO + 0 * SIZE], b1
FMOV FZERO, c06
LDF [AO + 1 * SIZE], a2
FMOV FZERO, t3
LDF [BO + 1 * SIZE], b2
FMOV FZERO, c08
LDF [AO + 2 * SIZE], a3
FMOV FZERO, t4
LDF [BO + 2 * SIZE], b3
FMOV FZERO, c01
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c03
LDF [BO + 3 * SIZE], b4
FMOV FZERO, c05
#endif
ble,pn %icc, .LL55
FMOV FZERO, c07
.LL52:
FADD c02, t1, c02
add AO, 8 * SIZE, AO
prefetch [AO + APREFETCHSIZE * SIZE], 0
FMUL a1, b1, t1
add BO, 16 * SIZE, BO
FADD c04, t2, c04
add L, -1, L
FMUL a1, b2, t2
FADD c06, t3, c06
cmp L, 0
FMUL a1, b3, t3
FADD c08, t4, c08
FMUL a1, b4, t4
LDF [AO - 4 * SIZE], a1
FADD c01, t1, c01
FMUL a2, b1, t1
LDF [BO - 12 * SIZE], b1
FADD c03, t2, c03
FMUL a2, b2, t2
LDF [BO - 11 * SIZE], b2
FADD c05, t3, c05
FMUL a2, b3, t3
LDF [BO - 10 * SIZE], b3
FADD c07, t4, c07
FMUL a2, b4, t4
LDF [BO - 9 * SIZE], b4
FADD c02, t1, c02
FMUL a3, b1, t1
LDF [AO - 3 * SIZE], a2
FADD c04, t2, c04
FMUL a3, b2, t2
FADD c06, t3, c06
FMUL a3, b3, t3
FADD c08, t4, c08
FMUL a3, b4, t4
LDF [AO - 2 * SIZE], a3
FADD c01, t1, c01
FMUL a4, b1, t1
LDF [BO - 8 * SIZE], b1
FADD c03, t2, c03
FMUL a4, b2, t2
LDF [BO - 7 * SIZE], b2
FADD c05, t3, c05
FMUL a4, b3, t3
LDF [BO - 6 * SIZE], b3
FADD c07, t4, c07
FMUL a4, b4, t4
LDF [BO - 5 * SIZE], b4
FADD c02, t1, c02
FMUL a1, b1, t1
LDF [AO - 1 * SIZE], a4
FADD c04, t2, c04
FMUL a1, b2, t2
FADD c06, t3, c06
FMUL a1, b3, t3
FADD c08, t4, c08
FMUL a1, b4, t4
LDF [AO + 0 * SIZE], a1
FADD c01, t1, c01
FMUL a2, b1, t1
LDF [BO - 4 * SIZE], b1
FADD c03, t2, c03
FMUL a2, b2, t2
LDF [BO - 3 * SIZE], b2
FADD c05, t3, c05
FMUL a2, b3, t3
LDF [BO - 2 * SIZE], b3
FADD c07, t4, c07
FMUL a2, b4, t4
LDF [BO - 1 * SIZE], b4
FADD c02, t1, c02
FMUL a3, b1, t1
LDF [AO + 1 * SIZE], a2
FADD c04, t2, c04
FMUL a3, b2, t2
FADD c06, t3, c06
FMUL a3, b3, t3
FADD c08, t4, c08
FMUL a3, b4, t4
LDF [AO + 2 * SIZE], a3
FADD c01, t1, c01
FMUL a4, b1, t1
LDF [BO + 0 * SIZE], b1
FADD c03, t2, c03
FMUL a4, b2, t2
LDF [BO + 1 * SIZE], b2
FADD c05, t3, c05
FMUL a4, b3, t3
LDF [BO + 2 * SIZE], b3
FADD c07, t4, c07
FMUL a4, b4, t4
LDF [BO + 3 * SIZE], b4
bg,pt %icc, .LL52
LDF [AO + 3 * SIZE], a4
.LL55:
#ifndef TRMMKERNEL
and K, 3, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 2, L
#else
add KK, 4, L
#endif
and L, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL59
nop
.LL56:
FADD c02, t1, c02
add AO, 2 * SIZE, AO
FMUL a1, b1, t1
add L, -1, L
add BO, 4 * SIZE, BO
FADD c04, t2, c04
cmp L, 0
FMUL a1, b2, t2
FADD c06, t3, c06
FMUL a1, b3, t3
FADD c08, t4, c08
FMUL a1, b4, t4
LDF [AO + 0 * SIZE], a1
FADD c01, t1, c01
FMUL a2, b1, t1
LDF [BO + 0 * SIZE], b1
FADD c03, t2, c03
FMUL a2, b2, t2
LDF [BO + 1 * SIZE], b2
FADD c05, t3, c05
FMUL a2, b3, t3
LDF [BO + 2 * SIZE], b3
FADD c07, t4, c07
FMUL a2, b4, t4
LDF [BO + 3 * SIZE], b4
bg,pt %icc, .LL56
LDF [AO + 1 * SIZE], a2
.LL59:
#ifndef TRMMKERNEL
FADD c02, t1, c02
FMUL c01, ALPHA, c01
LDF [C1 + 0 * SIZE], a1
FADD c04, t2, c04
FMUL c03, ALPHA, c03
LDF [C1 + 1 * SIZE], a2
FADD c06, t3, c06
FMUL c05, ALPHA, c05
LDF [C2 + 0 * SIZE], a3
FADD c08, t4, c08
FMUL c07, ALPHA, c07
LDF [C2 + 1 * SIZE], a4
FMUL c02, ALPHA, c02
FADD c01, a1, c01
LDF [C3 + 0 * SIZE], b1
FMUL c04, ALPHA, c04
FADD c02, a2, c02
LDF [C3 + 1 * SIZE], b2
FMUL c06, ALPHA, c06
FADD c03, a3, c03
LDF [C4 + 0 * SIZE], b3
FMUL c08, ALPHA, c08
FADD c04, a4, c04
LDF [C4 + 1 * SIZE], b4
STF c01, [C1 + 0 * SIZE]
FADD c05, b1, c05
STF c02, [C1 + 1 * SIZE]
FADD c06, b2, c06
add C1, 2 * SIZE, C1
STF c03, [C2 + 0 * SIZE]
FADD c07, b3, c07
STF c04, [C2 + 1 * SIZE]
FADD c08, b4, c08
add C2, 2 * SIZE, C2
STF c05, [C3 + 0 * SIZE]
STF c06, [C3 + 1 * SIZE]
add C3, 2 * SIZE, C3
STF c07, [C4 + 0 * SIZE]
STF c08, [C4 + 1 * SIZE]
add C4, 2 * SIZE, C4
#else
FADD c02, t1, c02
FADD c04, t2, c04
FADD c06, t3, c06
FADD c08, t4, c08
FMUL c01, ALPHA, c01
FMUL c03, ALPHA, c03
FMUL c05, ALPHA, c05
FMUL c07, ALPHA, c07
FMUL c02, ALPHA, c02
FMUL c04, ALPHA, c04
FMUL c06, ALPHA, c06
FMUL c08, ALPHA, c08
STF c01, [C1 + 0 * SIZE]
STF c02, [C1 + 1 * SIZE]
STF c03, [C2 + 0 * SIZE]
STF c04, [C2 + 1 * SIZE]
STF c05, [C3 + 0 * SIZE]
STF c06, [C3 + 1 * SIZE]
STF c07, [C4 + 0 * SIZE]
STF c08, [C4 + 1 * SIZE]
add C1, 2 * SIZE, C1
add C2, 2 * SIZE, C2
add C3, 2 * SIZE, C3
add C4, 2 * SIZE, C4
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
sub K, KK, TEMP1
#ifdef LEFT
add TEMP1, -2, TEMP1
#else
add TEMP1, -4, TEMP1
#endif
sll TEMP1, 1 + BASE_SHIFT, TEMP2
sll TEMP1, 2 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LEFT
add KK, 2, KK
#endif
#endif
.LL70:
and M, 1, I
cmp I, 0
ble,pn %icc, .LL99
nop
.LL71:
#if !defined(TRMMKERNEL)
LDF [AO + 0 * SIZE], a1
sra K, 2, L
FMOV FZERO, c01
LDF [B + 0 * SIZE], b1
mov B, BO
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
cmp L, 0
FMOV FZERO, c02
LDF [B + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [B + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [B + 3 * SIZE], b4
FMOV FZERO, t4
#else
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov B, BO
#else
sll KK, 0 + BASE_SHIFT, TEMP1
sll KK, 2 + BASE_SHIFT, TEMP2
add AO, TEMP1, AO
add B, TEMP2, BO
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 1, L
#else
add KK, 4, L
#endif
sra L, 2, L
cmp L, 0
LDF [AO + 0 * SIZE], a1
FMOV FZERO, c01
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c02
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [BO + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [BO + 3 * SIZE], b4
FMOV FZERO, t4
#endif
ble,pn %icc, .LL75
nop
.LL72:
FADD c01, t1, c01
add L, -1, L
FMUL a1, b1, t1
LDF [BO + 4 * SIZE], b1
FADD c02, t2, c02
cmp L, 0
FMUL a1, b2, t2
LDF [BO + 5 * SIZE], b2
FADD c03, t3, c03
FMUL a1, b3, t3
LDF [BO + 6 * SIZE], b3
FADD c04, t4, c04
FMUL a1, b4, t4
LDF [BO + 7 * SIZE], b4
LDF [AO + 4 * SIZE], a1
FADD c01, t1, c01
add AO, 4 * SIZE, AO
FMUL a2, b1, t1
LDF [BO + 8 * SIZE], b1
FADD c02, t2, c02
FMUL a2, b2, t2
LDF [BO + 9 * SIZE], b2
FADD c03, t3, c03
FMUL a2, b3, t3
LDF [BO + 10 * SIZE], b3
FADD c04, t4, c04
FMUL a2, b4, t4
LDF [BO + 11 * SIZE], b4
LDF [AO + 1 * SIZE], a2
FADD c01, t1, c01
FMUL a3, b1, t1
LDF [BO + 12 * SIZE], b1
FADD c02, t2, c02
FMUL a3, b2, t2
LDF [BO + 13 * SIZE], b2
FADD c03, t3, c03
FMUL a3, b3, t3
LDF [BO + 14 * SIZE], b3
FADD c04, t4, c04
FMUL a3, b4, t4
LDF [BO + 15 * SIZE], b4
LDF [AO + 2 * SIZE], a3
FADD c01, t1, c01
FMUL a4, b1, t1
LDF [BO + 16 * SIZE], b1
FADD c02, t2, c02
FMUL a4, b2, t2
LDF [BO + 17 * SIZE], b2
FADD c03, t3, c03
FMUL a4, b3, t3
LDF [BO + 18 * SIZE], b3
FADD c04, t4, c04
FMUL a4, b4, t4
LDF [BO + 19 * SIZE], b4
add BO, 16 * SIZE, BO
bg,pt %icc, .LL72
LDF [AO + 3 * SIZE], a4
.LL75:
#ifndef TRMMKERNEL
and K, 3, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 1, L
#else
add KK, 4, L
#endif
and L, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL79
nop
.LL76:
FADD c01, t1, c01
add AO, 1 * SIZE, AO
FMUL a1, b1, t1
LDF [BO + 4 * SIZE], b1
FADD c02, t2, c02
add L, -1, L
FMUL a1, b2, t2
LDF [BO + 5 * SIZE], b2
FADD c03, t3, c03
cmp L, 0
FMUL a1, b3, t3
LDF [BO + 6 * SIZE], b3
FADD c04, t4, c04
add BO, 4 * SIZE, BO
FMUL a1, b4, t4
LDF [AO + 0 * SIZE], a1
bg,pt %icc, .LL76
LDF [BO + 3 * SIZE], b4
.LL79:
#ifndef TRMMKERNEL
FADD c01, t1, c01
LDF [C1 + 0 * SIZE], a1
FADD c02, t2, c02
LDF [C2 + 0 * SIZE], a2
FADD c03, t3, c03
LDF [C3 + 0 * SIZE], a3
FADD c04, t4, c04
LDF [C4 + 0 * SIZE], a4
FMUL c01, ALPHA, c01
FMUL c02, ALPHA, c02
FMUL c03, ALPHA, c03
FMUL c04, ALPHA, c04
FADD c01, a1, c01
FADD c02, a2, c02
FADD c03, a3, c03
FADD c04, a4, c04
STF c01, [C1 + 0 * SIZE]
STF c02, [C2 + 0 * SIZE]
STF c03, [C3 + 0 * SIZE]
STF c04, [C4 + 0 * SIZE]
#else
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
FMUL c01, ALPHA, c01
FMUL c02, ALPHA, c02
FMUL c03, ALPHA, c03
FMUL c04, ALPHA, c04
STF c01, [C1 + 0 * SIZE]
STF c02, [C2 + 0 * SIZE]
STF c03, [C3 + 0 * SIZE]
STF c04, [C4 + 0 * SIZE]
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
sub K, KK, TEMP1
#ifdef LEFT
add TEMP1, -1, TEMP1
#else
add TEMP1, -4, TEMP1
#endif
sll TEMP1, 0 + BASE_SHIFT, TEMP2
sll TEMP1, 2 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LEFT
add KK, 1, KK
#endif
#endif
.LL99:
add J, -1, J
mov BO, B
cmp J, 0
bg,pt %icc, .LL11
#if defined(TRMMKERNEL) && !defined(LEFT)
add KK, 4, KK
#else
nop
#endif
.LL100: /* n & 2 */
sra M, 2, I
and N, 2, J
cmp J, 0
add C, LDC, C2
ble,pn %icc, .LL200
mov A, AO
#if defined(TRMMKERNEL) && defined(LEFT)
mov OFFSET, KK
#endif
mov C, C1
add C2, LDC, C
cmp I, 0
ble,pn %icc, .LL150
FMOV FZERO, c03
.LL121:
#if !defined(TRMMKERNEL)
LDF [AO + 0 * SIZE], a1
sra K, 2, L
FMOV FZERO, t1
LDF [B + 0 * SIZE], b1
mov B, BO
FMOV FZERO, c07
LDF [AO + 1 * SIZE], a2
cmp L, 0
FMOV FZERO, t2
LDF [B + 1 * SIZE], b2
FMOV FZERO, c04
LDF [AO + 2 * SIZE], a3
FMOV FZERO, t3
LDF [B + 2 * SIZE], b3
FMOV FZERO, c08
LDF [AO + 3 * SIZE], a4
FMOV FZERO, t4
LDF [B + 3 * SIZE], b4
FMOV FZERO, c01
prefetch [C1 + 3 * SIZE], 2
FMOV FZERO, c05
prefetch [C2 + 3 * SIZE], 2
FMOV FZERO, c02
#else
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov B, BO
#else
sll KK, 2 + BASE_SHIFT, TEMP1
sll KK, 1 + BASE_SHIFT, TEMP2
add AO, TEMP1, AO
add B, TEMP2, BO
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 4, L
#else
add KK, 2, L
#endif
sra L, 2, L
cmp L, 0
LDF [AO + 0 * SIZE], a1
FMOV FZERO, t1
LDF [BO + 0 * SIZE], b1
FMOV FZERO, c07
LDF [AO + 1 * SIZE], a2
FMOV FZERO, t2
LDF [BO + 1 * SIZE], b2
FMOV FZERO, c04
LDF [AO + 2 * SIZE], a3
FMOV FZERO, t3
LDF [BO + 2 * SIZE], b3
FMOV FZERO, c08
LDF [AO + 3 * SIZE], a4
FMOV FZERO, t4
LDF [BO + 3 * SIZE], b4
FMOV FZERO, c01
prefetch [C1 + 3 * SIZE], 2
FMOV FZERO, c05
prefetch [C2 + 3 * SIZE], 2
FMOV FZERO, c02
#endif
ble,pn %icc, .LL125
FMOV FZERO, c06
.LL122:
FADD c03, t1, c03
add L, -1, L
FMUL a1, b1, t1
prefetch [AO + APREFETCHSIZE * SIZE], 0
FADD c07, t2, c07
add BO, 8 * SIZE, BO
FMUL a1, b2, t2
LDF [AO + 4 * SIZE], a1
FADD c04, t3, c04
add AO, 16 * SIZE, AO
FMUL a2, b1, t3
cmp L, 0
FADD c08, t4, c08
nop
FMUL a2, b2, t4
LDF [AO - 11 * SIZE], a2
FADD c01, t1, c01
nop
FMUL a3, b1, t1
nop
FADD c05, t2, c05
nop
FMUL a3, b2, t2
LDF [AO - 10 * SIZE], a3
FADD c02, t3, c02
nop
FMUL a4, b1, t3
LDF [BO - 4 * SIZE], b1
FADD c06, t4, c06
nop
FMUL a4, b2, t4
LDF [BO - 3 * SIZE], b2
FADD c03, t1, c03
nop
FMUL a1, b3, t1
LDF [AO - 9 * SIZE], a4
FADD c07, t2, c07
nop
FMUL a1, b4, t2
LDF [AO - 8 * SIZE], a1
FADD c04, t3, c04
nop
FMUL a2, b3, t3
nop
FADD c08, t4, c08
nop
FMUL a2, b4, t4
LDF [AO - 7 * SIZE], a2
FADD c01, t1, c01
nop
FMUL a3, b3, t1
nop
FADD c05, t2, c05
nop
FMUL a3, b4, t2
LDF [AO - 6 * SIZE], a3
FADD c02, t3, c02
nop
FMUL a4, b3, t3
LDF [BO - 2 * SIZE], b3
FADD c06, t4, c06
nop
FMUL a4, b4, t4
LDF [BO - 1 * SIZE], b4
FADD c03, t1, c03
nop
FMUL a1, b1, t1
LDF [AO - 5 * SIZE], a4
FADD c07, t2, c07
nop
FMUL a1, b2, t2
LDF [AO - 4 * SIZE], a1
FADD c04, t3, c04
nop
FMUL a2, b1, t3
nop
FADD c08, t4, c08
nop
FMUL a2, b2, t4
LDF [AO - 3 * SIZE], a2
FADD c01, t1, c01
nop
FMUL a3, b1, t1
nop
FADD c05, t2, c05
nop
FMUL a3, b2, t2
LDF [AO - 2 * SIZE], a3
FADD c02, t3, c02
nop
FMUL a4, b1, t3
LDF [BO + 0 * SIZE], b1
FADD c06, t4, c06
nop
FMUL a4, b2, t4
LDF [BO + 1 * SIZE], b2
FADD c03, t1, c03
nop
FMUL a1, b3, t1
LDF [AO - 1 * SIZE], a4
FADD c07, t2, c07
nop
FMUL a1, b4, t2
LDF [AO + 0 * SIZE], a1
FADD c04, t3, c04
nop
FMUL a2, b3, t3
nop
FADD c08, t4, c08
nop
FMUL a2, b4, t4
LDF [AO + 1 * SIZE], a2
FADD c01, t1, c01
nop
FMUL a3, b3, t1
nop
FADD c05, t2, c05
nop
FMUL a3, b4, t2
LDF [AO + 2 * SIZE], a3
FADD c02, t3, c02
nop
FMUL a4, b3, t3
LDF [BO + 2 * SIZE], b3
FADD c06, t4, c06
FMUL a4, b4, t4
LDF [AO + 3 * SIZE], a4
bg,pt %icc, .LL122
LDF [BO + 3 * SIZE], b4
.LL125:
#ifndef TRMMKERNEL
and K, 3, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 4, L
#else
add KK, 2, L
#endif
and L, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL129
nop
.LL126:
FADD c03, t1, c03
add AO, 4 * SIZE, AO
FMUL a1, b1, t1
add BO, 2 * SIZE, BO
FADD c07, t2, c07
add L, -1, L
FMUL a1, b2, t2
LDF [AO + 0 * SIZE], a1
FADD c04, t3, c04
cmp L, 0
FMUL a2, b1, t3
FADD c08, t4, c08
FMUL a2, b2, t4
LDF [AO + 1 * SIZE], a2
FADD c01, t1, c01
FMUL a3, b1, t1
FADD c05, t2, c05
FMUL a3, b2, t2
LDF [AO + 2 * SIZE], a3
FADD c02, t3, c02
FMUL a4, b1, t3
LDF [BO + 0 * SIZE], b1
FADD c06, t4, c06
FMUL a4, b2, t4
LDF [BO + 1 * SIZE], b2
bg,pt %icc, .LL126
LDF [AO + 3 * SIZE], a4
.LL129:
#ifndef TRMMKERNEL
FADD c03, t1, c03
add I, -1, I
LDF [C1 + 0 * SIZE], a1
FADD c07, t2, c07
cmp I, 0
LDF [C1 + 1 * SIZE], a2
FADD c04, t3, c04
LDF [C1 + 2 * SIZE], a3
FADD c08, t4, c08
LDF [C1 + 3 * SIZE], a4
LDF [C2 + 0 * SIZE], b1
FMUL c01, ALPHA, c01
LDF [C2 + 1 * SIZE], b2
FMUL c02, ALPHA, c02
LDF [C2 + 2 * SIZE], b3
FMUL c03, ALPHA, c03
LDF [C2 + 3 * SIZE], b4
FMUL c04, ALPHA, c04
FMUL c05, ALPHA, c05
FADD c01, a1, c01
FMUL c06, ALPHA, c06
FADD c02, a2, c02
FMUL c07, ALPHA, c07
FADD c03, a3, c03
FMUL c08, ALPHA, c08
FADD c04, a4, c04
STF c01, [C1 + 0 * SIZE]
FADD c05, b1, c05
STF c02, [C1 + 1 * SIZE]
FADD c06, b2, c06
STF c03, [C1 + 2 * SIZE]
FADD c07, b3, c07
STF c04, [C1 + 3 * SIZE]
add C1, 4 * SIZE, C1
FADD c08, b4, c08
STF c05, [C2 + 0 * SIZE]
STF c06, [C2 + 1 * SIZE]
STF c07, [C2 + 2 * SIZE]
STF c08, [C2 + 3 * SIZE]
add C2, 4 * SIZE, C2
#else
FADD c03, t1, c03
FADD c07, t2, c07
FADD c04, t3, c04
FADD c08, t4, c08
FMUL c01, ALPHA, c01
FMUL c02, ALPHA, c02
FMUL c03, ALPHA, c03
FMUL c04, ALPHA, c04
FMUL c05, ALPHA, c05
FMUL c06, ALPHA, c06
FMUL c07, ALPHA, c07
FMUL c08, ALPHA, c08
STF c01, [C1 + 0 * SIZE]
STF c02, [C1 + 1 * SIZE]
STF c03, [C1 + 2 * SIZE]
STF c04, [C1 + 3 * SIZE]
STF c05, [C2 + 0 * SIZE]
STF c06, [C2 + 1 * SIZE]
STF c07, [C2 + 2 * SIZE]
STF c08, [C2 + 3 * SIZE]
add C1, 4 * SIZE, C1
add C2, 4 * SIZE, C2
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
sub K, KK, TEMP1
#ifdef LEFT
add TEMP1, -4, TEMP1
#else
add TEMP1, -2, TEMP1
#endif
sll TEMP1, 2 + BASE_SHIFT, TEMP2
sll TEMP1, 1 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LEFT
add KK, 4, KK
#endif
add I, -1, I
cmp I, 0
#endif
bg,pt %icc, .LL121
FMOV FZERO, c03
.LL150:
and M, 2, I
cmp I, 0
ble,pn %icc, .LL170
nop
.LL151:
#if !defined(TRMMKERNEL)
LDF [AO + 0 * SIZE], a1
sra K, 2, L
FMOV FZERO, c01
LDF [B + 0 * SIZE], b1
mov B, BO
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
cmp L, 0
FMOV FZERO, c02
LDF [B + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [B + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [B + 3 * SIZE], b4
FMOV FZERO, t4
#else
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov B, BO
#else
sll KK, 1 + BASE_SHIFT, TEMP1
sll KK, 1 + BASE_SHIFT, TEMP2
add AO, TEMP1, AO
add B, TEMP2, BO
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 2, L
#else
add KK, 2, L
#endif
sra L, 2, L
cmp L, 0
LDF [AO + 0 * SIZE], a1
FMOV FZERO, c01
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c02
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [BO + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [BO + 3 * SIZE], b4
FMOV FZERO, t4
#endif
ble,pn %icc, .LL155
nop
.LL152:
FADD c01, t1, c01
add L, -1, L
FMUL a1, b1, t1
prefetch [AO + APREFETCHSIZE * SIZE], 0
FADD c02, t2, c02
add BO, 8 * SIZE, BO
FMUL a1, b2, t2
LDF [AO + 4 * SIZE], a1
FADD c03, t3, c03
cmp L, 0
FMUL a2, b1, t3
LDF [BO - 4 * SIZE], b1
FADD c04, t4, c04
nop
FMUL a2, b2, t4
LDF [AO + 5 * SIZE], a2
FADD c01, t1, c01
nop
FMUL a3, b3, t1
LDF [BO - 3 * SIZE], b2
FADD c02, t2, c02
nop
FMUL a3, b4, t2
LDF [AO + 6 * SIZE], a3
FADD c03, t3, c03
nop
FMUL a4, b3, t3
LDF [BO - 2 * SIZE], b3
FADD c04, t4, c04
nop
FMUL a4, b4, t4
LDF [AO + 7 * SIZE], a4
FADD c01, t1, c01
nop
FMUL a1, b1, t1
LDF [BO - 1 * SIZE], b4
FADD c02, t2, c02
FMUL a1, b2, t2
LDF [AO + 8 * SIZE], a1
FADD c03, t3, c03
FMUL a2, b1, t3
LDF [BO + 0 * SIZE], b1
FADD c04, t4, c04
FMUL a2, b2, t4
LDF [AO + 9 * SIZE], a2
FADD c01, t1, c01
FMUL a3, b3, t1
LDF [BO + 1 * SIZE], b2
FADD c02, t2, c02
FMUL a3, b4, t2
LDF [AO + 10 * SIZE], a3
FADD c03, t3, c03
FMUL a4, b3, t3
LDF [BO + 2 * SIZE], b3
FADD c04, t4, c04
FMUL a4, b4, t4
LDF [AO + 11 * SIZE], a4
add AO, 8 * SIZE, AO
bg,pt %icc, .LL152
LDF [BO + 3 * SIZE], b4
.LL155:
#ifndef TRMMKERNEL
and K, 3, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 2, L
#else
add KK, 2, L
#endif
and L, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL159
nop
.LL156:
LDF [AO + 0 * SIZE], a1
LDF [AO + 1 * SIZE], a2
LDF [BO + 0 * SIZE], b1
LDF [BO + 1 * SIZE], b2
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
FMUL a1, b1, t1
FMUL a1, b2, t2
FMUL a2, b1, t3
FMUL a2, b2, t4
add AO, 2 * SIZE, AO
add BO, 2 * SIZE, BO
add L, -1, L
cmp L, 0
bg,pt %icc, .LL156
nop
.LL159:
#ifndef TRMMKERNEL
LDF [C1 + 0 * SIZE], a1
LDF [C2 + 0 * SIZE], a2
LDF [C1 + 1 * SIZE], a3
LDF [C2 + 1 * SIZE], a4
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
FMUL c01, ALPHA, c01
FMUL c02, ALPHA, c02
FMUL c03, ALPHA, c03
FMUL c04, ALPHA, c04
FADD c01, a1, c01
FADD c02, a2, c02
FADD c03, a3, c03
FADD c04, a4, c04
STF c01, [C1 + 0 * SIZE]
STF c02, [C2 + 0 * SIZE]
STF c03, [C1 + 1 * SIZE]
add C1, 2 * SIZE, C1
STF c04, [C2 + 1 * SIZE]
add C2, 2 * SIZE, C2
#else
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
FMUL c01, ALPHA, c01
FMUL c02, ALPHA, c02
FMUL c03, ALPHA, c03
FMUL c04, ALPHA, c04
STF c01, [C1 + 0 * SIZE]
STF c02, [C2 + 0 * SIZE]
STF c03, [C1 + 1 * SIZE]
STF c04, [C2 + 1 * SIZE]
add C1, 2 * SIZE, C1
add C2, 2 * SIZE, C2
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
sub K, KK, TEMP1
#ifdef LEFT
add TEMP1, -2, TEMP1
#else
add TEMP1, -2, TEMP1
#endif
sll TEMP1, 1 + BASE_SHIFT, TEMP2
sll TEMP1, 1 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LEFT
add KK, 2, KK
#endif
#endif
.LL170:
and M, 1, I
cmp I, 0
ble,pn %icc, .LL199
nop
.LL171:
#if !defined(TRMMKERNEL)
LDF [AO + 0 * SIZE], a1
sra K, 2, L
FMOV FZERO, c01
LDF [B + 0 * SIZE], b1
mov B, BO
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
cmp L, 0
FMOV FZERO, c02
LDF [B + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [B + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [B + 3 * SIZE], b4
FMOV FZERO, t4
#else
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov B, BO
#else
sll KK, 0 + BASE_SHIFT, TEMP1
sll KK, 1 + BASE_SHIFT, TEMP2
add AO, TEMP1, AO
add B, TEMP2, BO
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 1, L
#else
add KK, 2, L
#endif
sra L, 2, L
cmp L, 0
LDF [AO + 0 * SIZE], a1
FMOV FZERO, c01
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c02
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [BO + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [BO + 3 * SIZE], b4
FMOV FZERO, t4
#endif
ble,pn %icc, .LL175
nop
.LL172:
FADD c01, t1, c01
add AO, 4 * SIZE, AO
FMUL a1, b1, t1
LDF [BO + 4 * SIZE], b1
FADD c02, t2, c02
FMUL a1, b2, t2
LDF [BO + 5 * SIZE], b2
add L, -1, L
LDF [AO + 0 * SIZE], a1
FADD c03, t3, c03
cmp L, 0
FMUL a2, b3, t3
LDF [BO + 6 * SIZE], b3
FADD c04, t4, c04
FMUL a2, b4, t4
LDF [BO + 7 * SIZE], b4
LDF [AO + 1 * SIZE], a2
FADD c01, t1, c01
FMUL a3, b1, t1
LDF [BO + 8 * SIZE], b1
FADD c02, t2, c02
FMUL a3, b2, t2
LDF [BO + 9 * SIZE], b2
LDF [AO + 2 * SIZE], a3
FADD c03, t3, c03
FMUL a4, b3, t3
LDF [BO + 10 * SIZE], b3
FADD c04, t4, c04
FMUL a4, b4, t4
LDF [BO + 11 * SIZE], b4
add BO, 8 * SIZE, BO
bg,pt %icc, .LL172
LDF [AO + 3 * SIZE], a4
.LL175:
#ifndef TRMMKERNEL
and K, 3, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 1, L
#else
add KK, 2, L
#endif
and L, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL179
nop
.LL176:
FADD c01, t1, c01
add L, -1, L
FMUL a1, b1, t1
add AO, 1 * SIZE, AO
LDF [BO + 2 * SIZE], b1
FADD c02, t2, c02
cmp L, 0
FMUL a1, b2, t2
LDF [BO + 3 * SIZE], b2
add BO, 2 * SIZE, BO
bg,pt %icc, .LL176
LDF [AO + 0 * SIZE], a1
.LL179:
#ifndef TRMMKERNEL
FADD c01, t1, c01
LDF [C1 + 0 * SIZE], a1
FADD c02, t2, c02
LDF [C2 + 0 * SIZE], a2
FADD c03, t3, c03
FADD c04, t4, c04
FADD c01, c03, c01
FADD c02, c04, c02
FMUL c01, ALPHA, c01
FMUL c02, ALPHA, c02
FADD c01, a1, c01
FADD c02, a2, c02
STF c01, [C1 + 0 * SIZE]
STF c02, [C2 + 0 * SIZE]
#else
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
FADD c01, c03, c01
FADD c02, c04, c02
FMUL c01, ALPHA, c01
FMUL c02, ALPHA, c02
STF c01, [C1 + 0 * SIZE]
STF c02, [C2 + 0 * SIZE]
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
sub K, KK, TEMP1
#ifdef LEFT
add TEMP1, -1, TEMP1
#else
add TEMP1, -2, TEMP1
#endif
sll TEMP1, 0 + BASE_SHIFT, TEMP2
sll TEMP1, 1 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LEFT
add KK, 1, KK
#endif
#endif
.LL199:
mov BO, B
#if defined(TRMMKERNEL) && !defined(LEFT)
add KK, 2, KK
#else
nop
#endif
.LL200:
and N, 1, J
sra M, 2, I
cmp J, 0
ble,pn %icc, .LL999
mov A, AO
#if defined(TRMMKERNEL) && defined(LEFT)
mov OFFSET, KK
#endif
cmp I, 0
ble,pn %icc, .LL250
mov C, C1
.LL221:
#if !defined(TRMMKERNEL)
LDF [AO + 0 * SIZE], a1
sra K, 2, L
FMOV FZERO, c01
LDF [B + 0 * SIZE], b1
mov B, BO
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
cmp L, 0
FMOV FZERO, c02
LDF [B + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [B + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [B + 3 * SIZE], b4
FMOV FZERO, t4
#else
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov B, BO
#else
sll KK, 2 + BASE_SHIFT, TEMP1
sll KK, 0 + BASE_SHIFT, TEMP2
add AO, TEMP1, AO
add B, TEMP2, BO
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 4, L
#else
add KK, 1, L
#endif
sra L, 2, L
cmp L, 0
LDF [AO + 0 * SIZE], a1
FMOV FZERO, c01
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c02
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [BO + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [BO + 3 * SIZE], b4
FMOV FZERO, t4
#endif
ble,pn %icc, .LL225
prefetch [C1 + 4 * SIZE], 2
.LL222:
FADD c01, t1, c01
add BO, 4 * SIZE, BO
FMUL a1, b1, t1
LDF [AO + 4 * SIZE], a1
FADD c02, t2, c02
FMUL a2, b1, t2
LDF [AO + 5 * SIZE], a2
FADD c03, t3, c03
add L, -1, L
FMUL a3, b1, t3
LDF [AO + 6 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b1, t4
LDF [AO + 7 * SIZE], a4
LDF [BO + 0 * SIZE], b1
FADD c01, t1, c01
cmp L, 0
FMUL a1, b2, t1
LDF [AO + 8 * SIZE], a1
FADD c02, t2, c02
FMUL a2, b2, t2
LDF [AO + 9 * SIZE], a2
FADD c03, t3, c03
FMUL a3, b2, t3
LDF [AO + 10 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b2, t4
LDF [AO + 11 * SIZE], a4
LDF [BO + 1 * SIZE], b2
FADD c01, t1, c01
FMUL a1, b3, t1
LDF [AO + 12 * SIZE], a1
FADD c02, t2, c02
FMUL a2, b3, t2
LDF [AO + 13 * SIZE], a2
FADD c03, t3, c03
FMUL a3, b3, t3
LDF [AO + 14 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b3, t4
LDF [AO + 15 * SIZE], a4
LDF [BO + 2 * SIZE], b3
FADD c01, t1, c01
FMUL a1, b4, t1
LDF [AO + 16 * SIZE], a1
FADD c02, t2, c02
FMUL a2, b4, t2
LDF [AO + 17 * SIZE], a2
FADD c03, t3, c03
FMUL a3, b4, t3
LDF [AO + 18 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b4, t4
LDF [AO + 19 * SIZE], a4
add AO, 16 * SIZE, AO
bg,pt %icc, .LL222
LDF [BO + 3 * SIZE], b4
.LL225:
#ifndef TRMMKERNEL
and K, 3, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 4, L
#else
add KK, 1, L
#endif
and L, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL229
nop
.LL226:
FADD c01, t1, c01
add BO, 1 * SIZE, BO
FMUL a1, b1, t1
LDF [AO + 4 * SIZE], a1
FADD c02, t2, c02
add L, -1, L
FMUL a2, b1, t2
LDF [AO + 5 * SIZE], a2
FADD c03, t3, c03
cmp L, 0
FMUL a3, b1, t3
LDF [AO + 6 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b1, t4
LDF [AO + 7 * SIZE], a4
add AO, 4 * SIZE, AO
bg,pt %icc, .LL226
LDF [BO + 0 * SIZE], b1
.LL229:
#ifndef TRMMKERNEL
FADD c01, t1, c01
add I, -1, I
FADD c02, t2, c02
cmp I, 0
FADD c03, t3, c03
FADD c04, t4, c04
FMUL c01, ALPHA, c01
FMUL c02, ALPHA, c02
FMUL c03, ALPHA, c03
FMUL c04, ALPHA, c04
LDF [C1 + 0 * SIZE], a1
LDF [C1 + 1 * SIZE], a2
LDF [C1 + 2 * SIZE], a3
LDF [C1 + 3 * SIZE], a4
FADD c01, a1, c01
FADD c02, a2, c02
FADD c03, a3, c03
FADD c04, a4, c04
STF c01, [C1 + 0 * SIZE]
STF c02, [C1 + 1 * SIZE]
STF c03, [C1 + 2 * SIZE]
STF c04, [C1 + 3 * SIZE]
add C1, 4 * SIZE, C1
#else
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
FMUL c01, ALPHA, c01
FMUL c02, ALPHA, c02
FMUL c03, ALPHA, c03
FMUL c04, ALPHA, c04
STF c01, [C1 + 0 * SIZE]
STF c02, [C1 + 1 * SIZE]
STF c03, [C1 + 2 * SIZE]
STF c04, [C1 + 3 * SIZE]
add C1, 4 * SIZE, C1
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
sub K, KK, TEMP1
#ifdef LEFT
add TEMP1, -4, TEMP1
#else
add TEMP1, -1, TEMP1
#endif
sll TEMP1, 2 + BASE_SHIFT, TEMP2
sll TEMP1, 0 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LEFT
add KK, 4, KK
#endif
add I, -1, I
cmp I, 0
#endif
bg,pt %icc, .LL221
nop
.LL250:
and M, 2, I
cmp I, 0
ble,pn %icc, .LL270
nop
.LL251:
#if !defined(TRMMKERNEL)
LDF [AO + 0 * SIZE], a1
sra K, 2, L
FMOV FZERO, c01
LDF [B + 0 * SIZE], b1
mov B, BO
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
cmp L, 0
FMOV FZERO, c02
LDF [B + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [B + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [B + 3 * SIZE], b4
FMOV FZERO, t4
#else
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov B, BO
#else
sll KK, 1 + BASE_SHIFT, TEMP1
sll KK, 0 + BASE_SHIFT, TEMP2
add AO, TEMP1, AO
add B, TEMP2, BO
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 2, L
#else
add KK, 1, L
#endif
sra L, 2, L
cmp L, 0
LDF [AO + 0 * SIZE], a1
FMOV FZERO, c01
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c02
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t2
LDF [AO + 2 * SIZE], a3
FMOV FZERO, c03
LDF [BO + 2 * SIZE], b3
FMOV FZERO, t3
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c04
LDF [BO + 3 * SIZE], b4
FMOV FZERO, t4
#endif
ble,pn %icc, .LL255
nop
.LL252:
FADD c01, t1, c01
add L, -1, L
FMUL a1, b1, t1
LDF [AO + 4 * SIZE], a1
FADD c02, t2, c02
FMUL a2, b1, t2
LDF [AO + 5 * SIZE], a2
LDF [BO + 4 * SIZE], b1
FADD c03, t3, c03
cmp L, 0
FMUL a3, b2, t3
LDF [AO + 6 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b2, t4
LDF [AO + 7 * SIZE], a4
LDF [BO + 5 * SIZE], b2
FADD c01, t1, c01
FMUL a1, b3, t1
LDF [AO + 8 * SIZE], a1
FADD c02, t2, c02
FMUL a2, b3, t2
LDF [AO + 9 * SIZE], a2
LDF [BO + 6 * SIZE], b3
FADD c03, t3, c03
FMUL a3, b4, t3
LDF [AO + 10 * SIZE], a3
FADD c04, t4, c04
FMUL a4, b4, t4
LDF [AO + 11 * SIZE], a4
add AO, 8 * SIZE, AO
LDF [BO + 7 * SIZE], b4
bg,pt %icc, .LL252
add BO, 4 * SIZE, BO
.LL255:
#ifndef TRMMKERNEL
and K, 3, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 2, L
#else
add KK, 1, L
#endif
and L, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL259
nop
.LL256:
FADD c01, t1, c01
add L, -1, L
FMUL a1, b1, t1
LDF [AO + 2 * SIZE], a1
FADD c02, t2, c02
cmp L, 0
FMUL a2, b1, t2
LDF [AO + 3 * SIZE], a2
LDF [BO + 1 * SIZE], b1
add AO, 2 * SIZE, AO
bg,pt %icc, .LL256
add BO, 1 * SIZE, BO
.LL259:
#ifndef TRMMKERNEL
FADD c01, t1, c01
LDF [C1 + 0 * SIZE], a1
FADD c02, t2, c02
LDF [C1 + 1 * SIZE], a2
FADD c03, t3, c03
FADD c04, t4, c04
FADD c01, c03, c01
FADD c02, c04, c02
FMUL c01, ALPHA, c01
FMUL c02, ALPHA, c02
FADD c01, a1, c01
FADD c02, a2, c02
STF c01, [C1 + 0 * SIZE]
STF c02, [C1 + 1 * SIZE]
add C1, 2 * SIZE, C1
#else
FADD c01, t1, c01
FADD c02, t2, c02
FADD c03, t3, c03
FADD c04, t4, c04
FADD c01, c03, c01
FADD c02, c04, c02
FMUL c01, ALPHA, c01
FMUL c02, ALPHA, c02
STF c01, [C1 + 0 * SIZE]
STF c02, [C1 + 1 * SIZE]
add C1, 2 * SIZE, C1
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
sub K, KK, TEMP1
#ifdef LEFT
add TEMP1, -2, TEMP1
#else
add TEMP1, -1, TEMP1
#endif
sll TEMP1, 1 + BASE_SHIFT, TEMP2
sll TEMP1, 0 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LEFT
add KK, 2, KK
#endif
#endif
.LL270:
and M, 1, I
cmp I, 0
ble,pn %icc, .LL999
nop
.LL271:
#if !defined(TRMMKERNEL)
LDF [AO + 0 * SIZE], a1
sra K, 2, L
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
mov B, BO
FMOV FZERO, c01
LDF [AO + 2 * SIZE], a3
cmp L, 0
FMOV FZERO, t2
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c02
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t3
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t4
LDF [BO + 2 * SIZE], b3
#else
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov B, BO
#else
sll KK, 0 + BASE_SHIFT, TEMP1
sll KK, 0 + BASE_SHIFT, TEMP2
add AO, TEMP1, AO
add B, TEMP2, BO
#endif
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 1, L
#else
add KK, 1, L
#endif
sra L, 2, L
cmp L, 0
LDF [AO + 0 * SIZE], a1
FMOV FZERO, t1
LDF [AO + 1 * SIZE], a2
FMOV FZERO, c01
LDF [AO + 2 * SIZE], a3
FMOV FZERO, t2
LDF [AO + 3 * SIZE], a4
FMOV FZERO, c02
LDF [BO + 0 * SIZE], b1
FMOV FZERO, t3
LDF [BO + 1 * SIZE], b2
FMOV FZERO, t4
LDF [BO + 2 * SIZE], b3
#endif
ble,pn %icc, .LL275
LDF [BO + 3 * SIZE], b4
.LL272:
FADD c01, t1, c01
add L, -1, L
add AO, 4 * SIZE, AO
FMUL a1, b1, t1
add BO, 4 * SIZE, BO
LDF [AO + 0 * SIZE], a1
FADD c02, t2, c02
cmp L, 0
LDF [BO + 0 * SIZE], b1
FMUL a2, b2, t2
LDF [AO + 1 * SIZE], a2
FADD c01, t3, c01
LDF [BO + 1 * SIZE], b2
FMUL a3, b3, t3
LDF [AO + 2 * SIZE], a3
FADD c02, t4, c02
LDF [BO + 2 * SIZE], b3
FMUL a4, b4, t4
LDF [AO + 3 * SIZE], a4
bg,pt %icc, .LL272
LDF [BO + 3 * SIZE], b4
.LL275:
#ifndef TRMMKERNEL
and K, 3, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
sub K, KK, L
#elif defined(LEFT)
add KK, 1, L
#else
add KK, 1, L
#endif
and L, 3, L
#endif
cmp L, 0
ble,a,pn %icc, .LL279
nop
.LL276:
FADD c01, t1, c01
add L, -1, L
FMUL a1, b1, t1
LDF [AO + 1 * SIZE], a1
LDF [BO + 1 * SIZE], b1
add BO, 1 * SIZE, BO
cmp L, 0
bg,pt %icc, .LL276
add AO, 1 * SIZE, AO
.LL279:
#ifndef TRMMKERNEL
FADD c01, t1, c01
LDF [C1 + 0 * SIZE], a1
FADD c02, t2, c02
FADD c01, t3, c01
FADD c02, t4, c02
FADD c01, c02, c01
FMUL c01, ALPHA, c01
FADD c01, a1, c01
STF c01, [C1 + 0 * SIZE]
#else
FADD c01, t1, c01
FADD c02, t2, c02
FADD c01, t3, c01
FADD c02, t4, c02
FADD c01, c02, c01
FMUL c01, ALPHA, c01
STF c01, [C1 + 0 * SIZE]
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
sub K, KK, TEMP1
#ifdef LEFT
add TEMP1, -1, TEMP1
#else
add TEMP1, -1, TEMP1
#endif
sll TEMP1, 0 + BASE_SHIFT, TEMP2
sll TEMP1, 0 + BASE_SHIFT, TEMP1
add AO, TEMP2, AO
add BO, TEMP1, BO
#endif
#ifdef LEFT
add KK, 1, KK
#endif
#endif
.LL999:
return %i7 + 8
clr %o0
EPILOGUE