/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define K %i2 #define A %i5 #define B %i3 #define C %i4 #define LDC %o0 #define AO %o1 #define BO %o2 #define I %o3 #define J %o4 #define L %o5 #define C1 %l0 #define C2 %l1 #define OFFSET %l2 #define KK %l3 #define TEMP1 %l4 #define TEMP2 %l5 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define t1 %f32 #define t2 %f34 #define t3 %f36 #define t4 %f38 #define a1 %f40 #define a2 %f42 #define a3 %f44 #define a4 %f46 #define a5 %f62 #define b1 %f48 #define b2 %f50 #define b3 %f52 #define b4 %f54 #define b5 %f56 #define FZERO %f58 #define ALPHA_R %f60 #define ALPHA_I %f62 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define t1 %f16 #define t2 %f17 #define t3 %f18 #define t4 %f19 #define a1 %f20 #define a2 %f21 #define a3 %f22 #define a4 %f23 #define a5 %f31 #define b1 %f24 #define b2 %f25 #define b3 %f26 #define b4 %f27 #define b5 %f28 #define FZERO %f29 #define ALPHA_R %f30 #define ALPHA_I %f31 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FADD1 FADD #define FADD2 FADD #define FADD3 FADD #define FADD4 FSUB #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define FADD1 FADD #define FADD2 FADD #define FADD3 FSUB #define FADD4 FADD #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define FADD1 FADD #define FADD2 FSUB #define FADD3 FADD #define FADD4 FADD #else #define FADD1 FADD #define FADD2 FSUB #define FADD3 FSUB #define FADD4 FSUB #endif #define APREFETCHSIZE 40 #define BPREFETCHSIZE 40 #define APREFETCH_CATEGORY 0 #define BPREFETCH_CATEGORY 0 PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE #define STACK_ALPHA [%sp + STACK_START + 24] #else #define STACK_ALPHA [%sp + STACK_START + 20] #endif #else #define STACK_ALPHA [%sp + STACK_START + 40] #endif #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] st %i5, [%sp + STACK_START + 24] ld [%sp + STACK_START + 32], A ld [%sp + STACK_START + 36], B ld [%sp + STACK_START + 40], C ld [%sp + STACK_START + 44], LDC #ifdef TRMMKERNEL ld [%sp + STACK_START + 48], OFFSET #endif ldd [%sp + STACK_START + 16], ALPHA_R ldd [%sp + STACK_START + 24], ALPHA_I #else st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC #ifdef TRMMKERNEL ld [%sp + STACK_START + 40], OFFSET #endif ld [%sp + STACK_START + 16], ALPHA_R ld [%sp + STACK_START + 20], ALPHA_I #endif #else #ifdef DOUBLE FMOV %f6, ALPHA_R FMOV %f8, ALPHA_I STF %f8, STACK_ALPHA #else FMOV %f7, ALPHA_R FMOV %f9, ALPHA_I STF %f9, STACK_ALPHA #endif ldx [%sp+ STACK_START + 56], B nop ldx [%sp+ STACK_START + 64], C nop ldx [%sp+ STACK_START + 72], LDC #ifdef TRMMKERNEL ldx [%sp+ STACK_START + 80], OFFSET #endif LDF [%sp + STACK_START + 32], FZERO #endif #ifdef DOUBLE FCLR(27) #else FCLR(29) #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg OFFSET, KK #endif sra N, 1, J cmp J, 0 ble,pn %icc, .LL100 sll LDC, ZBASE_SHIFT, LDC .LL11: sra M, 1, I FMOV FZERO, t1 add C, LDC, C2 FMOV FZERO, t2 mov C, C1 FMOV FZERO, t3 cmp I, 0 #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif mov A, AO add C2, LDC, C nop ble,pn %icc, .LL50 FMOV FZERO, t4 .LL21: #if !defined(TRMMKERNEL) sra K, 2, L FMOV FZERO, c01 cmp L, 0 FMOV FZERO, c02 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c03 LDF [B + 0 * SIZE], b1 FMOV FZERO, c04 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c05 LDF [B + 1 * SIZE], b2 FMOV FZERO, c06 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c07 LDF [B + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c09 LDF [B + 3 * SIZE], b4 FMOV FZERO, c10 LDF [B + 4 * SIZE], b5 FMOV FZERO, c11 LDF [AO + 4 * SIZE], a5 FMOV FZERO, c12 prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c14 mov B, BO #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add B, TEMP1, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 2, L #endif sra L, 2, L cmp L, 0 FMOV FZERO, c01 FMOV FZERO, c02 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c03 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c04 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c05 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c06 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c07 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c09 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c10 LDF [BO + 4 * SIZE], b5 FMOV FZERO, c11 LDF [AO + 4 * SIZE], a5 FMOV FZERO, c12 prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c14 #endif FMOV FZERO, c15 ble,pn %icc, .LL25 FMOV FZERO, c16 .LL22: FADD2 c04, t1, c04 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY FMUL a1, b1, t1 nop FADD4 c08, t2, c08 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY FMUL a1, b2, t2 add AO, 16 * SIZE, AO FADD2 c12, t3, c12 LDF [AO - 13 * SIZE], a4 FMUL a1, b3, t3 add BO, 16 * SIZE, BO FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 8 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 add L, -1, L FMUL a2, b4, t4 LDF [AO - 11 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b1, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 10 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 11 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 10 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 9 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 9 * SIZE], a4 FADD4 c08, t2, c08 nop FMUL a5, b2, t2 nop FADD2 c12, t3, c12 nop FMUL a5, b3, t3 nop FADD4 c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO - 4 * SIZE], a5 FADD1 c01, t1, c01 nop FMUL a2, b5, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b5, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 6 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b5, t1 LDF [BO - 4 * SIZE], b5 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD4 c08, t2, c08 nop FMUL a1, b2, t2 nop FADD2 c12, t3, c12 nop FMUL a1, b3, t3 nop FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 0 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop #ifdef DOUBLE prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY #else nop #endif FADD3 c05, t2, c05 nop FMUL a2, b2, t2 FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 nop FADD2 c02, t1, c02 nop FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD4 c06, t2, c06 #ifdef DOUBLE prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY #else nop #endif FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 0 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 3 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 1 * SIZE], a4 FADD4 c08, t2, c08 FMUL a5, b2, t2 FADD2 c12, t3, c12 FMUL a5, b3, t3 FADD4 c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO + 4 * SIZE], a5 FADD1 c01, t1, c01 nop FMUL a2, b5, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b5, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c03, t1, c03 cmp L, 0 FMUL a4, b5, t1 LDF [BO + 4 * SIZE], b5 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL22 LDF [BO + 3 * SIZE], b4 .LL25: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 2, L #endif and L, 3, L #endif cmp L, 0 ble,pn %icc, .LL29 LDF STACK_ALPHA, ALPHA_I .LL26: FADD2 c04, t1, c04 LDF [AO + 3 * SIZE], a4 FMUL a1, b1, t1 add AO, 4 * SIZE, AO FADD4 c08, t2, c08 add BO, 4 * SIZE, BO FMUL a1, b2, t2 add L, -1, L FADD2 c12, t3, c12 nop FMUL a1, b3, t3 cmp L, 0 FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b1, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL26 LDF [BO + 3 * SIZE], b4 .LL29: #ifndef TRMMKERNEL FADD2 c04, t1, c04 LDF [C1 + 0 * SIZE], a1 FADD4 c08, t2, c08 LDF [C1 + 1 * SIZE], a2 FADD2 c12, t3, c12 LDF [C1 + 2 * SIZE], a3 FADD4 c16, t4, c16 LDF [C1 + 3 * SIZE], a4 FADD c01, c06, c01 LDF [C2 + 0 * SIZE], b1 FADD c02, c05, c02 LDF [C2 + 1 * SIZE], b2 FADD c03, c08, c03 LDF [C2 + 2 * SIZE], b3 FADD c04, c07, c04 LDF [C2 + 3 * SIZE], b4 FADD c09, c14, c09 FMUL ALPHA_R, c01, t1 FADD c10, c13, c10 FMUL ALPHA_R, c02, t2 FADD c11, c16, c11 FMUL ALPHA_R, c03, t3 FADD c12, c15, c12 FMUL ALPHA_R, c04, t4 FADD a1, t1, a1 FMUL ALPHA_I, c02, t1 FADD a2, t2, a2 FMUL ALPHA_I, c01, t2 FADD a3, t3, a3 FMUL ALPHA_I, c04, t3 FADD a4, t4, a4 FMUL ALPHA_I, c03, t4 FSUB a1, t1, a1 FMUL ALPHA_R, c09, t1 FADD a2, t2, a2 FMUL ALPHA_R, c10, t2 FSUB a3, t3, a3 FMUL ALPHA_R, c11, t3 FADD a4, t4, a4 FMUL ALPHA_R, c12, t4 FADD b1, t1, b1 FMUL ALPHA_I, c10, t1 FADD b2, t2, b2 FMUL ALPHA_I, c09, t2 FADD b3, t3, b3 FMUL ALPHA_I, c12, t3 FADD b4, t4, b4 FMUL ALPHA_I, c11, t4 STF a1, [C1 + 0 * SIZE] FSUB b1, t1, b1 STF a2, [C1 + 1 * SIZE] FADD b2, t2, b2 STF a3, [C1 + 2 * SIZE] FSUB b3, t3, b3 STF a4, [C1 + 3 * SIZE] FADD b4, t4, b4 STF b1, [C2 + 0 * SIZE] FMOV FZERO, t1 STF b2, [C2 + 1 * SIZE] FMOV FZERO, t2 STF b3, [C2 + 2 * SIZE] FMOV FZERO, t3 STF b4, [C2 + 3 * SIZE] FMOV FZERO, t4 #else FADD2 c04, t1, c04 FADD4 c08, t2, c08 FADD2 c12, t3, c12 FADD4 c16, t4, c16 FADD c01, c06, c01 FADD c02, c05, c02 FADD c03, c08, c03 FADD c04, c07, c04 STF c01, [C1 + 0 * SIZE] FADD c09, c14, c09 STF c02, [C1 + 1 * SIZE] FADD c10, c13, c10 STF c03, [C1 + 2 * SIZE] FADD c11, c16, c11 STF c04, [C1 + 3 * SIZE] FADD c12, c15, c12 STF c09, [C2 + 0 * SIZE] FMOV FZERO, t1 STF c10, [C2 + 1 * SIZE] FMOV FZERO, t2 STF c11, [C2 + 2 * SIZE] FMOV FZERO, t3 STF c12, [C2 + 3 * SIZE] FMOV FZERO, t4 #endif add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -2, TEMP1 #else add TEMP1, -2, TEMP1 #endif sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 2, KK #endif #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL21 FMOV FZERO, c01 .LL50: and M, 1, I FMOV FZERO, c02 cmp I, 0 FMOV FZERO, t1 ble,pn %icc, .LL99 FMOV FZERO, c04 #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, t2 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, t3 LDF [B + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [B + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [B + 3 * SIZE], b4 FMOV FZERO, c05 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 0 + ZBASE_SHIFT, TEMP1 sll KK, 1 + ZBASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 2, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t2 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c05 #endif ble,pn %icc, .LL55 FMOV FZERO, c07 .LL52: FADD2 c02, t1, c02 add AO, 8 * SIZE, AO prefetch [AO + APREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 add BO, 16 * SIZE, BO FADD4 c04, t2, c04 add L, -1, L FMUL a1, b2, t2 FADD2 c06, t3, c06 cmp L, 0 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO - 4 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 12 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 11 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 10 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 9 * SIZE], b4 FADD2 c02, t1, c02 FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD4 c04, t2, c04 FMUL a3, b2, t2 FADD2 c06, t3, c06 FMUL a3, b3, t3 FADD4 c08, t4, c08 FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD1 c01, t1, c01 FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD3 c03, t2, c03 FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD1 c05, t3, c05 FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD3 c07, t4, c07 FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD2 c02, t1, c02 FMUL a1, b1, t1 LDF [AO - 1 * SIZE], a4 FADD4 c04, t2, c04 FMUL a1, b2, t2 FADD2 c06, t3, c06 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 4 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 3 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 1 * SIZE], b4 FADD2 c02, t1, c02 FMUL a3, b1, t1 LDF [AO + 1 * SIZE], a2 FADD4 c04, t2, c04 FMUL a3, b2, t2 FADD2 c06, t3, c06 FMUL a3, b3, t3 FADD4 c08, t4, c08 FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c03, t2, c03 FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c05, t3, c05 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a4, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL52 LDF [AO + 3 * SIZE], a4 .LL55: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 2, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL59 nop .LL56: FADD2 c02, t1, c02 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add L, -1, L add BO, 4 * SIZE, BO FADD4 c04, t2, c04 cmp L, 0 FMUL a1, b2, t2 FADD2 c06, t3, c06 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL56 LDF [AO + 1 * SIZE], a2 .LL59: #ifndef TRMMKERNEL FADD2 c02, t1, c02 LDF [C1 + 0 * SIZE], a1 FADD4 c04, t2, c04 LDF [C1 + 1 * SIZE], a2 FADD2 c06, t3, c06 LDF [C2 + 0 * SIZE], a3 FADD4 c08, t4, c08 LDF [C2 + 1 * SIZE], a4 FADD c01, c04, c01 FMUL ALPHA_R, c01, t1 FADD c02, c03, c02 FMUL ALPHA_R, c02, t2 FADD c05, c08, c05 FMUL ALPHA_R, c05, t3 FADD c06, c07, c06 FMUL ALPHA_R, c06, t4 FADD a1, t1, a1 FMUL ALPHA_I, c02, t1 FADD a2, t2, a2 FMUL ALPHA_I, c01, t2 FADD a3, t3, a3 FMUL ALPHA_I, c06, t3 FADD a4, t4, a4 FMUL ALPHA_I, c05, t4 FSUB a1, t1, a1 FADD a2, t2, a2 FSUB a3, t3, a3 FADD a4, t4, a4 STF a1, [C1 + 0 * SIZE] FMOV FZERO, t1 STF a2, [C1 + 1 * SIZE] FMOV FZERO, t2 STF a3, [C2 + 0 * SIZE] FMOV FZERO, t3 STF a4, [C2 + 1 * SIZE] FMOV FZERO, t4 #else FADD2 c02, t1, c02 FADD4 c04, t2, c04 FADD2 c06, t3, c06 FADD4 c08, t4, c08 FADD c01, c04, c01 FADD c02, c03, c02 FADD c05, c08, c05 FADD c06, c07, c06 STF c01, [C1 + 0 * SIZE] FMOV FZERO, t1 STF c02, [C1 + 1 * SIZE] FMOV FZERO, t2 STF c05, [C2 + 0 * SIZE] FMOV FZERO, t3 STF c06, [C2 + 1 * SIZE] FMOV FZERO, t4 #endif add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -2, TEMP1 #endif sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif .LL99: add J, -1, J mov BO, B cmp J, 0 bg,pt %icc, .LL11 #if defined(TRMMKERNEL) && !defined(LEFT) add KK, 2, KK #else nop #endif .LL100: sra M, 1, I and N, 1, J cmp J, 0 ble,pn %icc, .LL999 mov A, AO mov C, C1 add C, LDC, C #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif cmp I, 0 ble,pn %icc, .LL150 FMOV FZERO, c03 .LL121: #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, t1 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, t2 LDF [B + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [B + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [B + 3 * SIZE], b4 FMOV FZERO, c01 prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c05 FMOV FZERO, c02 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 1 + ZBASE_SHIFT, TEMP1 sll KK, 0 + ZBASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 1, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t2 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c01 prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c05 FMOV FZERO, c02 #endif ble,pn %icc, .LL125 FMOV FZERO, c06 .LL122: FADD1 c03, t1, c03 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD3 c07, t2, c07 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD2 c04, t3, c04 add AO, 16 * SIZE, AO FMUL a2, b1, t3 cmp L, 0 FADD4 c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 11 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 10 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO - 4 * SIZE], b1 FADD4 c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO - 3 * SIZE], b2 FADD1 c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 9 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO - 8 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b3, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO - 6 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD4 c06, t4, c06 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD1 c03, t1, c03 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b2, t2 LDF [AO - 4 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b1, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 3 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 2 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 FADD1 c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 1 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO + 0 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b3, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO + 2 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD4 c06, t4, c06 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL122 LDF [BO + 3 * SIZE], b4 .LL125: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 1, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL129 nop .LL126: FADD1 c03, t1, c03 add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD3 c07, t2, c07 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD2 c04, t3, c04 cmp L, 0 FMUL a2, b1, t3 FADD4 c08, t4, c08 FMUL a2, b2, t4 LDF [AO + 1 * SIZE], a2 FADD1 c01, t1, c01 FMUL a3, b1, t1 FADD3 c05, t2, c05 FMUL a3, b2, t2 LDF [AO + 2 * SIZE], a3 FADD2 c02, t3, c02 FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c06, t4, c06 FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL126 LDF [AO + 3 * SIZE], a4 .LL129: #ifndef TRMMKERNEL FADD1 c03, t1, c03 LDF [C1 + 0 * SIZE], a1 FADD3 c07, t2, c07 LDF [C1 + 1 * SIZE], a2 FADD2 c04, t3, c04 LDF [C1 + 2 * SIZE], a3 FADD4 c08, t4, c08 LDF [C1 + 3 * SIZE], a4 FADD c01, c06, c01 FMUL ALPHA_R, c01, t1 FADD c02, c05, c02 FMUL ALPHA_R, c02, t2 FADD c03, c08, c03 FMUL ALPHA_R, c03, t3 FADD c04, c07, c04 FMUL ALPHA_R, c04, t4 FADD a1, t1, a1 FMUL ALPHA_I, c02, t1 FADD a2, t2, a2 FMUL ALPHA_I, c01, t2 FADD a3, t3, a3 FMUL ALPHA_I, c04, t3 FADD a4, t4, a4 FMUL ALPHA_I, c03, t4 FSUB a1, t1, a1 FADD a2, t2, a2 FSUB a3, t3, a3 FADD a4, t4, a4 STF a1, [C1 + 0 * SIZE] FMOV FZERO, t1 STF a2, [C1 + 1 * SIZE] FMOV FZERO, t2 STF a3, [C1 + 2 * SIZE] FMOV FZERO, t3 STF a4, [C1 + 3 * SIZE] FMOV FZERO, t4 #else FADD1 c03, t1, c03 FADD3 c07, t2, c07 FADD2 c04, t3, c04 FADD4 c08, t4, c08 FADD c01, c06, c01 FADD c02, c05, c02 FADD c03, c08, c03 FADD c04, c07, c04 STF c01, [C1 + 0 * SIZE] FMOV FZERO, t1 STF c02, [C1 + 1 * SIZE] FMOV FZERO, t2 STF c03, [C1 + 2 * SIZE] FMOV FZERO, t3 STF c04, [C1 + 3 * SIZE] FMOV FZERO, t4 #endif add C1, 4 * SIZE, C1 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -2, TEMP1 #else add TEMP1, -1, TEMP1 #endif sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 2, KK #endif #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL121 FMOV FZERO, c03 .LL150: and M, 1, I cmp I, 0 ble,pn %icc, .LL999 nop #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, c01 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [B + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [B + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [B + 3 * SIZE], b4 FMOV FZERO, t4 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 0 + ZBASE_SHIFT, TEMP1 sll KK, 0 + ZBASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 1, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 #endif ble,pn %icc, .LL155 nop .LL152: FADD1 c01, t1, c01 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD3 c02, t2, c02 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD2 c03, t3, c03 cmp L, 0 FMUL a2, b1, t3 LDF [BO - 4 * SIZE], b1 FADD4 c04, t4, c04 nop FMUL a2, b2, t4 LDF [AO + 5 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 LDF [BO - 3 * SIZE], b2 FADD3 c02, t2, c02 nop FMUL a3, b4, t2 LDF [AO + 6 * SIZE], a3 FADD2 c03, t3, c03 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD4 c04, t4, c04 nop FMUL a4, b4, t4 LDF [AO + 7 * SIZE], a4 FADD1 c01, t1, c01 nop FMUL a1, b1, t1 LDF [BO - 1 * SIZE], b4 FADD3 c02, t2, c02 FMUL a1, b2, t2 LDF [AO + 8 * SIZE], a1 FADD2 c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c04, t4, c04 FMUL a2, b2, t4 LDF [AO + 9 * SIZE], a2 FADD1 c01, t1, c01 FMUL a3, b3, t1 LDF [BO + 1 * SIZE], b2 FADD3 c02, t2, c02 FMUL a3, b4, t2 LDF [AO + 10 * SIZE], a3 FADD2 c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD4 c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO bg,pt %icc, .LL152 LDF [BO + 3 * SIZE], b4 .LL155: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 1, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL159 nop .LL156: FADD1 c01, t1, c01 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD3 c02, t2, c02 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD2 c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 cmp L, 0 FADD4 c04, t4, c04 FMUL a2, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL156 LDF [AO + 1 * SIZE], a2 .LL159: #ifndef TRMMKERNEL FADD1 c01, t1, c01 FADD3 c02, t2, c02 FADD2 c03, t3, c03 FADD4 c04, t4, c04 LDF [C1 + 0 * SIZE], a1 LDF [C1 + 1 * SIZE], a2 FADD c01, c04, c01 FADD c02, c03, c02 FMUL ALPHA_R, c01, t1 FMUL ALPHA_R, c02, t2 FMUL ALPHA_I, c02, t3 FMUL ALPHA_I, c01, t4 FADD a1, t1, a1 FADD a2, t2, a2 FSUB a1, t3, a1 FADD a2, t4, a2 STF a1, [C1 + 0 * SIZE] STF a2, [C1 + 1 * SIZE] #else FADD1 c01, t1, c01 FADD3 c02, t2, c02 FADD2 c03, t3, c03 FADD4 c04, t4, c04 FADD c01, c04, c01 FADD c02, c03, c02 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] #endif add C1, 2 * SIZE, C1 #ifndef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -1, TEMP1 #endif sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif .LL999: return %i7 + 8 clr %o0 EPILOGUE