/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define P 4000 #define M %i0 #define N %i1 #define A %i5 #define LDA %i2 #define X %i3 #define INCX %i4 #define Y %l0 #define INCY %l1 #define BUFFER %l2 #define I %l3 #define IS %l4 #define J %l5 #define MIN_M %l6 #define XP %l7 #define A1 %o0 #define A2 %o1 #define A3 %o2 #define A4 %o3 #define X1 %o4 #define Y1 %o5 #define PNLDA %g1 #define Y2 %o7 /* Danger? */ #ifdef DOUBLE #define t1 %f0 #define t2 %f2 #define t3 %f4 #define t4 %f6 #define c1 %f8 #define c2 %f10 #define c3 %f12 #define c4 %f14 #define c5 %f16 #define c6 %f18 #define c7 %f20 #define c8 %f22 #define c9 %f24 #define c10 %f26 #define c11 %f28 #define c12 %f30 #define c13 %f32 #define c14 %f34 #define c15 %f36 #define c16 %f38 #define a1 %f40 #define a2 %f42 #define a3 %f44 #define a4 %f46 #define a5 %f48 #define a6 %f50 #define a7 %f52 #define a8 %f54 #define b1 %f56 #define b2 %f58 #define b3 %f60 #define b4 %f62 #else #define t1 %f0 #define t2 %f1 #define t3 %f2 #define t4 %f3 #define c1 %f4 #define c2 %f5 #define c3 %f6 #define c4 %f7 #define c5 %f8 #define c6 %f9 #define c7 %f10 #define c8 %f11 #define c9 %f12 #define c10 %f13 #define c11 %f14 #define c12 %f15 #define c13 %f16 #define c14 %f17 #define c15 %f18 #define c16 %f19 #define a1 %f20 #define a2 %f21 #define a3 %f22 #define a4 %f23 #define a5 %f24 #define a6 %f25 #define a7 %f26 #define a8 %f27 #define b1 %f28 #define b2 %f29 #define b3 %f30 #define b4 %f31 #endif #ifndef __64BIT__ #define ALPHA_R [%sp + STACK_START + 16] #ifndef DOUBLE #define ALPHA_I [%sp + STACK_START + 20] #else #define ALPHA_I [%sp + STACK_START + 24] #endif #else #define ALPHA_R [%sp + STACK_START + 32] #define ALPHA_I [%sp + STACK_START + 40] #endif #ifdef DOUBLE #define PREFETCHSIZE 18 #else #define PREFETCHSIZE 36 #endif PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ st %i4, [%sp + STACK_START + 20] st %i5, [%sp + STACK_START + 24] /* ALPHA_I */ ld [%sp + STACK_START + 32], A ld [%sp + STACK_START + 36], LDA ld [%sp + STACK_START + 40], X ld [%sp + STACK_START + 44], INCX ld [%sp + STACK_START + 48], Y ld [%sp + STACK_START + 52], INCY ld [%sp + STACK_START + 56], BUFFER #else st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ st %i4, [%sp + STACK_START + 20] /* ALPHA_I */ ld [%sp + STACK_START + 28], LDA ld [%sp + STACK_START + 32], X ld [%sp + STACK_START + 36], INCX ld [%sp + STACK_START + 40], Y ld [%sp + STACK_START + 44], INCY ld [%sp + STACK_START + 48], BUFFER #endif #else ldx [%sp + STACK_START + 56], LDA ldx [%sp + STACK_START + 64], X ldx [%sp + STACK_START + 72], INCX ldx [%sp + STACK_START + 80], Y ldx [%sp + STACK_START + 88], INCY ldx [%sp + STACK_START + 96], BUFFER #ifdef DOUBLE std %f6, ALPHA_R std %f8, ALPHA_I #else st %f7, ALPHA_R st %f9, ALPHA_I #endif #endif clr IS mov P, I sll LDA, ZBASE_SHIFT, LDA sll I, ZBASE_SHIFT, I smul LDA, N, PNLDA sll INCX, ZBASE_SHIFT, INCX sll INCY, ZBASE_SHIFT, INCY sub I, PNLDA, PNLDA .LL10: sll IS, ZBASE_SHIFT, I sub M, IS, MIN_M mov P, J cmp MIN_M, J nop movg %icc, J, MIN_M nop cmp INCX, 2 * SIZE beq .LL100 add X, I, XP sra MIN_M, 2, I mov BUFFER, XP cmp I, 0 ble,pn %icc, .LL15 mov BUFFER, Y1 .LL11: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 LDF [X + 1 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 LDF [X + 1 * SIZE], a6 add X, INCX, X LDF [X + 0 * SIZE], a7 LDF [X + 1 * SIZE], a8 add X, INCX, X STF a1, [Y1 + 0 * SIZE] add I, -1, I STF a2, [Y1 + 1 * SIZE] cmp I, 0 STF a3, [Y1 + 2 * SIZE] STF a4, [Y1 + 3 * SIZE] STF a5, [Y1 + 4 * SIZE] STF a6, [Y1 + 5 * SIZE] STF a7, [Y1 + 6 * SIZE] STF a8, [Y1 + 7 * SIZE] bg,pn %icc, .LL11 add Y1, 8 * SIZE, Y1 .LL15: and MIN_M, 3, I cmp I, 0 ble,pn %icc, .LL100 nop .LL16: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X add I, -1, I cmp I, 0 nop STF a1, [Y1 + 0 * SIZE] STF a2, [Y1 + 1 * SIZE] bg,pn %icc, .LL16 add Y1, 2 * SIZE, Y1 .LL100: sra N, 2, J cmp J, 0 ble %icc, .LL200 mov Y, Y1 .LL110: FCLR(0) FMOV t1, c1 sra MIN_M, 2, I FMOV t1, c2 add A, LDA, A2 FMOV t1, c3 mov A, A1 FMOV t1, c4 add A2, LDA, A3 FMOV t1, c5 FMOV t1, c6 FMOV t1, c7 FMOV t1, c8 FMOV t1, c9 FMOV t1, c10 FMOV t1, c11 FMOV t1, c12 FMOV t1, c13 FMOV t1, c14 FMOV t1, c15 FMOV t1, c16 add A3, LDA, A4 FMOV t1, t2 mov XP, X1 FMOV t1, t3 add A4, LDA, A cmp I, 0 ble %icc, .LL115 FMOV t1, t4 LDF [A1 + 0 * SIZE], a1 nop LDF [A1 + 1 * SIZE], a2 add A1, 2 * SIZE, A1 LDF [A2 + 0 * SIZE], a3 LDF [A2 + 1 * SIZE], a4 add A2, 2 * SIZE, A2 LDF [A3 + 0 * SIZE], a5 LDF [A3 + 1 * SIZE], a6 add A3, 2 * SIZE, A3 LDF [A4 + 0 * SIZE], a7 LDF [A4 + 1 * SIZE], a8 add A4, 2 * SIZE, A4 LDF [X1 + 0 * SIZE], b1 nop LDF [X1 + 1 * SIZE], b2 nop LDF [X1 + 2 * SIZE], b3 add X1, 4 * SIZE, X1 deccc I ble .LL112 prefetch [Y1 + 7 * SIZE], 2 #ifndef XCONJ #define FADDX FADD #else #define FADDX FSUB #endif .LL111: FADD c13, t1, c13 prefetch [A1 + PREFETCHSIZE * SIZE], 1 FMUL a1, b1, t1 nop FADDX c14, t2, c14 nop FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b1, t3 LDF [X1 - 1 * SIZE], b4 FADD c16, t4, c16 nop FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b1, t1 nop FADDX c2, t2, c2 nop FMUL a3, b2, t2 LDF [A2 + 0 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b1, t3 nop FADD c4, t4, c4 nop FMUL a4, b2, t4 LDF [A2 + 1 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b1, t1 nop FADDX c6, t2, c6 nop FMUL a5, b2, t2 LDF [A3 + 0 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b1, t3 nop FADD c8, t4, c8 nop FMUL a6, b2, t4 LDF [A3 + 1 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b1, t1 nop FADDX c10, t2, c10 nop FMUL a7, b2, t2 LDF [A4 + 0 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c12, t4, c12 nop FMUL a8, b2, t4 LDF [A4 + 1 * SIZE], a8 FADD c13, t1, c13 nop FMUL a1, b3, t1 prefetch [A2 + PREFETCHSIZE * SIZE], 1 FADDX c14, t2, c14 nop FMUL a1, b4, t2 LDF [A1 + 2 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b3, t3 LDF [X1 + 1 * SIZE], b2 FADD c16, t4, c16 nop FMUL a2, b4, t4 LDF [A1 + 3 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b3, t1 nop FADDX c2, t2, c2 nop FMUL a3, b4, t2 LDF [A2 + 2 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b3, t3 nop FADD c4, t4, c4 nop FMUL a4, b4, t4 LDF [A2 + 3 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b3, t1 nop FADDX c6, t2, c6 nop FMUL a5, b4, t2 LDF [A3 + 2 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b3, t3 nop FADD c8, t4, c8 nop FMUL a6, b4, t4 LDF [A3 + 3 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b3, t1 nop FADDX c10, t2, c10 nop FMUL a7, b4, t2 LDF [A4 + 2 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b3, t3 LDF [X1 + 2 * SIZE], b3 FADD c12, t4, c12 nop FMUL a8, b4, t4 LDF [A4 + 3 * SIZE], a8 FADD c13, t1, c13 prefetch [A3 + PREFETCHSIZE * SIZE], 1 FMUL a1, b1, t1 nop FADDX c14, t2, c14 nop FMUL a1, b2, t2 LDF [A1 + 4 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b1, t3 LDF [X1 + 3 * SIZE], b4 FADD c16, t4, c16 nop FMUL a2, b2, t4 LDF [A1 + 5 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b1, t1 nop FADDX c2, t2, c2 nop FMUL a3, b2, t2 LDF [A2 + 4 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b1, t3 nop FADD c4, t4, c4 nop FMUL a4, b2, t4 LDF [A2 + 5 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b1, t1 nop FADDX c6, t2, c6 nop FMUL a5, b2, t2 LDF [A3 + 4 * SIZE], a5 FADD c7, t3, c7 deccc I FMUL a6, b1, t3 nop FADD c8, t4, c8 nop FMUL a6, b2, t4 LDF [A3 + 5 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b1, t1 nop FADDX c10, t2, c10 nop FMUL a7, b2, t2 LDF [A4 + 4 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b1, t3 LDF [X1 + 4 * SIZE], b1 FADD c12, t4, c12 nop FMUL a8, b2, t4 LDF [A4 + 5 * SIZE], a8 FADD c13, t1, c13 prefetch [A4 + PREFETCHSIZE * SIZE], 1 FMUL a1, b3, t1 nop FADDX c14, t2, c14 nop FMUL a1, b4, t2 LDF [A1 + 6 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b3, t3 LDF [X1 + 5 * SIZE], b2 FADD c16, t4, c16 nop FMUL a2, b4, t4 LDF [A1 + 7 * SIZE], a2 FADD c1, t1, c1 add A1, 8 * SIZE, A1 FMUL a3, b3, t1 nop FADDX c2, t2, c2 nop FMUL a3, b4, t2 LDF [A2 + 6 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b3, t3 nop FADD c4, t4, c4 nop FMUL a4, b4, t4 LDF [A2 + 7 * SIZE], a4 FADD c5, t1, c5 add A2, 8 * SIZE, A2 FMUL a5, b3, t1 nop FADDX c6, t2, c6 nop FMUL a5, b4, t2 LDF [A3 + 6 * SIZE], a5 FADD c7, t3, c7 add A4, 8 * SIZE, A4 FMUL a6, b3, t3 nop FADD c8, t4, c8 nop FMUL a6, b4, t4 LDF [A3 + 7 * SIZE], a6 FADD c9, t1, c9 add A3, 8 * SIZE, A3 FMUL a7, b3, t1 nop FADDX c10, t2, c10 add X1, 8 * SIZE, X1 FMUL a7, b4, t2 LDF [A4 - 2 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b3, t3 LDF [X1 - 2 * SIZE], b3 FADD c12, t4, c12 FMUL a8, b4, t4 bg,pn %icc, .LL111 LDF [A4 - 1 * SIZE], a8 .LL112: FADD c13, t1, c13 nop FMUL a1, b1, t1 LDF [X1 - 1 * SIZE], b4 FADDX c14, t2, c14 nop FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b1, t3 LDF [X1 - 1 * SIZE], b4 FADD c16, t4, c16 nop FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b1, t1 nop FADDX c2, t2, c2 nop FMUL a3, b2, t2 LDF [A2 + 0 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b1, t3 nop FADD c4, t4, c4 nop FMUL a4, b2, t4 LDF [A2 + 1 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b1, t1 nop FADDX c6, t2, c6 nop FMUL a5, b2, t2 LDF [A3 + 0 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b1, t3 nop FADD c8, t4, c8 nop FMUL a6, b2, t4 LDF [A3 + 1 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b1, t1 nop FADDX c10, t2, c10 nop FMUL a7, b2, t2 LDF [A4 + 0 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c12, t4, c12 nop FMUL a8, b2, t4 LDF [A4 + 1 * SIZE], a8 FADD c13, t1, c13 nop FMUL a1, b3, t1 LDF [X1 + 1 * SIZE], b2 FADDX c14, t2, c14 nop FMUL a1, b4, t2 LDF [A1 + 2 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b3, t3 nop FADD c16, t4, c16 nop FMUL a2, b4, t4 LDF [A1 + 3 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b3, t1 nop FADDX c2, t2, c2 nop FMUL a3, b4, t2 LDF [A2 + 2 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b3, t3 nop FADD c4, t4, c4 nop FMUL a4, b4, t4 LDF [A2 + 3 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b3, t1 nop FADDX c6, t2, c6 nop FMUL a5, b4, t2 LDF [A3 + 2 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b3, t3 nop FADD c8, t4, c8 nop FMUL a6, b4, t4 LDF [A3 + 3 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b3, t1 nop FADDX c10, t2, c10 nop FMUL a7, b4, t2 LDF [A4 + 2 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b3, t3 LDF [X1 + 2 * SIZE], b3 FADD c12, t4, c12 nop FMUL a8, b4, t4 LDF [A4 + 3 * SIZE], a8 FADD c13, t1, c13 nop FMUL a1, b1, t1 LDF [X1 + 3 * SIZE], b4 FADDX c14, t2, c14 add X1, 4 * SIZE, X1 FMUL a1, b2, t2 LDF [A1 + 4 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b1, t3 nop FADD c16, t4, c16 nop FMUL a2, b2, t4 LDF [A1 + 5 * SIZE], a2 FADD c1, t1, c1 add A1, 6 * SIZE, A1 FMUL a3, b1, t1 nop FADDX c2, t2, c2 nop FMUL a3, b2, t2 LDF [A2 + 4 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b1, t3 nop FADD c4, t4, c4 nop FMUL a4, b2, t4 LDF [A2 + 5 * SIZE], a4 FADD c5, t1, c5 add A2, 6 * SIZE, A2 FMUL a5, b1, t1 nop FADDX c6, t2, c6 nop FMUL a5, b2, t2 LDF [A3 + 4 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b1, t3 nop FADD c8, t4, c8 nop FMUL a6, b2, t4 LDF [A3 + 5 * SIZE], a6 FADD c9, t1, c9 add A3, 6 * SIZE, A3 FMUL a7, b1, t1 nop FADDX c10, t2, c10 nop FMUL a7, b2, t2 LDF [A4 + 4 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b1, t3 nop FADD c12, t4, c12 nop FMUL a8, b2, t4 LDF [A4 + 5 * SIZE], a8 FADD c13, t1, c13 add A4, 6 * SIZE, A4 FMUL a1, b3, t1 nop FADDX c14, t2, c14 nop FMUL a1, b4, t2 nop FADD c15, t3, c15 FMUL a2, b3, t3 FADD c16, t4, c16 FMUL a2, b4, t4 FADD c1, t1, c1 FMUL a3, b3, t1 FADDX c2, t2, c2 FMUL a3, b4, t2 FADD c3, t3, c3 FMUL a4, b3, t3 FADD c4, t4, c4 FMUL a4, b4, t4 FADD c5, t1, c5 FMUL a5, b3, t1 FADDX c6, t2, c6 FMUL a5, b4, t2 FADD c7, t3, c7 FMUL a6, b3, t3 FADD c8, t4, c8 FMUL a6, b4, t4 FADD c9, t1, c9 FMUL a7, b3, t1 FADDX c10, t2, c10 FMUL a7, b4, t2 FADD c11, t3, c11 FMUL a8, b3, t3 FADD c12, t4, c12 FMUL a8, b4, t4 .LL115: andcc MIN_M, 3, I LDF ALPHA_R, b3 mov Y1, Y2 ble,pn %icc, .LL119 LDF ALPHA_I, b4 .L116: LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 add A1, 2 * SIZE, A1 LDF [X1 + 0 * SIZE], b1 LDF [X1 + 1 * SIZE], b2 add X1, 2 * SIZE, X1 LDF [A2 + 0 * SIZE], a3 LDF [A2 + 1 * SIZE], a4 add A2, 2 * SIZE, A2 LDF [A3 + 0 * SIZE], a5 LDF [A3 + 1 * SIZE], a6 add A3, 2 * SIZE, A3 LDF [A4 + 0 * SIZE], a7 LDF [A4 + 1 * SIZE], a8 add A4, 2 * SIZE, A4 FADD c13, t1, c13 FMUL a1, b1, t1 FADDX c14, t2, c14 FMUL a1, b2, t2 FADD c15, t3, c15 FMUL a2, b1, t3 FADD c16, t4, c16 FMUL a2, b2, t4 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 FADD c3, t3, c3 FMUL a4, b1, t3 FADD c4, t4, c4 FMUL a4, b2, t4 FADD c5, t1, c5 FMUL a5, b1, t1 FADDX c6, t2, c6 FMUL a5, b2, t2 FADD c7, t3, c7 FMUL a6, b1, t3 FADD c8, t4, c8 FMUL a6, b2, t4 FADD c9, t1, c9 FMUL a7, b1, t1 FADDX c10, t2, c10 FMUL a7, b2, t2 FADD c11, t3, c11 FMUL a8, b1, t3 FADD c12, t4, c12 FMUL a8, b2, t4 deccc I bg %icc, .L116 nop .LL119: FADD c13, t1, c13 LDF [Y1 + 0 * SIZE], a1 FADDX c14, t2, c14 LDF [Y1 + 1 * SIZE] ,a2 add Y1, INCY, Y1 FADD c15, t3, c15 LDF [Y1 + 0 * SIZE], a3 FADD c16, t4, c16 LDF [Y1 + 1 * SIZE] ,a4 add Y1, INCY, Y1 #if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) FSUB c1, c4, c1 LDF [Y1 + 0 * SIZE], a5 FSUB c5, c8, c5 LDF [Y1 + 1 * SIZE] ,a6 add Y1, INCY, Y1 FSUB c9, c12, c9 LDF [Y1 + 0 * SIZE], a7 FSUB c13, c16, c13 LDF [Y1 + 1 * SIZE] ,a8 add Y1, INCY, Y1 #else FADD c1, c4, c1 LDF [Y1 + 0 * SIZE], a5 FADD c5, c8, c5 LDF [Y1 + 1 * SIZE] ,a6 add Y1, INCY, Y1 FADD c9, c12, c9 LDF [Y1 + 0 * SIZE], a7 FADD c13, c16, c13 LDF [Y1 + 1 * SIZE] ,a8 add Y1, INCY, Y1 #endif #ifndef CONJ FADD c2, c3, c2 FCLR(0) FADD c6, c7, c6 FADD c10, c11, c10 FADD c14, c15, c14 #else FSUB c2, c3, c2 FCLR(0) FSUB c6, c7, c6 FSUB c10, c11, c10 FSUB c14, c15, c14 #endif FMUL b3, c1, c3 FMOV t1, t2 FMUL b4, c1, c4 FMOV t1, t3 FMUL b4, c2, c1 FMOV t1, t4 FMUL b3, c2, c2 FMUL b3, c5, c7 FMUL b4, c5, c8 FMUL b4, c6, c5 FMUL b3, c6, c6 FMUL b3, c9, c11 FMUL b4, c9, c12 FMUL b4, c10, c9 FMUL b3, c10, c10 FMUL b3, c13, c15 FSUB c3, c1, c1 FMUL b4, c13, c16 FADD c2, c4, c2 FMUL b4, c14, c13 FSUB c7, c5, c5 FMUL b3, c14, c14 FADD c6, c8, c6 FSUB c11, c9, c9 FADD c10, c12, c10 FSUB c15, c13, c13 FADD c14, c16, c14 FADD a1, c1, a1 FADD a2, c2, a2 FADD a3, c5, a3 FADD a4, c6, a4 STF a1, [Y2 + 0 * SIZE] FADD a5, c9, a5 STF a2, [Y2 + 1 * SIZE] FADD a6, c10, a6 add Y2, INCY, Y2 STF a3, [Y2 + 0 * SIZE] FADD a7, c13, a7 STF a4, [Y2 + 1 * SIZE] FADD a8, c14, a8 add Y2, INCY, Y2 STF a5, [Y2 + 0 * SIZE] FMOV t1, c1 add J, -1, J STF a6, [Y2 + 1 * SIZE] FMOV t1, c2 cmp J, 0 add Y2, INCY, Y2 STF a7, [Y2 + 0 * SIZE] FMOV t1, c3 STF a8, [Y2 + 1 * SIZE] FMOV t1, c4 add Y2, INCY, Y2 FMOV t1, c5 bg %icc, .LL110 FMOV t1, c6 .LL200: FCLR(0) and N, 2, J cmp J, 0 FMOV t1, c1 ble %icc, .LL300 FMOV t1, c2 sra MIN_M, 2, I FMOV t1, t2 add A, LDA, A2 FMOV t1, c3 mov A, A1 FMOV t1, t3 cmp I, 0 FMOV t1, c4 FMOV t1, c5 FMOV t1, c6 FMOV t1, c7 FMOV t1, c8 add A2, LDA, A FMOV t1, t4 ble %icc, .LL215 mov XP, X1 LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a5 LDF [A1 + 3 * SIZE], a6 add A1, 4 * SIZE, A1 LDF [A2 + 0 * SIZE], a3 LDF [A2 + 1 * SIZE], a4 LDF [A2 + 2 * SIZE], a7 LDF [A2 + 3 * SIZE], a8 add A2, 4 * SIZE, A2 LDF [X1 + 0 * SIZE], b1 add I, -1, I LDF [X1 + 1 * SIZE], b2 cmp I, 0 LDF [X1 + 2 * SIZE], b3 LDF [X1 + 3 * SIZE], b4 ble %icc, .LL212 add X1, 4 * SIZE, X1 .LL211: prefetch [A1 + PREFETCHSIZE * SIZE], 1 FADD c5, t1, c5 FMUL a1, b1, t1 FADDX c6, t2, c6 FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c7, t3, c7 FMUL a2, b1, t3 FADD c8, t4, c8 FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 LDF [A2 + 0 * SIZE], a3 FADD c3, t3, c3 FMUL a4, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c4, t4, c4 FMUL a4, b2, t4 LDF [A2 + 1 * SIZE], a4 FADD c5, t1, c5 LDF [X1 + 1 * SIZE], b2 FMUL a5, b3, t1 FADDX c6, t2, c6 FMUL a5, b4, t2 LDF [A1 + 2 * SIZE], a5 FADD c7, t3, c7 add I, -1, I FMUL a6, b3, t3 FADD c8, t4, c8 cmp I, 0 FMUL a6, b4, t4 LDF [A1 + 3 * SIZE], a6 FADD c1, t1, c1 FMUL a7, b3, t1 FADDX c2, t2, c2 FMUL a7, b4, t2 LDF [A2 + 2 * SIZE], a7 FADD c3, t3, c3 FMUL a8, b3, t3 LDF [X1 + 2 * SIZE], b3 FADD c4, t4, c4 FMUL a8, b4, t4 LDF [A2 + 3 * SIZE], a8 prefetch [A2 + PREFETCHSIZE * SIZE], 1 FADD c5, t1, c5 LDF [X1 + 3 * SIZE], b4 FMUL a1, b1, t1 FADDX c6, t2, c6 FMUL a1, b2, t2 LDF [A1 + 4 * SIZE], a1 FADD c7, t3, c7 FMUL a2, b1, t3 FADD c8, t4, c8 FMUL a2, b2, t4 LDF [A1 + 5 * SIZE], a2 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 LDF [A2 + 4 * SIZE], a3 FADD c3, t3, c3 FMUL a4, b1, t3 LDF [X1 + 4 * SIZE], b1 FADD c4, t4, c4 FMUL a4, b2, t4 LDF [A2 + 5 * SIZE], a4 FADD c5, t1, c5 LDF [X1 + 5 * SIZE], b2 FMUL a5, b3, t1 FADDX c6, t2, c6 FMUL a5, b4, t2 LDF [A1 + 6 * SIZE], a5 FADD c7, t3, c7 FMUL a6, b3, t3 FADD c8, t4, c8 FMUL a6, b4, t4 LDF [A1 + 7 * SIZE], a6 add A1, 8 * SIZE, A1 FADD c1, t1, c1 FMUL a7, b3, t1 FADDX c2, t2, c2 FMUL a7, b4, t2 LDF [A2 + 6 * SIZE], a7 FADD c3, t3, c3 FMUL a8, b3, t3 LDF [X1 + 6 * SIZE], b3 FADD c4, t4, c4 add X1, 8 * SIZE, X1 FMUL a8, b4, t4 LDF [A2 + 7 * SIZE], a8 add A2, 8 * SIZE, A2 bg,pn %icc, .LL211 LDF [X1 - 1 * SIZE], b4 .LL212: FADD c5, t1, c5 FMUL a1, b1, t1 FADDX c6, t2, c6 FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c7, t3, c7 FMUL a2, b1, t3 FADD c8, t4, c8 FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 LDF [A2 + 0 * SIZE], a3 FADD c3, t3, c3 FMUL a4, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c4, t4, c4 FMUL a4, b2, t4 LDF [A2 + 1 * SIZE], a4 FADD c5, t1, c5 LDF [X1 + 1 * SIZE], b2 FMUL a5, b3, t1 FADDX c6, t2, c6 FMUL a5, b4, t2 LDF [A1 + 2 * SIZE], a5 FADD c7, t3, c7 FMUL a6, b3, t3 FADD c8, t4, c8 FMUL a6, b4, t4 LDF [A1 + 3 * SIZE], a6 add A1, 4 * SIZE, A1 FADD c1, t1, c1 FMUL a7, b3, t1 FADDX c2, t2, c2 FMUL a7, b4, t2 LDF [A2 + 2 * SIZE], a7 FADD c3, t3, c3 FMUL a8, b3, t3 LDF [X1 + 2 * SIZE], b3 FADD c4, t4, c4 FMUL a8, b4, t4 LDF [A2 + 3 * SIZE], a8 add A2, 4 * SIZE, A2 FADD c5, t1, c5 LDF [X1 + 3 * SIZE], b4 add X1, 4 * SIZE, X1 FMUL a1, b1, t1 FADDX c6, t2, c6 FMUL a1, b2, t2 FADD c7, t3, c7 FMUL a2, b1, t3 FADD c8, t4, c8 FMUL a2, b2, t4 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 FADD c3, t3, c3 FMUL a4, b1, t3 FADD c4, t4, c4 FMUL a4, b2, t4 FADD c5, t1, c5 FMUL a5, b3, t1 FADDX c6, t2, c6 FMUL a5, b4, t2 FADD c7, t3, c7 FMUL a6, b3, t3 FADD c8, t4, c8 FMUL a6, b4, t4 FADD c1, t1, c1 FMUL a7, b3, t1 FADDX c2, t2, c2 FMUL a7, b4, t2 FADD c3, t3, c3 FMUL a8, b3, t3 FADD c4, t4, c4 FMUL a8, b4, t4 .LL215: andcc MIN_M, 3, I LDF ALPHA_R, b3 mov Y1, Y2 ble %icc, .LL219 LDF ALPHA_I, b4 LDF [A1 + 0 * SIZE], a1 add I, -1, I LDF [A1 + 1 * SIZE], a2 cmp I, 0 add A1, 2 * SIZE, A1 LDF [A2 + 0 * SIZE], a3 LDF [A2 + 1 * SIZE], a4 add A2, 2 * SIZE, A2 LDF [X1 + 0 * SIZE], b1 LDF [X1 + 1 * SIZE], b2 ble %icc, .LL217 add X1, 2 * SIZE, X1 .LL216: FADD c5, t1, c5 FMUL a1, b1, t1 FADDX c6, t2, c6 FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c7, t3, c7 add I, -1, I FMUL a2, b1, t3 FADD c8, t4, c8 cmp I, 0 FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 add A1, 2 * SIZE, A1 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 LDF [A2 + 0 * SIZE], a3 FADD c3, t3, c3 FMUL a4, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c4, t4, c4 add X1, 2 * SIZE, X1 FMUL a4, b2, t4 LDF [A2 + 1 * SIZE], a4 add A2, 2 * SIZE, A2 bg,pn %icc, .LL216 LDF [X1 - 1 * SIZE], b2 .LL217: FADD c5, t1, c5 FMUL a1, b1, t1 FADDX c6, t2, c6 FMUL a1, b2, t2 FADD c7, t3, c7 FMUL a2, b1, t3 FADD c8, t4, c8 FMUL a2, b2, t4 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 FADD c3, t3, c3 FMUL a4, b1, t3 FADD c4, t4, c4 FMUL a4, b2, t4 .LL219: FADD c5, t1, c5 LDF [Y1 + 0 * SIZE], a1 FADDX c6, t2, c6 LDF [Y1 + 1 * SIZE] ,a2 add Y1, INCY, Y1 FADD c7, t3, c7 LDF [Y1 + 0 * SIZE], a3 FADD c8, t4, c8 LDF [Y1 + 1 * SIZE] ,a4 add Y1, INCY, Y1 #if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) FSUB c1, c4, c1 FSUB c5, c8, c5 #else FADD c1, c4, c1 FADD c5, c8, c5 #endif #ifndef CONJ FADD c2, c3, c2 FADD c6, c7, c6 #else FSUB c2, c3, c2 FSUB c6, c7, c6 #endif FMUL b3, c1, c3 FMUL b4, c1, c4 FMUL b4, c2, c1 FMUL b3, c2, c2 FMUL b3, c5, c7 FMUL b4, c5, c8 FMUL b4, c6, c5 FMUL b3, c6, c6 FSUB c3, c1, c1 FADD c2, c4, c2 FSUB c7, c5, c5 FADD c6, c8, c6 FADD a1, c1, a1 FADD a2, c2, a2 FADD a3, c5, a3 FADD a4, c6, a4 STF a1, [Y2 + 0 * SIZE] STF a2, [Y2 + 1 * SIZE] add Y2, INCY, Y2 STF a3, [Y2 + 0 * SIZE] STF a4, [Y2 + 1 * SIZE] .LL300: andcc N, 1, J FCLR(0) ble %icc, .LL400 FMOV t1, c1 .LL310: sra MIN_M, 2, I FMOV t1, c2 FMOV t1, c3 FMOV t1, c4 mov A, A1 FMOV t1, t2 add A, LDA, A FMOV t1, t3 cmp I, 0 FMOV t1, t4 ble %icc, .LL315 mov XP, X1 LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [A1 + 4 * SIZE], a5 LDF [A1 + 5 * SIZE], a6 LDF [A1 + 6 * SIZE], a7 LDF [A1 + 7 * SIZE], a8 add A1, 8 * SIZE, A1 LDF [X1 + 0 * SIZE], c9 add I, -1, I LDF [X1 + 1 * SIZE], c10 cmp I, 0 LDF [X1 + 2 * SIZE], c11 LDF [X1 + 3 * SIZE], c12 LDF [X1 + 4 * SIZE], c13 LDF [X1 + 5 * SIZE], c14 LDF [X1 + 6 * SIZE], c15 LDF [X1 + 7 * SIZE], c16 ble %icc, .LL312 add X1, 8 * SIZE, X1 .LL311: prefetch [A1 + PREFETCHSIZE * SIZE], 1 FADD c1, t1, c1 FMUL a1, c9, t1 FADDX c2, t2, c2 FMUL a1, c10, t2 LDF [A1 + 0 * SIZE], a1 FADD c3, t3, c3 FMUL a2, c9, t3 LDF [X1 + 0 * SIZE], c9 FADD c4, t4, c4 FMUL a2, c10, t4 LDF [A1 + 1 * SIZE], a2 LDF [X1 + 1 * SIZE], c10 FADD c1, t1, c1 FMUL a3, c11, t1 FADDX c2, t2, c2 FMUL a3, c12, t2 LDF [A1 + 2 * SIZE], a3 FADD c3, t3, c3 add I, -1, I FMUL a4, c11, t3 LDF [X1 + 2 * SIZE], c11 FADD c4, t4, c4 cmp I, 0 FMUL a4, c12, t4 LDF [A1 + 3 * SIZE], a4 LDF [X1 + 3 * SIZE], c12 FADD c1, t1, c1 FMUL a5, c13, t1 FADDX c2, t2, c2 FMUL a5, c14, t2 LDF [A1 + 4 * SIZE], a5 FADD c3, t3, c3 FMUL a6, c13, t3 LDF [X1 + 4 * SIZE], c13 FADD c4, t4, c4 FMUL a6, c14, t4 LDF [A1 + 5 * SIZE], a6 LDF [X1 + 5 * SIZE], c14 FADD c1, t1, c1 FMUL a7, c15, t1 FADDX c2, t2, c2 FMUL a7, c16, t2 LDF [A1 + 6 * SIZE], a7 FADD c3, t3, c3 FMUL a8, c15, t3 LDF [X1 + 6 * SIZE], c15 FADD c4, t4, c4 add X1, 8 * SIZE, X1 FMUL a8, c16, t4 LDF [A1 + 7 * SIZE], a8 add A1, 8 * SIZE, A1 bg,pn %icc, .LL311 LDF [X1 - 1 * SIZE], c16 .LL312: FADD c1, t1, c1 FMUL a1, c9, t1 FADDX c2, t2, c2 FMUL a1, c10, t2 FADD c3, t3, c3 FMUL a2, c9, t3 FADD c4, t4, c4 FMUL a2, c10, t4 FADD c1, t1, c1 FMUL a3, c11, t1 FADDX c2, t2, c2 FMUL a3, c12, t2 FADD c3, t3, c3 FMUL a4, c11, t3 FADD c4, t4, c4 FMUL a4, c12, t4 FADD c1, t1, c1 FMUL a5, c13, t1 FADDX c2, t2, c2 FMUL a5, c14, t2 FADD c3, t3, c3 FMUL a6, c13, t3 FADD c4, t4, c4 FMUL a6, c14, t4 FADD c1, t1, c1 FMUL a7, c15, t1 FADDX c2, t2, c2 FMUL a7, c16, t2 FADD c3, t3, c3 FMUL a8, c15, t3 FADD c4, t4, c4 FMUL a8, c16, t4 .LL315: andcc MIN_M, 3, I LDF ALPHA_R, b3 mov Y1, Y2 ble %icc, .LL319 LDF ALPHA_I, b4 LDF [A1 + 0 * SIZE], a1 add I, -1, I LDF [A1 + 1 * SIZE], a2 add A1, 2 * SIZE, A1 LDF [X1 + 0 * SIZE], b1 cmp I, 0 LDF [X1 + 1 * SIZE], b2 ble %icc, .LL317 add X1, 2 * SIZE, X1 .LL316: FADD c1, t1, c1 add I, -1, I FMUL a1, b1, t1 FADDX c2, t2, c2 FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c3, t3, c3 cmp I, 0 FMUL a2, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c4, t4, c4 add X1, 2 * SIZE, X1 FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 add A1, 2 * SIZE, A1 bg,pn %icc, .LL316 LDF [X1 - 1 * SIZE], b2 .LL317: FADD c1, t1, c1 FMUL a1, b1, t1 FADDX c2, t2, c2 FMUL a1, b2, t2 FADD c3, t3, c3 FMUL a2, b1, t3 FADD c4, t4, c4 FMUL a2, b2, t4 .LL319: FADD c1, t1, c1 LDF [Y1 + 0 * SIZE], a1 FADDX c2, t2, c2 LDF [Y1 + 1 * SIZE] ,a2 add Y1, INCY, Y1 FADD c3, t3, c3 FADD c4, t4, c4 #if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) FSUB c1, c4, c1 #else FADD c1, c4, c1 #endif #ifndef CONJ FADD c2, c3, c2 #else FSUB c2, c3, c2 #endif FMUL b3, c1, c3 FMUL b4, c1, c4 FMUL b4, c2, c1 FMUL b3, c2, c2 FSUB c3, c1, c1 FADD c2, c4, c2 FADD a1, c1, a1 FADD a2, c2, a2 STF a1, [Y2 + 0 * SIZE] STF a2, [Y2 + 1 * SIZE] .LL400: mov P, I add IS, I, IS cmp IS, M bl %icc, .LL10 add A, PNLDA, A .LL999: return %i7 + 8 clr %o0 EPILOGUE