/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." #endif #ifdef EV6 #define PREFETCHSIZE 56 #define UNOP unop #endif #ifdef EV5 #define PREFETCHSIZE 56 #define UNOP #endif #ifdef EV4 #define UNOP #endif #define STACKSIZE 80 #define M $16 #define N $17 #define K $18 #define A $20 #define B $21 #define C $22 #define LDC $23 #define C1 $19 #define C2 $24 #define C3 $25 #define C4 $27 #define AO $at #define BO $5 #define I $6 #define J $7 #define L $8 #define a1 $f16 #define a2 $f17 #define a3 $f18 #define a4 $f19 #define b1 $f20 #define b2 $f21 #define b3 $f22 #define b4 $f23 #define t1 $f24 #define t2 $f25 #define t3 $f26 #define t4 $f27 #define a5 $f28 #define a6 $f30 #define b5 $f29 #define alpha $f30 #define c01 $f0 #define c02 $f1 #define c03 $f2 #define c04 $f3 #define c05 $f4 #define c06 $f5 #define c07 $f6 #define c08 $f7 #define c09 $f8 #define c10 $f9 #define c11 $f10 #define c12 $f11 #define c13 $f12 #define c14 $f13 #define c15 $f14 #define c16 $f15 #define TMP1 $0 #define TMP2 $1 #define KK $2 #define AORIG $3 #define OFFSET $4 PROLOGUE PROFCODE .frame $sp, STACKSIZE, $26, 0 lda $sp, -STACKSIZE($sp) ldq C, 0 + STACKSIZE($sp) ldq LDC, 8 + STACKSIZE($sp) ldq OFFSET, 16 + STACKSIZE($sp) SXADDQ LDC, 0, LDC stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) cmple M, 0, $0 cmple N, 0, $1 cmple K, 0, $2 or $0, $1, $0 or $0, $2, $0 bne $0, $L999 #ifdef LN mulq M, K, TMP1 SXADDQ TMP1, A, A SXADDQ M, C, C #endif #ifdef RN negq OFFSET, KK #endif #ifdef RT mulq N, K, TMP1 SXADDQ TMP1, B, B mulq N, LDC, TMP1 addq TMP1, C, C subq N, OFFSET, KK #endif sra N, 2, J ble J, $L40 .align 4 $L01: #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 subq B, TMP1, B s4addq LDC, 0, TMP1 subq C, TMP1, C #endif mov C, C1 addq C, LDC, C2 addq C2, LDC, C3 #ifndef RT s4addq LDC, C, C #endif fclr t1 addq C3, LDC, C4 fclr t2 #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 2, I fclr t3 fclr t4 ble I, $L20 .align 4 $L11: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c11 LD a2, 1 * SIZE(AO) fclr c12 LD a3, 2 * SIZE(AO) fclr c16 LD a4, 3 * SIZE(AO) fclr c15 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c06 LD b4, 3 * SIZE(B) fclr c05 lds $f31, 4 * SIZE(C1) fclr c03 lda L, -2(KK) fclr c04 lds $f31, 7 * SIZE(C2) fclr c08 lda BO, 4 * SIZE(B) fclr c13 lds $f31, 4 * SIZE(C3) fclr c09 lda AO, 4 * SIZE(AO) fclr c10 lds $f31, 7 * SIZE(C4) fclr c14 fclr c07 ble KK, $L18 #else #ifdef LN sll K, BASE_SHIFT + 2, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 2, TMP1 addq AORIG, TMP1, AO addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c11 LD a2, 1 * SIZE(AO) fclr c12 LD a3, 2 * SIZE(AO) fclr c16 LD a4, 3 * SIZE(AO) fclr c15 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c06 LD b4, 3 * SIZE(BO) fclr c05 lds $f31, 4 * SIZE(C1) fclr c03 lda L, -2(TMP1) fclr c04 lds $f31, 7 * SIZE(C2) fclr c08 lda BO, 4 * SIZE(BO) fclr c13 lds $f31, 4 * SIZE(C3) fclr c09 lda AO, 4 * SIZE(AO) fclr c10 lds $f31, 7 * SIZE(C4) fclr c14 fclr c07 ble TMP1, $L18 #endif ble L, $L15 .align 5 $L12: /* 1 */ ADD c11, t1, c11 #ifndef EV4 ldq $31, PREFETCHSIZE * SIZE(AO) #else unop #endif MUL b1, a1, t1 #ifndef EV4 ldl $31, PREFETCHSIZE * SIZE(BO) #else unop #endif ADD c12, t2, c12 unop MUL b1, a2, t2 unop ADD c16, t3, c16 unop MUL b2, a2, t3 LD a5, 0 * SIZE(AO) ADD c15, t4, c15 unop MUL b2, a1, t4 LD b5, 0 * SIZE(BO) /* 2 */ ADD c01, t1, c01 UNOP MUL b1, a3, t1 UNOP ADD c02, t2, c02 UNOP MUL b1, a4, t2 UNOP ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a1, t4 unop /* 3 */ ADD c03, t1, c03 unop MUL b3, a1, t1 unop ADD c04, t2, c04 unop MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) /* 4 */ ADD c09, t1, c09 unop MUL b3, a3, t1 LD a6, 2 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, 3 * SIZE(AO) ADD c07, t4, c07 unop MUL b4, a3, t4 LD b4, 3 * SIZE(BO) /* 5 */ ADD c11, t1, c11 unop MUL b5, a5, t1 LD a1, 4 * SIZE(AO) ADD c12, t2, c12 lda L, -2(L) MUL b5, a2, t2 LD b1, 4 * SIZE(BO) ADD c16, t3, c16 unop MUL b2, a2, t3 unop ADD c15, t4, c15 unop MUL b2, a5, t4 unop /* 6 */ ADD c01, t1, c01 unop MUL b5, a6, t1 unop ADD c02, t2, c02 unop MUL b5, a4, t2 unop ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a5, t4 unop /* 7 */ ADD c03, t1, c03 lda AO, 8 * SIZE(AO) MUL b3, a5, t1 unop ADD c04, t2, c04 lda BO, 8 * SIZE(BO) MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, -3 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a6, t4 LD b2, -3 * SIZE(BO) /* 8 */ ADD c09, t1, c09 unop MUL b3, a6, t1 LD a3, -2 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, -2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD c07, t4, c07 MUL b4, a6, t4 LD b4, -1 * SIZE(BO) bgt L, $L12 .align 4 $L15: ADD c11, t1, c11 MUL b1, a1, t1 #if defined(LT) || defined(RN) blbs KK, $L17 #else blbs TMP1, $L17 #endif .align 4 ADD c12, t2, c12 MUL b1, a2, t2 ADD c16, t3, c16 MUL b2, a2, t3 ADD c15, t4, c15 MUL b2, a1, t4 ADD c01, t1, c01 MUL b1, a3, t1 ADD c02, t2, c02 unop MUL b1, a4, t2 LD b1, 0 * SIZE(BO) ADD c06, t3, c06 MUL b2, a4, t3 ADD c05, t4, c05 MUL b4, a1, t4 ADD c03, t1, c03 unop MUL b3, a1, t1 LD a1, 0 * SIZE(AO) ADD c04, t2, c04 unop MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) ADD c09, t1, c09 unop MUL b3, a3, t1 lda AO, 4 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD c07, t4, c07 unop MUL b4, a3, t4 LD a3, -2 * SIZE(AO) ADD c11, t1, c11 LD b4, 3 * SIZE(BO) MUL b1, a1, t1 lda BO, 4 * SIZE(BO) .align 4 $L17: ADD c12, t2, c12 MUL b1, a2, t2 ADD c16, t3, c16 MUL b2, a2, t3 ADD c15, t4, c15 MUL b2, a1, t4 ADD c01, t1, c01 MUL b1, a3, t1 ADD c02, t2, c02 MUL b1, a4, t2 ADD c06, t3, c06 MUL b2, a4, t3 ADD c05, t4, c05 MUL b4, a1, t4 ADD c03, t1, c03 MUL b3, a1, t1 ADD c04, t2, c04 MUL b3, a2, t2 ADD c08, t3, c08 MUL b4, a2, t3 ADD c13, t4, c13 MUL b2, a3, t4 ADD c09, t1, c09 MUL b3, a3, t1 ADD c10, t2, c10 MUL b3, a4, t2 ADD c14, t3, c14 MUL b4, a4, t3 ADD c07, t4, c07 lda AO, 4 * SIZE(AO) MUL b4, a3, t4 lda BO, 4 * SIZE(BO) ADD c11, t1, c11 ADD c12, t2, c12 ADD c16, t3, c16 ADD c15, t4, c15 .align 4 $L18: #if defined(LN) || defined(RT) #ifdef LN subq KK, 4, TMP1 #else subq KK, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 SUB b1, c02, c02 SUB b2, c06, c06 SUB b3, c10, c10 SUB b4, c14, c14 LD a1, 8 * SIZE(BO) LD a2, 9 * SIZE(BO) LD a3, 10 * SIZE(BO) LD a4, 11 * SIZE(BO) LD b1, 12 * SIZE(BO) LD b2, 13 * SIZE(BO) LD b3, 14 * SIZE(BO) LD b4, 15 * SIZE(BO) SUB a1, c03, c03 SUB a2, c07, c07 SUB a3, c11, c11 SUB a4, c15, c15 SUB b1, c04, c04 SUB b2, c08, c08 SUB b3, c12, c12 SUB b4, c16, c16 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 SUB b1, c05, c05 SUB b2, c06, c06 SUB b3, c07, c07 SUB b4, c08, c08 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b1, 12 * SIZE(AO) LD b2, 13 * SIZE(AO) LD b3, 14 * SIZE(AO) LD b4, 15 * SIZE(AO) SUB a1, c09, c09 SUB a2, c10, c10 SUB a3, c11, c11 SUB a4, c12, c12 SUB b1, c13, c13 SUB b2, c14, c14 SUB b3, c15, c15 SUB b4, c16, c16 #endif #ifdef LN LD a1, 15 * SIZE(AO) LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) MUL a1, c04, c04 MUL a1, c08, c08 MUL a1, c12, c12 MUL a1, c16, c16 MUL a2, c04, t1 MUL a2, c08, t2 MUL a2, c12, t3 MUL a2, c16, t4 SUB c03, t1, c03 SUB c07, t2, c07 SUB c11, t3, c11 SUB c15, t4, c15 MUL a3, c04, t1 MUL a3, c08, t2 MUL a3, c12, t3 MUL a3, c16, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL a4, c04, t1 MUL a4, c08, t2 MUL a4, c12, t3 MUL a4, c16, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) MUL b1, c03, c03 MUL b1, c07, c07 MUL b1, c11, c11 MUL b1, c15, c15 MUL b2, c03, t1 MUL b2, c07, t2 MUL b2, c11, t3 MUL b2, c15, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL b3, c03, t1 MUL b3, c07, t2 MUL b3, c11, t3 MUL b3, c15, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 LD a1, 5 * SIZE(AO) LD a2, 4 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a1, c10, c10 MUL a1, c14, c14 MUL a2, c02, t1 MUL a2, c06, t2 MUL a2, c10, t3 MUL a2, c14, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 MUL a3, c01, c01 MUL a3, c05, c05 MUL a3, c09, c09 MUL a3, c13, c13 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 MUL a1, c13, c13 MUL a2, c01, t1 MUL a2, c05, t2 MUL a2, c09, t3 MUL a2, c13, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL a3, c01, t1 MUL a3, c05, t2 MUL a3, c09, t3 MUL a3, c13, t4 SUB c03, t1, c03 SUB c07, t2, c07 SUB c11, t3, c11 SUB c15, t4, c15 MUL a4, c01, t1 MUL a4, c05, t2 MUL a4, c09, t3 MUL a4, c13, t4 SUB c04, t1, c04 SUB c08, t2, c08 SUB c12, t3, c12 SUB c16, t4, c16 LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) MUL b1, c02, c02 MUL b1, c06, c06 MUL b1, c10, c10 MUL b1, c14, c14 MUL b2, c02, t1 MUL b2, c06, t2 MUL b2, c10, t3 MUL b2, c14, t4 SUB c03, t1, c03 SUB c07, t2, c07 SUB c11, t3, c11 SUB c15, t4, c15 MUL b3, c02, t1 MUL b3, c06, t2 MUL b3, c10, t3 MUL b3, c14, t4 SUB c04, t1, c04 SUB c08, t2, c08 SUB c12, t3, c12 SUB c16, t4, c16 LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) MUL a1, c03, c03 MUL a1, c07, c07 MUL a1, c11, c11 MUL a1, c15, c15 MUL a2, c03, t1 MUL a2, c07, t2 MUL a2, c11, t3 MUL a2, c15, t4 SUB c04, t1, c04 SUB c08, t2, c08 SUB c12, t3, c12 SUB c16, t4, c16 MUL a3, c04, c04 MUL a3, c08, c08 MUL a3, c12, c12 MUL a3, c16, c16 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 MUL a2, c01, t1 MUL a2, c02, t2 MUL a2, c03, t3 MUL a2, c04, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL a3, c01, t1 MUL a3, c02, t2 MUL a3, c03, t3 MUL a3, c04, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL a4, c01, t1 MUL a4, c02, t2 MUL a4, c03, t3 MUL a4, c04, t4 SUB c13, t1, c13 SUB c14, t2, c14 SUB c15, t3, c15 SUB c16, t4, c16 LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) MUL b1, c05, c05 MUL b1, c06, c06 MUL b1, c07, c07 MUL b1, c08, c08 MUL b2, c05, t1 MUL b2, c06, t2 MUL b2, c07, t3 MUL b2, c08, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL b3, c05, t1 MUL b3, c06, t2 MUL b3, c07, t3 MUL b3, c08, t4 SUB c13, t1, c13 SUB c14, t2, c14 SUB c15, t3, c15 SUB c16, t4, c16 LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 MUL a1, c12, c12 MUL a2, c09, t1 MUL a2, c10, t2 MUL a2, c11, t3 MUL a2, c12, t4 SUB c13, t1, c13 SUB c14, t2, c14 SUB c15, t3, c15 SUB c16, t4, c16 MUL a3, c13, c13 MUL a3, c14, c14 MUL a3, c15, c15 MUL a3, c16, c16 #endif #ifdef RT LD a1, 15 * SIZE(BO) LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) MUL a1, c13, c13 MUL a1, c14, c14 MUL a1, c15, c15 MUL a1, c16, c16 MUL a2, c13, t1 MUL a2, c14, t2 MUL a2, c15, t3 MUL a2, c16, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL a3, c13, t1 MUL a3, c14, t2 MUL a3, c15, t3 MUL a3, c16, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL a4, c13, t1 MUL a4, c14, t2 MUL a4, c15, t3 MUL a4, c16, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) MUL b1, c09, c09 MUL b1, c10, c10 MUL b1, c11, c11 MUL b1, c12, c12 MUL b2, c09, t1 MUL b2, c10, t2 MUL b2, c11, t3 MUL b2, c12, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL b3, c09, t1 MUL b3, c10, t2 MUL b3, c11, t3 MUL b3, c12, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 LD a1, 5 * SIZE(BO) LD a2, 4 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a1, c07, c07 MUL a1, c08, c08 MUL a2, c05, t1 MUL a2, c06, t2 MUL a2, c07, t3 MUL a2, c08, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 MUL a3, c01, c01 MUL a3, c02, c02 MUL a3, c03, c03 MUL a3, c04, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c13, 3 * SIZE(BO) ST c02, 4 * SIZE(BO) ST c06, 5 * SIZE(BO) ST c10, 6 * SIZE(BO) ST c14, 7 * SIZE(BO) ST c03, 8 * SIZE(BO) ST c07, 9 * SIZE(BO) ST c11, 10 * SIZE(BO) ST c15, 11 * SIZE(BO) ST c04, 12 * SIZE(BO) ST c08, 13 * SIZE(BO) ST c12, 14 * SIZE(BO) ST c16, 15 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) ST c05, 4 * SIZE(AO) ST c06, 5 * SIZE(AO) ST c07, 6 * SIZE(AO) ST c08, 7 * SIZE(AO) ST c09, 8 * SIZE(AO) ST c10, 9 * SIZE(AO) ST c11, 10 * SIZE(AO) ST c12, 11 * SIZE(AO) ST c13, 12 * SIZE(AO) ST c14, 13 * SIZE(AO) ST c15, 14 * SIZE(AO) ST c16, 15 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) lda C2, -4 * SIZE(C2) lda C3, -4 * SIZE(C3) lda C4, -4 * SIZE(C4) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) ST c07, 2 * SIZE(C2) ST c08, 3 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c10, 1 * SIZE(C3) ST c11, 2 * SIZE(C3) ST c12, 3 * SIZE(C3) ST c13, 0 * SIZE(C4) ST c14, 1 * SIZE(C4) ST c15, 2 * SIZE(C4) ST c16, 3 * SIZE(C4) #ifndef LN lda C1, 4 * SIZE(C1) lda C2, 4 * SIZE(C2) lda C3, 4 * SIZE(C3) lda C4, 4 * SIZE(C4) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 2, TMP1 addq AO, TMP1, AO addq BO, TMP1, BO #endif #ifdef LT addq KK, 4, KK #endif #ifdef LN subq KK, 4, KK #endif lda I, -1(I) bgt I, $L11 .align 4 $L20: and M, 2, I ble I, $L30 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(B) lda L, -2(KK) LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) fclr c01 LD b4, 3 * SIZE(B) fclr c05 lda BO, 4 * SIZE(B) fclr c02 fclr c06 ble KK, $L28 ble L, $L25 #else #ifdef LN sll K, BASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) fclr c01 LD b4, 3 * SIZE(BO) fclr c05 lda BO, 4 * SIZE(BO) fclr c02 fclr c06 ble TMP1, $L28 ble L, $L25 #endif .align 4 $L22: ADD c09, t1, c09 unop MUL a1, b1, t1 unop ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 lda BO, 8 * SIZE(BO) ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, -7 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 unop ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, -6 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, 2 * SIZE(AO) ADD c06, t4, c06 MUL a2, b4, t4 LD b5, -5 * SIZE(BO) ADD c09, t1, c09 unop MUL a3, b1, t1 LD a2, 3 * SIZE(AO) ADD c10, t2, c10 unop MUL a4, b1, t2 LD b1, -4 * SIZE(BO) ADD c13, t3, c13 unop MUL a3, b2, t3 lda AO, 4 * SIZE(AO) ADD c14, t4, c14 MUL a4, b2, t4 LD b2, -3 * SIZE(BO) ADD c01, t1, c01 lda L, -2(L) MUL a3, b3, t1 LD b4, -1 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, -2 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b5, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b5, t4 LD a4, 1 * SIZE(AO) bgt L, $L22 .align 4 $L25: ADD c09, t1, c09 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L27 #else blbs TMP1, $L27 #endif ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 unop ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, 1 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 lda AO, 2 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, 2 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b4, t4 LD a2, -1 * SIZE(AO) ADD c09, t1, c09 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L27: ADD c10, t2, c10 MUL a2, b1, t2 ADD c13, t3, c13 MUL a1, b2, t3 ADD c14, t4, c14 MUL a2, b2, t4 ADD c01, t1, c01 MUL a1, b3, t1 ADD c02, t2, c02 MUL a2, b3, t2 ADD c05, t3, c05 MUL a1, b4, t3 ADD c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b4, t4 lda BO, 4 * SIZE(BO) ADD c09, t1, c09 ADD c10, t2, c10 ADD c13, t3, c13 ADD c14, t4, c14 .align 4 $L28: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 SUB b1, c02, c02 SUB b2, c06, c06 SUB b3, c10, c10 SUB b4, c14, c14 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c05, c05 SUB a4, c06, c06 SUB b1, c09, c09 SUB b2, c10, c10 SUB b3, c13, c13 SUB b4, c14, c14 #endif #ifdef LN LD a1, 3 * SIZE(AO) LD a2, 2 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a1, c10, c10 MUL a1, c14, c14 MUL a2, c02, t1 MUL a2, c06, t2 MUL a2, c10, t3 MUL a2, c14, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 MUL a3, c01, c01 MUL a3, c05, c05 MUL a3, c09, c09 MUL a3, c13, c13 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 MUL a1, c13, c13 MUL a2, c01, t1 MUL a2, c05, t2 MUL a2, c09, t3 MUL a2, c13, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL a3, c02, c02 MUL a3, c06, c06 MUL a3, c10, c10 MUL a3, c14, c14 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a2, c01, t1 MUL a2, c02, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL a3, c01, t1 MUL a3, c02, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL a4, c01, t1 MUL a4, c02, t2 SUB c13, t1, c13 SUB c14, t2, c14 LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) MUL b1, c05, c05 MUL b1, c06, c06 MUL b2, c05, t1 MUL b2, c06, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL b3, c05, t1 MUL b3, c06, t2 SUB c13, t1, c13 SUB c14, t2, c14 LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) MUL a1, c09, c09 MUL a1, c10, c10 MUL a2, c09, t1 MUL a2, c10, t2 SUB c13, t1, c13 SUB c14, t2, c14 MUL a3, c13, c13 MUL a3, c14, c14 #endif #ifdef RT LD a1, 15 * SIZE(BO) LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) MUL a1, c13, c13 MUL a1, c14, c14 MUL a2, c13, t1 MUL a2, c14, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL a3, c13, t1 MUL a3, c14, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL a4, c13, t1 MUL a4, c14, t2 SUB c01, t1, c01 SUB c02, t2, c02 LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) MUL b1, c09, c09 MUL b1, c10, c10 MUL b2, c09, t1 MUL b2, c10, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL b3, c09, t1 MUL b3, c10, t2 SUB c01, t1, c01 SUB c02, t2, c02 LD a1, 5 * SIZE(BO) LD a2, 4 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a2, c05, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a3, c01, c01 MUL a3, c02, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c13, 3 * SIZE(BO) ST c02, 4 * SIZE(BO) ST c06, 5 * SIZE(BO) ST c10, 6 * SIZE(BO) ST c14, 7 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c05, 2 * SIZE(AO) ST c06, 3 * SIZE(AO) ST c09, 4 * SIZE(AO) ST c10, 5 * SIZE(AO) ST c13, 6 * SIZE(AO) ST c14, 7 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) lda C2, -2 * SIZE(C2) lda C3, -2 * SIZE(C3) lda C4, -2 * SIZE(C4) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c10, 1 * SIZE(C3) ST c13, 0 * SIZE(C4) ST c14, 1 * SIZE(C4) #ifndef LN lda C1, 2 * SIZE(C1) lda C2, 2 * SIZE(C2) lda C3, 2 * SIZE(C3) lda C4, 2 * SIZE(C4) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif .align 4 $L30: and M, 1, I ble I, $L39 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(B) lda L, -2(KK) LD b2, 1 * SIZE(B) lda AO, 1 * SIZE(AO) LD b3, 2 * SIZE(B) fclr c09 LD b4, 3 * SIZE(B) fclr c13 lda BO, 4 * SIZE(B) ble KK, $L38 ble L, $L35 #else #ifdef LN sll K, BASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 1 * SIZE(AO) LD b3, 2 * SIZE(BO) fclr c09 LD b4, 3 * SIZE(BO) fclr c13 lda BO, 4 * SIZE(BO) ble TMP1, $L38 ble L, $L35 #endif .align 4 $L32: ADD c01, t1, c01 lda L, -2(L) MUL a1, b1, t1 LD b1, 0 * SIZE(BO) ADD c05, t2, c05 lda AO, 2 * SIZE(AO) MUL a1, b2, t2 LD b2, 1 * SIZE(BO) ADD c09, t3, c09 LD b5, 3 * SIZE(BO) MUL a1, b3, t3 LD b3, 2 * SIZE(BO) ADD c13, t4, c13 MUL a1, b4, t4 LD a1, -1 * SIZE(AO) ADD c01, t1, c01 MUL a2, b1, t1 LD b1, 4 * SIZE(BO) lda BO, 8 * SIZE(BO) ADD c05, t2, c05 MUL a2, b2, t2 LD b2, -3 * SIZE(BO) ADD c09, t3, c09 LD b4, -1 * SIZE(BO) MUL a2, b3, t3 LD b3, -2 * SIZE(BO) ADD c13, t4, c13 MUL a2, b5, t4 LD a2, 0 * SIZE(AO) bgt L, $L32 .align 4 $L35: ADD c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L37 #else blbs TMP1, $L37 #endif .align 4 ADD c05, t2, c05 LD b1, 0 * SIZE(BO) MUL a1, b2, t2 LD b2, 1 * SIZE(BO) ADD c09, t3, c09 MUL a1, b3, t3 LD b3, 2 * SIZE(BO) ADD c13, t4, c13 MUL a1, b4, t4 LD a1, 0 * SIZE(AO) lda AO, 1 * SIZE(AO) ADD c01, t1, c01 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L37: ADD c05, t2, c05 MUL a1, b2, t2 ADD c09, t3, c09 MUL a1, b3, t3 ADD c13, t4, c13 lda AO, 1 * SIZE(AO) MUL a1, b4, t4 lda BO, 4 * SIZE(BO) ADD c01, t1, c01 ADD c05, t2, c05 ADD c09, t3, c09 ADD c13, t4, c13 $L38: #if defined(LN) || defined(RT) #ifdef LN subq KK, 1, TMP1 #else subq KK, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO #else lda AO, -1 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 MUL a1, c13, c13 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 MUL a3, c01, t1 SUB c09, t1, c09 MUL a4, c01, t1 SUB c13, t1, c13 LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) MUL b1, c05, c05 MUL b2, c05, t1 SUB c09, t1, c09 MUL b3, c05, t1 SUB c13, t1, c13 LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) MUL a1, c09, c09 MUL a2, c09, t1 SUB c13, t1, c13 MUL a3, c13, c13 #endif #ifdef RT LD a1, 15 * SIZE(BO) LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) MUL a1, c13, c13 MUL a2, c13, t1 SUB c09, t1, c09 MUL a3, c13, t1 SUB c05, t1, c05 MUL a4, c13, t1 SUB c01, t1, c01 LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) MUL b1, c09, c09 MUL b2, c09, t1 SUB c05, t1, c05 MUL b3, c09, t1 SUB c01, t1, c01 LD a1, 5 * SIZE(BO) LD a2, 4 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a2, c05, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c13, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c05, 1 * SIZE(AO) ST c09, 2 * SIZE(AO) ST c13, 3 * SIZE(AO) #endif #ifdef LN lda C1, -1 * SIZE(C1) lda C2, -1 * SIZE(C2) lda C3, -1 * SIZE(C3) lda C4, -1 * SIZE(C4) #endif ST c01, 0 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c13, 0 * SIZE(C4) #ifdef RT sll K, 0 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L39: #ifdef LN sll K, 2 + BASE_SHIFT, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 4, KK #endif #ifdef RT subq KK, 4, KK #endif lda J, -1(J) bgt J, $L01 .align 4 $L40: and N, 2, J ble J, $L80 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 subq B, TMP1, B addq LDC, LDC, TMP1 subq C, TMP1, C #endif mov C, C1 addq C, LDC, C2 fclr t1 #ifndef RT addq C2, LDC, C #endif fclr t2 #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 2, I fclr t3 fclr t4 ble I, $L60 .align 4 $L51: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c03 LD a2, 1 * SIZE(AO) fclr c07 LD a3, 2 * SIZE(AO) fclr c04 LD a4, 3 * SIZE(AO) fclr c08 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 lda L, -2(KK) lda BO, 2 * SIZE(B) lda AO, 4 * SIZE(AO) ble KK, $L58 ble L, $L55 #else #ifdef LN sll K, BASE_SHIFT + 2, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 2, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c03 LD a2, 1 * SIZE(AO) fclr c07 LD a3, 2 * SIZE(AO) fclr c04 LD a4, 3 * SIZE(AO) fclr c08 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda L, -2(TMP1) lda BO, 2 * SIZE(BO) lda AO, 4 * SIZE(AO) ble TMP1, $L58 ble L, $L55 #endif .align 4 $L52: ADD c05, t1, c05 unop MUL a1, b1, t1 unop ADD c06, t2, c06 lda L, -2(L) MUL a2, b1, t2 unop ADD c07, t3, c07 unop MUL a3, b1, t3 unop ADD c08, t4, c08 unop MUL a4, b1, t4 LD b1, 2 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 unop MUL a4, b2, t4 LD a5, 3 * SIZE(AO) ADD c05, t1, c05 unop MUL a1, b3, t1 LD b2, -1 * SIZE(BO) ADD c06, t2, c06 unop MUL a2, b3, t2 unop ADD c07, t3, c07 unop MUL a3, b3, t3 lda AO, 8 * SIZE(AO) ADD c08, t4, c08 unop MUL a5, b3, t4 LD b3, 0 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b4, t1 LD a1, -4 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b4, t2 LD a2, -3 * SIZE(AO) ADD c03, t3, c03 LD a4, -1 * SIZE(AO) MUL a3, b4, t3 LD a3, -2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 1 * SIZE(BO) bgt L, $L52 .align 4 $L55: ADD c05, t1, c05 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L57 #else blbs TMP1, $L57 #endif .align 4 ADD c06, t2, c06 MUL a2, b1, t2 ADD c07, t3, c07 MUL a3, b1, t3 ADD c08, t4, c08 unop MUL a4, b1, t4 LD b1, 0 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a4, 3 * SIZE(AO) lda AO, 4 * SIZE(AO) ADD c05, t1, c05 LD b2, 1 * SIZE(BO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L57: ADD c06, t2, c06 MUL a2, b1, t2 ADD c07, t3, c07 MUL a3, b1, t3 ADD c08, t4, c08 MUL a4, b1, t4 ADD c01, t1, c01 MUL a1, b2, t1 ADD c02, t2, c02 MUL a2, b2, t2 ADD c03, t3, c03 MUL a3, b2, t3 ADD c04, t4, c04 lda AO, 4 * SIZE(AO) MUL a4, b2, t4 lda BO, 2 * SIZE(BO) ADD c05, t1, c05 ADD c06, t2, c06 ADD c07, t3, c07 ADD c08, t4, c08 .align 4 $L58: #if defined(LN) || defined(RT) #ifdef LN subq KK, 4, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c02, c02 SUB a4, c06, c06 SUB b1, c03, c03 SUB b2, c07, c07 SUB b3, c04, c04 SUB b4, c08, c08 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 SUB b1, c05, c05 SUB b2, c06, c06 SUB b3, c07, c07 SUB b4, c08, c08 #endif #ifdef LN LD a1, 15 * SIZE(AO) LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) MUL a1, c04, c04 MUL a1, c08, c08 MUL a2, c04, t1 MUL a2, c08, t2 SUB c03, t1, c03 SUB c07, t2, c07 MUL a3, c04, t1 MUL a3, c08, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL a4, c04, t1 MUL a4, c08, t2 SUB c01, t1, c01 SUB c05, t2, c05 LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) MUL b1, c03, c03 MUL b1, c07, c07 MUL b2, c03, t1 MUL b2, c07, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL b3, c03, t1 MUL b3, c07, t2 SUB c01, t1, c01 SUB c05, t2, c05 LD a1, 5 * SIZE(AO) LD a2, 4 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a2, c02, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c05, t2, c05 MUL a3, c01, c01 MUL a3, c05, c05 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a2, c01, t1 MUL a2, c05, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL a3, c01, t1 MUL a3, c05, t2 SUB c03, t1, c03 SUB c07, t2, c07 MUL a4, c01, t1 MUL a4, c05, t2 SUB c04, t1, c04 SUB c08, t2, c08 LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) MUL b1, c02, c02 MUL b1, c06, c06 MUL b2, c02, t1 MUL b2, c06, t2 SUB c03, t1, c03 SUB c07, t2, c07 MUL b3, c02, t1 MUL b3, c06, t2 SUB c04, t1, c04 SUB c08, t2, c08 LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) MUL a1, c03, c03 MUL a1, c07, c07 MUL a2, c03, t1 MUL a2, c07, t2 SUB c04, t1, c04 SUB c08, t2, c08 MUL a3, c04, c04 MUL a3, c08, c08 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 MUL a2, c01, t1 MUL a2, c02, t2 MUL a2, c03, t3 MUL a2, c04, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL a3, c05, c05 MUL a3, c06, c06 MUL a3, c07, c07 MUL a3, c08, c08 #endif #ifdef RT LD a1, 3 * SIZE(BO) LD a2, 2 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a1, c07, c07 MUL a1, c08, c08 MUL a2, c05, t1 MUL a2, c06, t2 MUL a2, c07, t3 MUL a2, c08, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 MUL a3, c01, c01 MUL a3, c02, c02 MUL a3, c03, c03 MUL a3, c04, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c02, 2 * SIZE(BO) ST c06, 3 * SIZE(BO) ST c03, 4 * SIZE(BO) ST c07, 5 * SIZE(BO) ST c04, 6 * SIZE(BO) ST c08, 7 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) ST c05, 4 * SIZE(AO) ST c06, 5 * SIZE(AO) ST c07, 6 * SIZE(AO) ST c08, 7 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) lda C2, -4 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) ST c07, 2 * SIZE(C2) ST c08, 3 * SIZE(C2) #ifndef LN lda C1, 4 * SIZE(C1) lda C2, 4 * SIZE(C2) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 2, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 4, KK #endif #ifdef LN subq KK, 4, KK #endif lda I, -1(I) bgt I, $L51 .align 4 $L60: and M, 2, I ble I, $L70 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) lda L, -2(KK) LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) lda BO, 2 * SIZE(B) ble KK, $L68 ble L, $L65 #else #ifdef LN sll K, BASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) lda BO, 2 * SIZE(BO) ble TMP1, $L68 ble L, $L65 #endif .align 4 $L62: ADD c01, t1, c01 unop MUL a1, b1, t1 unop ADD c02, t2, c02 lda AO, 4 * SIZE(AO) MUL a2, b1, t2 LD b1, 2 * SIZE(BO) ADD c05, t3, c05 lda L, -2(L) MUL a1, b2, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b2, t4 LD a2, -1 * SIZE(AO) ADD c01, t1, c01 LD b2, 3 * SIZE(BO) MUL a3, b3, t1 lda BO, 4 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, 0 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b4, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b4, t4 LD b4, 1 * SIZE(BO) unop LD a4, 1 * SIZE(AO) unop unop bgt L, $L62 .align 4 $L65: ADD c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L67 #else blbs TMP1, $L67 #endif .align 4 ADD c02, t2, c02 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c05, t3, c05 lda BO, 2 * SIZE(BO) MUL a1, b2, t3 LD a1, 0 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b2, t4 LD a2, 1 * SIZE(AO) ADD c01, t1, c01 LD b2, -1 * SIZE(BO) MUL a1, b1, t1 lda AO, 2 * SIZE(AO) .align 4 $L67: ADD c02, t2, c02 MUL a2, b1, t2 ADD c05, t3, c05 MUL a1, b2, t3 ADD c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b2, t4 lda BO, 2 * SIZE(BO) ADD c01, t1, c01 ADD c02, t2, c02 ADD c05, t3, c05 ADD c06, t4, c06 .align 4 $L68: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c02, c02 SUB a4, c06, c06 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c05, c05 SUB a4, c06, c06 #endif #ifdef LN LD a1, 3 * SIZE(AO) LD a2, 2 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a2, c02, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c05, t2, c05 MUL a3, c01, c01 MUL a3, c05, c05 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a2, c01, t1 MUL a2, c05, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL a3, c02, c02 MUL a3, c06, c06 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a2, c01, t1 MUL a2, c02, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL a3, c05, c05 MUL a3, c06, c06 #endif #ifdef RT LD a1, 3 * SIZE(BO) LD a2, 2 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a2, c05, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a3, c01, c01 MUL a3, c02, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c02, 2 * SIZE(BO) ST c06, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c05, 2 * SIZE(AO) ST c06, 3 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) lda C2, -2 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) #ifndef LN lda C1, 2 * SIZE(C1) lda C2, 2 * SIZE(C2) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif .align 4 $L70: and M, 1, I ble I, $L79 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(B) fclr c02 LD b2, 1 * SIZE(B) fclr c06 lda L, -2(KK) LD b3, 2 * SIZE(B) lda AO, 1 * SIZE(AO) LD b4, 3 * SIZE(B) lda BO, 2 * SIZE(B) ble KK, $L78 ble L, $L75 #else #ifdef LN sll K, BASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(BO) fclr c02 LD b2, 1 * SIZE(BO) fclr c06 lda L, -2(TMP1) LD b3, 2 * SIZE(BO) lda AO, 1 * SIZE(AO) LD b4, 3 * SIZE(BO) lda BO, 2 * SIZE(BO) ble TMP1, $L78 ble L, $L75 #endif .align 4 $L72: ADD c01, t1, c01 lda L, -2(L) MUL a1, b1, t1 LD b1, 2 * SIZE(BO) ADD c05, t2, c05 MUL a1, b2, t2 LD a1, 1 * SIZE(AO) LD b2, 3 * SIZE(BO) ADD c02, t3, c02 lda AO, 2 * SIZE(AO) MUL a2, b3, t3 LD b3, 4 * SIZE(BO) ADD c06, t4, c06 MUL a2, b4, t4 LD a2, 0 * SIZE(AO) LD b4, 5 * SIZE(BO) lda BO, 4 * SIZE(BO) unop unop bgt L, $L72 .align 4 $L75: ADD c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L77 #else blbs TMP1, $L77 #endif .align 4 ADD c05, t2, c05 MUL a1, b2, t2 LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) ADD c01, t1, c01 LD b2, 1 * SIZE(BO) lda AO, 1 * SIZE(AO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L77: ADD c05, t2, c05 MUL a1, b2, t2 ADD c02, t3, c02 ADD c06, t4, c06 ADD c01, c02, c01 lda AO, 1 * SIZE(AO) ADD c05, c06, c05 lda BO, 2 * SIZE(BO) ADD c01, t1, c01 ADD c05, t2, c05 .align 4 $L78: #if defined(LN) || defined(RT) #ifdef LN subq KK, 1, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -1 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) SUB a1, c01, c01 SUB a2, c05, c05 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 MUL a3, c05, c05 #endif #ifdef RT LD a1, 3 * SIZE(BO) LD a2, 2 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a2, c05, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c05, 1 * SIZE(AO) #endif #ifdef LN lda C1, -1 * SIZE(C1) lda C2, -1 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c05, 0 * SIZE(C2) fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 0 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L79: #ifdef LN sll K, 1 + BASE_SHIFT, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 2, KK #endif #ifdef RT subq KK, 2, KK #endif .align 4 $L80: and N, 1, J ble J, $L999 #ifdef RT sll K, BASE_SHIFT, TMP1 subq B, TMP1, B subq C, LDC, C #endif mov C, C1 #ifndef RT addq C, LDC, C #endif #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 2, I ble I, $L100 .align 4 $L91: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 sra KK, 2, L mov B, BO ble L, $L95 #else #ifdef LN sll K, BASE_SHIFT + 2, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 2, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 sra TMP1, 2, L unop ble L, $L95 #endif .align 5 $L92: ADD c01, t1, c01 unop MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 lda L, -1(L) MUL a2, b1, t2 LD a2, 5 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b1, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b1, t4 LD a4, 7 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 8 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b2, t2 LD a2, 9 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 10 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a4, 11 * SIZE(AO) LD b2, 5 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 LD a1, 12 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b3, t2 LD a2, 13 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b3, t3 LD a3, 14 * SIZE(AO) ADD c04, t4, c04 MUL a4, b3, t4 LD a5, 15 * SIZE(AO) LD b3, 6 * SIZE(BO) ADD c01, t1, c01 MUL a1, b4, t1 LD a1, 16 * SIZE(AO) lda AO, 16 * SIZE(AO) ADD c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b4, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 LD a4, 3 * SIZE(AO) MUL a3, b4, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 3 * SIZE(BO) bgt L, $L92 .align 4 $L95: #if defined(LT) || defined(RN) and KK, 3, L #else and TMP1, 3, L #endif unop ble L, $L98 .align 4 $L96: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 lda BO, 1 * SIZE(BO) MUL a2, b1, t2 LD a2, 5 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b1, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b1, t4 LD a4, 7 * SIZE(AO) LD b1, 0 * SIZE(BO) lda AO, 4 * SIZE(AO) bgt L, $L96 .align 4 $L98: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN subq KK, 4, TMP1 #else subq KK, 1, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq B, TMP2, BO #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #endif #ifdef LN LD a1, 15 * SIZE(AO) LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) MUL a1, c04, c04 MUL a2, c04, t1 SUB c03, t1, c03 MUL a3, c04, t1 SUB c02, t1, c02 MUL a4, c04, t1 SUB c01, t1, c01 LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) MUL b1, c03, c03 MUL b2, c03, t1 SUB c02, t1, c02 MUL b3, c03, t1 SUB c01, t1, c01 LD a1, 5 * SIZE(AO) LD a2, 4 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a2, c02, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 MUL a3, c01, t1 SUB c03, t1, c03 MUL a4, c01, t1 SUB c04, t1, c04 LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) MUL b1, c02, c02 MUL b2, c02, t1 SUB c03, t1, c03 MUL b3, c02, t1 SUB c04, t1, c04 LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) MUL a1, c03, c03 MUL a2, c03, t1 SUB c04, t1, c04 MUL a3, c04, c04 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c03, 2 * SIZE(BO) ST c04, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) #ifndef LN lda C1, 4 * SIZE(C1) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 2, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 4, KK #endif #ifdef LN subq KK, 4, KK #endif lda I, -1(I) bgt I, $L91 .align 4 $L100: and M, 2, I ble I, $L110 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 sra KK, 2, L mov B, BO ble L, $L105 #else #ifdef LN sll K, BASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 sra TMP1, 2, L ble L, $L105 #endif .align 5 $L102: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 MUL a2, b1, t2 LD a2, 5 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c03, t3, c03 lda BO, 4 * SIZE(BO) MUL a3, b2, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a5, 7 * SIZE(AO) LD b2, 1 * SIZE(BO) ADD c01, t1, c01 MUL a1, b3, t1 LD a1, 8 * SIZE(AO) lda AO, 8 * SIZE(AO) ADD c02, t2, c02 MUL a2, b3, t2 LD b3, 2 * SIZE(BO) LD a2, 1 * SIZE(AO) ADD c03, t3, c03 LD a4, 3 * SIZE(AO) MUL a3, b4, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 3 * SIZE(BO) bgt L, $L102 .align 4 $L105: #if defined(LT) || defined(RN) and KK, 3, L #else and TMP1, 3, L #endif ble L, $L108 .align 4 $L106: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 2 * SIZE(AO) ADD c02, t2, c02 MUL a2, b1, t2 LD a2, 3 * SIZE(AO) LD b1, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) unop lda BO, 1 * SIZE(BO) bgt L, $L106 .align 4 $L108: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 ADD c01, c03, c01 ADD c02, c04, c02 #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 1, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq B, TMP2, BO #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 #endif #ifdef LN LD a1, 3 * SIZE(AO) LD a2, 2 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a2, c02, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 MUL a3, c02, c02 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) #ifndef LN lda C1, 2 * SIZE(C1) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif .align 4 $L110: and M, 1, I ble I, $L119 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 sra KK, 2, L mov B, BO unop ble L, $L115 #else #ifdef LN sll K, BASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 sra TMP1, 2, L unop ble L, $L115 #endif .align 4 $L112: ADD c01, t1, c01 MUL a1, b1, t1 LD a1, 4 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c02, t2, c02 MUL a2, b2, t2 LD a2, 5 * SIZE(AO) LD b2, 5 * SIZE(BO) ADD c03, t3, c03 MUL a3, b3, t3 LD a3, 6 * SIZE(AO) LD b3, 6 * SIZE(BO) ADD c04, t4, c04 MUL a4, b4, t4 LD a4, 7 * SIZE(AO) LD b4, 7 * SIZE(BO) lda L, -1(L) lda AO, 4 * SIZE(AO) lda BO, 4 * SIZE(BO) bgt L, $L112 .align 4 $L115: #if defined(LT) || defined(RN) and KK, 3, L #else and TMP1, 3, L #endif ble L, $L118 .align 4 $L116: ADD c01, t1, c01 MUL a1, b1, t1 LD a1, 1 * SIZE(AO) LD b1, 1 * SIZE(BO) lda L, -1(L) lda AO, 1 * SIZE(AO) lda BO, 1 * SIZE(BO) bgt L, $L116 .align 4 $L118: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 ADD c01, c02, c01 ADD c03, c04, c03 ADD c01, c03, c01 #if defined(LN) || defined(RT) subq KK, 1, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO addq B, TMP2, BO #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) SUB a1, c01, c01 #else LD a1, 0 * SIZE(AO) SUB a1, c01, c01 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) MUL a1, c01, c01 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) MUL a1, c01, c01 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) #else ST c01, 0 * SIZE(AO) #endif #ifdef LN lda C1, -1 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) #ifndef LN lda C1, 1 * SIZE(C1) #endif #ifdef RT SXADDQ K, AORIG, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L119: #ifdef LN SXADDQ K, B, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 1, KK #endif #ifdef RT subq KK, 1, KK #endif .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) clr $0 lda $sp, STACKSIZE($sp) ret EPILOGUE