Blob Blame Raw
/*********************************************************************/
/* Copyright 2005-2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define APREFETCHSIZE 24
#define APREFETCH_CATEGORY 0

#define M	%i0
#define N	%i1
#define K	%i2
#define A	%i5
#define B	%i3
#define C	%i4

#define LDC	%o0
#define AO	%o1
#define BO	%o2
#define I	%o3
#define J	%o4
#define L	%o5

#define BB	%o7

#define C1	%l0
#define C2	%l1
#define C3	%l2
#define C4	%l3

#define OFFSET	%l4
#define	KK	%l5
#define TEMP1	%l6
#define TEMP2	%l7

#ifdef DOUBLE
#define c01	%f0
#define c02	%f2
#define c03	%f4
#define c04	%f6
#define c05	%f8
#define c06	%f10
#define c07	%f12
#define c08	%f14
#define c09	%f16
#define c10	%f18
#define c11	%f20
#define c12	%f22
#define c13	%f24
#define c14	%f26
#define c15	%f28
#define c16	%f30

#define a1	%f32
#define a2	%f34
#define a3	%f36
#define a4	%f38
#define a5	%f40

#define b1	%f42
#define b2	%f44
#define b3	%f46
#define b4	%f48
#define b5	%f50
#define b6	%f52
#define b7	%f54
#define b8	%f56
#define b9	%f58

#define ALPHA_R	%f60
#define ALPHA_I	%f62

#define cc01	0
#define cc02	2
#define cc03	4
#define cc04	6
#define cc05	8
#define cc06	10
#define cc07	12
#define cc08	14
#define cc09	16
#define cc10	18
#define cc11	20
#define cc12	22
#define cc13	24
#define cc14	26
#define cc15	28
#define cc16	30

#define aa1	 1
#define aa2	 3
#define aa3	 5
#define aa4	 7
#define aa5	 9

#define bb1	11
#define bb2	13
#define bb3	15
#define bb4	17
#define bb5	19
#define bb6	21
#define bb7	23
#define bb8	25
#define bb9	27

#define alpha_r	29
#define alpha_i	31
#else
#define c01	%f0
#define c02	%f1
#define c03	%f2
#define c04	%f3
#define c05	%f4
#define c06	%f5
#define c07	%f6
#define c08	%f7
#define c09	%f8
#define c10	%f9
#define c11	%f10
#define c12	%f11
#define c13	%f12
#define c14	%f13
#define c15	%f14
#define c16	%f15

#define a1	%f16
#define a2	%f17
#define a3	%f18
#define a4	%f19
#define a5	%f20

#define b1	%f21
#define b2	%f22
#define b3	%f23
#define b4	%f24
#define b5	%f25
#define b6	%f26
#define b7	%f27
#define b8	%f28
#define b9	%f29

#define ALPHA_R	%f30
#define ALPHA_I	%f31

#define cc01	0
#define cc02	1
#define cc03	2
#define cc04	3
#define cc05	4
#define cc06	5
#define cc07	6
#define cc08	7
#define cc09	8
#define cc10	9
#define cc11	10
#define cc12	11
#define cc13	12
#define cc14	13
#define cc15	14
#define cc16	15

#define aa1	16
#define aa2	17
#define aa3	18
#define aa4	19
#define aa5	20

#define bb1	21
#define bb2	22
#define bb3	23
#define bb4	24
#define bb5	25
#define bb6	26
#define bb7	27
#define bb8	28
#define bb9	29

#define alpha_r	30
#define alpha_i	31

#endif

#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FMADD1	FMADD
#define FMADD2	FMADD
#define FMADD3	FMADD
#define FMADD4	FNMSUB
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define FMADD1	FMADD
#define FMADD2	FMADD
#define FMADD3	FNMSUB
#define FMADD4	FMADD
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define FMADD1	FMADD
#define FMADD2	FNMSUB
#define FMADD3	FMADD
#define FMADD4	FMADD
#else
#define FMADD1	FMADD
#define FMADD2	FNMSUB
#define FMADD3	FNMSUB
#define FMADD4	FNMSUB
#endif

        .register %g2, #scratch
        .register %g3, #scratch
	
	PROLOGUE
	SAVESP
	
#ifndef __64BIT__
#ifdef DOUBLE
	st	%i3, [%sp + STACK_START + 16]
	st	%i4, [%sp + STACK_START + 20]
	st	%i5, [%sp + STACK_START + 24]

	ld	[%sp + STACK_START + 32], A
	ld	[%sp + STACK_START + 36], B
	ld	[%sp + STACK_START + 40], C
	ld	[%sp + STACK_START + 44], LDC
#ifdef TRMMKERNEL
	ld	[%sp + STACK_START + 48], OFFSET
#endif

	ldd	[%sp + STACK_START + 16], ALPHA_R
	ldd	[%sp + STACK_START + 24], ALPHA_I
#else
	st	%i3, [%sp + STACK_START + 16]
	st	%i4, [%sp + STACK_START + 20]

	ld	[%sp + STACK_START + 28], B
	ld	[%sp + STACK_START + 32], C
	ld	[%sp + STACK_START + 36], LDC
#ifdef TRMMKERNEL
	ld	[%sp + STACK_START + 40], OFFSET
#endif

	ld	[%sp + STACK_START + 16], ALPHA_R
	ld	[%sp + STACK_START + 20], ALPHA_I
#endif
#else
	ldx	[%sp + STACK_START + 56], B
	ldx	[%sp + STACK_START + 64], C
	ldx	[%sp + STACK_START + 72], LDC
#ifdef TRMMKERNEL
	ldx	[%sp + STACK_START + 80], OFFSET
#endif

#ifdef DOUBLE
	FMOV	%f6, ALPHA_R
	FMOV	%f8, ALPHA_I
#else
	FMOV	%f7, ALPHA_R
	FMOV	%f9, ALPHA_I
#endif
#endif

#if defined(TRMMKERNEL) && !defined(LEFT)
	neg	OFFSET, KK
#endif

	cmp	M, 0
	ble,pn	%icc, .LL999
	nop

	sra	N, 2, J
	cmp	J, 0
	ble,pn	%icc, .LL20
	sll	LDC, ZBASE_SHIFT, LDC

.LL11:
	mov	C,  C1
	add	C,  LDC, C2
	add	C2, LDC, C3
	add	C3, LDC, C4
	add	C4,  LDC, C

	sll	K, ZBASE_SHIFT + 2, BB

#if defined(TRMMKERNEL) &&  defined(LEFT)
	mov	OFFSET, KK
#endif

	mov	A, AO

	mov	M, I
	add	B, BB, BB
	.align 4

.LL12:
	prefetch [BB +  0 * SIZE], 1
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
	mov	B, BO
#else
	sll	KK, ZBASE_SHIFT + 0, TEMP1
	sll	KK, ZBASE_SHIFT + 2, TEMP2

	add	AO, TEMP1, AO
	add	B,  TEMP2, BO
#endif

	LDF	[AO +  0 * SIZE], a1
	FCLR	(cc01)
	LDF	[AO +  1 * SIZE], a2
	FCLR	(cc05)
	LDF	[AO +  8 * SIZE], a5
	FCLR	(cc09)
	LDF	[BO +  0 * SIZE], b1
	FCLR	(cc13)

	LDF	[BO +  1 * SIZE], b2
	FCLR	(cc02)
	LDF	[BO +  2 * SIZE], b3
	FCLR	(cc06)
	LDF	[BO +  3 * SIZE], b4
	FCLR	(cc10)
	LDF	[BO +  4 * SIZE], b5
	FCLR	(cc14)

	LDF	[BO +  5 * SIZE], b6
	FCLR	(cc03)
	LDF	[BO +  6 * SIZE], b7
	FCLR	(cc07)
	LDF	[BO +  7 * SIZE], b8
	FCLR	(cc11)
	LDF	[BO +  8 * SIZE], b9
	FCLR	(cc15)

	prefetch [C1 + 1 * SIZE], 3
	FCLR	(cc04)
	prefetch [C2 + 2 * SIZE], 3
	FCLR	(cc08)
	prefetch [C3 + 1 * SIZE], 3
	FCLR	(cc12)
	prefetch [C4 + 2 * SIZE], 3
	FCLR	(cc16)

#ifndef TRMMKERNEL
	sra	K,  3, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	sub	K, KK, L
#elif defined(LEFT)
	add	KK, 1, L
#else
	add	KK, 4, L
#endif
	sra	L,  3, L
#endif
	cmp	L,  0
	ble,pn	%icc, .LL15
	add	 BB, 32 * SIZE, BB
	.align 4

.LL13:
	FMADD1	(aa1, bb1, cc01, cc01)
	FMADD2	(aa2, bb1, cc02, cc02)
	FMADD3	(aa1, bb2, cc03, cc03)
	FMADD4	(aa2, bb2, cc04, cc04)

	FMADD1	(aa1, bb3, cc05, cc05)
	LDF	[BO + 16 * SIZE], b1
	FMADD2	(aa2, bb3, cc06, cc06)
	LDF	[BO +  9 * SIZE], b2

	FMADD3	(aa1, bb4, cc07, cc07)
	LDF	[BO + 10 * SIZE], b3
	FMADD4	(aa2, bb4, cc08, cc08)
	LDF	[BO + 11 * SIZE], b4

	FMADD1	(aa1, bb5, cc09, cc09)
	LDF	[AO +  2 * SIZE], a3
	FMADD2	(aa2, bb5, cc10, cc10)
	LDF	[AO +  3 * SIZE], a4

	FMADD3	(aa1, bb6, cc11, cc11)
	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
	FMADD4	(aa2, bb6, cc12, cc12)
	nop

	FMADD1	(aa1, bb7, cc13, cc13)
	LDF	[BO + 12 * SIZE], b5
	FMADD2	(aa2, bb7, cc14, cc14)
	LDF	[BO + 13 * SIZE], b6

	FMADD3	(aa1, bb8, cc15, cc15)
	LDF	[BO + 14 * SIZE], b7
	FMADD4	(aa2, bb8, cc16, cc16)
	LDF	[BO + 15 * SIZE], b8

	FMADD1	(aa3, bb9, cc01, cc01)
	FMADD2	(aa4, bb9, cc02, cc02)
	FMADD3	(aa3, bb2, cc03, cc03)
	FMADD4	(aa4, bb2, cc04, cc04)

	FMADD1	(aa3, bb3, cc05, cc05)
	LDF	[BO + 24 * SIZE], b9
	FMADD2	(aa4, bb3, cc06, cc06)
	LDF	[BO + 17 * SIZE], b2

	FMADD3	(aa3, bb4, cc07, cc07)
	LDF	[BO + 18 * SIZE], b3
	FMADD4	(aa4, bb4, cc08, cc08)
	LDF	[BO + 19 * SIZE], b4

	FMADD1	(aa3, bb5, cc09, cc09)
	LDF	[AO +  4 * SIZE], a1
	FMADD2	(aa4, bb5, cc10, cc10)
	LDF	[AO +  5 * SIZE], a2

	FMADD3	(aa3, bb6, cc11, cc11)
	add	L, -1, L
	FMADD4	(aa4, bb6, cc12, cc12)
	nop

	FMADD1	(aa3, bb7, cc13, cc13)
	LDF	[BO + 20 * SIZE], b5
	FMADD2	(aa4, bb7, cc14, cc14)
	LDF	[BO + 21 * SIZE], b6

	FMADD3	(aa3, bb8, cc15, cc15)
	LDF	[BO + 22 * SIZE], b7
	FMADD4	(aa4, bb8, cc16, cc16)
	LDF	[BO + 23 * SIZE], b8

	FMADD1	(aa1, bb1, cc01, cc01)
	FMADD2	(aa2, bb1, cc02, cc02)
	FMADD3	(aa1, bb2, cc03, cc03)
	FMADD4	(aa2, bb2, cc04, cc04)

	FMADD1	(aa1, bb3, cc05, cc05)
	LDF	[BO + 32 * SIZE], b1
	FMADD2	(aa2, bb3, cc06, cc06)
	LDF	[BO + 25 * SIZE], b2

	FMADD3	(aa1, bb4, cc07, cc07)
	LDF	[BO + 26 * SIZE], b3
	FMADD4	(aa2, bb4, cc08, cc08)
	LDF	[BO + 27 * SIZE], b4

	FMADD1	(aa1, bb5, cc09, cc09)
	LDF	[AO +  6 * SIZE], a3
	FMADD2	(aa2, bb5, cc10, cc10)
	LDF	[AO +  7 * SIZE], a4

	FMADD3	(aa1, bb6, cc11, cc11)
	nop
	FMADD4	(aa2, bb6, cc12, cc12)
	nop

	FMADD1	(aa1, bb7, cc13, cc13)
	LDF	[BO + 28 * SIZE], b5
	FMADD2	(aa2, bb7, cc14, cc14)
	LDF	[BO + 29 * SIZE], b6

	FMADD3	(aa1, bb8, cc15, cc15)
	LDF	[BO + 30 * SIZE], b7
	FMADD4	(aa2, bb8, cc16, cc16)
	LDF	[BO + 31 * SIZE], b8

	FMADD1	(aa3, bb9, cc01, cc01)
	FMADD2	(aa4, bb9, cc02, cc02)
	FMADD3	(aa3, bb2, cc03, cc03)
	FMADD4	(aa4, bb2, cc04, cc04)

	FMADD1	(aa3, bb3, cc05, cc05)
	LDF	[BO + 40 * SIZE], b9
	FMADD2	(aa4, bb3, cc06, cc06)
	LDF	[BO + 33 * SIZE], b2

	FMADD3	(aa3, bb4, cc07, cc07)
	LDF	[BO + 34 * SIZE], b3
	FMADD4	(aa4, bb4, cc08, cc08)
	LDF	[BO + 35 * SIZE], b4

	FMADD1	(aa3, bb5, cc09, cc09)
	LDF	[AO + 16 * SIZE], a1  /****/
	FMADD2	(aa4, bb5, cc10, cc10)
	LDF	[AO +  9 * SIZE], a2

	FMADD3	(aa3, bb6, cc11, cc11)
	nop
	FMADD4	(aa4, bb6, cc12, cc12)
	nop

	FMADD1	(aa3, bb7, cc13, cc13)
	LDF	[BO + 36 * SIZE], b5
	FMADD2	(aa4, bb7, cc14, cc14)
	LDF	[BO + 37 * SIZE], b6

	FMADD3	(aa3, bb8, cc15, cc15)
	LDF	[BO + 38 * SIZE], b7
	FMADD4	(aa4, bb8, cc16, cc16)
	LDF	[BO + 39 * SIZE], b8

	FMADD1	(aa5, bb1, cc01, cc01)
	FMADD2	(aa2, bb1, cc02, cc02)
	FMADD3	(aa5, bb2, cc03, cc03)
	FMADD4	(aa2, bb2, cc04, cc04)

	FMADD1	(aa5, bb3, cc05, cc05)
	LDF	[BO + 48 * SIZE], b1
	FMADD2	(aa2, bb3, cc06, cc06)
	LDF	[BO + 41 * SIZE], b2

	FMADD3	(aa5, bb4, cc07, cc07)
	LDF	[BO + 42 * SIZE], b3
	FMADD4	(aa2, bb4, cc08, cc08)
	LDF	[BO + 43 * SIZE], b4

	FMADD1	(aa5, bb5, cc09, cc09)
	LDF	[AO + 10 * SIZE], a3
	FMADD2	(aa2, bb5, cc10, cc10)
	LDF	[AO + 11 * SIZE], a4

	FMADD3	(aa5, bb6, cc11, cc11)
	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
	FMADD4	(aa2, bb6, cc12, cc12)
	nop

	FMADD1	(aa5, bb7, cc13, cc13)
	LDF	[BO + 44 * SIZE], b5
	FMADD2	(aa2, bb7, cc14, cc14)
	LDF	[BO + 45 * SIZE], b6

	FMADD3	(aa5, bb8, cc15, cc15)
	LDF	[BO + 46 * SIZE], b7
	FMADD4	(aa2, bb8, cc16, cc16)
	LDF	[BO + 47 * SIZE], b8

	FMADD1	(aa3, bb9, cc01, cc01)
	FMADD2	(aa4, bb9, cc02, cc02)
	FMADD3	(aa3, bb2, cc03, cc03)
	FMADD4	(aa4, bb2, cc04, cc04)

	FMADD1	(aa3, bb3, cc05, cc05)
	LDF	[BO + 56 * SIZE], b9
	FMADD2	(aa4, bb3, cc06, cc06)
	LDF	[BO + 49 * SIZE], b2

	FMADD3	(aa3, bb4, cc07, cc07)
	LDF	[BO + 50 * SIZE], b3
	FMADD4	(aa4, bb4, cc08, cc08)
	LDF	[BO + 51 * SIZE], b4

	FMADD1	(aa3, bb5, cc09, cc09)
	LDF	[AO + 12 * SIZE], a5
	FMADD2	(aa4, bb5, cc10, cc10)
	LDF	[AO + 13 * SIZE], a2

	FMADD3	(aa3, bb6, cc11, cc11)
	cmp	L, 0
	FMADD4	(aa4, bb6, cc12, cc12)
	nop

	FMADD1	(aa3, bb7, cc13, cc13)
	LDF	[BO + 52 * SIZE], b5
	FMADD2	(aa4, bb7, cc14, cc14)
	LDF	[BO + 53 * SIZE], b6

	FMADD3	(aa3, bb8, cc15, cc15)
	LDF	[BO + 54 * SIZE], b7
	FMADD4	(aa4, bb8, cc16, cc16)
	LDF	[BO + 55 * SIZE], b8

	FMADD1	(aa5, bb1, cc01, cc01)
	FMADD2	(aa2, bb1, cc02, cc02)
	FMADD3	(aa5, bb2, cc03, cc03)
	FMADD4	(aa2, bb2, cc04, cc04)

	FMADD1	(aa5, bb3, cc05, cc05)
	LDF	[BO + 64 * SIZE], b1
	FMADD2	(aa2, bb3, cc06, cc06)
	LDF	[BO + 57 * SIZE], b2

	FMADD3	(aa5, bb4, cc07, cc07)
	LDF	[BO + 58 * SIZE], b3
	FMADD4	(aa2, bb4, cc08, cc08)
	LDF	[BO + 59 * SIZE], b4

	FMADD1	(aa5, bb5, cc09, cc09)
	LDF	[AO + 14 * SIZE], a3
	FMADD2	(aa2, bb5, cc10, cc10)
	LDF	[AO + 15 * SIZE], a4

	FMADD3	(aa5, bb6, cc11, cc11)
	add	BO, 64 * SIZE, BO
	FMADD4	(aa2, bb6, cc12, cc12)
	add	AO, 16 * SIZE, AO

	FMADD1	(aa5, bb7, cc13, cc13)
	LDF	[BO -  4 * SIZE], b5
	FMADD2	(aa2, bb7, cc14, cc14)
	LDF	[BO -  3 * SIZE], b6

	FMADD3	(aa5, bb8, cc15, cc15)
	LDF	[BO -  2 * SIZE], b7
	FMADD4	(aa2, bb8, cc16, cc16)
	LDF	[BO -  1 * SIZE], b8

	FMADD1	(aa3, bb9, cc01, cc01)
	FMADD2	(aa4, bb9, cc02, cc02)
	FMADD3	(aa3, bb2, cc03, cc03)
	FMADD4	(aa4, bb2, cc04, cc04)

	FMADD1	(aa3, bb3, cc05, cc05)
	LDF	[BO +  8 * SIZE], b9
	FMADD2	(aa4, bb3, cc06, cc06)
	LDF	[BO +  1 * SIZE], b2

	FMADD3	(aa3, bb4, cc07, cc07)
	LDF	[BO +  2 * SIZE], b3
	FMADD4	(aa4, bb4, cc08, cc08)
	LDF	[BO +  3 * SIZE], b4

	FMADD1	(aa3, bb5, cc09, cc09)
	LDF	[AO +  8 * SIZE], a5  /****/
	FMADD2	(aa4, bb5, cc10, cc10)
	LDF	[AO +  1 * SIZE], a2

	FMADD3	(aa3, bb6, cc11, cc11)
	FMADD4	(aa4, bb6, cc12, cc12)

	FMADD1	(aa3, bb7, cc13, cc13)
	LDF	[BO +  4 * SIZE], b5
	FMADD2	(aa4, bb7, cc14, cc14)
	LDF	[BO +  5 * SIZE], b6

	FMADD3	(aa3, bb8, cc15, cc15)
	LDF	[BO +  6 * SIZE], b7
	FMADD4	(aa4, bb8, cc16, cc16)
	ble,pn	%icc, .LL15
	LDF	[BO +  7 * SIZE], b8

	FMADD1	(aa1, bb1, cc01, cc01)
	FMADD2	(aa2, bb1, cc02, cc02)
	FMADD3	(aa1, bb2, cc03, cc03)
	FMADD4	(aa2, bb2, cc04, cc04)

	FMADD1	(aa1, bb3, cc05, cc05)
	LDF	[BO + 16 * SIZE], b1
	FMADD2	(aa2, bb3, cc06, cc06)
	LDF	[BO +  9 * SIZE], b2

	FMADD3	(aa1, bb4, cc07, cc07)
	LDF	[BO + 10 * SIZE], b3
	FMADD4	(aa2, bb4, cc08, cc08)
	LDF	[BO + 11 * SIZE], b4

	FMADD1	(aa1, bb5, cc09, cc09)
	LDF	[AO +  2 * SIZE], a3
	FMADD2	(aa2, bb5, cc10, cc10)
	LDF	[AO +  3 * SIZE], a4

	FMADD3	(aa1, bb6, cc11, cc11)
	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
	FMADD4	(aa2, bb6, cc12, cc12)
	nop

	FMADD1	(aa1, bb7, cc13, cc13)
	LDF	[BO + 12 * SIZE], b5
	FMADD2	(aa2, bb7, cc14, cc14)
	LDF	[BO + 13 * SIZE], b6

	FMADD3	(aa1, bb8, cc15, cc15)
	LDF	[BO + 14 * SIZE], b7
	FMADD4	(aa2, bb8, cc16, cc16)
	LDF	[BO + 15 * SIZE], b8

	FMADD1	(aa3, bb9, cc01, cc01)
	FMADD2	(aa4, bb9, cc02, cc02)
	FMADD3	(aa3, bb2, cc03, cc03)
	FMADD4	(aa4, bb2, cc04, cc04)

	FMADD1	(aa3, bb3, cc05, cc05)
	LDF	[BO + 24 * SIZE], b9
	FMADD2	(aa4, bb3, cc06, cc06)
	LDF	[BO + 17 * SIZE], b2

	FMADD3	(aa3, bb4, cc07, cc07)
	LDF	[BO + 18 * SIZE], b3
	FMADD4	(aa4, bb4, cc08, cc08)
	LDF	[BO + 19 * SIZE], b4

	FMADD1	(aa3, bb5, cc09, cc09)
	LDF	[AO +  4 * SIZE], a1
	FMADD2	(aa4, bb5, cc10, cc10)
	LDF	[AO +  5 * SIZE], a2

	FMADD3	(aa3, bb6, cc11, cc11)
	add	L, -1, L
	FMADD4	(aa4, bb6, cc12, cc12)
	nop

	FMADD1	(aa3, bb7, cc13, cc13)
	LDF	[BO + 20 * SIZE], b5
	FMADD2	(aa4, bb7, cc14, cc14)
	LDF	[BO + 21 * SIZE], b6

	FMADD3	(aa3, bb8, cc15, cc15)
	LDF	[BO + 22 * SIZE], b7
	FMADD4	(aa4, bb8, cc16, cc16)
	LDF	[BO + 23 * SIZE], b8

	FMADD1	(aa1, bb1, cc01, cc01)
	FMADD2	(aa2, bb1, cc02, cc02)
	FMADD3	(aa1, bb2, cc03, cc03)
	FMADD4	(aa2, bb2, cc04, cc04)

	FMADD1	(aa1, bb3, cc05, cc05)
	LDF	[BO + 32 * SIZE], b1
	FMADD2	(aa2, bb3, cc06, cc06)
	LDF	[BO + 25 * SIZE], b2

	FMADD3	(aa1, bb4, cc07, cc07)
	LDF	[BO + 26 * SIZE], b3
	FMADD4	(aa2, bb4, cc08, cc08)
	LDF	[BO + 27 * SIZE], b4

	FMADD1	(aa1, bb5, cc09, cc09)
	LDF	[AO +  6 * SIZE], a3
	FMADD2	(aa2, bb5, cc10, cc10)
	LDF	[AO +  7 * SIZE], a4

	FMADD3	(aa1, bb6, cc11, cc11)
	nop
	FMADD4	(aa2, bb6, cc12, cc12)
	nop

	FMADD1	(aa1, bb7, cc13, cc13)
	LDF	[BO + 28 * SIZE], b5
	FMADD2	(aa2, bb7, cc14, cc14)
	LDF	[BO + 29 * SIZE], b6

	FMADD3	(aa1, bb8, cc15, cc15)
	LDF	[BO + 30 * SIZE], b7
	FMADD4	(aa2, bb8, cc16, cc16)
	LDF	[BO + 31 * SIZE], b8

	FMADD1	(aa3, bb9, cc01, cc01)
	FMADD2	(aa4, bb9, cc02, cc02)
	FMADD3	(aa3, bb2, cc03, cc03)
	FMADD4	(aa4, bb2, cc04, cc04)

	FMADD1	(aa3, bb3, cc05, cc05)
	LDF	[BO + 40 * SIZE], b9
	FMADD2	(aa4, bb3, cc06, cc06)
	LDF	[BO + 33 * SIZE], b2

	FMADD3	(aa3, bb4, cc07, cc07)
	LDF	[BO + 34 * SIZE], b3
	FMADD4	(aa4, bb4, cc08, cc08)
	LDF	[BO + 35 * SIZE], b4

	FMADD1	(aa3, bb5, cc09, cc09)
	LDF	[AO + 16 * SIZE], a1  /****/
	FMADD2	(aa4, bb5, cc10, cc10)
	LDF	[AO +  9 * SIZE], a2

	FMADD3	(aa3, bb6, cc11, cc11)
	nop
	FMADD4	(aa4, bb6, cc12, cc12)
	nop

	FMADD1	(aa3, bb7, cc13, cc13)
	LDF	[BO + 36 * SIZE], b5
	FMADD2	(aa4, bb7, cc14, cc14)
	LDF	[BO + 37 * SIZE], b6

	FMADD3	(aa3, bb8, cc15, cc15)
	LDF	[BO + 38 * SIZE], b7
	FMADD4	(aa4, bb8, cc16, cc16)
	LDF	[BO + 39 * SIZE], b8

	FMADD1	(aa5, bb1, cc01, cc01)
	FMADD2	(aa2, bb1, cc02, cc02)
	FMADD3	(aa5, bb2, cc03, cc03)
	FMADD4	(aa2, bb2, cc04, cc04)

	FMADD1	(aa5, bb3, cc05, cc05)
	LDF	[BO + 48 * SIZE], b1
	FMADD2	(aa2, bb3, cc06, cc06)
	LDF	[BO + 41 * SIZE], b2

	FMADD3	(aa5, bb4, cc07, cc07)
	LDF	[BO + 42 * SIZE], b3
	FMADD4	(aa2, bb4, cc08, cc08)
	LDF	[BO + 43 * SIZE], b4

	FMADD1	(aa5, bb5, cc09, cc09)
	LDF	[AO + 10 * SIZE], a3
	FMADD2	(aa2, bb5, cc10, cc10)
	LDF	[AO + 11 * SIZE], a4

	FMADD3	(aa5, bb6, cc11, cc11)
	prefetch [AO + (APREFETCHSIZE +  8) * SIZE], APREFETCH_CATEGORY
	FMADD4	(aa2, bb6, cc12, cc12)
	nop

	FMADD1	(aa5, bb7, cc13, cc13)
	LDF	[BO + 44 * SIZE], b5
	FMADD2	(aa2, bb7, cc14, cc14)
	LDF	[BO + 45 * SIZE], b6

	FMADD3	(aa5, bb8, cc15, cc15)
	LDF	[BO + 46 * SIZE], b7
	FMADD4	(aa2, bb8, cc16, cc16)
	LDF	[BO + 47 * SIZE], b8

	FMADD1	(aa3, bb9, cc01, cc01)
	FMADD2	(aa4, bb9, cc02, cc02)
	FMADD3	(aa3, bb2, cc03, cc03)
	FMADD4	(aa4, bb2, cc04, cc04)

	FMADD1	(aa3, bb3, cc05, cc05)
	LDF	[BO + 56 * SIZE], b9
	FMADD2	(aa4, bb3, cc06, cc06)
	LDF	[BO + 49 * SIZE], b2

	FMADD3	(aa3, bb4, cc07, cc07)
	LDF	[BO + 50 * SIZE], b3
	FMADD4	(aa4, bb4, cc08, cc08)
	LDF	[BO + 51 * SIZE], b4

	FMADD1	(aa3, bb5, cc09, cc09)
	LDF	[AO + 12 * SIZE], a5
	FMADD2	(aa4, bb5, cc10, cc10)
	LDF	[AO + 13 * SIZE], a2

	FMADD3	(aa3, bb6, cc11, cc11)
	cmp	L, 0
	FMADD4	(aa4, bb6, cc12, cc12)
	nop

	FMADD1	(aa3, bb7, cc13, cc13)
	LDF	[BO + 52 * SIZE], b5
	FMADD2	(aa4, bb7, cc14, cc14)
	LDF	[BO + 53 * SIZE], b6

	FMADD3	(aa3, bb8, cc15, cc15)
	LDF	[BO + 54 * SIZE], b7
	FMADD4	(aa4, bb8, cc16, cc16)
	LDF	[BO + 55 * SIZE], b8

	FMADD1	(aa5, bb1, cc01, cc01)
	FMADD2	(aa2, bb1, cc02, cc02)
	FMADD3	(aa5, bb2, cc03, cc03)
	FMADD4	(aa2, bb2, cc04, cc04)

	FMADD1	(aa5, bb3, cc05, cc05)
	LDF	[BO + 64 * SIZE], b1
	FMADD2	(aa2, bb3, cc06, cc06)
	LDF	[BO + 57 * SIZE], b2

	FMADD3	(aa5, bb4, cc07, cc07)
	LDF	[BO + 58 * SIZE], b3
	FMADD4	(aa2, bb4, cc08, cc08)
	LDF	[BO + 59 * SIZE], b4

	FMADD1	(aa5, bb5, cc09, cc09)
	LDF	[AO + 14 * SIZE], a3
	FMADD2	(aa2, bb5, cc10, cc10)
	LDF	[AO + 15 * SIZE], a4

	FMADD3	(aa5, bb6, cc11, cc11)
	add	BO, 64 * SIZE, BO
	FMADD4	(aa2, bb6, cc12, cc12)
	add	AO, 16 * SIZE, AO

	FMADD1	(aa5, bb7, cc13, cc13)
	LDF	[BO -  4 * SIZE], b5
	FMADD2	(aa2, bb7, cc14, cc14)
	LDF	[BO -  3 * SIZE], b6

	FMADD3	(aa5, bb8, cc15, cc15)
	LDF	[BO -  2 * SIZE], b7
	FMADD4	(aa2, bb8, cc16, cc16)
	LDF	[BO -  1 * SIZE], b8

	FMADD1	(aa3, bb9, cc01, cc01)
	FMADD2	(aa4, bb9, cc02, cc02)
	FMADD3	(aa3, bb2, cc03, cc03)
	FMADD4	(aa4, bb2, cc04, cc04)

	FMADD1	(aa3, bb3, cc05, cc05)
	LDF	[BO +  8 * SIZE], b9
	FMADD2	(aa4, bb3, cc06, cc06)
	LDF	[BO +  1 * SIZE], b2

	FMADD3	(aa3, bb4, cc07, cc07)
	LDF	[BO +  2 * SIZE], b3
	FMADD4	(aa4, bb4, cc08, cc08)
	LDF	[BO +  3 * SIZE], b4

	FMADD1	(aa3, bb5, cc09, cc09)
	LDF	[AO +  8 * SIZE], a5  /****/
	FMADD2	(aa4, bb5, cc10, cc10)
	LDF	[AO +  1 * SIZE], a2

	FMADD3	(aa3, bb6, cc11, cc11)
	FMADD4	(aa4, bb6, cc12, cc12)

	FMADD1	(aa3, bb7, cc13, cc13)
	LDF	[BO +  4 * SIZE], b5
	FMADD2	(aa4, bb7, cc14, cc14)
	LDF	[BO +  5 * SIZE], b6

	FMADD3	(aa3, bb8, cc15, cc15)
	LDF	[BO +  6 * SIZE], b7
	FMADD4	(aa4, bb8, cc16, cc16)
	bg,pt	%icc, .LL13
	LDF	[BO +  7 * SIZE], b8
	.align 4

.LL15:
#ifndef TRMMKERNEL
	and	K,  7, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	sub	K, KK, L
#elif defined(LEFT)
	add	KK, 1, L
#else
	add	KK, 4, L
#endif
	and	L,  7, L
#endif
	cmp	L,  0
	ble,a,pn %icc, .LL18
	nop
	.align 4

.LL17:
	FMADD1	(aa1, bb1, cc01, cc01)
	add	L, -1, L
	FMADD2	(aa2, bb1, cc02, cc02)
	nop

	FMADD3	(aa1, bb2, cc03, cc03)
	LDF	[BO +  8 * SIZE], b1
	FMADD4	(aa2, bb2, cc04, cc04)
	LDF	[BO +  9 * SIZE], b2

	FMADD1	(aa1, bb3, cc05, cc05)
	cmp	L, 0
	FMADD2	(aa2, bb3, cc06, cc06)
	nop

	FMADD3	(aa1, bb4, cc07, cc07)
	LDF	[BO + 10 * SIZE], b3
	FMADD4	(aa2, bb4, cc08, cc08)
	LDF	[BO + 11 * SIZE], b4

	FMADD1	(aa1, bb5, cc09, cc09)
	nop
	FMADD2	(aa2, bb5, cc10, cc10)
	nop

	FMADD3	(aa1, bb6, cc11, cc11)
	LDF	[BO + 12 * SIZE], b5
	FMADD4	(aa2, bb6, cc12, cc12)
	LDF	[BO + 13 * SIZE], b6

	FMADD1	(aa1, bb7, cc13, cc13)
	add	AO, 2 * SIZE, AO
	FMADD2	(aa2, bb7, cc14, cc14)
	add	BO, 8 * SIZE, BO

	FMADD3	(aa1, bb8, cc15, cc15)
	LDF	[AO +  0 * SIZE], a1
	FMADD4	(aa2, bb8, cc16, cc16)
	LDF	[AO +  1 * SIZE], a2

	LDF	[BO +  6 * SIZE], b7
	bg,pt	%icc, .LL17
	LDF	[BO +  7 * SIZE], b8
	nop
	.align 4

.LL18:
#ifndef TRMMKERNEL
	LDF	[C1 + 0 * SIZE], a1
	FADD	  c01, c04, c01
	LDF	[C1 + 1 * SIZE], a2
	FADD	  c02, c03, c02
	LDF	[C2 + 0 * SIZE], a3
	FADD	  c05, c08, c05
	LDF	[C2 + 1 * SIZE], a4
	FADD	  c06, c07, c06

	LDF	[C3 + 0 * SIZE], b1
	FADD	  c09, c12, c09
	LDF	[C3 + 1 * SIZE], b2
	FADD	  c10, c11, c10
	LDF	[C4 + 0 * SIZE], b3
	FADD	  c13, c16, c13
	LDF	[C4 + 1 * SIZE], b4
	FADD	  c14, c15, c14

	FMADD	(alpha_r, cc01, aa1, aa1)
	FMADD	(alpha_r, cc02, aa2, aa2)
	FMADD	(alpha_r, cc05, aa3, aa3)
	FMADD	(alpha_r, cc06, aa4, aa4)

	FMADD	(alpha_r, cc09, bb1, bb1)
	FMADD	(alpha_r, cc10, bb2, bb2)
	FMADD	(alpha_r, cc13, bb3, bb3)
	FMADD	(alpha_r, cc14, bb4, bb4)

#else
	FADD	c01, c04, c01
	FADD	c02, c03, c02
	FADD	c05, c08, c05
	FADD	c06, c07, c06

	FADD	c09, c12, c09
	FADD	c10, c11, c10
	FADD	c13, c16, c13
	FADD	c14, c15, c14

	FMUL	ALPHA_R, c01, a1
	FMUL	ALPHA_R, c02, a2
	FMUL	ALPHA_R, c05, a3
	FMUL	ALPHA_R, c06, a4

	FMUL	ALPHA_R, c09, b1
	FMUL	ALPHA_R, c10, b2
	FMUL	ALPHA_R, c13, b3
	FMUL	ALPHA_R, c14, b4
#endif

	FNMSUB	(alpha_i, cc02, aa1, aa1)
	FMADD	(alpha_i, cc01, aa2, aa2)
	FNMSUB	(alpha_i, cc06, aa3, aa3)
	FMADD	(alpha_i, cc05, aa4, aa4)

	FNMSUB	(alpha_i, cc10, bb1, bb1)
	STF	a1, [C1 + 0 * SIZE]
	FMADD	(alpha_i, cc09, bb2, bb2)
	STF	a2, [C1 + 1 * SIZE]
	FNMSUB	(alpha_i, cc14, bb3, bb3)
	STF	a3, [C2 + 0 * SIZE]
	FMADD	(alpha_i, cc13, bb4, bb4)
	STF	a4, [C2 + 1 * SIZE]

	STF	b1, [C3 + 0 * SIZE]
	add	C1, 2 * SIZE, C1
	STF	b2, [C3 + 1 * SIZE]
	add	C2, 2 * SIZE, C2
	STF	b3, [C4 + 0 * SIZE]
	add	C3, 2 * SIZE, C3
	STF	b4, [C4 + 1 * SIZE]
	add	C4, 2 * SIZE, C4

#ifdef TRMMKERNEL
#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	sub	K, KK, TEMP1
#ifdef LEFT
	add	TEMP1, -1, TEMP1
#else
	add	TEMP1, -4, TEMP1
#endif
	sll	TEMP1, ZBASE_SHIFT + 0, TEMP2
	sll	TEMP1, ZBASE_SHIFT + 2, TEMP1

	add	AO, TEMP2, AO
	add	BO, TEMP1, BO
#endif

#ifdef LEFT
	add	KK, 1, KK
#endif
#endif

	add	I, -1, I
	cmp	I, 0
	bg,pt	%icc, .LL12
	nop

#if defined(TRMMKERNEL) && !defined(LEFT)
	add	KK, 4, KK
#endif

	add	J, -1, J
	cmp	J, 0
	bg,pt	%icc, .LL11
	mov	BO, B
	.align 4

.LL20: 
	and	N, 2, J
	cmp	J, 0
	ble,pn	%icc, .LL30
	mov	C,  C1

	add	C,  LDC, C2
	add	C2, LDC, C

#if defined(TRMMKERNEL) &&  defined(LEFT)
	mov	OFFSET, KK
#endif

	mov	M, I
	mov	A, AO
	.align 4

.LL22:
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
	mov	B, BO
#else
	sll	KK, ZBASE_SHIFT + 0, TEMP1
	sll	KK, ZBASE_SHIFT + 1, TEMP2

	add	AO, TEMP1, AO
	add	B,  TEMP2, BO
#endif

	LDF	[AO +  0 * SIZE], a1
	LDF	[AO +  1 * SIZE], a2

	LDF	[BO +  0 * SIZE], b1
	LDF	[BO +  1 * SIZE], b2
	LDF	[BO +  2 * SIZE], b3
	LDF	[BO +  3 * SIZE], b4
	LDF	[BO +  4 * SIZE], b5
	FCLR	(cc01)

	LDF	[BO +  5 * SIZE], b6
	FCLR	(cc02)
	LDF	[BO +  6 * SIZE], b7
	FCLR	(cc03)
	LDF	[BO +  7 * SIZE], b8
	FCLR	(cc04)
	LDF	[BO +  8 * SIZE], b9
	FCLR	(cc05)

	prefetch [C1 + 2 * SIZE], 3
	FCLR	(cc06)
	prefetch [C2 + 2 * SIZE], 3
	FCLR	(cc07)

#ifndef TRMMKERNEL
	sra	K,  2, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	sub	K, KK, L
#elif defined(LEFT)
	add	KK, 1, L
#else
	add	KK, 2, L
#endif
	sra	L,  2, L
#endif
	cmp	L,  0
	ble,pn	%icc, .LL25
	FCLR	(cc08)
	.align 4

.LL23:
	FMADD1	(aa1, bb1, cc01, cc01)
	LDF	[AO +  2 * SIZE], a3
	FMADD2	(aa2, bb1, cc02, cc02)
	LDF	[AO +  3 * SIZE], a4

	FMADD3	(aa1, bb2, cc03, cc03)
	LDF	[BO + 16 * SIZE], b1
	FMADD4	(aa2, bb2, cc04, cc04)
	LDF	[BO +  9 * SIZE], b2

	FMADD1	(aa1, bb3, cc05, cc05)
	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
	FMADD2	(aa2, bb3, cc06, cc06)
	add	L, -1, L

	FMADD3	(aa1, bb4, cc07, cc07)
	LDF	[BO + 10 * SIZE], b3
	FMADD4	(aa2, bb4, cc08, cc08)
	LDF	[BO + 11 * SIZE], b4

	FMADD1	(aa3, bb5, cc01, cc01)
	LDF	[AO +  4 * SIZE], a1
	FMADD2	(aa4, bb5, cc02, cc02)
	LDF	[AO +  5 * SIZE], a2

	FMADD3	(aa3, bb6, cc03, cc03)
	LDF	[BO + 12 * SIZE], b5
	FMADD4	(aa4, bb6, cc04, cc04)
	LDF	[BO + 13 * SIZE], b6

	FMADD1	(aa3, bb7, cc05, cc05)
	cmp	L, 0
	FMADD2	(aa4, bb7, cc06, cc06)
	add	AO,  8 * SIZE, AO

	FMADD3	(aa3, bb8, cc07, cc07)
	LDF	[BO + 14 * SIZE], b7
	FMADD4	(aa4, bb8, cc08, cc08)
	LDF	[BO + 15 * SIZE], b8

	FMADD1	(aa1, bb9, cc01, cc01)
	LDF	[AO -  2 * SIZE], a3
	FMADD2	(aa2, bb9, cc02, cc02)
	LDF	[AO -  1 * SIZE], a4

	FMADD3	(aa1, bb2, cc03, cc03)
	LDF	[BO + 24 * SIZE], b9
	FMADD4	(aa2, bb2, cc04, cc04)
	LDF	[BO + 17 * SIZE], b2

	FMADD1	(aa1, bb3, cc05, cc05)
	add	BO, 16 * SIZE, BO
	FMADD2	(aa2, bb3, cc06, cc06)
	nop

	FMADD3	(aa1, bb4, cc07, cc07)
	LDF	[BO +  2 * SIZE], b3
	FMADD4	(aa2, bb4, cc08, cc08)
	LDF	[BO +  3 * SIZE], b4

	FMADD1	(aa3, bb5, cc01, cc01)
	LDF	[AO +  0 * SIZE], a1
	FMADD2	(aa4, bb5, cc02, cc02)
	LDF	[AO +  1 * SIZE], a2
	FMADD3	(aa3, bb6, cc03, cc03)
	LDF	[BO +  4 * SIZE], b5
	FMADD4	(aa4, bb6, cc04, cc04)
	LDF	[BO +  5 * SIZE], b6

	FMADD1	(aa3, bb7, cc05, cc05)
	nop
	FMADD2	(aa4, bb7, cc06, cc06)
	LDF	[BO +  6 * SIZE], b7

	FMADD3	(aa3, bb8, cc07, cc07)
	FMADD4	(aa4, bb8, cc08, cc08)
	bg,pt	%icc, .LL23
	LDF	[BO +  7 * SIZE], b8
	.align 4

.LL25:
#ifndef TRMMKERNEL
	and	K,  3, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	sub	K, KK, L
#elif defined(LEFT)
	add	KK, 1, L
#else
	add	KK, 2, L
#endif
	and	L,  3, L
#endif
	cmp	L,  0
	ble,a,pn %icc, .LL28
	nop
	.align 4

.LL27:
	FMADD1	(aa1, bb1, cc01, cc01)
	add	L, -1, L
	FMADD2	(aa2, bb1, cc02, cc02)
	LDF	[BO + 4 * SIZE], b1

	FMADD3	(aa1, bb2, cc03, cc03)
	add	AO, 2 * SIZE, AO
	FMADD4	(aa2, bb2, cc04, cc04)
	LDF	[BO + 5 * SIZE], b2

	FMADD1	(aa1, bb3, cc05, cc05)
	cmp	L, 0
	FMADD2	(aa2, bb3, cc06, cc06)
	LDF	[BO + 6 * SIZE], b3

	FMADD3	(aa1, bb4, cc07, cc07)
	LDF	[AO + 0 * SIZE], a1
	FMADD4	(aa2, bb4, cc08, cc08)
	LDF	[AO + 1 * SIZE], a2

	LDF	[BO + 7 * SIZE], b4
	bg,pt	%icc, .LL27
	add	BO, 4 * SIZE, BO
	.align 4

.LL28:
#ifndef TRMMKERNEL
	LDF	[C1 + 0 * SIZE], a1
	FADD	  c01, c04, c01
	LDF	[C1 + 1 * SIZE], a2
	FADD	  c02, c03, c02
	LDF	[C2 + 0 * SIZE], a3
	FADD	  c05, c08, c05
	LDF	[C2 + 1 * SIZE], a4
	FADD	  c06, c07, c06

	FMADD	(alpha_r, cc01, aa1, aa1)
	FMADD	(alpha_r, cc02, aa2, aa2)
	FMADD	(alpha_r, cc05, aa3, aa3)
	FMADD	(alpha_r, cc06, aa4, aa4)
#else
	FADD	  c01, c04, c01
	FADD	  c02, c03, c02
	FADD	  c05, c08, c05
	FADD	  c06, c07, c06

	FMUL	ALPHA_R, c01, a1
	FMUL	ALPHA_R, c02, a2
	FMUL	ALPHA_R, c05, a3
	FMUL	ALPHA_R, c06, a4
#endif

	FNMSUB	(alpha_i, cc02, aa1, aa1)
	FMADD	(alpha_i, cc01, aa2, aa2)
	FNMSUB	(alpha_i, cc06, aa3, aa3)
	FMADD	(alpha_i, cc05, aa4, aa4)

	STF	a1, [C1 + 0 * SIZE]
	add	I, -1, I
	STF	a2, [C1 + 1 * SIZE]
	cmp	I, 0
	STF	a3, [C2 + 0 * SIZE]
	add	C1, 2 * SIZE, C1
	STF	a4, [C2 + 1 * SIZE]
	add	C2, 2 * SIZE, C2

#ifdef TRMMKERNEL
#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	sub	K, KK, TEMP1
#ifdef LEFT
	add	TEMP1, -1, TEMP1
#else
	add	TEMP1, -2, TEMP1
#endif
	sll	TEMP1, ZBASE_SHIFT + 0, TEMP2
	sll	TEMP1, ZBASE_SHIFT + 1, TEMP1

	add	AO, TEMP2, AO
	add	BO, TEMP1, BO
#endif

#ifdef LEFT
	add	KK, 1, KK
#endif
#endif

	bg,pt	%icc, .LL22
	nop

#if defined(TRMMKERNEL) && !defined(LEFT)
	add	KK, 2, KK
#endif

	mov	BO, B
	.align 4

.LL30:
	and	N, 1, J
	cmp	J, 0
	ble,pn	%icc, .LL999
	mov	C,  C1

#if defined(TRMMKERNEL) &&  defined(LEFT)
	mov	OFFSET, KK
#endif

	mov	M, I
	mov	A, AO
	.align 4

.LL32:
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
	mov	B, BO
#else
	sll	KK, ZBASE_SHIFT + 0, TEMP1
	sll	KK, ZBASE_SHIFT + 0, TEMP2

	add	AO, TEMP1, AO
	add	B,  TEMP2, BO
#endif

	LDF	[AO +  0 * SIZE], a1
	LDF	[AO +  1 * SIZE], a2
	LDF	[AO +  2 * SIZE], a3
	LDF	[AO +  3 * SIZE], a4

	LDF	[BO +  0 * SIZE], b1
	LDF	[BO +  1 * SIZE], b2
	LDF	[BO +  2 * SIZE], b3
	FCLR	(cc01)
	LDF	[BO +  3 * SIZE], b4
	FCLR	(cc02)

	LDF	[BO +  4 * SIZE], b5
	FCLR	(cc03)
	LDF	[BO +  5 * SIZE], b6
	FCLR	(cc04)
	LDF	[BO +  6 * SIZE], b7
	FCLR	(cc05)
	LDF	[BO +  7 * SIZE], b8
	FCLR	(cc06)

	prefetch [C1 + 2 * SIZE], 3
	FCLR	(cc07)

#ifndef TRMMKERNEL
	sra	K,  2, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	sub	K, KK, L
#elif defined(LEFT)
	add	KK, 1, L
#else
	add	KK, 1, L
#endif
	sra	L,  2, L
#endif
	cmp	L,  0
	ble,pn	%icc, .LL35
	FCLR	(cc08)
	.align 4

.LL33:
	FMADD1	(aa1, bb1, cc01, cc01)
	prefetch [AO + (APREFETCHSIZE +  0) * SIZE], APREFETCH_CATEGORY
	FMADD2	(aa2, bb1, cc02, cc02)
	LDF	[BO +  8 * SIZE], b1

	FMADD3	(aa1, bb2, cc03, cc03)
	LDF	[AO +  4 * SIZE], a1
	FMADD4	(aa2, bb2, cc04, cc04)
	LDF	[AO +  5 * SIZE], a2

	FMADD1	(aa3, bb3, cc01, cc01)
	LDF	[BO +  9 * SIZE], b2
	FMADD2	(aa4, bb3, cc02, cc02)
	LDF	[BO + 10 * SIZE], b3

	FMADD3	(aa3, bb4, cc03, cc03)
	LDF	[AO +  6 * SIZE], a3
	FMADD4	(aa4, bb4, cc04, cc04)
	LDF	[AO +  7 * SIZE], a4

	FMADD1	(aa1, bb5, cc01, cc01)
	LDF	[BO + 11 * SIZE], b4
	FMADD2	(aa2, bb5, cc02, cc02)
	LDF	[BO + 12 * SIZE], b5

	FMADD3	(aa1, bb6, cc03, cc03)
	LDF	[AO +  8 * SIZE], a1
	FMADD4	(aa2, bb6, cc04, cc04)
	LDF	[AO +  9 * SIZE], a2

	FMADD1	(aa3, bb7, cc01, cc01)
	LDF	[BO + 13 * SIZE], b6

	FMADD2	(aa4, bb7, cc02, cc02)
	LDF	[BO + 14 * SIZE], b7

	FMADD3	(aa3, bb8, cc03, cc03)
	LDF	[AO + 10 * SIZE], a3
	FMADD4	(aa4, bb8, cc04, cc04)
	LDF	[AO + 11 * SIZE], a4

	add	AO,  8 * SIZE, AO
	add	L, -1, L
	add	BO,  8 * SIZE, BO
	cmp	L, 0

	bg,pt	%icc, .LL33
	LDF	[BO +  7 * SIZE], b8
	.align 4

.LL35:
#ifndef TRMMKERNEL
	and	K,  3, L
#else
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
	sub	K, KK, L
#elif defined(LEFT)
	add	KK, 1, L
#else
	add	KK, 1, L
#endif
	and	L,  3, L
#endif
	cmp	L,  0
	ble,a,pn %icc, .LL38
	nop
	.align 4

.LL37:
	FMADD1	(aa1, bb1, cc01, cc01)
	add	L, -1, L
	FMADD2	(aa2, bb1, cc02, cc02)
	LDF	[BO + 2 * SIZE], b1

	FMADD3	(aa1, bb2, cc03, cc03)
	LDF	[AO + 2 * SIZE], a1
	FMADD4	(aa2, bb2, cc04, cc04)
	LDF	[AO + 3 * SIZE], a2

	add	AO, 2 * SIZE, AO
	cmp	L, 0
	add	BO, 2 * SIZE, BO
	bg,pt	%icc, .LL37
	LDF	[BO + 1 * SIZE], b2
	.align 4

.LL38:
#ifndef TRMMKERNEL
	LDF	[C1 + 0 * SIZE], a1
	FADD	  c01, c04, c01
	LDF	[C1 + 1 * SIZE], a2
	FADD	  c02, c03, c02

	FMADD	(alpha_r, cc01, aa1, aa1)
	FMADD	(alpha_r, cc02, aa2, aa2)
#else
	FADD	  c01, c04, c01
	FADD	  c02, c03, c02

	FMUL	ALPHA_R, c01, a1
	FMUL	ALPHA_R, c02, a2
#endif

	FNMSUB	(alpha_i, cc02, aa1, aa1)
	FMADD	(alpha_i, cc01, aa2, aa2)

	STF	a1, [C1 + 0 * SIZE]
	STF	a2, [C1 + 1 * SIZE]

#ifdef TRMMKERNEL
#if ( defined(LEFT) &&  defined(TRANSA)) || \
    (!defined(LEFT) && !defined(TRANSA))
	sub	K, KK, TEMP1
#ifdef LEFT
	add	TEMP1, -1, TEMP1
#else
	add	TEMP1, -1, TEMP1
#endif
	sll	TEMP1, ZBASE_SHIFT + 0, TEMP2
	sll	TEMP1, ZBASE_SHIFT + 0, TEMP1

	add	AO, TEMP2, AO
	add	BO, TEMP1, BO
#endif

#ifdef LEFT
	add	KK, 1, KK
#endif
#endif

	add	I, -1, I
	cmp	I, 0
	bg,pt	%icc, .LL32
	add	C1, 2 * SIZE, C1
	.align 4

.LL999:
	return	%i7 + 8
	clr	%o0

	EPILOGUE