Blob Blame Raw
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define P 4000

#define M	%i0
#define N	%i1
#define A	%i5
#define LDA	%i2
#define X	%i3
#define INCX	%i4	

#define Y	%l0
#define INCY	%l1
#define BUFFER	%l2

#define I	%l3
#define IS	%l4
#define J	%l5
#define MIN_M	%l6
#define XP	%l7

#define A1	%o0
#define A2	%o1
#define A3	%o2
#define A4	%o3

#define X1	%o4
#define Y1	%o5
#define PNLDA	%g1
#define Y2	%o7	/* Danger? */

#ifdef DOUBLE
#define t1	%f0
#define	t2 	%f2
#define t3	%f4
#define	t4 	%f6

#define c1	%f8
#define c2	%f10
#define c3	%f12
#define c4	%f14
#define c5	%f16
#define c6	%f18
#define c7	%f20
#define c8	%f22
#define c9	%f24
#define c10	%f26
#define c11	%f28
#define c12	%f30
#define c13	%f32
#define c14	%f34
#define c15	%f36
#define c16	%f38

#define a1	%f40
#define a2	%f42
#define a3	%f44
#define a4	%f46
#define a5	%f48
#define a6	%f50
#define a7	%f52
#define a8	%f54

#define b1	%f56
#define b2	%f58
#define b3	%f60
#define b4	%f62
#else
#define t1	%f0
#define	t2 	%f1
#define t3	%f2
#define	t4 	%f3

#define c1	%f4
#define c2	%f5
#define c3	%f6
#define c4	%f7
#define c5	%f8
#define c6	%f9
#define c7	%f10
#define c8	%f11
#define c9	%f12
#define c10	%f13
#define c11	%f14
#define c12	%f15
#define c13	%f16
#define c14	%f17
#define c15	%f18
#define c16	%f19

#define a1	%f20
#define a2	%f21
#define a3	%f22
#define a4	%f23
#define a5	%f24
#define a6	%f25
#define a7	%f26
#define a8	%f27

#define b1	%f28
#define b2	%f29
#define b3	%f30
#define b4	%f31
#endif

#ifndef __64BIT__
#define ALPHA_R	[%sp + STACK_START + 16]
#ifndef DOUBLE
#define ALPHA_I	[%sp + STACK_START + 20]
#else
#define ALPHA_I	[%sp + STACK_START + 24]
#endif
#else
#define ALPHA_R	[%sp + STACK_START + 32]
#define ALPHA_I	[%sp + STACK_START + 40]
#endif

#ifdef DOUBLE
#define PREFETCHSIZE 18
#else
#define PREFETCHSIZE 36
#endif

	PROLOGUE
	SAVESP
	nop

#ifndef __64BIT__

#ifdef DOUBLE
	st	%i3, [%sp + STACK_START + 16]   /* ALPHA_R */
	st	%i4, [%sp + STACK_START + 20]
	st	%i5, [%sp + STACK_START + 24]   /* ALPHA_I */

	ld	[%sp + STACK_START + 32], A
	ld	[%sp + STACK_START + 36], LDA
	ld	[%sp + STACK_START + 40], X
	ld	[%sp + STACK_START + 44], INCX
	ld	[%sp + STACK_START + 48], Y
	ld	[%sp + STACK_START + 52], INCY
	ld	[%sp + STACK_START + 56], BUFFER
#else
	st	%i3, [%sp + STACK_START + 16]   /* ALPHA_R */
	st	%i4, [%sp + STACK_START + 20]   /* ALPHA_I */

	ld	[%sp + STACK_START + 28], LDA
	ld	[%sp + STACK_START + 32], X
	ld	[%sp + STACK_START + 36], INCX
	ld	[%sp + STACK_START + 40], Y
	ld	[%sp + STACK_START + 44], INCY
	ld	[%sp + STACK_START + 48], BUFFER
#endif
#else
	ldx	[%sp + STACK_START + 56], LDA
	ldx	[%sp + STACK_START + 64], X
	ldx	[%sp + STACK_START + 72], INCX
	ldx	[%sp + STACK_START + 80], Y
	ldx	[%sp + STACK_START + 88], INCY
	ldx	[%sp + STACK_START + 96], BUFFER
#ifdef DOUBLE
	std	%f6, ALPHA_R
	std	%f8, ALPHA_I
#else
	st	%f7, ALPHA_R
	st	%f9, ALPHA_I
#endif
#endif

	clr	IS
	mov	P, I
	sll	LDA, ZBASE_SHIFT, LDA
	sll	I, ZBASE_SHIFT, I
	smul	LDA, N, PNLDA
	sll	INCX, ZBASE_SHIFT, INCX
	sll	INCY, ZBASE_SHIFT, INCY
	sub	I, PNLDA, PNLDA

.LL10:
	sll	IS, ZBASE_SHIFT, I
	sub	M, IS, MIN_M
	mov	P, J

	cmp	MIN_M, J
	nop
	movg	%icc, J, MIN_M
	nop
	cmp	INCX, 2 * SIZE
	beq	.LL100
	add	X, I, XP

	sra	MIN_M, 2, I
	mov	BUFFER, XP
	cmp	I, 0
	ble,pn	%icc, .LL15
	mov	BUFFER, Y1

.LL11:
	LDF	[X + 0 * SIZE], a1
	LDF	[X + 1 * SIZE], a2
	add	X, INCX, X
	LDF	[X + 0 * SIZE], a3
	LDF	[X + 1 * SIZE], a4
	add	X, INCX, X
	LDF	[X + 0 * SIZE], a5
	LDF	[X + 1 * SIZE], a6
	add	X, INCX, X
	LDF	[X + 0 * SIZE], a7
	LDF	[X + 1 * SIZE], a8
	add	X, INCX, X

	STF	a1, [Y1 + 0 * SIZE]
	add	I, -1, I
	STF	a2, [Y1 + 1 * SIZE]
	cmp	I, 0
	STF	a3, [Y1 + 2 * SIZE]
	STF	a4, [Y1 + 3 * SIZE]
	STF	a5, [Y1 + 4 * SIZE]
	STF	a6, [Y1 + 5 * SIZE]
	STF	a7, [Y1 + 6 * SIZE]
	STF	a8, [Y1 + 7 * SIZE]
	bg,pn	%icc, .LL11
	add	Y1, 8 * SIZE, Y1

.LL15:
	and	MIN_M, 3, I
	cmp	I, 0
	ble,pn	%icc, .LL100
	nop

.LL16:
	LDF	[X + 0 * SIZE], a1
	LDF	[X + 1 * SIZE], a2
	add	X, INCX, X
	add	I, -1, I
	cmp	I, 0
	nop
	STF	a1, [Y1 + 0 * SIZE]
	STF	a2, [Y1 + 1 * SIZE]
	bg,pn	%icc, .LL16
	add	Y1, 2 * SIZE, Y1

.LL100:
	sra	N, 2, J
	cmp	J, 0
	ble	%icc, .LL200
	mov	Y, Y1

.LL110:
	FCLR(0)

	FMOV	t1, c1
	sra	MIN_M, 2, I
	FMOV	t1, c2
	add	A,  LDA, A2
	FMOV	t1, c3
	mov	A,  A1
	FMOV	t1, c4
	add	A2, LDA, A3

	FMOV	t1, c5
	FMOV	t1, c6
	FMOV	t1, c7
	FMOV	t1, c8
	FMOV	t1, c9
	FMOV	t1, c10
	FMOV	t1, c11
	FMOV	t1, c12
	FMOV	t1, c13
	FMOV	t1, c14
	FMOV	t1, c15
	FMOV	t1, c16

	add	A3, LDA, A4
	FMOV	t1, t2
	mov	XP, X1
	FMOV	t1, t3
	add	A4, LDA, A
	cmp	I, 0
	ble	%icc, .LL115
	FMOV	t1, t4

	LDF	[A1 + 0 * SIZE], a1
	nop
	LDF	[A1 + 1 * SIZE], a2
	add	A1, 2 * SIZE, A1
	LDF	[A2 + 0 * SIZE], a3
	LDF	[A2 + 1 * SIZE], a4
	add	A2, 2 * SIZE, A2
	LDF	[A3 + 0 * SIZE], a5
	LDF	[A3 + 1 * SIZE], a6
	add	A3, 2 * SIZE, A3
	LDF	[A4 + 0 * SIZE], a7
	LDF	[A4 + 1 * SIZE], a8
	add	A4, 2 * SIZE, A4

	LDF	[X1 + 0 * SIZE], b1
	nop
	LDF	[X1 + 1 * SIZE], b2
	nop
	LDF	[X1 + 2 * SIZE], b3
	add	X1, 4 * SIZE, X1

	deccc	 I
	ble	 .LL112
	prefetch [Y1 + 7 * SIZE], 2

#ifndef XCONJ
#define FADDX	FADD
#else
#define FADDX	FSUB
#endif

.LL111:
	FADD	c13, t1, c13
	prefetch [A1 + PREFETCHSIZE * SIZE], 1
	FMUL	a1, b1, t1
	nop

	FADDX	c14, t2, c14
	nop
	FMUL	a1, b2, t2
	LDF	[A1 + 0 * SIZE], a1

	FADD	c15, t3, c15
	nop
	FMUL	a2, b1, t3
	LDF	[X1 - 1 * SIZE], b4

	FADD	c16, t4, c16
	nop
	FMUL	a2, b2, t4
	LDF	[A1 + 1 * SIZE], a2

	FADD	c1, t1, c1
	nop
	FMUL	a3, b1, t1
	nop

	FADDX	c2, t2, c2
	nop
	FMUL	a3, b2, t2
	LDF	[A2 + 0 * SIZE], a3

	FADD	c3, t3, c3
	nop
	FMUL	a4, b1, t3
	nop

	FADD	c4, t4, c4
	nop
	FMUL	a4, b2, t4
	LDF	[A2 + 1 * SIZE], a4

	FADD	c5, t1, c5
	nop
	FMUL	a5, b1, t1
	nop

	FADDX	c6, t2, c6
	nop
	FMUL	a5, b2, t2
	LDF	[A3 + 0 * SIZE], a5

	FADD	c7, t3, c7
	nop
	FMUL	a6, b1, t3
	nop

	FADD	c8, t4, c8
	nop
	FMUL	a6, b2, t4
	LDF	[A3 + 1 * SIZE], a6

	FADD	c9, t1, c9
	nop
	FMUL	a7, b1, t1
	nop

	FADDX	c10, t2, c10
	nop
	FMUL	a7, b2, t2
	LDF	[A4 + 0 * SIZE], a7

	FADD	c11, t3, c11
	nop
	FMUL	a8, b1, t3
	LDF	[X1 + 0 * SIZE], b1

	FADD	c12, t4, c12
	nop
	FMUL	a8, b2, t4
	LDF	[A4 + 1 * SIZE], a8

	FADD	c13, t1, c13
	nop
	FMUL	a1, b3, t1
	prefetch [A2 + PREFETCHSIZE * SIZE], 1

	FADDX	c14, t2, c14
	nop
	FMUL	a1, b4, t2
	LDF	[A1 + 2 * SIZE], a1

	FADD	c15, t3, c15
	nop
	FMUL	a2, b3, t3
	LDF	[X1 + 1 * SIZE], b2

	FADD	c16, t4, c16
	nop
	FMUL	a2, b4, t4
	LDF	[A1 + 3 * SIZE], a2

	FADD	c1, t1, c1
	nop
	FMUL	a3, b3, t1
	nop

	FADDX	c2, t2, c2
	nop
	FMUL	a3, b4, t2
	LDF	[A2 + 2 * SIZE], a3

	FADD	c3, t3, c3
	nop
	FMUL	a4, b3, t3
	nop

	FADD	c4, t4, c4
	nop
	FMUL	a4, b4, t4
	LDF	[A2 + 3 * SIZE], a4

	FADD	c5, t1, c5
	nop
	FMUL	a5, b3, t1
	nop

	FADDX	c6, t2, c6
	nop
	FMUL	a5, b4, t2
	LDF	[A3 + 2 * SIZE], a5

	FADD	c7, t3, c7
	nop
	FMUL	a6, b3, t3
	nop

	FADD	c8, t4, c8
	nop
	FMUL	a6, b4, t4
	LDF	[A3 + 3 * SIZE], a6

	FADD	c9, t1, c9
	nop
	FMUL	a7, b3, t1
	nop

	FADDX	c10, t2, c10
	nop
	FMUL	a7, b4, t2
	LDF	[A4 + 2 * SIZE], a7

	FADD	c11, t3, c11
	nop
	FMUL	a8, b3, t3
	LDF	[X1 + 2 * SIZE], b3

	FADD	c12, t4, c12
	nop
	FMUL	a8, b4, t4
	LDF	[A4 + 3 * SIZE], a8

	FADD	c13, t1, c13
	prefetch [A3 + PREFETCHSIZE * SIZE], 1
	FMUL	a1, b1, t1
	nop

	FADDX	c14, t2, c14
	nop
	FMUL	a1, b2, t2
	LDF	[A1 + 4 * SIZE], a1

	FADD	c15, t3, c15
	nop
	FMUL	a2, b1, t3
	LDF	[X1 + 3 * SIZE], b4

	FADD	c16, t4, c16
	nop
	FMUL	a2, b2, t4
	LDF	[A1 + 5 * SIZE], a2

	FADD	c1, t1, c1
	nop
	FMUL	a3, b1, t1
	nop

	FADDX	c2, t2, c2
	nop
	FMUL	a3, b2, t2
	LDF	[A2 + 4 * SIZE], a3

	FADD	c3, t3, c3
	nop
	FMUL	a4, b1, t3
	nop

	FADD	c4, t4, c4
	nop
	FMUL	a4, b2, t4
	LDF	[A2 + 5 * SIZE], a4

	FADD	c5, t1, c5
	nop
	FMUL	a5, b1, t1
	nop

	FADDX	c6, t2, c6
	nop
	FMUL	a5, b2, t2
	LDF	[A3 + 4 * SIZE], a5

	FADD	c7, t3, c7
	deccc	I
	FMUL	a6, b1, t3
	nop

	FADD	c8, t4, c8
	nop
	FMUL	a6, b2, t4
	LDF	[A3 + 5 * SIZE], a6

	FADD	c9, t1, c9
	nop
	FMUL	a7, b1, t1
	nop

	FADDX	c10, t2, c10
	nop
	FMUL	a7, b2, t2
	LDF	[A4 + 4 * SIZE], a7

	FADD	c11, t3, c11
	nop
	FMUL	a8, b1, t3
	LDF	[X1 + 4 * SIZE], b1

	FADD	c12, t4, c12
	nop
	FMUL	a8, b2, t4
	LDF	[A4 + 5 * SIZE], a8

	FADD	c13, t1, c13
	prefetch [A4 + PREFETCHSIZE * SIZE], 1
	FMUL	a1, b3, t1
	nop

	FADDX	c14, t2, c14
	nop
	FMUL	a1, b4, t2
	LDF	[A1 + 6 * SIZE], a1

	FADD	c15, t3, c15
	nop
	FMUL	a2, b3, t3
	LDF	[X1 + 5 * SIZE], b2

	FADD	c16, t4, c16
	nop
	FMUL	a2, b4, t4
	LDF	[A1 + 7 * SIZE], a2

	FADD	c1, t1, c1
	add	A1, 8 * SIZE, A1
	FMUL	a3, b3, t1
	nop

	FADDX	c2, t2, c2
	nop
	FMUL	a3, b4, t2
	LDF	[A2 + 6 * SIZE], a3

	FADD	c3, t3, c3
	nop
	FMUL	a4, b3, t3
	nop

	FADD	c4, t4, c4
	nop
	FMUL	a4, b4, t4
	LDF	[A2 + 7 * SIZE], a4

	FADD	c5, t1, c5
	add	A2, 8 * SIZE, A2
	FMUL	a5, b3, t1
	nop

	FADDX	c6, t2, c6
	nop
	FMUL	a5, b4, t2
	LDF	[A3 + 6 * SIZE], a5

	FADD	c7, t3, c7
	add	A4, 8 * SIZE, A4
	FMUL	a6, b3, t3
	nop

	FADD	c8, t4, c8
	nop
	FMUL	a6, b4, t4
	LDF	[A3 + 7 * SIZE], a6

	FADD	c9, t1, c9
	add	A3, 8 * SIZE, A3
	FMUL	a7, b3, t1
	nop

	FADDX	c10, t2, c10
	add	X1, 8 * SIZE, X1
	FMUL	a7, b4, t2
	LDF	[A4 - 2 * SIZE], a7

	FADD	c11, t3, c11
	nop
	FMUL	a8, b3, t3
	LDF	[X1 - 2 * SIZE], b3

	FADD	c12, t4, c12
	FMUL	a8, b4, t4
	bg,pn	%icc, .LL111
	LDF	[A4 - 1 * SIZE], a8

.LL112:
	FADD	c13, t1, c13
	nop
	FMUL	a1, b1, t1
	LDF	[X1 - 1 * SIZE], b4

	FADDX	c14, t2, c14
	nop
	FMUL	a1, b2, t2
	LDF	[A1 + 0 * SIZE], a1

	FADD	c15, t3, c15
	nop
	FMUL	a2, b1, t3
	LDF	[X1 - 1 * SIZE], b4

	FADD	c16, t4, c16
	nop
	FMUL	a2, b2, t4
	LDF	[A1 + 1 * SIZE], a2

	FADD	c1, t1, c1
	nop
	FMUL	a3, b1, t1
	nop

	FADDX	c2, t2, c2
	nop
	FMUL	a3, b2, t2
	LDF	[A2 + 0 * SIZE], a3

	FADD	c3, t3, c3
	nop
	FMUL	a4, b1, t3
	nop

	FADD	c4, t4, c4
	nop
	FMUL	a4, b2, t4
	LDF	[A2 + 1 * SIZE], a4

	FADD	c5, t1, c5
	nop
	FMUL	a5, b1, t1
	nop

	FADDX	c6, t2, c6
	nop
	FMUL	a5, b2, t2
	LDF	[A3 + 0 * SIZE], a5

	FADD	c7, t3, c7
	nop
	FMUL	a6, b1, t3
	nop

	FADD	c8, t4, c8
	nop
	FMUL	a6, b2, t4
	LDF	[A3 + 1 * SIZE], a6

	FADD	c9, t1, c9
	nop
	FMUL	a7, b1, t1
	nop

	FADDX	c10, t2, c10
	nop
	FMUL	a7, b2, t2
	LDF	[A4 + 0 * SIZE], a7

	FADD	c11, t3, c11
	nop
	FMUL	a8, b1, t3
	LDF	[X1 + 0 * SIZE], b1

	FADD	c12, t4, c12
	nop
	FMUL	a8, b2, t4
	LDF	[A4 + 1 * SIZE], a8

	FADD	c13, t1, c13
	nop
	FMUL	a1, b3, t1
	LDF	[X1 + 1 * SIZE], b2

	FADDX	c14, t2, c14
	nop
	FMUL	a1, b4, t2
	LDF	[A1 + 2 * SIZE], a1

	FADD	c15, t3, c15
	nop
	FMUL	a2, b3, t3
	nop

	FADD	c16, t4, c16
	nop
	FMUL	a2, b4, t4
	LDF	[A1 + 3 * SIZE], a2

	FADD	c1, t1, c1
	nop
	FMUL	a3, b3, t1
	nop

	FADDX	c2, t2, c2
	nop
	FMUL	a3, b4, t2
	LDF	[A2 + 2 * SIZE], a3

	FADD	c3, t3, c3
	nop
	FMUL	a4, b3, t3
	nop

	FADD	c4, t4, c4
	nop
	FMUL	a4, b4, t4
	LDF	[A2 + 3 * SIZE], a4

	FADD	c5, t1, c5
	nop
	FMUL	a5, b3, t1
	nop

	FADDX	c6, t2, c6
	nop
	FMUL	a5, b4, t2
	LDF	[A3 + 2 * SIZE], a5

	FADD	c7, t3, c7
	nop
	FMUL	a6, b3, t3
	nop

	FADD	c8, t4, c8
	nop
	FMUL	a6, b4, t4
	LDF	[A3 + 3 * SIZE], a6

	FADD	c9, t1, c9
	nop
	FMUL	a7, b3, t1
	nop

	FADDX	c10, t2, c10
	nop
	FMUL	a7, b4, t2
	LDF	[A4 + 2 * SIZE], a7

	FADD	c11, t3, c11
	nop
	FMUL	a8, b3, t3
	LDF	[X1 + 2 * SIZE], b3

	FADD	c12, t4, c12
	nop
	FMUL	a8, b4, t4
	LDF	[A4 + 3 * SIZE], a8

	FADD	c13, t1, c13
	nop
	FMUL	a1, b1, t1
	LDF	[X1 + 3 * SIZE], b4

	FADDX	c14, t2, c14
	add	X1, 4 * SIZE, X1
	FMUL	a1, b2, t2
	LDF	[A1 + 4 * SIZE], a1

	FADD	c15, t3, c15
	nop
	FMUL	a2, b1, t3
	nop

	FADD	c16, t4, c16
	nop
	FMUL	a2, b2, t4
	LDF	[A1 + 5 * SIZE], a2

	FADD	c1, t1, c1
	add	A1, 6 * SIZE, A1
	FMUL	a3, b1, t1
	nop

	FADDX	c2, t2, c2
	nop
	FMUL	a3, b2, t2
	LDF	[A2 + 4 * SIZE], a3

	FADD	c3, t3, c3
	nop
	FMUL	a4, b1, t3
	nop

	FADD	c4, t4, c4
	nop
	FMUL	a4, b2, t4
	LDF	[A2 + 5 * SIZE], a4

	FADD	c5, t1, c5
	add	A2, 6 * SIZE, A2
	FMUL	a5, b1, t1
	nop

	FADDX	c6, t2, c6
	nop
	FMUL	a5, b2, t2
	LDF	[A3 + 4 * SIZE], a5

	FADD	c7, t3, c7
	nop
	FMUL	a6, b1, t3
	nop

	FADD	c8, t4, c8
	nop
	FMUL	a6, b2, t4
	LDF	[A3 + 5 * SIZE], a6

	FADD	c9, t1, c9
	add	A3, 6 * SIZE, A3
	FMUL	a7, b1, t1
	nop

	FADDX	c10, t2, c10
	nop
	FMUL	a7, b2, t2
	LDF	[A4 + 4 * SIZE], a7

	FADD	c11, t3, c11
	nop
	FMUL	a8, b1, t3
	nop

	FADD	c12, t4, c12
	nop
	FMUL	a8, b2, t4
	LDF	[A4 + 5 * SIZE], a8

	FADD	c13, t1, c13
	add	A4, 6 * SIZE, A4
	FMUL	a1, b3, t1
	nop

	FADDX	c14, t2, c14
	nop
	FMUL	a1, b4, t2
	nop

	FADD	c15, t3, c15
	FMUL	a2, b3, t3
	FADD	c16, t4, c16
	FMUL	a2, b4, t4

	FADD	c1, t1, c1
	FMUL	a3, b3, t1
	FADDX	c2, t2, c2
	FMUL	a3, b4, t2
	FADD	c3, t3, c3
	FMUL	a4, b3, t3
	FADD	c4, t4, c4
	FMUL	a4, b4, t4

	FADD	c5, t1, c5
	FMUL	a5, b3, t1
	FADDX	c6, t2, c6
	FMUL	a5, b4, t2
	FADD	c7, t3, c7
	FMUL	a6, b3, t3
	FADD	c8, t4, c8
	FMUL	a6, b4, t4

	FADD	c9, t1, c9
	FMUL	a7, b3, t1
	FADDX	c10, t2, c10
	FMUL	a7, b4, t2
	FADD	c11, t3, c11
	FMUL	a8, b3, t3
	FADD	c12, t4, c12
	FMUL	a8, b4, t4

.LL115:
	andcc	MIN_M, 3, I
	LDF	ALPHA_R, b3
	mov	Y1, Y2
	ble,pn	%icc, .LL119
	LDF	ALPHA_I, b4

.L116:
	LDF	[A1 + 0 * SIZE], a1
	LDF	[A1 + 1 * SIZE], a2
	add	A1, 2 * SIZE, A1
	LDF	[X1 + 0 * SIZE], b1
	LDF	[X1 + 1 * SIZE], b2
	add	X1, 2 * SIZE, X1
	LDF	[A2 + 0 * SIZE], a3
	LDF	[A2 + 1 * SIZE], a4
	add	A2, 2 * SIZE, A2
	LDF	[A3 + 0 * SIZE], a5
	LDF	[A3 + 1 * SIZE], a6
	add	A3, 2 * SIZE, A3
	LDF	[A4 + 0 * SIZE], a7
	LDF	[A4 + 1 * SIZE], a8
	add	A4, 2 * SIZE, A4

	FADD	c13, t1, c13
	FMUL	a1, b1, t1
	FADDX	c14, t2, c14
	FMUL	a1, b2, t2
	FADD	c15, t3, c15
	FMUL	a2, b1, t3
	FADD	c16, t4, c16
	FMUL	a2, b2, t4

	FADD	c1, t1, c1
	FMUL	a3, b1, t1
	FADDX	c2, t2, c2
	FMUL	a3, b2, t2
	FADD	c3, t3, c3
	FMUL	a4, b1, t3
	FADD	c4, t4, c4
	FMUL	a4, b2, t4

	FADD	c5, t1, c5
	FMUL	a5, b1, t1
	FADDX	c6, t2, c6
	FMUL	a5, b2, t2
	FADD	c7, t3, c7
	FMUL	a6, b1, t3
	FADD	c8, t4, c8
	FMUL	a6, b2, t4

	FADD	c9, t1, c9
	FMUL	a7, b1, t1
	FADDX	c10, t2, c10
	FMUL	a7, b2, t2
	FADD	c11, t3, c11
	FMUL	a8, b1, t3
	FADD	c12, t4, c12
	FMUL	a8, b2, t4

	deccc	I
	bg	%icc, .L116
	nop

.LL119:
	FADD	c13, t1, c13
	LDF	[Y1 + 0 * SIZE], a1
	FADDX	c14, t2, c14
	LDF	[Y1 + 1 * SIZE] ,a2
	add	Y1, INCY, Y1
	FADD	c15, t3, c15
	LDF	[Y1 + 0 * SIZE], a3
	FADD	c16, t4, c16
	LDF	[Y1 + 1 * SIZE] ,a4
	add	Y1, INCY, Y1

#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ))
	FSUB	c1,  c4,  c1
	LDF	[Y1 + 0 * SIZE], a5
	FSUB	c5,  c8,  c5
	LDF	[Y1 + 1 * SIZE] ,a6
	add	Y1, INCY, Y1
	FSUB	c9,  c12, c9
	LDF	[Y1 + 0 * SIZE], a7
	FSUB	c13, c16, c13
	LDF	[Y1 + 1 * SIZE] ,a8
	add	Y1, INCY, Y1
#else
	FADD	c1,  c4,  c1
	LDF	[Y1 + 0 * SIZE], a5
	FADD	c5,  c8,  c5
	LDF	[Y1 + 1 * SIZE] ,a6
	add	Y1, INCY, Y1
	FADD	c9,  c12, c9
	LDF	[Y1 + 0 * SIZE], a7
	FADD	c13, c16, c13
	LDF	[Y1 + 1 * SIZE] ,a8
	add	Y1, INCY, Y1
#endif

#ifndef CONJ
	FADD	c2,  c3,  c2
	FCLR(0)
	FADD	c6,  c7,  c6
	FADD	c10, c11, c10
	FADD	c14, c15, c14
#else
	FSUB	c2,  c3,  c2
	FCLR(0)
	FSUB	c6,  c7,  c6
	FSUB	c10, c11, c10
	FSUB	c14, c15, c14
#endif

	FMUL	b3, c1, c3
	FMOV	t1, t2
	FMUL	b4, c1, c4
	FMOV	t1, t3
	FMUL	b4, c2, c1
	FMOV	t1, t4
	FMUL	b3, c2, c2

	FMUL	b3, c5, c7
	FMUL	b4, c5, c8
	FMUL	b4, c6, c5
	FMUL	b3, c6, c6

	FMUL	b3, c9,  c11
	FMUL	b4, c9,  c12
	FMUL	b4, c10, c9
	FMUL	b3, c10, c10

	FMUL	b3, c13, c15
	FSUB	c3,  c1,  c1
	FMUL	b4, c13, c16
	FADD	c2,  c4,  c2
	FMUL	b4, c14, c13
	FSUB	c7,  c5,  c5
	FMUL	b3, c14, c14
	FADD	c6,  c8,  c6

	FSUB	c11, c9,  c9
	FADD	c10, c12, c10
	FSUB	c15, c13, c13
	FADD	c14, c16, c14

	FADD	a1, c1, a1
	FADD	a2, c2, a2
	FADD	a3, c5, a3
	FADD	a4, c6, a4

	STF	a1, [Y2 + 0 * SIZE]
	FADD	a5, c9,  a5
	STF	a2, [Y2 + 1 * SIZE]
	FADD	a6, c10, a6
	add	Y2, INCY, Y2
	STF	a3, [Y2 + 0 * SIZE]
	FADD	a7, c13, a7
	STF	a4, [Y2 + 1 * SIZE]
	FADD	a8, c14, a8
	add	Y2, INCY, Y2

	STF	a5, [Y2 + 0 * SIZE]
	FMOV	t1, c1
	add	J, -1, J
	STF	a6, [Y2 + 1 * SIZE]
	FMOV	t1, c2
	cmp	J, 0
	add	Y2, INCY, Y2
	STF	a7, [Y2 + 0 * SIZE]
	FMOV	t1, c3
	STF	a8, [Y2 + 1 * SIZE]
	FMOV	t1, c4
	add	Y2, INCY, Y2

	FMOV	t1, c5
	bg	%icc, .LL110
	FMOV	t1, c6

.LL200:
	FCLR(0)

	and	N, 2, J
	cmp	J, 0
	FMOV	t1, c1
	ble	%icc, .LL300

	FMOV	t1, c2
	sra	MIN_M, 2, I
	FMOV	t1, t2
	add	A,  LDA, A2
	FMOV	t1, c3
	mov	A,  A1
	FMOV	t1, t3
	cmp	I, 0
	FMOV	t1, c4

	FMOV	t1, c5
	FMOV	t1, c6
	FMOV	t1, c7
	FMOV	t1, c8

	add	A2, LDA, A
	FMOV	t1, t4
	ble	%icc, .LL215
	mov	XP, X1

	LDF	[A1 + 0 * SIZE], a1
	LDF	[A1 + 1 * SIZE], a2
	LDF	[A1 + 2 * SIZE], a5
	LDF	[A1 + 3 * SIZE], a6
	add	A1, 4 * SIZE, A1

	LDF	[A2 + 0 * SIZE], a3
	LDF	[A2 + 1 * SIZE], a4
	LDF	[A2 + 2 * SIZE], a7
	LDF	[A2 + 3 * SIZE], a8
	add	A2, 4 * SIZE, A2

	LDF	[X1 + 0 * SIZE], b1
	add	I, -1, I
	LDF	[X1 + 1 * SIZE], b2
	cmp	I, 0
	LDF	[X1 + 2 * SIZE], b3
	LDF	[X1 + 3 * SIZE], b4
	ble	%icc, .LL212
	add	X1, 4 * SIZE, X1

.LL211:
	prefetch [A1 + PREFETCHSIZE * SIZE], 1

	FADD	c5, t1, c5
	FMUL	a1, b1, t1
	FADDX	c6, t2, c6
	FMUL	a1, b2, t2
	LDF	[A1 + 0 * SIZE], a1
	FADD	c7, t3, c7
	FMUL	a2, b1, t3
	FADD	c8, t4, c8
	FMUL	a2, b2, t4
	LDF	[A1 + 1 * SIZE], a2

	FADD	c1, t1, c1
	FMUL	a3, b1, t1
	FADDX	c2, t2, c2
	FMUL	a3, b2, t2
	LDF	[A2 + 0 * SIZE], a3
	FADD	c3, t3, c3
	FMUL	a4, b1, t3
	LDF	[X1 + 0 * SIZE], b1
	FADD	c4, t4, c4
	FMUL	a4, b2, t4
	LDF	[A2 + 1 * SIZE], a4

	FADD	c5, t1, c5
	LDF	[X1 + 1 * SIZE], b2
	FMUL	a5, b3, t1
	FADDX	c6, t2, c6
	FMUL	a5, b4, t2
	LDF	[A1 + 2 * SIZE], a5
	FADD	c7, t3, c7
	add	I, -1, I
	FMUL	a6, b3, t3
	FADD	c8, t4, c8
	cmp	I, 0
	FMUL	a6, b4, t4
	LDF	[A1 + 3 * SIZE], a6

	FADD	c1, t1, c1
	FMUL	a7, b3, t1
	FADDX	c2, t2, c2
	FMUL	a7, b4, t2
	LDF	[A2 + 2 * SIZE], a7
	FADD	c3, t3, c3
	FMUL	a8, b3, t3
	LDF	[X1 + 2 * SIZE], b3
	FADD	c4, t4, c4
	FMUL	a8, b4, t4
	LDF	[A2 + 3 * SIZE], a8

	prefetch [A2 + PREFETCHSIZE * SIZE], 1
	FADD	c5, t1, c5
	LDF	[X1 + 3 * SIZE], b4
	FMUL	a1, b1, t1
	FADDX	c6, t2, c6
	FMUL	a1, b2, t2
	LDF	[A1 + 4 * SIZE], a1
	FADD	c7, t3, c7
	FMUL	a2, b1, t3
	FADD	c8, t4, c8
	FMUL	a2, b2, t4
	LDF	[A1 + 5 * SIZE], a2

	FADD	c1, t1, c1
	FMUL	a3, b1, t1
	FADDX	c2, t2, c2
	FMUL	a3, b2, t2
	LDF	[A2 + 4 * SIZE], a3
	FADD	c3, t3, c3
	FMUL	a4, b1, t3
	LDF	[X1 + 4 * SIZE], b1
	FADD	c4, t4, c4
	FMUL	a4, b2, t4
	LDF	[A2 + 5 * SIZE], a4

	FADD	c5, t1, c5
	LDF	[X1 + 5 * SIZE], b2
	FMUL	a5, b3, t1
	FADDX	c6, t2, c6
	FMUL	a5, b4, t2
	LDF	[A1 + 6 * SIZE], a5
	FADD	c7, t3, c7
	FMUL	a6, b3, t3
	FADD	c8, t4, c8
	FMUL	a6, b4, t4
	LDF	[A1 + 7 * SIZE], a6
	add	A1, 8 * SIZE, A1

	FADD	c1, t1, c1
	FMUL	a7, b3, t1
	FADDX	c2, t2, c2
	FMUL	a7, b4, t2
	LDF	[A2 + 6 * SIZE], a7
	FADD	c3, t3, c3
	FMUL	a8, b3, t3
	LDF	[X1 + 6 * SIZE], b3
	FADD	c4, t4, c4
	add	X1, 8 * SIZE, X1
	FMUL	a8, b4, t4
	LDF	[A2 + 7 * SIZE], a8
	add	A2, 8 * SIZE, A2
	bg,pn	%icc, .LL211
	LDF	[X1 - 1 * SIZE], b4

.LL212:
	FADD	c5, t1, c5
	FMUL	a1, b1, t1
	FADDX	c6, t2, c6
	FMUL	a1, b2, t2
	LDF	[A1 + 0 * SIZE], a1
	FADD	c7, t3, c7
	FMUL	a2, b1, t3
	FADD	c8, t4, c8
	FMUL	a2, b2, t4
	LDF	[A1 + 1 * SIZE], a2

	FADD	c1, t1, c1
	FMUL	a3, b1, t1
	FADDX	c2, t2, c2
	FMUL	a3, b2, t2
	LDF	[A2 + 0 * SIZE], a3
	FADD	c3, t3, c3
	FMUL	a4, b1, t3
	LDF	[X1 + 0 * SIZE], b1
	FADD	c4, t4, c4
	FMUL	a4, b2, t4
	LDF	[A2 + 1 * SIZE], a4

	FADD	c5, t1, c5
	LDF	[X1 + 1 * SIZE], b2
	FMUL	a5, b3, t1
	FADDX	c6, t2, c6
	FMUL	a5, b4, t2
	LDF	[A1 + 2 * SIZE], a5
	FADD	c7, t3, c7
	FMUL	a6, b3, t3
	FADD	c8, t4, c8
	FMUL	a6, b4, t4
	LDF	[A1 + 3 * SIZE], a6
	add	A1, 4 * SIZE, A1

	FADD	c1, t1, c1
	FMUL	a7, b3, t1
	FADDX	c2, t2, c2
	FMUL	a7, b4, t2
	LDF	[A2 + 2 * SIZE], a7
	FADD	c3, t3, c3
	FMUL	a8, b3, t3
	LDF	[X1 + 2 * SIZE], b3
	FADD	c4, t4, c4
	FMUL	a8, b4, t4
	LDF	[A2 + 3 * SIZE], a8
	add	A2, 4 * SIZE, A2

	FADD	c5, t1, c5
	LDF	[X1 + 3 * SIZE], b4
	add	X1, 4 * SIZE, X1
	FMUL	a1, b1, t1
	FADDX	c6, t2, c6
	FMUL	a1, b2, t2
	FADD	c7, t3, c7
	FMUL	a2, b1, t3
	FADD	c8, t4, c8
	FMUL	a2, b2, t4

	FADD	c1, t1, c1
	FMUL	a3, b1, t1
	FADDX	c2, t2, c2
	FMUL	a3, b2, t2
	FADD	c3, t3, c3
	FMUL	a4, b1, t3
	FADD	c4, t4, c4
	FMUL	a4, b2, t4

	FADD	c5, t1, c5
	FMUL	a5, b3, t1
	FADDX	c6, t2, c6
	FMUL	a5, b4, t2
	FADD	c7, t3, c7
	FMUL	a6, b3, t3
	FADD	c8, t4, c8
	FMUL	a6, b4, t4

	FADD	c1, t1, c1
	FMUL	a7, b3, t1
	FADDX	c2, t2, c2
	FMUL	a7, b4, t2
	FADD	c3, t3, c3
	FMUL	a8, b3, t3
	FADD	c4, t4, c4
	FMUL	a8, b4, t4

.LL215:
	andcc	MIN_M, 3, I
	LDF	ALPHA_R, b3
	mov	Y1, Y2
	ble	%icc, .LL219
	LDF	ALPHA_I, b4

	LDF	[A1 + 0 * SIZE], a1
	add	I, -1, I
	LDF	[A1 + 1 * SIZE], a2
	cmp	I, 0
	add	A1, 2 * SIZE, A1

	LDF	[A2 + 0 * SIZE], a3
	LDF	[A2 + 1 * SIZE], a4
	add	A2, 2 * SIZE, A2

	LDF	[X1 + 0 * SIZE], b1
	LDF	[X1 + 1 * SIZE], b2
	ble	%icc, .LL217
	add	X1, 2 * SIZE, X1

.LL216:
	FADD	c5, t1, c5
	FMUL	a1, b1, t1
	FADDX	c6, t2, c6
	FMUL	a1, b2, t2
	LDF	[A1 + 0 * SIZE], a1
	FADD	c7, t3, c7
	add	I, -1, I
	FMUL	a2, b1, t3
	FADD	c8, t4, c8
	cmp	I, 0
	FMUL	a2, b2, t4
	LDF	[A1 + 1 * SIZE], a2
	add	A1, 2 * SIZE, A1

	FADD	c1, t1, c1
	FMUL	a3, b1, t1
	FADDX	c2, t2, c2
	FMUL	a3, b2, t2
	LDF	[A2 + 0 * SIZE], a3
	FADD	c3, t3, c3
	FMUL	a4, b1, t3
	LDF	[X1 + 0 * SIZE], b1
	FADD	c4, t4, c4
	add	X1, 2 * SIZE, X1
	FMUL	a4, b2, t4
	LDF	[A2 + 1 * SIZE], a4
	add	A2, 2 * SIZE, A2
	bg,pn	%icc, .LL216
	LDF	[X1 - 1 * SIZE], b2

.LL217:
	FADD	c5, t1, c5
	FMUL	a1, b1, t1
	FADDX	c6, t2, c6
	FMUL	a1, b2, t2
	FADD	c7, t3, c7
	FMUL	a2, b1, t3
	FADD	c8, t4, c8
	FMUL	a2, b2, t4

	FADD	c1, t1, c1
	FMUL	a3, b1, t1
	FADDX	c2, t2, c2
	FMUL	a3, b2, t2
	FADD	c3, t3, c3
	FMUL	a4, b1, t3
	FADD	c4, t4, c4
	FMUL	a4, b2, t4

.LL219:
	FADD	c5, t1, c5
	LDF	[Y1 + 0 * SIZE], a1
	FADDX	c6, t2, c6
	LDF	[Y1 + 1 * SIZE] ,a2
	add	Y1, INCY, Y1
	FADD	c7, t3, c7
	LDF	[Y1 + 0 * SIZE], a3
	FADD	c8, t4, c8
	LDF	[Y1 + 1 * SIZE] ,a4
	add	Y1, INCY, Y1

#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ))
	FSUB	c1, c4, c1
	FSUB	c5, c8, c5
#else
	FADD	c1, c4, c1
	FADD	c5, c8, c5
#endif

#ifndef CONJ
	FADD	c2, c3, c2
	FADD	c6, c7, c6
#else
	FSUB	c2, c3, c2
	FSUB	c6, c7, c6
#endif

	FMUL	b3, c1, c3
	FMUL	b4, c1, c4
	FMUL	b4, c2, c1
	FMUL	b3, c2, c2

	FMUL	b3, c5, c7
	FMUL	b4, c5, c8
	FMUL	b4, c6, c5
	FMUL	b3, c6, c6

	FSUB	c3, c1, c1
	FADD	c2, c4, c2
	FSUB	c7, c5, c5
	FADD	c6, c8, c6

	FADD	a1, c1, a1
	FADD	a2, c2, a2
	FADD	a3, c5, a3
	FADD	a4, c6, a4

	STF	a1, [Y2 + 0 * SIZE]
	STF	a2, [Y2 + 1 * SIZE]
	add	Y2, INCY, Y2
	STF	a3, [Y2 + 0 * SIZE]
	STF	a4, [Y2 + 1 * SIZE]
	
.LL300:
	andcc	N, 1, J
	FCLR(0)
	ble	%icc, .LL400
	FMOV	t1, c1

.LL310:
	sra	MIN_M, 2, I
	FMOV	t1, c2
	FMOV	t1, c3
	FMOV	t1, c4
	mov	A, A1
	FMOV	t1, t2
	add	A, LDA, A
	FMOV	t1, t3
	cmp	I, 0
	FMOV	t1, t4
	ble	%icc, .LL315
	mov	XP, X1

	LDF	[A1 + 0 * SIZE], a1
	LDF	[A1 + 1 * SIZE], a2
	LDF	[A1 + 2 * SIZE], a3
	LDF	[A1 + 3 * SIZE], a4
	LDF	[A1 + 4 * SIZE], a5
	LDF	[A1 + 5 * SIZE], a6
	LDF	[A1 + 6 * SIZE], a7
	LDF	[A1 + 7 * SIZE], a8
	add	A1, 8 * SIZE, A1

	LDF	[X1 + 0 * SIZE], c9
	add	I, -1, I
	LDF	[X1 + 1 * SIZE], c10
	cmp	I, 0
	LDF	[X1 + 2 * SIZE], c11
	LDF	[X1 + 3 * SIZE], c12
	LDF	[X1 + 4 * SIZE], c13
	LDF	[X1 + 5 * SIZE], c14
	LDF	[X1 + 6 * SIZE], c15
	LDF	[X1 + 7 * SIZE], c16
	ble	%icc, .LL312
	add	X1, 8 * SIZE, X1

.LL311:
	prefetch [A1 + PREFETCHSIZE * SIZE], 1

	FADD	c1, t1, c1
	FMUL	a1, c9,  t1
	FADDX	c2, t2, c2
	FMUL	a1, c10, t2
	LDF	[A1 + 0 * SIZE], a1
	FADD	c3, t3, c3
	FMUL	a2, c9,  t3
	LDF	[X1 + 0 * SIZE], c9
	FADD	c4, t4, c4
	FMUL	a2, c10, t4
	LDF	[A1 + 1 * SIZE], a2
	LDF	[X1 + 1 * SIZE], c10

	FADD	c1, t1, c1
	FMUL	a3, c11, t1
	FADDX	c2, t2, c2
	FMUL	a3, c12, t2
	LDF	[A1 + 2 * SIZE], a3
	FADD	c3, t3, c3
	add	I, -1, I
	FMUL	a4, c11, t3
	LDF	[X1 + 2 * SIZE], c11
	FADD	c4, t4, c4
	cmp	I, 0
	FMUL	a4, c12, t4
	LDF	[A1 + 3 * SIZE], a4
	LDF	[X1 + 3 * SIZE], c12

	FADD	c1, t1, c1
	FMUL	a5, c13, t1
	FADDX	c2, t2, c2
	FMUL	a5, c14, t2
	LDF	[A1 + 4 * SIZE], a5
	FADD	c3, t3, c3
	FMUL	a6, c13, t3
	LDF	[X1 + 4 * SIZE], c13
	FADD	c4, t4, c4
	FMUL	a6, c14, t4
	LDF	[A1 + 5 * SIZE], a6
	LDF	[X1 + 5 * SIZE], c14

	FADD	c1, t1, c1
	FMUL	a7, c15, t1
	FADDX	c2, t2, c2
	FMUL	a7, c16, t2
	LDF	[A1 + 6 * SIZE], a7

	FADD	c3, t3, c3
	FMUL	a8, c15, t3
	LDF	[X1 + 6 * SIZE], c15
	FADD	c4, t4, c4
	add	X1, 8 * SIZE, X1
	FMUL	a8, c16, t4
	LDF	[A1 + 7 * SIZE], a8
	add	A1, 8 * SIZE, A1
	bg,pn	%icc, .LL311
	LDF	[X1 - 1 * SIZE], c16

.LL312:
	FADD	c1, t1, c1
	FMUL	a1, c9,  t1
	FADDX	c2, t2, c2
	FMUL	a1, c10, t2
	FADD	c3, t3, c3
	FMUL	a2, c9,  t3
	FADD	c4, t4, c4
	FMUL	a2, c10, t4

	FADD	c1, t1, c1
	FMUL	a3, c11, t1
	FADDX	c2, t2, c2
	FMUL	a3, c12, t2
	FADD	c3, t3, c3
	FMUL	a4, c11, t3
	FADD	c4, t4, c4
	FMUL	a4, c12, t4

	FADD	c1, t1, c1
	FMUL	a5, c13, t1
	FADDX	c2, t2, c2
	FMUL	a5, c14, t2
	FADD	c3, t3, c3
	FMUL	a6, c13, t3
	FADD	c4, t4, c4
	FMUL	a6, c14, t4

	FADD	c1, t1, c1
	FMUL	a7, c15, t1
	FADDX	c2, t2, c2
	FMUL	a7, c16, t2
	FADD	c3, t3, c3
	FMUL	a8, c15, t3
	FADD	c4, t4, c4
	FMUL	a8, c16, t4

.LL315:
	andcc	MIN_M, 3, I
	LDF	ALPHA_R, b3
	mov	Y1, Y2
	ble	%icc, .LL319
	LDF	ALPHA_I, b4

	LDF	[A1 + 0 * SIZE], a1
	add	I, -1, I
	LDF	[A1 + 1 * SIZE], a2
	add	A1, 2 * SIZE, A1
	LDF	[X1 + 0 * SIZE], b1
	cmp	I, 0
	LDF	[X1 + 1 * SIZE], b2
	ble	%icc, .LL317
	add	X1, 2 * SIZE, X1

.LL316:
	FADD	c1, t1, c1
	add	I, -1, I
	FMUL	a1, b1, t1
	FADDX	c2, t2, c2
	FMUL	a1, b2, t2
	LDF	[A1 + 0 * SIZE], a1
	FADD	c3, t3, c3
	cmp	I, 0
	FMUL	a2, b1, t3
	LDF	[X1 + 0 * SIZE], b1
	FADD	c4, t4, c4
	add	X1, 2 * SIZE, X1
	FMUL	a2, b2, t4
	LDF	[A1 + 1 * SIZE], a2
	add	A1, 2 * SIZE, A1

	bg,pn	%icc, .LL316
	LDF	[X1 - 1 * SIZE], b2

.LL317:
	FADD	c1, t1, c1
	FMUL	a1, b1, t1
	FADDX	c2, t2, c2
	FMUL	a1, b2, t2
	FADD	c3, t3, c3
	FMUL	a2, b1, t3
	FADD	c4, t4, c4
	FMUL	a2, b2, t4

.LL319:
	FADD	c1, t1, c1
	LDF	[Y1 + 0 * SIZE], a1
	FADDX	c2, t2, c2
	LDF	[Y1 + 1 * SIZE] ,a2
	add	Y1, INCY, Y1
	FADD	c3, t3, c3
	FADD	c4, t4, c4

#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ))
	FSUB	c1, c4, c1
#else
	FADD	c1, c4, c1
#endif

#ifndef CONJ
	FADD	c2, c3, c2
#else
	FSUB	c2, c3, c2
#endif

	FMUL	b3, c1, c3
	FMUL	b4, c1, c4
	FMUL	b4, c2, c1
	FMUL	b3, c2, c2

	FSUB	c3, c1, c1
	FADD	c2, c4, c2
	FADD	a1, c1, a1
	FADD	a2, c2, a2

	STF	a1, [Y2 + 0 * SIZE]
	STF	a2, [Y2 + 1 * SIZE]

.LL400:
	mov	P, I
	add	IS, I, IS
	cmp	IS, M
	bl	%icc, .LL10
	add	A, PNLDA, A
		
.LL999:
	return	%i7 + 8
	clr	%o0

	EPILOGUE