Blob Blame Raw
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"
#include "version.h"

#define STACKSIZE     64
#define PREFETCHSIZE  32

#define M	$16
#define N	$17
#define A	$21
#define	LDA	$18

#define X	$19
#define	INCX	$20
#define Y	$22
#define	INCY	$23

#define BUFFER	$24

#define I	$25
#define J	$27

#define	X1	$3
#define Y1	$4
#define A1	$5
#define A2	$6

#define	alpha_r	$f19
#define	alpha_i	$f20

#define	s0	$f0
#define	s1	$f1
#define	s2	$f10
#define	s3	$f11

#define	t0	$f12
#define	t1	$f13
#define	t2	$f14
#define	t3	$f15

#define	x0	$f16
#define	x1	$f17
#define	x2	$f18
#define	x3	$f21

#define	a0	$f22
#define	a1	$f23
#define	a2	$f24
#define	a3	$f25
#define	a4	$f26
#define	a5	$f27
#define	a6	$f28
#define	a7	$f29

#define	a8	$f2
#define	a9	$f3
#define	a10	$f4
#define	a11	$f5
#define	a12	$f6
#define	a13	$f7
#define	a14	$f8
#define	a15	$f9

#if   !defined(CONJ) && !defined(XCONJ)
#define ADD1	ADD
#define ADD2	ADD
#define ADD3	SUB
#define ADD4	ADD
#elif !defined(CONJ) &&  defined(XCONJ)
#define ADD1	ADD
#define ADD2	ADD
#define ADD3	ADD
#define ADD4	SUB
#elif  defined(CONJ) && !defined(XCONJ)
#define ADD1	ADD
#define ADD2	SUB
#define ADD3	ADD
#define ADD4	ADD
#else
#define ADD1	ADD
#define ADD2	SUB
#define ADD3	SUB
#define ADD4	SUB
#endif

	PROLOGUE

	lda	$sp,  -STACKSIZE($sp)
	ldq	LDA,     0 + STACKSIZE($sp)
	ldq	X,       8 + STACKSIZE($sp)
	ldq	INCX,   16 + STACKSIZE($sp)
	ldq	Y,      24 + STACKSIZE($sp)
	ldq	INCY,   32 + STACKSIZE($sp)
	ldq	BUFFER, 40 + STACKSIZE($sp)

	stt	$f2,    0($sp)
	stt	$f3,    8($sp)
	stt	$f4,   16($sp)
	stt	$f5,   24($sp)
	stt	$f6,   32($sp)
	stt	$f7,   40($sp)
	stt	$f8,   48($sp)
	stt	$f9,   56($sp)

	PROFCODE

	cmple	M, 0, $0
	sll	INCX, ZBASE_SHIFT, INCX
	cmple	N, 0, $1
	sll	INCY, ZBASE_SHIFT, INCY

	or	$0, $1, $0
	bne	$0,  $L999

	cmpeq	INCX, 2 * SIZE, $0
	mov	X, X1
	sll	LDA, ZBASE_SHIFT,LDA
	bne	$0, $L10

	sra	M, 2, I
	mov	BUFFER, Y1
	mov	BUFFER, X
	ble	I, $L05
	.align 4

$L02:
	ldl	$31, (PREFETCHSIZE + 0) * SIZE(X1)
	lda	I, -1(I)

	LD	a0,  0 * SIZE(X1)
	LD	a1,  1 * SIZE(X1)
	addq	X1, INCX, X1
	LD	a2,  0 * SIZE(X1)
	LD	a3,  1 * SIZE(X1)
	addq	X1, INCX, X1

	ST	a0,  0 * SIZE(Y1)
	ST	a1,  1 * SIZE(Y1)
	ST	a2,  2 * SIZE(Y1)
	ST	a3,  3 * SIZE(Y1)

	LD	a4,  0 * SIZE(X1)
	LD	a5,  1 * SIZE(X1)
	addq	X1, INCX, X1
	LD	a6,  0 * SIZE(X1)
	LD	a7,  1 * SIZE(X1)
	addq	X1, INCX, X1

	ST	a4,  4 * SIZE(Y1)
	ST	a5,  5 * SIZE(Y1)
	ST	a6,  6 * SIZE(Y1)
	ST	a7,  7 * SIZE(Y1)

	lda	Y1,  8 * SIZE(Y1)
	bgt	I, $L02
	.align 4

$L05:
	and	M, 3, I
	ble	I, $L10
	.align 4

$L06:
	LD	a0,  0 * SIZE(X1)
	LD	a1,  1 * SIZE(X1)
	addq	X1, INCX, X1

	ST	a0,  0 * SIZE(Y1)
	ST	a1,  1 * SIZE(Y1)
	lda	Y1,  2 * SIZE(Y1)

	lda	I, -1(I)
	bgt	I, $L06
	.align 4

$L10:
	mov	Y, Y1
	fclr	t0
	unop
	fclr	t1

	sra	N, 1, J
	fclr	t2
	fclr	t3
	ble	J,  $L20
	.align 4

$L11:
	mov	A, A1
	fclr	s0
	addq	A,  LDA, A2
	fclr	s1

	addq	A2, LDA, A
	unop
	mov	X, X1
	lds	$f31, 3 * SIZE(Y)

	sra	M,  2, I
	fclr	s2
	fclr	s3
	ble	I,  $L15

	LD	a0,   0 * SIZE(A1)
	LD	a1,   1 * SIZE(A1)
	LD	a2,   0 * SIZE(A2)
	LD	a3,   1 * SIZE(A2)
	LD	a4,   2 * SIZE(A1)
	LD	a5,   3 * SIZE(A1)
	LD	a6,   2 * SIZE(A2)
	LD	a7,   3 * SIZE(A2)

	LD	a8,   4 * SIZE(A1)
	LD	a9,   5 * SIZE(A1)
	LD	a10,  4 * SIZE(A2)
	LD	a11,  5 * SIZE(A2)
	LD	a12,  6 * SIZE(A1)
	LD	a13,  7 * SIZE(A1)
	LD	a14,  6 * SIZE(A2)
	LD	a15,  7 * SIZE(A2)

	LD	x0,   0 * SIZE(X1)
	LD	x1,   1 * SIZE(X1)
	LD	x2,   2 * SIZE(X1)

	lda	I,   -1(I)
	ble	I, $L13
	.align 4

$L12:
	ADD3	s0, t0,  s0
	unop
	MUL	x0, a0,  t0
	LD	x3,   3 * SIZE(X1)

	ADD4	s1, t1,  s1
	ldl	$31, (PREFETCHSIZE + 0) * SIZE(A1)
	MUL	x0, a1,  t1
	unop

	ADD3	s2, t2,  s2
	unop
	MUL	x0, a2,  t2
	unop

	ADD4	s3, t3,  s3
	unop
	MUL	x0, a3,  t3
	LD	x0,   4 * SIZE(X1)

	ADD1	s0, t0,  s0
	unop
	MUL	x1, a1,  t0
	LD	a1,   9 * SIZE(A1)

	ADD2	s1, t1,  s1
	unop
	MUL	x1, a0,  t1
	LD	a0,   8 * SIZE(A1)

	ADD1	s2, t2,  s2
	unop
	MUL	x1, a3,  t2
	LD	a3,   9 * SIZE(A2)

	ADD2	s3, t3,  s3
	unop
	MUL	x1, a2,  t3
	LD	a2,   8 * SIZE(A2)

	ADD3	s0, t0,  s0
	unop
	MUL	x2, a4,  t0
	LD	x1,   5 * SIZE(X1)

	ADD4	s1, t1,  s1
	MUL	x2, a5,  t1
	ADD3	s2, t2,  s2
	MUL	x2, a6,  t2

	ADD4	s3, t3,  s3
	unop
	MUL	x2, a7,  t3
	LD	x2,   6 * SIZE(X1)

	ADD1	s0, t0,  s0
	unop
	MUL	x3, a5,  t0
	LD	a5,  11 * SIZE(A1)

	ADD2	s1, t1,  s1
	unop
	MUL	x3, a4,  t1
	LD	a4,  10 * SIZE(A1)

	ADD1	s2, t2,  s2
	unop
	MUL	x3, a7,  t2
	LD	a7,  11 * SIZE(A2)

	ADD2	s3, t3,  s3
	unop
	MUL	x3, a6,  t3
	LD	a6,  10 * SIZE(A2)

	ADD3	s0, t0,  s0
	unop
	MUL	x0, a8,  t0
	LD	x3,   7 * SIZE(X1)

	ADD4	s1, t1,  s1
	ldl	$31, (PREFETCHSIZE + 0) * SIZE(A2)
	MUL	x0, a9,  t1
	unop

	ADD3	s2, t2,  s2
	lda	I,   -1(I)
	MUL	x0, a10, t2
	unop

	ADD4	s3, t3,  s3
	unop
	MUL	x0, a11, t3
	LD	x0,   8 * SIZE(X1)

	ADD1	s0, t0,  s0
	unop
	MUL	x1, a9,  t0
	LD	a9,  13 * SIZE(A1)

	ADD2	s1, t1,  s1
	unop
	MUL	x1, a8,  t1
	LD	a8,  12 * SIZE(A1)

	ADD1	s2, t2,  s2
	lda	A1,   8 * SIZE(A1)
	MUL	x1, a11, t2
	LD	a11, 13 * SIZE(A2)

	ADD2	s3, t3,  s3
	unop
	MUL	x1, a10, t3
	LD	a10, 12 * SIZE(A2)

	ADD3	s0, t0,  s0
	unop
	MUL	x2, a12, t0
	LD	x1,   9 * SIZE(X1)

	ADD4	s1, t1,  s1
	ldl	$31, (PREFETCHSIZE + 0) * SIZE(X1)
	MUL	x2, a13, t1
	lda	A2,   8 * SIZE(A2)

	ADD3	s2, t2,  s2
	unop
	MUL	x2, a14, t2
	unop

	ADD4	s3, t3,  s3
	unop
	MUL	x2, a15, t3
	LD	x2,  10 * SIZE(X1)

	ADD1	s0, t0,  s0
	unop
	MUL	x3, a13, t0
	LD	a13,  7 * SIZE(A1)

	ADD2	s1, t1,  s1
	lda	X1,   8 * SIZE(X1)
	MUL	x3, a12, t1
	LD	a12,  6 * SIZE(A1)

	ADD1	s2, t2,  s2
	unop
	MUL	x3, a15, t2
	LD	a15,  7 * SIZE(A2)

	ADD2	s3, t3,  s3
	MUL	x3, a14, t3
	LD	a14,  6 * SIZE(A2)
	bgt	I, $L12
	.align 4

$L13:
	ADD3	s0, t0,  s0
	unop
	MUL	x0, a0,  t0
	LD	x3,   3 * SIZE(X1)

	ADD4	s1, t1,  s1
	MUL	x0, a1,  t1
	ADD3	s2, t2,  s2
	MUL	x0, a2,  t2

	ADD4	s3, t3,  s3
	unop
	MUL	x0, a3,  t3
	LD	x0,   4 * SIZE(X1)

	ADD1	s0, t0,  s0
	MUL	x1, a1,  t0
	ADD2	s1, t1,  s1
	MUL	x1, a0,  t1

	ADD1	s2, t2,  s2
	unop
	MUL	x1, a3,  t2
	unop

	ADD2	s3, t3,  s3
	lda	A1,   8 * SIZE(A1)
	MUL	x1, a2,  t3
	LD	x1,   5 * SIZE(X1)

	ADD3	s0, t0,  s0
	MUL	x2, a4,  t0
	ADD4	s1, t1,  s1
	MUL	x2, a5,  t1

	ADD3	s2, t2,  s2
	unop
	MUL	x2, a6,  t2
	unop

	ADD4	s3, t3,  s3
	lda	A2,   8 * SIZE(A2)
	MUL	x2, a7,  t3
	LD	x2,   6 * SIZE(X1)

	ADD1	s0, t0,  s0
	MUL	x3, a5,  t0
	ADD2	s1, t1,  s1
	MUL	x3, a4,  t1

	ADD1	s2, t2,  s2
	unop
	MUL	x3, a7,  t2
	lda	X1,   8 * SIZE(X1)

	ADD2	s3, t3,  s3
	unop
	MUL	x3, a6,  t3
	LD	x3,  -1 * SIZE(X1)

	ADD3	s0, t0,  s0
	MUL	x0, a8,  t0
	ADD4	s1, t1,  s1
	MUL	x0, a9,  t1

	ADD3	s2, t2,  s2
	MUL	x0, a10, t2
	ADD4	s3, t3,  s3
	MUL	x0, a11, t3

	ADD1	s0, t0,  s0
	MUL	x1, a9,  t0
	ADD2	s1, t1,  s1
	MUL	x1, a8,  t1

	ADD1	s2, t2,  s2
	MUL	x1, a11, t2
	ADD2	s3, t3,  s3
	MUL	x1, a10, t3

	ADD3	s0, t0,  s0
	MUL	x2, a12, t0
	ADD4	s1, t1,  s1
	MUL	x2, a13, t1

	ADD3	s2, t2,  s2
	MUL	x2, a14, t2
	ADD4	s3, t3,  s3
	MUL	x2, a15, t3

	ADD1	s0, t0,  s0
	MUL	x3, a13, t0
	ADD2	s1, t1,  s1
	MUL	x3, a12, t1

	ADD1	s2, t2,  s2
	MUL	x3, a15, t2
	ADD2	s3, t3,  s3
	MUL	x3, a14, t3
	.align 4

$L15:
	and	M, 3, I
	ble	I,  $L18

	LD	a0,   0 * SIZE(A1)
	LD	a1,   1 * SIZE(A1)
	LD	a2,   0 * SIZE(A2)
	LD	a3,   1 * SIZE(A2)

	LD	x0,   0 * SIZE(X1)

	lda	I,   -1(I)
	ble	I, $L17
	.align 4

$L16:
	ADD3	s0, t0, s0
	lda	I,   -1(I)
	MUL	x0, a0, t0
	LD	x1,   1 * SIZE(X1)

	ADD4	s1, t1, s1
	MUL	x0, a1, t1
	ADD3	s2, t2, s2
	MUL	x0, a2, t2

	ADD4	s3, t3, s3
	unop
	MUL	x0, a3, t3
	LD	x0,   2 * SIZE(X1)

	ADD1	s0, t0, s0
	lda	A2,   2 * SIZE(A2)
	MUL	x1, a1, t0
	LD	a1,   3 * SIZE(A1)

	ADD2	s1, t1, s1
	lda	X1,   2 * SIZE(X1)
	MUL	x1, a0, t1
	LD	a0,   2 * SIZE(A1)

	ADD1	s2, t2, s2
	lda	A1,   2 * SIZE(A1)
	MUL	x1, a3, t2
	LD	a3,   1 * SIZE(A2)

	ADD2	s3, t3, s3
	MUL	x1, a2, t3
	LD	a2,   0 * SIZE(A2)
	bgt	I, $L16
	.align 4

$L17:
	ADD3	s0, t0, s0
	unop
	MUL	x0, a0, t0
	LD	x1,   1 * SIZE(X1)

	ADD4	s1, t1, s1
	unop
	MUL	x0, a1, t1
	unop

	ADD3	s2, t2, s2
	MUL	x0, a2, t2
	ADD4	s3, t3, s3
	MUL	x0, a3, t3

	ADD1	s0, t0, s0
	MUL	x1, a1, t0
	ADD2	s1, t1, s1
	MUL	x1, a0, t1

	ADD1	s2, t2, s2
	MUL	x1, a3, t2
	ADD2	s3, t3, s3
	MUL	x1, a2, t3
	.align 4

$L18:
	LD	a0,    0 * SIZE(Y)
	unop
	LD	a1,    1 * SIZE(Y)
	addq	Y, INCY, Y

	LD	a2,    0 * SIZE(Y)
	unop
	LD	a3,    1 * SIZE(Y)
	addq	Y, INCY, Y

	ADD3	s0, t0, s0
	ADD4	s1, t1, s1
	ADD3	s2, t2, s2
	ADD4	s3, t3, s3

	MUL	alpha_r, s0, t0
	MUL	alpha_r, s1, t1
	MUL	alpha_r, s2, t2
	MUL	alpha_r, s3, t3

	ADD	a0, t0, a0
	MUL	alpha_i, s1, t0
	ADD	a1, t1, a1
	MUL	alpha_i, s0, t1
	ADD	a2, t2, a2
	MUL	alpha_i, s3, t2
	ADD	a3, t3, a3
	MUL	alpha_i, s2, t3

	SUB	a0, t0, a0
	ADD	a1, t1, a1
	SUB	a2, t2, a2
	ADD	a3, t3, a3

	ST	a0,    0 * SIZE(Y1)
	fclr	t0
	ST	a1,    1 * SIZE(Y1)
	addq	Y1, INCY, Y1

	ST	a2,    0 * SIZE(Y1)
	fclr	t1
	ST	a3,    1 * SIZE(Y1)
	addq	Y1, INCY, Y1

	fclr	t2
	lda	J, -1(J)
	fclr	t3
	bgt	J,  $L11
	.align 4

$L20:
	blbc	N,  $L999

	mov	A, A1
	fclr	s0
	fclr	s1
	mov	X, X1

	sra	M,  2, I
	fclr	s2
	fclr	s3
	ble	I,  $L25

	LD	a0,   0 * SIZE(A1)
	LD	a1,   1 * SIZE(A1)
	LD	a4,   2 * SIZE(A1)
	LD	a5,   3 * SIZE(A1)
	LD	a8,   4 * SIZE(A1)
	LD	a9,   5 * SIZE(A1)
	LD	a12,  6 * SIZE(A1)
	LD	a13,  7 * SIZE(A1)

	LD	x0,   0 * SIZE(X1)
	LD	x1,   1 * SIZE(X1)
	LD	x2,   2 * SIZE(X1)

	lda	I,   -1(I)
	ble	I, $L23
	.align 4

$L22:
	ADD3	s0, t0,  s0
	ldl	$31, (PREFETCHSIZE + 0) * SIZE(A1)
	MUL	x0, a0,  t0
	LD	x3,   3 * SIZE(X1)

	ADD4	s1, t1,  s1
	unop
	MUL	x0, a1,  t1
	LD	x0,   4 * SIZE(X1)

	ADD1	s2, t0,  s2
	lda	I,   -1(I)
	MUL	x1, a1,  t0
	LD	a1,   9 * SIZE(A1)

	ADD2	s3, t1,  s3
	unop
	MUL	x1, a0,  t1
	LD	a0,   8 * SIZE(A1)

	ADD3	s0, t0,  s0
	unop
	MUL	x2, a4,  t0
	LD	x1,   5 * SIZE(X1)

	ADD4	s1, t1,  s1
	unop
	MUL	x2, a5,  t1
	LD	x2,   6 * SIZE(X1)

	ADD1	s2, t0,  s2
	unop
	MUL	x3, a5,  t0
	LD	a5,  11 * SIZE(A1)

	ADD2	s3, t1,  s3
	unop
	MUL	x3, a4,  t1
	LD	a4,  10 * SIZE(A1)

	ADD3	s0, t0,  s0
	unop
	MUL	x0, a8,  t0
	LD	x3,   7 * SIZE(X1)

	ADD4	s1, t1,  s1
	unop
	MUL	x0, a9,  t1
	LD	x0,   8 * SIZE(X1)

	ADD1	s2, t0,  s2
	unop
	MUL	x1, a9,  t0
	LD	a9,  13 * SIZE(A1)

	ADD2	s3, t1,  s3
	unop
	MUL	x1, a8,  t1
	LD	a8,  12 * SIZE(A1)

	ADD3	s0, t0,  s0
	unop
	MUL	x2, a12, t0
	LD	x1,   9 * SIZE(X1)

	ADD4	s1, t1,  s1
	lda	A1,   8 * SIZE(A1)
	MUL	x2, a13, t1
	LD	x2,  10 * SIZE(X1)

	ADD1	s2, t0,  s2
	lda	X1,   8 * SIZE(X1)
	MUL	x3, a13, t0
	LD	a13,  7 * SIZE(A1)

	ADD2	s3, t1,  s3
	MUL	x3, a12, t1
	LD	a12,  6 * SIZE(A1)
	bgt	I, $L22
	.align 4

$L23:
	ADD3	s0, t0,  s0
	unop
	MUL	x0, a0,  t0
	LD	x3,   3 * SIZE(X1)

	ADD4	s1, t1,  s1
	unop
	MUL	x0, a1,  t1
	LD	x0,   4 * SIZE(X1)

	ADD1	s2, t0,  s2
	unop
	MUL	x1, a1,  t0
	lda	A1,   8 * SIZE(A1)

	ADD2	s3, t1,  s3
	unop
	MUL	x1, a0,  t1
	LD	x1,   5 * SIZE(X1)

	ADD3	s0, t0,  s0
	unop
	MUL	x2, a4,  t0
	unop

	ADD4	s1, t1,  s1
	unop
	MUL	x2, a5,  t1
	LD	x2,   6 * SIZE(X1)

	ADD1	s2, t0,  s2
	unop
	MUL	x3, a5,  t0
	lda	X1,   8 * SIZE(X1)

	ADD2	s3, t1,  s3
	unop
	MUL	x3, a4,  t1
	LD	x3,  -1 * SIZE(X1)

	ADD3	s0, t0,  s0
	MUL	x0, a8,  t0
	ADD4	s1, t1,  s1
	MUL	x0, a9,  t1

	ADD1	s2, t0,  s2
	MUL	x1, a9,  t0
	ADD2	s3, t1,  s3
	MUL	x1, a8,  t1

	ADD3	s0, t0,  s0
	MUL	x2, a12, t0
	ADD4	s1, t1,  s1
	MUL	x2, a13, t1

	ADD1	s2, t0,  s2
	MUL	x3, a13, t0
	ADD2	s3, t1,  s3
	MUL	x3, a12, t1
	.align 4

$L25:
	and	M, 3, I
	ble	I,  $L28

	LD	a0,   0 * SIZE(A1)
	LD	a1,   1 * SIZE(A1)

	LD	x0,   0 * SIZE(X1)

	lda	I,   -1(I)
	ble	I, $L27
	.align 4

$L26:
	ADD3	s0, t0, s0
	lda	A1,   2 * SIZE(A1)
	MUL	x0, a0, t0
	LD	x1,   1 * SIZE(X1)

	ADD4	s1, t1, s1
	lda	I,   -1(I)
	MUL	x0, a1, t1
	LD	x0,   2 * SIZE(X1)

	ADD1	s0, t0, s0
	lda	X1,   2 * SIZE(X1)
	MUL	x1, a1, t0
	LD	a1,   1 * SIZE(A1)

	ADD2	s1, t1, s1
	MUL	x1, a0, t1
	LD	a0,   0 * SIZE(A1)
	bgt	I, $L26
	.align 4

$L27:
	ADD3	s0, t0, s0
	unop
	MUL	x0, a0, t0
	LD	x1,   1 * SIZE(X1)

	ADD4	s1, t1, s1
	unop
	MUL	x0, a1, t1
	unop

	ADD1	s0, t0, s0
	MUL	x1, a1, t0
	ADD2	s1, t1, s1
	MUL	x1, a0, t1
	.align 4

$L28:
	LD	a0,    0 * SIZE(Y)
	LD	a1,    1 * SIZE(Y)

	ADD3	s0, t0, s0
	ADD4	s1, t1, s1
	ADD3	s2, t2, s2
	ADD4	s3, t3, s3

	ADD	s0, s2, s0
	ADD	s1, s3, s1

	MUL	alpha_r, s0, t0
	MUL	alpha_r, s1, t1

	ADD	a0, t0, a0
	MUL	alpha_i, s1, t0
	ADD	a1, t1, a1
	MUL	alpha_i, s0, t1

	SUB	a0, t0, a0
	ADD	a1, t1, a1

	ST	a0,    0 * SIZE(Y1)
	ST	a1,    1 * SIZE(Y1)
	.align 4

$L999:
	ldt	$f2,    0($sp)
	ldt	$f3,    8($sp)
	ldt	$f4,   16($sp)
	ldt	$f5,   24($sp)
	ldt	$f6,   32($sp)
	ldt	$f7,   40($sp)
	ldt	$f8,   48($sp)
	ldt	$f9,   56($sp)

	lda	$sp,  STACKSIZE($sp)
	ret
	EPILOGUE