Blob Blame Raw
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"
#include "version.h"

#define N	$16
#define X	$17
#define INCX	$18

#ifndef USE_MIN
#define CMPLT(a, b) cmptlt a, b
#else
#define CMPLT(a, b) cmptlt b, a
#endif

#define STACKSIZE 8 * 8

	PROLOGUE
	PROFCODE
	.frame	$sp, STACKSIZE, $26, 0

	lda	$sp, -STACKSIZE($sp)

	stt	$f2,   0($sp)
	fclr	$f16
	cmplt	$31, N,    $2

	stt	$f3,   8($sp)
	fclr	$f17
	cmplt	$31, INCX, $3
	unop

	stt	$f4,  16($sp)
	fclr	$f18
	SXADDQ	INCX, $31, INCX
	unop

	stt	$f5,  24($sp)
	fclr	$f19
	and	$2,  $3,  $0
	unop

	stt	$f6,  32($sp)
	unop

	stt	$f7,  40($sp)
	stt	$f8,  48($sp)
	stt	$f9,  56($sp)

	fclr	$f0
	beq	$0,  $End		# if (n <= 0) or (incx <= 0) return
	.align 4

	LD	$f20,  0 * SIZE(X)
	LD	$f21,  1 * SIZE(X)
	sra	N, 2, $1
	addq	INCX, INCX, INCX

	fabs	$f20, $f20
	fabs	$f21, $f21
	addt	$f20, $f21, $f0
	ble	$1,  $L15
	.align 4

	lda	$1,  -1($1)
	unop
	addq	X, INCX, X
	unop

	LD	$f22,  0 * SIZE(X)
	fmov	$f0,  $f1
	LD	$f23,  1 * SIZE(X)
	addq	X, INCX, X

	LD	$f24,  0 * SIZE(X)
	fmov	$f0,  $f2
	LD	$f25,  1 * SIZE(X)
	addq	X, INCX, X

	LD	$f26,  0 * SIZE(X)
	fmov	$f0,  $f3
	LD	$f27,  1 * SIZE(X)
	addq	X, INCX, X

	fabs	$f20, $f8
	fabs	$f21, $f9
	fabs	$f22, $f10
	fabs	$f23, $f11

	fabs	$f24, $f12
	fabs	$f25, $f13
	fabs	$f26, $f14
	fabs	$f27, $f15

	ble	$1, $L14
	.align 4

	LD	$f20,  0 * SIZE(X)
	LD	$f21,  1 * SIZE(X)
	lda	$1,  -1($1)
	addq	X, INCX, X

	LD	$f22,  0 * SIZE(X)
	LD	$f23,  1 * SIZE(X)
	unop
	addq	X, INCX, X

	LD	$f24,  0 * SIZE(X)
	LD	$f25,  1 * SIZE(X)
	unop
	addq	X, INCX, X

	LD	$f26,  0 * SIZE(X)
	LD	$f27,  1 * SIZE(X)
	addq	X, INCX, X
	ble	$1, $L13
	.align 4

$L12:
	addt	$f8,  $f9,  $f16
	unop
	fabs	$f20, $f8
	ldl	$31, 64 * SIZE(X)

	addt	$f10, $f11, $f17
	unop
	fabs	$f21, $f9
	LD	$f20,  0 * SIZE(X)

	addt	$f12, $f13, $f18
	LD	$f21,  1 * SIZE(X)
	fabs	$f22, $f10
	addq	X, INCX, X

	addt	$f14, $f15, $f19
	LD	$f22,  0 * SIZE(X)
	fabs	$f23, $f11
	unop

	CMPLT($f0,  $f16), $f4
	LD	$f23,  1 * SIZE(X)
	fabs	$f24, $f12
	addq	X, INCX, X

	CMPLT($f1,  $f17), $f5
	LD	$f24,  0 * SIZE(X)
	fabs	$f25, $f13
	unop

	CMPLT($f2,  $f18), $f6
	LD	$f25,  1 * SIZE(X)
	fabs	$f26, $f14
	addq	X, INCX, X

	CMPLT($f3,  $f19), $f7
	LD	$f26,  0 * SIZE(X)
	fabs	$f27, $f15
	unop

	fcmovne	$f4, $f16, $f0
	LD	$f27,  1 * SIZE(X)
	addq	X, INCX, X
	lda	$1,   -1($1)		# i --

	fcmovne	$f5, $f17, $f1
	fcmovne	$f6, $f18, $f2
	fcmovne	$f7, $f19, $f3
	bgt	$1,$L12
	.align 4

$L13:
	addt	$f8,  $f9,  $f16
	fabs	$f20, $f8

	addt	$f10, $f11, $f17
	fabs	$f21, $f9

	addt	$f12, $f13, $f18
	fabs	$f22, $f10

	addt	$f14, $f15, $f19
	fabs	$f23, $f11

	CMPLT($f0,  $f16), $f4
	fabs	$f24, $f12

	CMPLT($f1,  $f17), $f5
	fabs	$f25, $f13

	CMPLT($f2,  $f18), $f6
	fabs	$f26, $f14
	CMPLT($f3,  $f19), $f7
	fabs	$f27, $f15

	fcmovne	$f4, $f16, $f0
	fcmovne	$f5, $f17, $f1
	fcmovne	$f6, $f18, $f2
	fcmovne	$f7, $f19, $f3
	.align 4
	
$L14:
	addt	$f8,  $f9,  $f16
	addt	$f10, $f11, $f17
	addt	$f12, $f13, $f18
	addt	$f14, $f15, $f19

	CMPLT($f0,  $f16), $f4
	CMPLT($f1,  $f17), $f5
	CMPLT($f2,  $f18), $f6
	CMPLT($f3,  $f19), $f7

	fcmovne	$f4, $f16, $f0
	fcmovne	$f5, $f17, $f1
	fcmovne	$f6, $f18, $f2
	fcmovne	$f7, $f19, $f3

	CMPLT($f0,  $f1), $f16
	CMPLT($f2,  $f3), $f17

	fcmovne	$f16, $f1, $f0
	fcmovne	$f17, $f3, $f2

	CMPLT($f0,  $f2), $f16
	fcmovne	$f16, $f2, $f0
	.align 4

$L15:
	and	N, 3, $1
	unop
	unop
	ble	$1,  $End
	.align 4

$L16:
	LD	$f20,  0 * SIZE(X)
	LD	$f21,  1 * SIZE(X)
	unop
	addq	X, INCX, X

	fabs	$f20, $f29
	fabs	$f21, $f30
	addt	$f29, $f30, $f29

	CMPLT($f0,  $f29), $f16
	fcmovne	$f16, $f29, $f0

	lda	$1,   -1($1)		# i --
	bgt	$1, $L16
	.align 4

$End:
	ldt	$f2,   0($sp)
	ldt	$f3,   8($sp)
	ldt	$f4,  16($sp)
	ldt	$f5,  24($sp)

	ldt	$f6,  32($sp)
	ldt	$f7,  40($sp)
	ldt	$f8,  48($sp)
	ldt	$f9,  56($sp)
	lda	$sp,  STACKSIZE($sp)
	ret

	EPILOGUE