kusano 2b45e8
/*********************************************************************/
kusano 2b45e8
/* Copyright 2009, 2010 The University of Texas at Austin.           */
kusano 2b45e8
/* All rights reserved.                                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* Redistribution and use in source and binary forms, with or        */
kusano 2b45e8
/* without modification, are permitted provided that the following   */
kusano 2b45e8
/* conditions are met:                                               */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   1. Redistributions of source code must retain the above         */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer.                                                  */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   2. Redistributions in binary form must reproduce the above      */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer in the documentation and/or other materials       */
kusano 2b45e8
/*      provided with the distribution.                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
kusano 2b45e8
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
kusano 2b45e8
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
kusano 2b45e8
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
kusano 2b45e8
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
kusano 2b45e8
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
kusano 2b45e8
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
kusano 2b45e8
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
kusano 2b45e8
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
kusano 2b45e8
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
kusano 2b45e8
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
kusano 2b45e8
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
kusano 2b45e8
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
kusano 2b45e8
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* The views and conclusions contained in the software and           */
kusano 2b45e8
/* documentation are those of the authors and should not be          */
kusano 2b45e8
/* interpreted as representing official policies, either expressed   */
kusano 2b45e8
/* or implied, of The University of Texas at Austin.                 */
kusano 2b45e8
/*********************************************************************/
kusano 2b45e8
kusano 2b45e8
#define ASSEMBLER
kusano 2b45e8
#include "common.h"
kusano 2b45e8
#include "version.h"
kusano 2b45e8
kusano 2b45e8
#define PREFETCHSIZE 40
kusano 2b45e8
kusano 2b45e8
	PROLOGUE
kusano 2b45e8
	PROFCODE
kusano 2b45e8
	.frame	$sp, 16, $26, 0
kusano 2b45e8
kusano 2b45e8
	ldq	$24,   0($sp)
kusano 2b45e8
	fmov	$f19,  $f30
kusano 2b45e8
	ldl	$23,   8($sp)
kusano 2b45e8
	lda	$sp, -16($sp)
kusano 2b45e8
#ifndef PROFILE
kusano 2b45e8
	.prologue 0
kusano 2b45e8
#else
kusano 2b45e8
	.prologue 1
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	nop
kusano 2b45e8
	sra	$16,  3,  $1
kusano 2b45e8
	stt	$f2,   0($sp)
kusano 2b45e8
	cmpeq	$21,  1,  $3
kusano 2b45e8
kusano 2b45e8
	stt	$f3,   8($sp)
kusano 2b45e8
	cmpeq	$23,  1, $4
kusano 2b45e8
	and	$16,  7,  $2
kusano 2b45e8
	ble	$16, $End
kusano 2b45e8
kusano 2b45e8
	and	$3,  $4,  $3
kusano 2b45e8
	fbeq	$f30, $End
kusano 2b45e8
kusano 2b45e8
	beq	$3,  $Sub
kusano 2b45e8
	ble	$1,  $Remain
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
	LD	$f10,  0*SIZE($20)
kusano 2b45e8
	LD	$f11,  1*SIZE($20)
kusano 2b45e8
	LD	$f12,  2*SIZE($20)
kusano 2b45e8
	LD	$f13,  3*SIZE($20)
kusano 2b45e8
kusano 2b45e8
	LD	$f18,  0*SIZE($24)
kusano 2b45e8
	LD	$f19,  1*SIZE($24)
kusano 2b45e8
	LD	$f20,  2*SIZE($24)
kusano 2b45e8
	LD	$f21,  3*SIZE($24)
kusano 2b45e8
kusano 2b45e8
	LD	$f14,  4*SIZE($20)
kusano 2b45e8
	LD	$f15,  5*SIZE($20)
kusano 2b45e8
	LD	$f16,  6*SIZE($20)
kusano 2b45e8
	LD	$f17,  7*SIZE($20)
kusano 2b45e8
kusano 2b45e8
	LD	$f22,  4*SIZE($24)
kusano 2b45e8
	LD	$f23,  5*SIZE($24)
kusano 2b45e8
	LD	$f24,  6*SIZE($24)
kusano 2b45e8
	LD	$f25,  7*SIZE($24)
kusano 2b45e8
kusano 2b45e8
	subq	$1,   1,  $1
kusano 2b45e8
	addq	$20, 8*SIZE, $20
kusano 2b45e8
	unop
kusano 2b45e8
	ble	$1,  $LoopEnd
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
$Loop:
kusano 2b45e8
	ldt	$f31, PREFETCHSIZE * SIZE($24)
kusano 2b45e8
	ldl	$31,  PREFETCHSIZE * SIZE($20)
kusano 2b45e8
kusano 2b45e8
	MUL	$f30, $f10, $f26		# ctemp1 = da * atemp1
kusano 2b45e8
	LD	$f10,  0*SIZE($20)
kusano 2b45e8
	MUL	$f30, $f11, $f27
kusano 2b45e8
	LD	$f11,  1*SIZE($20)
kusano 2b45e8
kusano 2b45e8
	MUL	$f30, $f12, $f28
kusano 2b45e8
	LD	$f12,  2*SIZE($20)
kusano 2b45e8
	MUL	$f30, $f13, $f29
kusano 2b45e8
	LD	$f13,  3*SIZE($20)
kusano 2b45e8
kusano 2b45e8
	ADD	$f18, $f26, $f0
kusano 2b45e8
	LD	$f18,  8*SIZE($24)
kusano 2b45e8
	MUL	$f30, $f14, $f26		# ctemp1 = da * atemp1
kusano 2b45e8
	LD	$f14,  4*SIZE($20)
kusano 2b45e8
kusano 2b45e8
	ADD	$f19, $f27, $f1
kusano 2b45e8
	LD	$f19,  9*SIZE($24)
kusano 2b45e8
	MUL	$f30, $f15, $f27
kusano 2b45e8
	LD	$f15,  5*SIZE($20)
kusano 2b45e8
kusano 2b45e8
	ADD	$f20, $f28, $f2
kusano 2b45e8
	LD	$f20, 10*SIZE($24)
kusano 2b45e8
	MUL	$f30, $f16, $f28
kusano 2b45e8
	LD	$f16,  6*SIZE($20)
kusano 2b45e8
kusano 2b45e8
	ADD	$f21, $f29, $f3
kusano 2b45e8
	LD	$f21, 11*SIZE($24)
kusano 2b45e8
	MUL	$f30, $f17, $f29
kusano 2b45e8
	LD	$f17, 7*SIZE($20)
kusano 2b45e8
kusano 2b45e8
	ST	$f0,   0*SIZE($24)
kusano 2b45e8
	ADD	$f22, $f26, $f0
kusano 2b45e8
	ST	$f1,   1*SIZE($24)
kusano 2b45e8
	ADD	$f23, $f27, $f1
kusano 2b45e8
kusano 2b45e8
	ST	$f2,   2*SIZE($24)
kusano 2b45e8
	ADD	$f24, $f28, $f2
kusano 2b45e8
	ST	$f3,   3*SIZE($24)
kusano 2b45e8
	ADD	$f25, $f29, $f3
kusano 2b45e8
kusano 2b45e8
	LD	$f22, 12*SIZE($24)
kusano 2b45e8
	LD	$f23, 13*SIZE($24)
kusano 2b45e8
	LD	$f24, 14*SIZE($24)
kusano 2b45e8
	LD	$f25, 15*SIZE($24)
kusano 2b45e8
kusano 2b45e8
	ST	$f0,  4*SIZE($24)
kusano 2b45e8
	ST	$f1,  5*SIZE($24)
kusano 2b45e8
	ST	$f2,  6*SIZE($24)
kusano 2b45e8
	ST	$f3,  7*SIZE($24)
kusano 2b45e8
kusano 2b45e8
	subq	$1,  1, $1
kusano 2b45e8
	addq	$24, 8*SIZE, $24
kusano 2b45e8
	addq	$20, 8*SIZE, $20
kusano 2b45e8
	bgt	$1, $Loop
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
$LoopEnd:
kusano 2b45e8
	MUL	$f30, $f10, $f26		# ctemp1 = da * atemp1
kusano 2b45e8
	MUL	$f30, $f11, $f27
kusano 2b45e8
	MUL	$f30, $f12, $f28
kusano 2b45e8
	MUL	$f30, $f13, $f29
kusano 2b45e8
kusano 2b45e8
	ADD	$f18, $f26, $f0
kusano 2b45e8
	MUL	$f30, $f14, $f26		# ctemp1 = da * atemp1
kusano 2b45e8
	ADD	$f19, $f27, $f1
kusano 2b45e8
	MUL	$f30, $f15, $f27
kusano 2b45e8
kusano 2b45e8
	ADD	$f20, $f28, $f2
kusano 2b45e8
	MUL	$f30, $f16, $f28
kusano 2b45e8
	ADD	$f21, $f29, $f3
kusano 2b45e8
	MUL	$f30, $f17, $f29
kusano 2b45e8
kusano 2b45e8
	ST	$f0,   0*SIZE($24)
kusano 2b45e8
	ADD	$f22, $f26, $f0
kusano 2b45e8
	ST	$f1,   1*SIZE($24)
kusano 2b45e8
	ADD	$f23, $f27, $f1
kusano 2b45e8
kusano 2b45e8
	ST	$f2,   2*SIZE($24)
kusano 2b45e8
	ADD	$f24, $f28, $f2
kusano 2b45e8
	ST	$f3,   3*SIZE($24)
kusano 2b45e8
	ADD	$f25, $f29, $f3
kusano 2b45e8
kusano 2b45e8
	ST	$f0,   4*SIZE($24)
kusano 2b45e8
	ST	$f1,   5*SIZE($24)
kusano 2b45e8
	ST	$f2,   6*SIZE($24)
kusano 2b45e8
	ST	$f3,   7*SIZE($24)
kusano 2b45e8
	addq	$24, 8*SIZE, $24
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
$Remain:
kusano 2b45e8
	ble	$2, $End
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
$RemainLoop:
kusano 2b45e8
	LD	$f10,  0*SIZE($20)
kusano 2b45e8
	LD	$f11,  0*SIZE($24)
kusano 2b45e8
	addq	$20, SIZE, $20
kusano 2b45e8
	addq	$24, SIZE, $24
kusano 2b45e8
kusano 2b45e8
	MUL	$f30, $f10, $f12
kusano 2b45e8
	subq	$2,  1,  $2
kusano 2b45e8
	ADD	$f11, $f12, $f13
kusano 2b45e8
	ST	$f13,  -1*SIZE($24)
kusano 2b45e8
	bgt	$2,  $RemainLoop
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
$End:
kusano 2b45e8
	ldt	$f2,   0($sp)
kusano 2b45e8
	ldt	$f3,   8($sp)
kusano 2b45e8
	lda	$sp,  16($sp)
kusano 2b45e8
	ret
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
$Sub:
kusano 2b45e8
	SXSUBL	$16,  SIZE, $22
kusano 2b45e8
	subq	$1,  1, $4
kusano 2b45e8
	ble	$1, $SubRemain
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
	LD	$f10,  0($20)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	LD	$f11,  0($20)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
	LD	$f12,  0($20)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	LD	$f13,  0($20)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
	LD	$f18,  0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $22
kusano 2b45e8
kusano 2b45e8
	LD	$f19,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
	LD	$f20,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
kusano 2b45e8
	LD	$f21,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
	LD	$f14,  0($20)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	LD	$f15,  0($20)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
	LD	$f16,  0($20)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	LD	$f17,  0($20)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
	LD	$f22,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
kusano 2b45e8
	LD	$f23,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
	LD	$f24,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
kusano 2b45e8
	LD	$f25,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
	unop
kusano 2b45e8
	ble	$4,  $SubLoopEnd
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
$SubLoop:
kusano 2b45e8
	MUL	$f30, $f10, $f26		# ctemp1 = da * atemp1
kusano 2b45e8
	LD	$f10,  0($20)
kusano 2b45e8
	unop
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	MUL	$f30, $f11, $f27
kusano 2b45e8
	LD	$f11,  0($20)
kusano 2b45e8
	unop
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	MUL	$f30, $f12, $f28
kusano 2b45e8
	LD	$f12,  0($20)
kusano 2b45e8
	unop
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	MUL	$f30, $f13, $f29
kusano 2b45e8
	LD	$f13,  0($20)
kusano 2b45e8
	unop
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	ADD	$f18, $f26, $f0
kusano 2b45e8
	MUL	$f30, $f14, $f26		# ctemp1 = da * atemp1
kusano 2b45e8
	LD	$f14,  0($20)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	ADD	$f19, $f27, $f1
kusano 2b45e8
	MUL	$f30, $f15, $f27
kusano 2b45e8
	LD	$f15,  0($20)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	ADD	$f20, $f28, $f2
kusano 2b45e8
	MUL	$f30, $f16, $f28
kusano 2b45e8
	LD	$f16,  0($20)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	ADD	$f21, $f29, $f3
kusano 2b45e8
	MUL	$f30, $f17, $f29
kusano 2b45e8
	LD	$f17,  0($20)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	ST	$f0,   0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
	ADD	$f22, $f26, $f0
kusano 2b45e8
	unop
kusano 2b45e8
kusano 2b45e8
	ST	$f1,   0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
	ADD	$f23, $f27, $f1
kusano 2b45e8
	unop
kusano 2b45e8
kusano 2b45e8
	ST	$f2,   0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
	ADD	$f24, $f28, $f2
kusano 2b45e8
	unop
kusano 2b45e8
kusano 2b45e8
	ST	$f3,   0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
	ADD	$f25, $f29, $f3
kusano 2b45e8
	unop
kusano 2b45e8
kusano 2b45e8
	LD	$f18,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
	LD	$f19,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
kusano 2b45e8
	LD	$f20,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
	LD	$f21,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
kusano 2b45e8
	LD	$f22,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
	LD	$f23,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
kusano 2b45e8
	LD	$f24,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
	LD	$f25,  0($22)
kusano 2b45e8
	SXADDQ	$23, $22, $22
kusano 2b45e8
kusano 2b45e8
	ST	$f0,  0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
	ST	$f1,  0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
	ST	$f2,  0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
	ST	$f3,  0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
kusano 2b45e8
	subq	$4,   1,  $4
kusano 2b45e8
	bgt	$4, $SubLoop
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
$SubLoopEnd:
kusano 2b45e8
	MUL	$f30, $f10, $f26		# ctemp1 = da * atemp1
kusano 2b45e8
	MUL	$f30, $f11, $f27
kusano 2b45e8
	MUL	$f30, $f12, $f28
kusano 2b45e8
	MUL	$f30, $f13, $f29
kusano 2b45e8
kusano 2b45e8
	ADD	$f18, $f26, $f0
kusano 2b45e8
	MUL	$f30, $f14, $f26		# ctemp1 = da * atemp1
kusano 2b45e8
	ADD	$f19, $f27, $f1
kusano 2b45e8
	MUL	$f30, $f15, $f27
kusano 2b45e8
kusano 2b45e8
	ADD	$f20, $f28, $f2
kusano 2b45e8
	MUL	$f30, $f16, $f28
kusano 2b45e8
	ADD	$f21, $f29, $f3
kusano 2b45e8
	MUL	$f30, $f17, $f29
kusano 2b45e8
kusano 2b45e8
	ST	$f0,   0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
	ST	$f1,   0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
kusano 2b45e8
	ST	$f2,   0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
	ST	$f3,   0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
kusano 2b45e8
	ADD	$f22, $f26, $f0
kusano 2b45e8
	ADD	$f23, $f27, $f1
kusano 2b45e8
	ADD	$f24, $f28, $f2
kusano 2b45e8
	ADD	$f25, $f29, $f3
kusano 2b45e8
kusano 2b45e8
	ST	$f0,   0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
	ST	$f1,   0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
kusano 2b45e8
	ST	$f2,   0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
	ST	$f3,   0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
$SubRemain:
kusano 2b45e8
	ble	$2, $SubEnd
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
$SubRemainLoop:
kusano 2b45e8
	LD	$f10,  0($20)
kusano 2b45e8
	LD	$f11,  0($24)
kusano 2b45e8
	SXADDQ	$21, $20, $20
kusano 2b45e8
kusano 2b45e8
	MUL	$f30, $f10, $f12
kusano 2b45e8
	subq	$2,  1,  $2
kusano 2b45e8
	ADD	$f11, $f12, $f13
kusano 2b45e8
	ST	$f13,  0($24)
kusano 2b45e8
	SXADDQ	$23, $24, $24
kusano 2b45e8
kusano 2b45e8
	bgt	$2,  $SubRemainLoop
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
$SubEnd:
kusano 2b45e8
	ldt	$f2,   0($sp)
kusano 2b45e8
	ldt	$f3,   8($sp)
kusano 2b45e8
	lda	$sp,  16($sp)
kusano 2b45e8
	ret
kusano 2b45e8
	EPILOGUE