Blame thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/ia64/zgemm_ncopy.S

kusano 2b45e8
/*********************************************************************/
kusano 2b45e8
/* Copyright 2009, 2010 The University of Texas at Austin.           */
kusano 2b45e8
/* All rights reserved.                                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* Redistribution and use in source and binary forms, with or        */
kusano 2b45e8
/* without modification, are permitted provided that the following   */
kusano 2b45e8
/* conditions are met:                                               */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   1. Redistributions of source code must retain the above         */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer.                                                  */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   2. Redistributions in binary form must reproduce the above      */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer in the documentation and/or other materials       */
kusano 2b45e8
/*      provided with the distribution.                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
kusano 2b45e8
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
kusano 2b45e8
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
kusano 2b45e8
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
kusano 2b45e8
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
kusano 2b45e8
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
kusano 2b45e8
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
kusano 2b45e8
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
kusano 2b45e8
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
kusano 2b45e8
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
kusano 2b45e8
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
kusano 2b45e8
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
kusano 2b45e8
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
kusano 2b45e8
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* The views and conclusions contained in the software and           */
kusano 2b45e8
/* documentation are those of the authors and should not be          */
kusano 2b45e8
/* interpreted as representing official policies, either expressed   */
kusano 2b45e8
/* or implied, of The University of Texas at Austin.                 */
kusano 2b45e8
/*********************************************************************/
kusano 2b45e8
kusano 2b45e8
#define ASSEMBLER
kusano 2b45e8
#include "common.h"
kusano 2b45e8
kusano 2b45e8
#define PREFETCHSIZE   64
kusano 2b45e8
#define WPREFETCHSIZE  32
kusano 2b45e8
kusano 2b45e8
#define LD	LDF8
kusano 2b45e8
#define ST	STF8_NTA
kusano 2b45e8
	
kusano 2b45e8
#define TEMP	r2
kusano 2b45e8
kusano 2b45e8
#define I	r14
kusano 2b45e8
#define J	r15
kusano 2b45e8
#define PREB	r16
kusano 2b45e8
#define PREA	r17
kusano 2b45e8
kusano 2b45e8
#define A1	r18
kusano 2b45e8
#define A2	r19
kusano 2b45e8
#define A3	r20
kusano 2b45e8
#define A4	r21
kusano 2b45e8
#define A5	r22
kusano 2b45e8
#define A6	r23
kusano 2b45e8
#define A7	r24
kusano 2b45e8
#define A8	r25
kusano 2b45e8
#define B1	r26
kusano 2b45e8
kusano 2b45e8
#define COUNT	r28
kusano 2b45e8
kusano 2b45e8
#define ARLC	r30
kusano 2b45e8
#define PR	r31
kusano 2b45e8
kusano 2b45e8
#define M	r32
kusano 2b45e8
#define N	r33
kusano 2b45e8
#define A	r34
kusano 2b45e8
#define LDA	r35
kusano 2b45e8
#define B	r36
kusano 2b45e8
kusano 2b45e8
	PROLOGUE
kusano 2b45e8
	.prologue
kusano 2b45e8
	PROFCODE
kusano 2b45e8
kusano 2b45e8
	.body	
kusano 2b45e8
	{ .mii
kusano 2b45e8
	shladd	LDA= LDA, ZBASE_SHIFT, r0
kusano 2b45e8
	mov	PR = pr
kusano 2b45e8
	shr	J = N, 2
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mii
kusano 2b45e8
	mov COUNT=r0
kusano 2b45e8
	tbit.nz p10, p0 =M, 1
kusano 2b45e8
	tbit.nz p11, p0 =M, 0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mib
kusano 2b45e8
	cmp.eq	p8,p0 = 0, J
kusano 2b45e8
	mov	ARLC = ar.lc
kusano 2b45e8
	(p8) br.cond.dpnt .L20
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
      .align 32
kusano 2b45e8
kusano 2b45e8
.L11:
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	mov	A1 = A
kusano 2b45e8
	add	A2 = A, LDA
kusano 2b45e8
	mov	pr.rot = 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	shladd A3 = LDA, 1, A
kusano 2b45e8
	adds   B1 = 4 * SIZE, B
kusano 2b45e8
	shr    I  = M, 2
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	shladd	A4 = LDA, 1, A2
kusano 2b45e8
	cmp.eq	p16,p0 = r0, r0
kusano 2b45e8
	mov	ar.ec = 3
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	cmp.eq	p6,p0 = 0,I
kusano 2b45e8
	adds	I =-1, I
kusano 2b45e8
	adds	J =-1, J
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	shladd	A = LDA, 2, A
kusano 2b45e8
	adds	A5 = 4 * SIZE, A1
kusano 2b45e8
	adds	A6 = 4 * SIZE, A2
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	adds	A7 = 4 * SIZE, A3
kusano 2b45e8
	adds	A8 = 4 * SIZE, A4
kusano 2b45e8
	adds	PREA = PREFETCHSIZE * SIZE,A1
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mib
kusano 2b45e8
	adds	PREB = WPREFETCHSIZE * SIZE, B
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	(p6) br.cond.dpnt.few .L15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L12:
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	lfetch.nt1	[PREA], LDA
kusano 2b45e8
	(p16)	lfetch.excl.nt1	[PREB], 16 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f34, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f82, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f32 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f35 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f40, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f88, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f38 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f41 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f58,  SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f106, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f44 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f47 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ] = f64,  5 * SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f112, 5 * SIZE
kusano 2b45e8
	tbit.z	p0,p7 = COUNT,0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f50 = [A1], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f53 = [A5], 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f46, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f94, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f56 = [A2], SIZE
kusano 2b45e8
	(p16)	LD	f59 = [A6], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f52, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f100, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f62 = [A2], SIZE
kusano 2b45e8
	(p16)	LD	f65 = [A6], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f70, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f118, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f68 = [A2], SIZE
kusano 2b45e8
	(p16)	LD	f71 = [A6], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ]  = f76, 5 * SIZE
kusano 2b45e8
	(p18)	ST	[B1]  = f124, 5 * SIZE
kusano 2b45e8
	shladd	TEMP = LDA, 2, r0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f74 = [A2], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f77 = [A6], 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	lfetch.nt1	[PREA], LDA
kusano 2b45e8
	(p16)	lfetch.excl.nt1	[PREB], 16 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f37, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f85, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f80 = [A3], SIZE
kusano 2b45e8
	(p16)	LD	f83 = [A7], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ] = f43, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f91, SIZE
kusano 2b45e8
	adds	TEMP = -16 * SIZE, TEMP
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f86 = [A3], SIZE
kusano 2b45e8
	(p16)	LD	f89 = [A7], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ] = f61, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f109, SIZE
kusano 2b45e8
	(p7)	sub	PREA = PREA, TEMP
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f92 = [A3], SIZE
kusano 2b45e8
	(p16)	LD	f95 = [A7], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f67, 5 * SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f115, 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f98 = [A3], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f101 = [A7], 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f49, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f97, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f104 = [A4], SIZE
kusano 2b45e8
	(p16)	LD	f107 = [A8], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f55, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f103, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f110 = [A4], SIZE
kusano 2b45e8
	(p16)	LD	f113 = [A8], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f73, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f121, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f116 = [A4], SIZE
kusano 2b45e8
	(p16)	LD	f119 = [A8], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ] = f79, 5 * SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f127, 5 * SIZE
kusano 2b45e8
	(p16)	adds	COUNT =  1, COUNT
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f122 = [A4], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f125 = [A8], 5 * SIZE
kusano 2b45e8
	br.ctop.sptk.few .L12
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L15:
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f32 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f40 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f33 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f41 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f34 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f42 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f35 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f43 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f50 = [A3], SIZE
kusano 2b45e8
	(p10)	LD	f60 = [A4], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f51 = [A3], SIZE
kusano 2b45e8
	(p10)	LD	f61 = [A4], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f52 = [A3], SIZE
kusano 2b45e8
	(p10)	LD	f62 = [A4], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f53 = [A3], SIZE
kusano 2b45e8
	(p10)	LD	f63 = [A4], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	LD	f36 = [A1], SIZE
kusano 2b45e8
	(p11)	LD	f44 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	LD	f37 = [A1]
kusano 2b45e8
	(p11)	LD	f45 = [A2]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	LD	f54 = [A3], SIZE
kusano 2b45e8
	(p11)	LD	f64 = [A4], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	LD	f55 = [A3]
kusano 2b45e8
	(p11)	LD	f65 = [A4]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[B ] = f32, SIZE
kusano 2b45e8
	(p10)	ST	[B1] = f50, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[B ] = f33, SIZE
kusano 2b45e8
	(p10)	ST	[B1] = f51, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[B ] = f40, SIZE
kusano 2b45e8
	(p10)	ST	[B1] = f60, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[B ] = f41, 5 * SIZE
kusano 2b45e8
	(p10)	ST	[B1] = f61, 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[B ] = f34, SIZE
kusano 2b45e8
	(p10)	ST	[B1] = f52, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[B ] = f35, SIZE
kusano 2b45e8
	(p10)	ST	[B1] = f53, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[B ] = f42, SIZE
kusano 2b45e8
	(p10)	ST	[B1] = f62, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[B ] = f43, 5 * SIZE
kusano 2b45e8
	(p10)	ST	[B1] = f63, 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	ST	[B ] = f36, SIZE
kusano 2b45e8
	(p11)	ST	[B1] = f54, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	ST	[B ] = f37, SIZE
kusano 2b45e8
	(p11)	ST	[B1] = f55, SIZE
kusano 2b45e8
	mov	COUNT = r0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	ST	[B ] = f44, SIZE
kusano 2b45e8
	(p11)	ST	[B1] = f64, SIZE
kusano 2b45e8
	cmp.eq	p0,p6 = 0,J
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	ST	[B ] = f45, 5 * SIZE
kusano 2b45e8
	(p11)	ST	[B1] = f65, 5 * SIZE
kusano 2b45e8
	(p6)	br.cond.dptk.few .L11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L20:
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	mov	A1 = A
kusano 2b45e8
	add	A2 = A,LDA
kusano 2b45e8
	mov	pr.rot = 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	adds	A5 = 4 * SIZE, A
kusano 2b45e8
	adds	B1 = 4 * SIZE, B
kusano 2b45e8
	tbit.z	p8, p0 = N, 1
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	cmp.eq	p16,p0 = r0,r0
kusano 2b45e8
	adds	PREA = PREFETCHSIZE * SIZE, A
kusano 2b45e8
	mov	ar.ec = 3
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mib
kusano 2b45e8
	adds	PREB = WPREFETCHSIZE * SIZE,B
kusano 2b45e8
	shr	I = M, 2
kusano 2b45e8
	(p8)	br.cond.dpnt.few .L30
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	shladd	A = LDA, 1, A
kusano 2b45e8
	cmp.eq	p6, p0 = 0, I
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mib
kusano 2b45e8
	adds	A6 = 4 * SIZE, A2
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	(p6)	br.cond.dpnt.few .L25
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L21:
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	lfetch.nt1	[PREA],LDA
kusano 2b45e8
	(p16)	lfetch.excl.nt1	[PREB ],16 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f34, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f46, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f32 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f35 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f40, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f52, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f38 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f41 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f58, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f70, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f44 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f47 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ] = f64, 5 * SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f76, 5 * SIZE
kusano 2b45e8
	tbit.z	p0,p7 = COUNT,0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f50 = [A1], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f53 = [A5], 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ] = f37, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f49, SIZE
kusano 2b45e8
	adds	TEMP = -16 * SIZE,TEMP
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f56 = [A2], SIZE
kusano 2b45e8
	(p16)	LD	f59 = [A6], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ] = f43, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f55, SIZE
kusano 2b45e8
	(p7)	sub	PREA = PREA,TEMP
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f62 = [A2], SIZE
kusano 2b45e8
	(p16)	LD	f65 = [A6], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ] = f61, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f73, SIZE
kusano 2b45e8
	(p16)	adds	COUNT = 1,COUNT
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f68 = [A2], SIZE
kusano 2b45e8
	(p16)	LD	f71 = [A6], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ] = f67, 5 * SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f79, 5 * SIZE
kusano 2b45e8
	shladd	TEMP = LDA,2,r0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f74 = [A2], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f77 = [A6], 5 * SIZE
kusano 2b45e8
	br.ctop.sptk.few .L21
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L25:
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f32 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f40 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f33 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f41 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f34 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f42 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f35 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f43 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	LD	f36 = [A1], SIZE
kusano 2b45e8
	(p11)	LD	f44 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	LD	f37 = [A1]
kusano 2b45e8
	(p11)	LD	f45 = [A2]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[B ] = f32, SIZE
kusano 2b45e8
	(p10)	ST	[B1] = f34, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[B ] = f33, SIZE
kusano 2b45e8
	(p10)	ST	[B1] = f35, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[B ] = f40, SIZE
kusano 2b45e8
	(p10)	ST	[B1] = f42, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[B ] = f41, 5 * SIZE
kusano 2b45e8
	(p10)	ST	[B1] = f43, 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	ST	[B ] = f36, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p11)	ST	[B ] = f37, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	ST	[B ] = f44, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p11)	ST	[B ] = f45, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L30:
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	mov	A1 = A
kusano 2b45e8
	mov	COUNT = r0
kusano 2b45e8
	mov	pr.rot = 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	adds	A5 = 4 * SIZE,A
kusano 2b45e8
	adds	B1 = 4 * SIZE,B
kusano 2b45e8
	tbit.z	p8,p0 = N,0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	cmp.eq	p16,p0 = r0,r0
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	mov	ar.ec = 3
kusano 2b45e8
	}
kusano 2b45e8
	{ .mib
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	shr	I = M,2
kusano 2b45e8
	(p8)	br.cond.dptk.few .L999
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	cmp.eq	p6 ,p0 = 0, I
kusano 2b45e8
	adds	PREA = PREFETCHSIZE * SIZE, A
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mib
kusano 2b45e8
	adds	PREB = WPREFETCHSIZE * SIZE, B
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	(p6)	br.cond.dpnt.few .L35
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L31:
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p16)	lfetch.nt1	[PREA], LDA
kusano 2b45e8
	(p16)	lfetch.excl.nt1	[PREB ], 16 * SIZE
kusano 2b45e8
	tbit.z	p0, p7 = COUNT, 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ] = f34, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f37, SIZE
kusano 2b45e8
	shladd	TEMP = LDA,2,r0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f32 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f35 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ] = f40, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f43, SIZE
kusano 2b45e8
	adds	TEMP = -16 * SIZE,TEMP
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f38 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f41 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B ] = f46, SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f49, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p16)	LD	f44 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f47 = [A5], SIZE
kusano 2b45e8
	(p7)	sub	PREA = PREA,TEMP
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B ] = f52, 5 * SIZE
kusano 2b45e8
	(p18)	ST	[B1] = f55, 5 * SIZE
kusano 2b45e8
	(p16)	adds	COUNT = 1,COUNT
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f50 = [A1], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f53 = [A5], 5 * SIZE
kusano 2b45e8
	br.ctop.sptk.few .L31
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L35:
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p10)	LD	f32 = [A1], SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p10)	LD	f33 = [A1], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p10)	LD	f34 = [A1], SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p10)	LD	f35 = [A1], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	LD	f36 = [A1], SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p11)	LD	f37 = [A1]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p10)	ST	[B ] = f32, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p10)	ST	[B ] = f33, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p10)	ST	[B ] = f34, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p10)	ST	[B ] = f35, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	ST	[B ] = f36, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p11)	ST	[B ] = f37, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L999:
kusano 2b45e8
	mov	pr = PR,-1
kusano 2b45e8
	mov	ar.lc = ARLC
kusano 2b45e8
	br.ret.sptk.many b0
kusano 2b45e8
	;;
kusano 2b45e8
	EPILOGUE
kusano 2b45e8