Blame thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/ia64/zgemm_tcopy.S

kusano 2b45e8
/*********************************************************************/
kusano 2b45e8
/* Copyright 2009, 2010 The University of Texas at Austin.           */
kusano 2b45e8
/* All rights reserved.                                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* Redistribution and use in source and binary forms, with or        */
kusano 2b45e8
/* without modification, are permitted provided that the following   */
kusano 2b45e8
/* conditions are met:                                               */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   1. Redistributions of source code must retain the above         */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer.                                                  */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   2. Redistributions in binary form must reproduce the above      */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer in the documentation and/or other materials       */
kusano 2b45e8
/*      provided with the distribution.                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
kusano 2b45e8
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
kusano 2b45e8
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
kusano 2b45e8
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
kusano 2b45e8
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
kusano 2b45e8
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
kusano 2b45e8
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
kusano 2b45e8
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
kusano 2b45e8
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
kusano 2b45e8
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
kusano 2b45e8
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
kusano 2b45e8
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
kusano 2b45e8
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
kusano 2b45e8
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* The views and conclusions contained in the software and           */
kusano 2b45e8
/* documentation are those of the authors and should not be          */
kusano 2b45e8
/* interpreted as representing official policies, either expressed   */
kusano 2b45e8
/* or implied, of The University of Texas at Austin.                 */
kusano 2b45e8
/*********************************************************************/
kusano 2b45e8
kusano 2b45e8
#define ASSEMBLER
kusano 2b45e8
#include "common.h"
kusano 2b45e8
kusano 2b45e8
#define PREFETCHSIZE   24
kusano 2b45e8
#define WPREFETCHSIZE  48
kusano 2b45e8
kusano 2b45e8
#define LD	LDF8
kusano 2b45e8
#define ST	STF8_NTA
kusano 2b45e8
kusano 2b45e8
#define PREA	r2
kusano 2b45e8
#define PREB	r3
kusano 2b45e8
kusano 2b45e8
#define I	r14
kusano 2b45e8
#define J	r15
kusano 2b45e8
kusano 2b45e8
#define A1	r16
kusano 2b45e8
#define A2	r17
kusano 2b45e8
#define A3	r18
kusano 2b45e8
#define A4	r19
kusano 2b45e8
#define A5	r20
kusano 2b45e8
#define A6	r21
kusano 2b45e8
#define A7	r22
kusano 2b45e8
#define A8	r23
kusano 2b45e8
#define B1	r24
kusano 2b45e8
#define B2	r25
kusano 2b45e8
kusano 2b45e8
#define COUNT	r26
kusano 2b45e8
#define TEMP	r27
kusano 2b45e8
kusano 2b45e8
#define BO2	r28
kusano 2b45e8
#define BO3	r29
kusano 2b45e8
#define LDB	r8
kusano 2b45e8
kusano 2b45e8
#define ARLC	r30
kusano 2b45e8
#define PR	r31
kusano 2b45e8
kusano 2b45e8
#define M	r32
kusano 2b45e8
#define N	r33
kusano 2b45e8
#define A	r34
kusano 2b45e8
#define LDA	r35
kusano 2b45e8
#define B	r36
kusano 2b45e8
kusano 2b45e8
	PROLOGUE
kusano 2b45e8
	.prologue
kusano 2b45e8
	PROFCODE
kusano 2b45e8
kusano 2b45e8
	.body
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	setf.sig f32 = M
kusano 2b45e8
	and	r8  = -4, N
kusano 2b45e8
	mov	ARLC  = ar.lc
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	setf.sig f33  = r8
kusano 2b45e8
	and	r9  = -2, N
kusano 2b45e8
	mov	PR = pr
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	setf.sig f34  = r9
kusano 2b45e8
	shladd	LDA = LDA, ZBASE_SHIFT, r0
kusano 2b45e8
	shl	LDB = M, BASE_SHIFT + 3
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	 __LINE__
kusano 2b45e8
	xmpy.l	f33  = f32, f33
kusano 2b45e8
	shr	J = M, 2
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	 __LINE__
kusano 2b45e8
	xmpy.l	f34  = f32, f34
kusano 2b45e8
	nop	 __LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	getf.sig BO2 = f33
kusano 2b45e8
	getf.sig BO3 = f34
kusano 2b45e8
	nop	 __LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	shladd	BO2 = BO2, ZBASE_SHIFT, B
kusano 2b45e8
	shladd	BO3 = BO3, ZBASE_SHIFT, B
kusano 2b45e8
	tbit.nz p10, p0 =N, 1
kusano 2b45e8
	}
kusano 2b45e8
	{ .mib
kusano 2b45e8
	cmp.eq	p6, p0 = 0, J
kusano 2b45e8
	tbit.nz p11, p0 =N, 0
kusano 2b45e8
	(p6)	br.cond.dpnt .L20
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L11:
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	mov	A1 = A
kusano 2b45e8
	add	A2 = A, LDA
kusano 2b45e8
	mov	pr.rot = 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	shladd A3 = LDA, 1, A
kusano 2b45e8
	mov    B1 = B
kusano 2b45e8
	shr    I  = N, 2
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	shladd	A4 = LDA, 1, A2
kusano 2b45e8
	cmp.eq	p16,p0 = r0, r0
kusano 2b45e8
	mov	ar.ec = 3
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	cmp.eq	p6,p0 = 0,I
kusano 2b45e8
	adds	I =-1, I
kusano 2b45e8
	adds	J =-1, J
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	shladd	A = LDA, 2, A
kusano 2b45e8
	adds	A5 = 4 * SIZE, A1
kusano 2b45e8
	adds	A6 = 4 * SIZE, A2
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	adds	A7 = 4 * SIZE, A3
kusano 2b45e8
	adds	A8 = 4 * SIZE, A4
kusano 2b45e8
	adds	PREA = PREFETCHSIZE * SIZE,A1
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	adds   B2 = 4 * SIZE, B
kusano 2b45e8
	adds	PREB = WPREFETCHSIZE * SIZE, B
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mib
kusano 2b45e8
	adds   B  = 32 * SIZE, B
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	(p6) br.cond.dpnt.few .L15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
kusano 2b45e8
.L12:
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	lfetch.nt1	[PREA], LDA
kusano 2b45e8
	(p16)	lfetch.excl.nt1	[PREB], LDB
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f34, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f37, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f32 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f35 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f40, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f43, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f38 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f41 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f46,  SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f49,  SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f44 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f47 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B1] = f52,  5 * SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f55,  5 * SIZE
kusano 2b45e8
	tbit.z	p0,p7 = COUNT,0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f50 = [A1], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f53 = [A5], 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f58, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f61, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f56 = [A2], SIZE
kusano 2b45e8
	(p16)	LD	f59 = [A6], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f64, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f67, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f62 = [A2], SIZE
kusano 2b45e8
	(p16)	LD	f65 = [A6], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f70, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f73, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f68 = [A2], SIZE
kusano 2b45e8
	(p16)	LD	f71 = [A6], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B1]  = f76, 5 * SIZE
kusano 2b45e8
	(p18)	ST	[B2]  = f79, 5 * SIZE
kusano 2b45e8
	shladd	TEMP = LDA, 2, r0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f74 = [A2], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f77 = [A6], 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f82, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f85, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	lfetch.nt1	[PREA], LDA
kusano 2b45e8
	(p16)	lfetch.excl.nt1	[PREB], LDB
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B1] = f88, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f91, SIZE
kusano 2b45e8
	adds	TEMP = -16 * SIZE, TEMP
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f80 = [A3], SIZE
kusano 2b45e8
	(p16)	LD	f83 = [A7], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B1] = f94, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f97, SIZE
kusano 2b45e8
	(p7)	sub	PREA = PREA, TEMP
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f86 = [A3], SIZE
kusano 2b45e8
	(p16)	LD	f89 = [A7], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f100, 5 * SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f103, 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f92 = [A3], SIZE
kusano 2b45e8
	(p16)	LD	f95 = [A7], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f106, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f109, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f98  = [A3], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f101 = [A7], 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f112, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f115, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f104 = [A4], SIZE
kusano 2b45e8
	(p16)	LD	f107 = [A8], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f118, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f121, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f110 = [A4], SIZE
kusano 2b45e8
	(p16)	LD	f113 = [A8], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B1] = f124, -27 * SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f127, -27 * SIZE
kusano 2b45e8
	(p16)	adds	COUNT =  1, COUNT
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f116 = [A4], SIZE
kusano 2b45e8
	(p16)	LD	f119 = [A8], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18) add	B1 = B1, LDB
kusano 2b45e8
	(p18) add	B2 = B2, LDB
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f122 = [A4], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f125 = [A8], 5 * SIZE
kusano 2b45e8
	br.ctop.sptk.few .L12
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L15:
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f32 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f40 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f33 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f41 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f34 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f42 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f35 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f43 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f50 = [A3], SIZE
kusano 2b45e8
	(p10)	LD	f60 = [A4], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f51 = [A3], SIZE
kusano 2b45e8
	(p10)	LD	f61 = [A4], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f52 = [A3], SIZE
kusano 2b45e8
	(p10)	LD	f62 = [A4], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f53 = [A3], SIZE
kusano 2b45e8
	(p10)	LD	f63 = [A4], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	LD	f36 = [A1], SIZE
kusano 2b45e8
	(p11)	LD	f44 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	LD	f37 = [A1]
kusano 2b45e8
	(p11)	LD	f45 = [A2]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	LD	f54 = [A3], SIZE
kusano 2b45e8
	(p11)	LD	f64 = [A4], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	LD	f55 = [A3]
kusano 2b45e8
	(p11)	LD	f65 = [A4]
kusano 2b45e8
	adds	B2 = 4 * SIZE, BO2
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[BO2] = f32, SIZE
kusano 2b45e8
	(p10)	ST	[B2]  = f40, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[BO2] = f33, SIZE
kusano 2b45e8
	(p10)	ST	[B2]  = f41, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[BO2] = f34, SIZE
kusano 2b45e8
	(p10)	ST	[B2]  = f42, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[BO2] = f35, 5 * SIZE
kusano 2b45e8
	(p10)	ST	[B2]  = f43, 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[BO2] = f50, SIZE
kusano 2b45e8
	(p10)	ST	[B2]  = f60, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[BO2] = f51, SIZE
kusano 2b45e8
	(p10)	ST	[B2]  = f61, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[BO2] = f52, SIZE
kusano 2b45e8
	(p10)	ST	[B2]  = f62, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p10)	ST	[BO2] = f53, 5 * SIZE
kusano 2b45e8
	(p10)	ST	[B2]  = f63
kusano 2b45e8
	adds	B2 = 4 * SIZE, BO3
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	ST	[BO3] = f36, SIZE
kusano 2b45e8
	(p11)	ST	[B2] = f54, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	ST	[BO3] = f37, SIZE
kusano 2b45e8
	(p11)	ST	[B2] = f55, SIZE
kusano 2b45e8
	mov	COUNT = r0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	ST	[BO3] = f44, SIZE
kusano 2b45e8
	(p11)	ST	[B2] = f64, SIZE
kusano 2b45e8
	cmp.eq	p0,p6 = 0,J
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	ST	[BO3] = f45, 5 * SIZE
kusano 2b45e8
	(p11)	ST	[B2] = f65, 5 * SIZE
kusano 2b45e8
	(p6)	br.cond.dptk.few .L11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L20:
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	mov	A1 = A
kusano 2b45e8
	add	A2 = A, LDA
kusano 2b45e8
	mov	pr.rot = 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	mov    B1 = B
kusano 2b45e8
	adds	PREA = PREFETCHSIZE * SIZE,A
kusano 2b45e8
	tbit.z	p6, p0 = M, 1
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	cmp.eq	p16,p0 = r0, r0
kusano 2b45e8
	adds   B2 = 4 * SIZE, B
kusano 2b45e8
	mov	ar.ec = 3
kusano 2b45e8
	}
kusano 2b45e8
	{ .mib
kusano 2b45e8
	adds	PREB = WPREFETCHSIZE * SIZE, B
kusano 2b45e8
	shr    I  = N, 2
kusano 2b45e8
	(p6)	br.cond.dpnt .L30
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	cmp.eq	p6, p0 = 0, I
kusano 2b45e8
	adds	I =-1, I
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	shladd	A = LDA, 1, A
kusano 2b45e8
	adds	A5 = 4 * SIZE, A1
kusano 2b45e8
	adds	A6 = 4 * SIZE, A2
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mib
kusano 2b45e8
	adds   B  = 16 * SIZE, B
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	(p6) br.cond.dpnt.few .L25
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
kusano 2b45e8
.L22:
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p16)	lfetch.nt1	[PREA], LDA
kusano 2b45e8
	(p16)	lfetch.excl.nt1	[PREB], LDB
kusano 2b45e8
	shladd	TEMP = LDA, 1, r0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f34, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f37, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f32 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f35 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f40, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f43, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f38 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f41 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f46,  SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f49,  SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f44 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f47 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B1] = f52,  5 * SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f55,  5 * SIZE
kusano 2b45e8
	tbit.z	p0,p7 = COUNT,0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f50 = [A1], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f53 = [A5], 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f58, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f61, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f56 = [A2], SIZE
kusano 2b45e8
	(p16)	LD	f59 = [A6], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B1] = f64, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f67, SIZE
kusano 2b45e8
	adds	TEMP = -16 * SIZE, TEMP
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f62 = [A2], SIZE
kusano 2b45e8
	(p16)	LD	f65 = [A6], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B1] = f70,  SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f73,  SIZE
kusano 2b45e8
	(p7)	sub	PREA = PREA, TEMP
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f68 = [A2], SIZE
kusano 2b45e8
	(p16)	LD	f71 = [A6], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B1] = f76, -11 * SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f79, -11 * SIZE
kusano 2b45e8
	(p16)	adds	COUNT =  1, COUNT
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f74 = [A2], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f77 = [A6], 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18) add	B1 = B1, LDB
kusano 2b45e8
	(p18) add	B2 = B2, LDB
kusano 2b45e8
	br.ctop.sptk.few .L22
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L25:
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f32 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f40 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f33 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f41 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f34 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f42 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	LD	f35 = [A1], SIZE
kusano 2b45e8
	(p10)	LD	f43 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p11)	LD	f36 = [A1], SIZE
kusano 2b45e8
	(p11)	LD	f44 = [A2], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	LD	f37 = [A1]
kusano 2b45e8
	(p11)	LD	f45 = [A2]
kusano 2b45e8
	adds	B2 = 4 * SIZE, BO2
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[BO2] = f32, SIZE
kusano 2b45e8
	(p10)	ST	[B2]  = f40, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[BO2] = f33, SIZE
kusano 2b45e8
	(p10)	ST	[B2]  = f41, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[BO2] = f34, SIZE
kusano 2b45e8
	(p10)	ST	[B2]  = f42, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p10)	ST	[BO2] = f35, 5 * SIZE
kusano 2b45e8
	(p10)	ST	[B2]  = f43, 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	ST	[BO3] = f36, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p11)	ST	[BO3] = f37, SIZE
kusano 2b45e8
	mov	COUNT = r0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	ST	[BO3] = f44, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p11)	ST	[BO3] = f45, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L30:
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	mov	A1 = A
kusano 2b45e8
	adds	A5 = 4 * SIZE, A
kusano 2b45e8
	mov	pr.rot = 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	mov    B1 = B
kusano 2b45e8
	adds   B2 = 4 * SIZE, B
kusano 2b45e8
	tbit.z	p6, p0 = M, 0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mib
kusano 2b45e8
	cmp.eq	p16,p0 = r0, r0
kusano 2b45e8
	shr    I  = N, 2
kusano 2b45e8
	(p6)	br.cond.dpnt .L999
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	cmp.eq	p6, p0 = 0, I
kusano 2b45e8
	adds	I =-1, I
kusano 2b45e8
	mov	ar.ec = 3
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mib
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	(p6) br.cond.dpnt.few .L35
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L32:
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f34, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f37, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f32 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f35 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f40, SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f43, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f38 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f41 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18)	ST	[B1] = f46,  SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f49,  SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f44 = [A1], SIZE
kusano 2b45e8
	(p16)	LD	f47 = [A5], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p18)	ST	[B1] = f52, -3 * SIZE
kusano 2b45e8
	(p18)	ST	[B2] = f55, -3 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p16)	LD	f50 = [A1], 5 * SIZE
kusano 2b45e8
	(p16)	LD	f53 = [A5], 5 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	(p18) add	B1 = B1, LDB
kusano 2b45e8
	(p18) add	B2 = B2, LDB
kusano 2b45e8
	br.ctop.sptk.few .L32
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L35:
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p10)	LD	f32 = [A1], SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p10)	LD	f33 = [A1], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p10)	LD	f34 = [A1], SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p10)	LD	f35 = [A1], SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	LD	f36 = [A1], SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p11)	LD	f37 = [A1]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p10)	ST	[BO2] = f32, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p10)	ST	[BO2] = f33, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p10)	ST	[BO2] = f34, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p10)	ST	[BO2] = f35, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p11)	ST	[BO3] = f36, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p11)	ST	[BO3] = f37, SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 32
kusano 2b45e8
kusano 2b45e8
.L999:
kusano 2b45e8
	mov pr    = PR, -1
kusano 2b45e8
	mov	 ar.lc = ARLC
kusano 2b45e8
	br.ret.sptk.many b0
kusano 2b45e8
	EPILOGUE