kusano 2b45e8
/*********************************************************************/
kusano 2b45e8
/* Copyright 2009, 2010 The University of Texas at Austin.           */
kusano 2b45e8
/* All rights reserved.                                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* Redistribution and use in source and binary forms, with or        */
kusano 2b45e8
/* without modification, are permitted provided that the following   */
kusano 2b45e8
/* conditions are met:                                               */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   1. Redistributions of source code must retain the above         */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer.                                                  */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   2. Redistributions in binary form must reproduce the above      */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer in the documentation and/or other materials       */
kusano 2b45e8
/*      provided with the distribution.                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
kusano 2b45e8
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
kusano 2b45e8
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
kusano 2b45e8
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
kusano 2b45e8
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
kusano 2b45e8
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
kusano 2b45e8
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
kusano 2b45e8
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
kusano 2b45e8
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
kusano 2b45e8
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
kusano 2b45e8
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
kusano 2b45e8
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
kusano 2b45e8
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
kusano 2b45e8
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* The views and conclusions contained in the software and           */
kusano 2b45e8
/* documentation are those of the authors and should not be          */
kusano 2b45e8
/* interpreted as representing official policies, either expressed   */
kusano 2b45e8
/* or implied, of The University of Texas at Austin.                 */
kusano 2b45e8
/*********************************************************************/
kusano 2b45e8
kusano 2b45e8
#define ASSEMBLER
kusano 2b45e8
#include "common.h"
kusano 2b45e8
kusano 2b45e8
#define P	4096
kusano 2b45e8
#define SP	r12
kusano 2b45e8
kusano 2b45e8
#define M	r32
kusano 2b45e8
#define N	r33
kusano 2b45e8
#define A	r36
kusano 2b45e8
#define LDA	r37
kusano 2b45e8
#define X	r38
kusano 2b45e8
#define INCX	r39
kusano 2b45e8
#define Y	r34
kusano 2b45e8
#define INCY	r35
kusano 2b45e8
#define BUFFER	r11
kusano 2b45e8
kusano 2b45e8
#define MIN_M	r14
kusano 2b45e8
#define I	r15
kusano 2b45e8
#define J	r16
kusano 2b45e8
#define IS	r17
kusano 2b45e8
#define AO1	r18
kusano 2b45e8
#define AO2	r19
kusano 2b45e8
#define AO3	r20
kusano 2b45e8
#define AO4	r21
kusano 2b45e8
#define AO5	r22
kusano 2b45e8
#define AO6	r23
kusano 2b45e8
#define AO7	r24
kusano 2b45e8
#define AO8	r25
kusano 2b45e8
#define BO	r26
kusano 2b45e8
#define LDAP	r27
kusano 2b45e8
kusano 2b45e8
#define RPRE1	loc0
kusano 2b45e8
#define RPRE2	loc1
kusano 2b45e8
#define RPRE3	loc2
kusano 2b45e8
#define RPRE4	loc3
kusano 2b45e8
#define RPRE5	loc4
kusano 2b45e8
#define RPRE6	loc5
kusano 2b45e8
#define RPRE7	loc6
kusano 2b45e8
#define RPRE8	loc7
kusano 2b45e8
kusano 2b45e8
#define AO21	loc8
kusano 2b45e8
#define AO41	loc9
kusano 2b45e8
#define AO61	loc10
kusano 2b45e8
#define AO81	loc11
kusano 2b45e8
	
kusano 2b45e8
#define PREB	r8
kusano 2b45e8
#define WPRE	r9
kusano 2b45e8
#define OFFSET	PREB
kusano 2b45e8
#define CO	r10
kusano 2b45e8
kusano 2b45e8
#define ARLC	r29
kusano 2b45e8
#define PR	r30
kusano 2b45e8
#define ARPFS	r31
kusano 2b45e8
	
kusano 2b45e8
#ifdef DOUBLE
kusano 2b45e8
#define RPREFETCH	(16 * 3 +  8)
kusano 2b45e8
#else
kusano 2b45e8
#define RPREFETCH	(16 * 3 + 16)
kusano 2b45e8
#endif
kusano 2b45e8
#define PREFETCH	lfetch.nt1
kusano 2b45e8
kusano 2b45e8
#define ALPHA	f6
kusano 2b45e8
kusano 2b45e8
	PROLOGUE
kusano 2b45e8
	.prologue
kusano 2b45e8
	PROFCODE
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	.save	ar.pfs, ARPFS
kusano 2b45e8
	alloc	ARPFS = ar.pfs, 8, 16, 8, 0
kusano 2b45e8
	setf.sig f11 = LDA
kusano 2b45e8
	mov	ARLC  = ar.lc
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	adds	r15 = 24, SP
kusano 2b45e8
	adds	r16 = 32, SP
kusano 2b45e8
	adds	r14 = 16, SP
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	setf.sig f10 = N
kusano 2b45e8
	ld8	Y      = [r14]
kusano 2b45e8
	mov	PR = pr
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	ld8	INCY   = [r15]
kusano 2b45e8
	adds	r8 = -8 * 16, SP
kusano 2b45e8
	adds	r9 = -7 * 16, SP
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	stf.spill  [r8] = f16, 32
kusano 2b45e8
	stf.spill  [r9] = f17, 32
kusano 2b45e8
	adds	SP = -8 * 16, SP
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	stf.spill  [r8] = f18, 32
kusano 2b45e8
	stf.spill  [r9] = f19, 32
kusano 2b45e8
	mov	ALPHA = f8
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	stf.spill  [r8] = f20, 32
kusano 2b45e8
	stf.spill  [r9] = f21, 32
kusano 2b45e8
	mov	IS = 0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	stf.spill  [r8] = f22
kusano 2b45e8
	stf.spill  [r9] = f23
kusano 2b45e8
	xmpy.l f10 = f10, f11
kusano 2b45e8
	}
kusano 2b45e8
	.body
kusano 2b45e8
	;;	
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	ld8	BUFFER = [r16]
kusano 2b45e8
	cmp.ge	p7, p0 = r0, M
kusano 2b45e8
	cmp.ge	p6, p0 = r0, N
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	shladd	INCX = INCX, BASE_SHIFT, r0
kusano 2b45e8
	shladd	LDA  = LDA, BASE_SHIFT, r0
kusano 2b45e8
	shladd	INCY = INCY, BASE_SHIFT, r0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	getf.sig LDAP = f10
kusano 2b45e8
	mov	r2 = P
kusano 2b45e8
	tbit.nz	p8, p0 = A,   BASE_SHIFT
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	tbit.nz	p9, p0 = LDA, BASE_SHIFT
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mbb
kusano 2b45e8
	sub	LDAP = r2, LDAP
kusano 2b45e8
	(p7) br.cond.dpnt .L999
kusano 2b45e8
	(p6) br.cond.dpnt .L999
kusano 2b45e8
	}
kusano 2b45e8
	.align 16
kusano 2b45e8
	;;
kusano 2b45e8
kusano 2b45e8
.LIs_loop:
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	sub	MIN_M = M, IS
kusano 2b45e8
	(p8) LDFD f32 = [X],  INCX
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	mov	AO1 = BUFFER
kusano 2b45e8
	adds	AO2 = 4 * SIZE, BUFFER
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	cmp.le	p6, p0 = r2, MIN_M
kusano 2b45e8
	;;
kusano 2b45e8
	(p6) mov MIN_M = P
kusano 2b45e8
	;;
kusano 2b45e8
	(p8) adds MIN_M = -1, MIN_M
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	shladd	OFFSET = INCX, 2, INCX
kusano 2b45e8
	shladd	BO  = INCX, 2, X
kusano 2b45e8
	shr	I = MIN_M, 3
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	adds I = -1, I
kusano 2b45e8
	cmp.eq	p16, p0 = r0, r0
kusano 2b45e8
	mov	ar.ec= 5
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	(p8) STFD [AO1] = f32, 2 * SIZE
kusano 2b45e8
	(p8) adds	AO2 = 6 * SIZE, BUFFER
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	}
kusano 2b45e8
	{ .mib
kusano 2b45e8
	cmp.gt	p6, p0 = 0, I
kusano 2b45e8
	tbit.nz	p13, p0 = MIN_M, 2
kusano 2b45e8
	(p6) br.cond.dpnt .L05
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L01:
kusano 2b45e8
	(p20) STFD [AO1] = f36,  SIZE
kusano 2b45e8
	(p20) STFD [AO2] = f56,  SIZE
kusano 2b45e8
	(p16) LDFD f32 = [X],  INCX
kusano 2b45e8
	(p16) LDFD f52 = [BO], INCX
kusano 2b45e8
	;;
kusano 2b45e8
	(p20) STFD [AO1] = f41,  SIZE
kusano 2b45e8
	(p20) STFD [AO2] = f61,  SIZE
kusano 2b45e8
	(p16) LDFD f37 = [X],  INCX
kusano 2b45e8
	(p16) LDFD f57 = [BO], INCX
kusano 2b45e8
	;;
kusano 2b45e8
	(p20) STFD [AO1] = f46,  SIZE
kusano 2b45e8
	(p20) STFD [AO2] = f66,  SIZE
kusano 2b45e8
	(p16) LDFD f42 = [X],  INCX
kusano 2b45e8
	(p16) LDFD f62 = [BO], INCX
kusano 2b45e8
	;;
kusano 2b45e8
	(p20) STFD [AO1] = f51,  5 * SIZE
kusano 2b45e8
	(p20) STFD [AO2] = f71,  5 * SIZE
kusano 2b45e8
	(p16) LDFD f47 = [X],  OFFSET
kusano 2b45e8
	(p16) LDFD f67 = [BO], OFFSET
kusano 2b45e8
	br.ctop.sptk.few .L01
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L05:
kusano 2b45e8
	(p13) LDFD f32 = [X],  INCX
kusano 2b45e8
	tbit.nz	p14, p0 = MIN_M, 1
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) LDFD f33 = [X],  INCX
kusano 2b45e8
	tbit.nz	p15, p0 = MIN_M, 0
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) LDFD f34 = [X],  INCX
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) LDFD f35 = [X],  INCX
kusano 2b45e8
	;;
kusano 2b45e8
	(p14) LDFD f36 = [X],  INCX
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) STFD [AO1] = f32, SIZE
kusano 2b45e8
	(p14) LDFD f37 = [X],  INCX
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) STFD [AO1] = f33, SIZE
kusano 2b45e8
	(p15) LDFD f38 = [X],  INCX
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) STFD [AO1] = f34, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) STFD [AO1] = f35, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p14) STFD [AO1] = f36, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p14) STFD [AO1] = f37, SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p15) STFD [AO1] = f38, SIZE
kusano 2b45e8
	(p9) br.cond.dpnt .L100
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L10:
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	mov	CO  = Y
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	shr	J   = N, 3
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mib
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	cmp.eq	p6, p0 = r0, J
kusano 2b45e8
	(p6) br.cond.dpnt .L20
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L11:
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	mov	AO1 = A
kusano 2b45e8
	mov	f8  = f0
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	add	AO2 = LDA, A
kusano 2b45e8
	mov	f10 = f0
kusano 2b45e8
	shr	I = MIN_M, 4
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	shladd	AO3 = LDA, 1, A
kusano 2b45e8
	shladd	AO4 = LDA, 1, AO2
kusano 2b45e8
	mov	f12 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f32 = [AO1], SIZE
kusano 2b45e8
	(p8) LDFD f33 = [AO2], SIZE
kusano 2b45e8
	mov	f14 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	shladd	AO5 = LDA, 1, AO3
kusano 2b45e8
	shladd	AO6 = LDA, 1, AO4
kusano 2b45e8
	mov	f16 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f34 = [AO3], SIZE
kusano 2b45e8
	(p8) LDFD f35 = [AO4], SIZE
kusano 2b45e8
	mov	f18 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	shladd	AO7 = LDA, 1, AO5
kusano 2b45e8
	shladd	AO8 = LDA, 1, AO6
kusano 2b45e8
	mov	f20 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f36 = [AO5], SIZE
kusano 2b45e8
	(p8) LDFD f37 = [AO6], SIZE
kusano 2b45e8
	mov	f22 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p8) LDFD f38 = [AO7], SIZE
kusano 2b45e8
	mov	f9  = f0
kusano 2b45e8
	mov	ar.ec= 2
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f39 = [AO8], SIZE
kusano 2b45e8
	mov	BO  = BUFFER
kusano 2b45e8
	mov	f11 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f40 = [BO], 2 * SIZE
kusano 2b45e8
	cmp.eq	p6, p0 = 0, I
kusano 2b45e8
	mov	f13 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	shladd	A   = LDA, 3, A
kusano 2b45e8
	cmp.eq	p16, p0 = r0, r0
kusano 2b45e8
	mov	f15 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	add	I = I, I
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	mov	f17 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	RPRE1  = RPREFETCH * SIZE, AO1
kusano 2b45e8
	adds	RPRE2  = (RPREFETCH + 8) * SIZE, AO2
kusano 2b45e8
	mov	f19 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	mov	f21 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	RPRE3  = RPREFETCH * SIZE, AO3
kusano 2b45e8
	adds	RPRE4  = (RPREFETCH + 8) * SIZE, AO4
kusano 2b45e8
	mov	f23 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p8) FMPY	f8  = f40, f32
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	RPRE5  = RPREFETCH * SIZE, AO5
kusano 2b45e8
	adds	RPRE6  = (RPREFETCH + 8) * SIZE, AO6
kusano 2b45e8
	(p8) FMPY	f10 = f40, f33
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p8) FMPY	f12 = f40, f34
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	RPRE7  = RPREFETCH * SIZE, AO7
kusano 2b45e8
	adds	RPRE8  = (RPREFETCH + 8) * SIZE, AO8
kusano 2b45e8
	(p8) FMPY	f14 = f40, f35
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p8) FMPY	f16 = f40, f36
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	WPRE = 8 * SIZE, CO
kusano 2b45e8
	adds	PREB  = RPREFETCH * SIZE, BO
kusano 2b45e8
	(p8) FMPY	f18 = f40, f37
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	lfetch.excl.nt1	[WPRE]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p8) FMPY	f20 = f40, f38
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p8) FMPY	f22 = f40, f39
kusano 2b45e8
	(p6) br.cond.dpnt .L15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L12:
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p17) LDFPD	f95, f96 = [AO8], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f104, f33, f8
kusano 2b45e8
	(p16) tbit.nz.unc p14, p15 = I, 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p17) LDFPD	f110, f111 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f9  = f105, f34, f9
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f32, f33 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f104, f35, f10
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p14) PREFETCH [RPRE1], 16 * SIZE
kusano 2b45e8
	(p17) FMA	f11 = f105, f36, f11
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f34, f35 = [AO2], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f12 = f104, f37, f12
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p15) PREFETCH [RPRE2], 16 * SIZE
kusano 2b45e8
	(p17) FMA	f13 = f105, f38, f13
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f36, f37 = [AO3], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f14 = f104, f39, f14
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p14) PREFETCH [RPRE3], 16 * SIZE
kusano 2b45e8
	(p17) FMA	f15 = f105, f40, f15
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f38, f39 = [AO4], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f16 = f104, f41, f16
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p15) PREFETCH [RPRE4], 16 * SIZE
kusano 2b45e8
	(p17) FMA	f17 = f105, f42, f17
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f40, f41 = [AO5], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f18 = f104, f43, f18
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p14) PREFETCH [RPRE5], 16 * SIZE
kusano 2b45e8
	(p17) FMA	f19 = f105, f44, f19
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f42, f43 = [AO6], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f20 = f104, f45, f20
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p15) PREFETCH [RPRE6], 16 * SIZE
kusano 2b45e8
	(p17) FMA	f21 = f105, f46, f21
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f44, f45 = [AO7], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f22 = f104, f47, f22
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p14) PREFETCH [RPRE7], 16 * SIZE
kusano 2b45e8
	(p17) FMA	f23 = f105, f48, f23
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f46, f47 = [AO8], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f106, f49, f8
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p15) PREFETCH [RPRE8], 16 * SIZE
kusano 2b45e8
	(p17) FMA	f9  = f107, f50, f9
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f48, f49 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f106, f51, f10
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p14) PREFETCH [PREB], 16 * SIZE
kusano 2b45e8
	(p17) FMA	f11 = f107, f52, f11
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f50, f51 = [AO2], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f12 = f106, f53, f12
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f103, f104 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f13 = f107, f54, f13
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f52, f53 = [AO3], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f14 = f106, f55, f14
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f107, f56, f15
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f54, f55 = [AO4], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f16 = f106, f57, f16
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f17 = f107, f58, f17
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f56, f57 = [AO5], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f18 = f106, f59, f18
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f19 = f107, f60, f19
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f58, f59 = [AO6], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f20 = f106, f61, f20
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f21 = f107, f62, f21
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f60, f61 = [AO7], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f22 = f106, f63, f22
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f23 = f107, f64, f23
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f62, f63 = [AO8], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f108, f65, f8
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f105, f106 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f9  = f109, f66, f9
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f64, f65 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f108, f67, f10
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f109, f68, f11
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f66, f67 = [AO2], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f12 = f108, f69, f12
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f13 = f109, f70, f13
kusano 2b45e8
 	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f68, f69 = [AO3], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f14 = f108, f71, f14
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f109, f72, f15
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f70, f71 = [AO4], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f16 = f108, f73, f16
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f17 = f109, f74, f17
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f72, f73 = [AO5], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f18 = f108, f75, f18
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f19 = f109, f76, f19
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f74, f75 = [AO6], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f20 = f108, f77, f20
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f21 = f109, f78, f21
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f76, f77 = [AO7], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f22 = f108, f79, f22
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f23 = f109, f80, f23
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f107, f108 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f110, f81, f8
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f78, f79 = [AO8], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f9  = f111, f82, f9
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f80, f81 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f110, f83, f10
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f111, f84, f11
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f82, f83 = [AO2], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f12 = f110, f85, f12
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f13 = f111, f86, f13
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f84, f85 = [AO3], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f14 = f110, f87, f14
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f111, f88, f15
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f86, f87 = [AO4], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f16 = f110, f89, f16
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f17 = f111, f90, f17
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f88, f89 = [AO5], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f18 = f110, f91, f18
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f19 = f111, f92, f19
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f90, f91 = [AO6], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f20 = f110, f93, f20
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f21 = f111, f94, f21
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f92, f93 = [AO7], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f22 = f110, f95, f22
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	(p17) FMA	f23 = f111, f96, f23
kusano 2b45e8
	br.ctop.sptk.few .L12
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L15:
kusano 2b45e8
	and	I = 15, MIN_M
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	;;
kusano 2b45e8
	cmp.eq	p6,  p0 = 0, I
kusano 2b45e8
	cmp.eq	p16, p15 = r0, r0
kusano 2b45e8
	;;
kusano 2b45e8
	adds	I = 1, I
kusano 2b45e8
	;;
kusano 2b45e8
	shr	I = I, 1
kusano 2b45e8
	;;
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	;;
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	mov	ar.ec= 3
kusano 2b45e8
	and	I = 15, MIN_M
kusano 2b45e8
	(p6) br.cond.dpnt .L18
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L16:
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f104, f107 = [BO], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f8  = f106, f34, f8
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f32,  f35  = [AO1], 2 * SIZE
kusano 2b45e8
	(p15) FMA	f9  = f109, f37, f9
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f38,  f41  = [AO2], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f10 = f106, f40, f10
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f11 = f109, f43, f11
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f44,  f47  = [AO3], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f12 = f106, f46, f12
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f13 = f109, f49, f13
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f50,  f53  = [AO4], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f14 = f106, f52, f14
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f15 = f109, f55, f15
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f56,  f59  = [AO5], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f16 = f106, f58, f16
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f17 = f109, f61, f17
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f62,  f65  = [AO6], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f18 = f106, f64, f18
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f19 = f109, f67, f19
kusano 2b45e8
	(p17) adds	I = -2, I
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f68,  f71  = [AO7], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f20 = f106, f70, f20
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f21 = f109, f73, f21
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p16) LDFPD	f74,  f77  = [AO8], 2 * SIZE
kusano 2b45e8
	(p15) FMA	f23 = f109, f79, f23
kusano 2b45e8
	(p17) cmp.ne.unc p15, p0 = -1, I
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p18) FMA	f22 = f106, f76, f22
kusano 2b45e8
	br.ctop.sptk.few .L16
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
kusano 2b45e8
.L18:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	mov	AO1 = CO
kusano 2b45e8
	LDFD	f32 = [CO], INCY
kusano 2b45e8
	FADD	f8  = f8,  f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f33 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f10 = f10, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f34 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f12 = f12, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f35 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f14 = f14, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f36 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f16 = f16, f17
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f37 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f18 = f18, f19
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f38 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f20 = f20, f21
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f39 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f22 = f22, f23
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f32 = ALPHA, f8,  f32
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f33 = ALPHA, f10, f33
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f34 = ALPHA, f12, f34
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f35 = ALPHA, f14, f35
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f32
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	FMA	f36 = ALPHA, f16, f36
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f33
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	FMA	f37 = ALPHA, f18, f37
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f34
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	FMA	f38 = ALPHA, f20, f38
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f35
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	FMA	f39 = ALPHA, f22, f39
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	STFD [AO1] = f36
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	adds J = -1, J
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	STFD [AO1] = f37
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	STFD [AO1] = f38
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	cmp4.lt p6, p0 = 0, J
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mib
kusano 2b45e8
	STFD [AO1] = f39
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	(p6) br.cond.dptk .L11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L20:
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	mov	AO1 = A
kusano 2b45e8
	mov	f8  = f0
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	add	AO2 = LDA, A
kusano 2b45e8
	mov	f10 = f0
kusano 2b45e8
	tbit.z	p6, p0  = N, 2
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	shladd	AO3 = LDA, 1, A
kusano 2b45e8
	mov	f12 = f0
kusano 2b45e8
	shr	I = MIN_M, 4
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	shladd	AO4 = LDA, 1, AO2
kusano 2b45e8
	mov	f14 = f0
kusano 2b45e8
	(p6) br.cond.dpnt .L30
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f32 = [AO1], SIZE
kusano 2b45e8
	(p8) LDFD f33 = [AO2], SIZE
kusano 2b45e8
	mov	f9  = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	mov	BO  = BUFFER
kusano 2b45e8
	shladd	A   = LDA, 2, A
kusano 2b45e8
	mov	f11 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f40 = [BO], 2 * SIZE
kusano 2b45e8
	cmp.eq	p6, p0 = 0, I
kusano 2b45e8
	mov	f13 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f34 = [AO3], SIZE
kusano 2b45e8
	(p8) LDFD f35 = [AO4], SIZE
kusano 2b45e8
	mov	f15 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	adds	RPRE1  = RPREFETCH * SIZE, AO1
kusano 2b45e8
	adds	RPRE2  = (RPREFETCH + 8) * SIZE, AO2
kusano 2b45e8
	mov	ar.ec= 2
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	cmp.eq	p16, p0 = r0, r0
kusano 2b45e8
	add	I = I, I
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	WPRE =  4 * SIZE, CO
kusano 2b45e8
	adds	PREB  = RPREFETCH * SIZE, BO
kusano 2b45e8
	(p8) FMPY	f8  = f40, f32
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	RPRE3  = RPREFETCH * SIZE, AO3
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	(p8) FMPY	f10 = f40, f33
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	lfetch.excl.nt1	[WPRE]
kusano 2b45e8
	(p8) FMPY	f12 = f40, f34
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	RPRE4  = (RPREFETCH + 8) * SIZE, AO4
kusano 2b45e8
	(p8) FMPY	f14 = f40, f35
kusano 2b45e8
	(p6) br.cond.dpnt .L25
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L22:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFPD	f87, f88 = [AO4], 2 * SIZE
kusano 2b45e8
	(p17) LDFPD	f110, f111 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f104, f33, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f105, f34, f9
kusano 2b45e8
	(p16) tbit.nz.unc p14, p15 = I, 0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [RPRE1], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f32, f33 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f104, f35, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f105, f36, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p15) PREFETCH [RPRE2], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f34, f35 = [AO2], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f12 = f104, f37, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f13 = f105, f38, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [RPRE3], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f36, f37 = [AO3], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f14 = f104, f39, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f105, f40, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p15) PREFETCH [RPRE4], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f38, f39 = [AO4], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f106, f49, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f107, f50, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [PREB], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f48, f49 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f106, f51, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f107, f52, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f50, f51 = [AO2], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f103, f104 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f12 = f106, f53, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f13 = f107, f54, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f52, f53 = [AO3], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f14 = f106, f55, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f107, f56, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f54, f55 = [AO4], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f105, f106 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f108, f65, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f109, f66, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f64, f65 = [AO1], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f10 = f108, f67, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f109, f68, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f66, f67 = [AO2], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f12 = f108, f69, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f13 = f109, f70, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f68, f69 = [AO3], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f14 = f108, f71, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f109, f72, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f70, f71 = [AO4], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f107, f108 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f110, f81, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f111, f82, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f80, f81 = [AO1], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f10 = f110, f83, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f111, f84, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f82, f83 = [AO2], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f12 = f110, f85, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f13 = f111, f86, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f84, f85 = [AO3], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f14 = f110, f87, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	(p17) FMA	f15 = f111, f88, f15
kusano 2b45e8
	br.ctop.sptk.few .L22
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L25:
kusano 2b45e8
	and	I = 15, MIN_M
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	;;
kusano 2b45e8
	cmp.eq	p6,  p0 = 0, I
kusano 2b45e8
	cmp.eq	p16, p15 = r0, r0
kusano 2b45e8
	;;
kusano 2b45e8
	adds	I = 1, I
kusano 2b45e8
	;;
kusano 2b45e8
	shr	I = I, 1
kusano 2b45e8
	;;
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	;;
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	mov	ar.ec= 3
kusano 2b45e8
	and	I = 15, MIN_M
kusano 2b45e8
	(p6) br.cond.dpnt .L28
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L26:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f104, f107 = [BO], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f32,  f35  = [AO1], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f8  = f106, f34, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f9  = f109, f37, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f38,  f41  = [AO2], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p18) FMA	f10 = f106, f40, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f11 = f109, f43, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f44,  f47  = [AO3], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p18) FMA	f12 = f106, f46, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) adds	I = -2, I
kusano 2b45e8
	(p15) FMA	f13 = f109, f49, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f50,  f53  = [AO4], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f15 = f109, f55, f15
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	(p17) cmp.ne.unc p15, p0 = -1, I
kusano 2b45e8
	(p18) FMA	f14 = f106, f52, f14
kusano 2b45e8
	br.ctop.sptk.few .L26
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
kusano 2b45e8
.L28:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	mov	AO1 = CO
kusano 2b45e8
	LDFD	f32 = [CO], INCY
kusano 2b45e8
	FADD	f8  = f8,  f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f33 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f10 = f10, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f34 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f12 = f12, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f35 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f14 = f14, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f32 = ALPHA, f8,  f32
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f33 = ALPHA, f10, f33
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f34 = ALPHA, f12, f34
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f35 = ALPHA, f14, f35
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f32
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f33
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f34
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f35
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L30:
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	mov	AO1 = A
kusano 2b45e8
	mov	f8  = f0
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	add	AO2 = LDA, A
kusano 2b45e8
	mov	f10 = f0
kusano 2b45e8
	tbit.z	p6, p0  = N, 1
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	mov	BO  = BUFFER
kusano 2b45e8
	mov	f12 = f0
kusano 2b45e8
	shr	I = MIN_M, 4
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	WPRE =  4 * SIZE, CO
kusano 2b45e8
	mov	f14 = f0
kusano 2b45e8
	(p6) br.cond.dpnt .L40
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f32 = [AO1], SIZE
kusano 2b45e8
	(p8) LDFD f33 = [AO2], SIZE
kusano 2b45e8
	mov	f9  = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	shladd	A   = LDA, 1, A
kusano 2b45e8
	mov	f11 = f0
kusano 2b45e8
	mov	ar.ec= 2
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f40 = [BO], 2 * SIZE
kusano 2b45e8
	cmp.eq	p6, p0 = 0, I
kusano 2b45e8
	mov	f13 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	RPRE1  = RPREFETCH * SIZE, AO1
kusano 2b45e8
	add	I = I, I
kusano 2b45e8
	mov	f15 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	cmp.eq	p16, p0 = r0, r0
kusano 2b45e8
	adds	RPRE2  = (RPREFETCH + 8) * SIZE, AO2
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	lfetch.excl.nt1	[WPRE]
kusano 2b45e8
	(p8) FMPY	f8  = f40, f32
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	PREB  = RPREFETCH * SIZE, BO
kusano 2b45e8
	(p8) FMPY	f10 = f40, f33
kusano 2b45e8
	(p6) br.cond.dpnt .L35
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L32:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFPD	f83, f84 = [AO2], 2 * SIZE
kusano 2b45e8
	(p17) LDFPD	f110, f111 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f104, f33, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f105, f34, f9
kusano 2b45e8
	(p16) tbit.nz.unc p14, p15 = I, 0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [RPRE1], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f32, f33 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f104, f35, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f105, f36, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p15) PREFETCH [RPRE2], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f34, f35 = [AO2], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f106, f49, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f107, f50, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [PREB], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f48, f49 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f106, f51, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f107, f52, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f50, f51 = [AO2], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f103, f104 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f108, f65, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f109, f66, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f105, f106 = [BO], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f64, f65 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f108, f67, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f109, f68, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f66, f67 = [AO2], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f107, f108 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f110, f81, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f111, f82, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f80, f81 = [AO1], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f10 = f110, f83, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	(p17) FMA	f11 = f111, f84, f11
kusano 2b45e8
	br.ctop.sptk.few .L32
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L35:
kusano 2b45e8
	and	I = 15, MIN_M
kusano 2b45e8
	;;
kusano 2b45e8
	cmp.eq	p6,  p0 = 0, I
kusano 2b45e8
	(p6) br.cond.dpnt .L38
kusano 2b45e8
	;;
kusano 2b45e8
	tbit.nz	p12, p0 = MIN_M, 3
kusano 2b45e8
	tbit.nz	p13, p0 = MIN_M, 2
kusano 2b45e8
	tbit.nz	p14, p0 = MIN_M, 1
kusano 2b45e8
	tbit.nz	p15, p0 = MIN_M, 0
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f32,  f33  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f34,  f35  = [AO2], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f100, f101 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f36,  f37  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f38,  f39  = [AO2], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f102, f103 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f40,  f41  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f42,  f43  = [AO2], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f104, f105 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f44,  f45  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f46,  f47  = [AO2], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f106, f107 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) LDFPD	f48,  f49  = [AO1], 2 * SIZE
kusano 2b45e8
	(p13) LDFPD	f50,  f51  = [AO2], 2 * SIZE
kusano 2b45e8
	(p13) LDFPD	f108, f109 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) LDFPD	f52,  f53  = [AO1], 2 * SIZE
kusano 2b45e8
	(p13) LDFPD	f54,  f55  = [AO2], 2 * SIZE
kusano 2b45e8
	(p13) LDFPD	f110, f111 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p14) LDFPD	f56,  f57  = [AO1], 2 * SIZE
kusano 2b45e8
	(p14) LDFPD	f58,  f59  = [AO2], 2 * SIZE
kusano 2b45e8
	(p14) LDFPD	f112, f113 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p15) LDFD	f60        = [AO1]
kusano 2b45e8
	(p15) LDFD	f61        = [AO2]
kusano 2b45e8
	(p15) LDFD	f114       = [BO]
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) FMA	f8  = f100, f32, f8
kusano 2b45e8
	(p12) FMA	f9  = f101, f33, f9
kusano 2b45e8
	(p12) FMA	f10 = f100, f34, f10
kusano 2b45e8
	(p12) FMA	f11 = f101, f35, f11
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) FMA	f12 = f102, f36, f12
kusano 2b45e8
	(p12) FMA	f13 = f103, f37, f13
kusano 2b45e8
	(p12) FMA	f14 = f102, f38, f14
kusano 2b45e8
	(p12) FMA	f15 = f103, f39, f15
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) FMA	f8  = f104, f40, f8
kusano 2b45e8
	(p12) FMA	f9  = f105, f41, f9
kusano 2b45e8
	(p12) FMA	f10 = f104, f42, f10
kusano 2b45e8
	(p12) FMA	f11 = f105, f43, f11
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) FMA	f12 = f106, f44, f12
kusano 2b45e8
	(p12) FMA	f13 = f107, f45, f13
kusano 2b45e8
	(p12) FMA	f14 = f106, f46, f14
kusano 2b45e8
	(p12) FMA	f15 = f107, f47, f15
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) FMA	f8  = f108, f48, f8
kusano 2b45e8
	(p13) FMA	f9  = f109, f49, f9
kusano 2b45e8
	(p13) FMA	f10 = f108, f50, f10
kusano 2b45e8
	(p13) FMA	f11 = f109, f51, f11
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) FMA	f12 = f110, f52, f12
kusano 2b45e8
	(p13) FMA	f13 = f111, f53, f13
kusano 2b45e8
	(p13) FMA	f14 = f110, f54, f14
kusano 2b45e8
	(p13) FMA	f15 = f111, f55, f15
kusano 2b45e8
	;;
kusano 2b45e8
	(p14) FMA	f8  = f112, f56, f8
kusano 2b45e8
	(p14) FMA	f9  = f113, f57, f9
kusano 2b45e8
	(p14) FMA	f10 = f112, f58, f10
kusano 2b45e8
	(p14) FMA	f11 = f113, f59, f11
kusano 2b45e8
	;;
kusano 2b45e8
	(p15) FMA	f12 = f114, f60, f12
kusano 2b45e8
	(p15) FMA	f14 = f114, f61, f14
kusano 2b45e8
	;;
kusano 2b45e8
.L38:
kusano 2b45e8
	FADD	f8  = f8,  f9
kusano 2b45e8
	FADD	f10 = f10, f11
kusano 2b45e8
	FADD	f12 = f12, f13
kusano 2b45e8
	FADD	f14 = f14, f15
kusano 2b45e8
	;;
kusano 2b45e8
	FADD	f8  = f8,  f12
kusano 2b45e8
	FADD	f10 = f10, f14
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	mov	AO1 = CO
kusano 2b45e8
	LDFD	f32 = [CO], INCY
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f33 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f32 = ALPHA, f8,  f32
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f33 = ALPHA, f10, f33
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f32
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f33
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L40:
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	mov	AO1 = A
kusano 2b45e8
	mov	f8  = f0
kusano 2b45e8
	shr	I = MIN_M, 4
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	mov	BO  = BUFFER
kusano 2b45e8
	mov	f10 = f0
kusano 2b45e8
	tbit.z	p7, p0  = N, 0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	cmp.eq	p6, p0 = 0, I
kusano 2b45e8
	mov	f12 = f0
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	add	I = I, I
kusano 2b45e8
	mov	f14 = f0
kusano 2b45e8
	(p7) br.cond.dpnt .L99
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p8) LDFD f32 = [AO1], SIZE
kusano 2b45e8
	mov	f9  = f0
kusano 2b45e8
	mov	ar.ec= 2
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f40 = [BO], 2 * SIZE
kusano 2b45e8
	add	A   = A, LDA
kusano 2b45e8
	mov	f11 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	WPRE =  1 * SIZE, CO
kusano 2b45e8
	adds	PREB  = RPREFETCH * SIZE, BO
kusano 2b45e8
	mov	f13 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	cmp.eq	p16, p0 = r0, r0
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	mov	f15 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	lfetch.excl.nt1	[WPRE]
kusano 2b45e8
	(p8) FMPY	f8  = f40, f32
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p6) br.cond.dpnt .L45
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L42:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFPD	f81, f82   = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) LDFPD	f110, f111 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f104, f33, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f105, f34, f9
kusano 2b45e8
	(p16) tbit.nz.unc p14, p15 = I, 0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f32, f33   = [AO1], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f103, f104 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f106, f49, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f107, f50, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f105, f106 = [BO], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f48, f49 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f108, f65, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f109, f66, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f64, f65  = [AO1], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f107, f108 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f110, f81, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	(p17) FMA	f9  = f111, f82, f9
kusano 2b45e8
	br.ctop.sptk.few .L42
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L45:
kusano 2b45e8
	and	I = 15, MIN_M
kusano 2b45e8
	;;
kusano 2b45e8
	cmp.eq	p6,  p0 = 0, I
kusano 2b45e8
	(p6) br.cond.dpnt .L48
kusano 2b45e8
	;;
kusano 2b45e8
	tbit.nz	p12, p0 = MIN_M, 3
kusano 2b45e8
	tbit.nz	p13, p0 = MIN_M, 2
kusano 2b45e8
	tbit.nz	p14, p0 = MIN_M, 1
kusano 2b45e8
	tbit.nz	p15, p0 = MIN_M, 0
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f32,  f33  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f100, f101 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f36,  f37  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f102, f103 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f40,  f41  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f104, f105 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f44,  f45  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f106, f107 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) LDFPD	f48,  f49  = [AO1], 2 * SIZE
kusano 2b45e8
	(p13) LDFPD	f108, f109 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) LDFPD	f52,  f53  = [AO1], 2 * SIZE
kusano 2b45e8
	(p13) LDFPD	f110, f111 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p14) LDFPD	f56,  f57  = [AO1], 2 * SIZE
kusano 2b45e8
	(p14) LDFPD	f112, f113 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p15) LDFD	f60        = [AO1]
kusano 2b45e8
	(p15) LDFD	f114       = [BO]
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) FMA	f8  = f100, f32, f8
kusano 2b45e8
	(p12) FMA	f9  = f101, f33, f9
kusano 2b45e8
	(p12) FMA	f10 = f102, f36, f10
kusano 2b45e8
	(p12) FMA	f11 = f103, f37, f11
kusano 2b45e8
	(p12) FMA	f12 = f104, f40, f12
kusano 2b45e8
	(p12) FMA	f13 = f105, f41, f13
kusano 2b45e8
	(p12) FMA	f14 = f106, f44, f14
kusano 2b45e8
	(p12) FMA	f15 = f107, f45, f15
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) FMA	f8  = f108, f48, f8
kusano 2b45e8
	(p13) FMA	f9  = f109, f49, f9
kusano 2b45e8
	(p13) FMA	f10 = f110, f52, f10
kusano 2b45e8
	(p13) FMA	f11 = f111, f53, f11
kusano 2b45e8
	(p14) FMA	f12 = f112, f56, f12
kusano 2b45e8
	(p14) FMA	f13 = f113, f57, f13
kusano 2b45e8
	(p15) FMA	f14 = f114, f60, f14
kusano 2b45e8
	;;
kusano 2b45e8
.L48:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f32 = [CO]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f8  = f8,  f9
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f10 = f10, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f12 = f12, f13
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f14 = f14, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f8  = f8,  f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f10 = f10, f14
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f8  = f8,  f10
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f32 = ALPHA, f8,  f32
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [CO] = f32
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L99:
kusano 2b45e8
	adds	IS = P, IS
kusano 2b45e8
	shladd	A  = LDAP, BASE_SHIFT, A
kusano 2b45e8
	;;
kusano 2b45e8
	cmp.gt	p6, p0 = M, IS
kusano 2b45e8
	(p6) br.cond.dptk .LIs_loop
kusano 2b45e8
	br   .L999
kusano 2b45e8
	.align 4
kusano 2b45e8
	;;
kusano 2b45e8
kusano 2b45e8
.L100:
kusano 2b45e8
	shr	J   = N, 3
kusano 2b45e8
	mov	CO  = Y
kusano 2b45e8
	;;
kusano 2b45e8
	cmp.eq	p6, p0 = r0, J
kusano 2b45e8
	(p6) br.cond.dpnt .L120
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L111:
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	mov	AO1 = A
kusano 2b45e8
	mov	f8  = f0
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	add	AO2 = LDA, A
kusano 2b45e8
	mov	f10 = f0
kusano 2b45e8
	shr	I = MIN_M, 4
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	shladd	AO3 = LDA, 1, A
kusano 2b45e8
	shladd	AO4 = LDA, 1, AO2
kusano 2b45e8
	mov	f12 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f32 = [AO1], SIZE
kusano 2b45e8
	(p8) LDFD f33 = [AO2], SIZE
kusano 2b45e8
	mov	f14 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	shladd	AO5 = LDA, 1, AO3
kusano 2b45e8
	shladd	AO6 = LDA, 1, AO4
kusano 2b45e8
	mov	f16 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f34 = [AO3], SIZE
kusano 2b45e8
	(p8) LDFD f35 = [AO4], SIZE
kusano 2b45e8
	mov	f18 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	shladd	AO7 = LDA, 1, AO5
kusano 2b45e8
	shladd	AO8 = LDA, 1, AO6
kusano 2b45e8
	mov	f20 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f36 = [AO5], SIZE
kusano 2b45e8
	(p8) LDFD f37 = [AO6], SIZE
kusano 2b45e8
	mov	f22 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p8) LDFD f38 = [AO7], SIZE
kusano 2b45e8
	mov	f9  = f0
kusano 2b45e8
	mov	ar.ec= 2
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f39 = [AO8], SIZE
kusano 2b45e8
	mov	BO  = BUFFER
kusano 2b45e8
	mov	f11 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f40 = [BO], 2 * SIZE
kusano 2b45e8
	cmp.eq	p6, p0 = 0, I
kusano 2b45e8
	mov	f13 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	shladd	A   = LDA, 3, A
kusano 2b45e8
	cmp.eq	p16, p0 = r0, r0
kusano 2b45e8
	mov	f15 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	add	I = I, I
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	mov	f17 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	RPRE1  = RPREFETCH * SIZE, AO1
kusano 2b45e8
	adds	RPRE2  = (RPREFETCH + 8) * SIZE, AO2
kusano 2b45e8
	mov	f19 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	mov	f21 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	RPRE3  = RPREFETCH * SIZE, AO3
kusano 2b45e8
	adds	RPRE4  = (RPREFETCH + 8) * SIZE, AO4
kusano 2b45e8
	mov	f23 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p8) FMPY	f8  = f40, f32
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	RPRE5  = RPREFETCH * SIZE, AO5
kusano 2b45e8
	adds	RPRE6  = (RPREFETCH + 8) * SIZE, AO6
kusano 2b45e8
	(p8) FMPY	f10 = f40, f33
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	AO21 = 7 * SIZE, AO2
kusano 2b45e8
	adds	AO41 = 7 * SIZE, AO4
kusano 2b45e8
	(p8) FMPY	f12 = f40, f34
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	RPRE7  = RPREFETCH * SIZE, AO7
kusano 2b45e8
	adds	RPRE8  = (RPREFETCH + 8) * SIZE, AO8
kusano 2b45e8
	(p8) FMPY	f14 = f40, f35
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p8) FMPY	f16 = f40, f36
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	WPRE = 8 * SIZE, CO
kusano 2b45e8
	adds	PREB  = RPREFETCH * SIZE, BO
kusano 2b45e8
	(p8) FMPY	f18 = f40, f37
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	lfetch.excl.nt1	[WPRE]
kusano 2b45e8
	adds	AO61 = 7 * SIZE, AO6
kusano 2b45e8
	(p8) FMPY	f20 = f40, f38
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	AO81 = 7 * SIZE, AO8
kusano 2b45e8
	(p8) FMPY	f22 = f40, f39
kusano 2b45e8
	(p6) br.cond.dpnt .L115
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L112:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFPD	f80, f95 = [AO8]
kusano 2b45e8
	(p17) LDFPD	f110, f111 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f104, f33, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p17) adds	AO8 = 3 * SIZE, AO8
kusano 2b45e8
	(p17) FMA	f9  = f105, f34, f9
kusano 2b45e8
	(p16) tbit.nz.unc p14, p15 = I, 0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [RPRE1], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f32, f33 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f104, f35, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f105, f36, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p15) PREFETCH [RPRE2], 16 * SIZE
kusano 2b45e8
	(p16) LDFD	f34      = [AO2], 1 * SIZE
kusano 2b45e8
	(p17) FMA	f12 = f104, f37, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFD	f84      = [AO21], 8 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f13 = f105, f38, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [RPRE3], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f36, f37 = [AO3], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f14 = f104, f39, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f105, f40, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p15) PREFETCH [RPRE4], 16 * SIZE
kusano 2b45e8
	(p16) LDFD	f38      = [AO4], 1 * SIZE
kusano 2b45e8
	(p17) FMA	f16 = f104, f41, f16
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFD	f88      = [AO41], 8 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f17 = f105, f42, f17
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [RPRE5], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f40, f41 = [AO5], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f18 = f104, f43, f18
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f19 = f105, f44, f19
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p15) PREFETCH [RPRE6], 16 * SIZE
kusano 2b45e8
	(p16) LDFD	f42      = [AO6], 1 * SIZE
kusano 2b45e8
	(p17) FMA	f20 = f104, f45, f20
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFD	f92      = [AO61], 8 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f21 = f105, f46, f21
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [RPRE7], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f44, f45 = [AO7], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f22 = f104, f47, f22
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f23 = f105, f48, f23
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p15) PREFETCH [RPRE8], 16 * SIZE
kusano 2b45e8
	(p16) LDFD	f46      = [AO8], 1 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f106, f49, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFD	f96      = [AO81], 8 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f107, f50, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [PREB], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f48, f49 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f106, f51, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f107, f52, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f35, f50 = [AO2], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f103, f104 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f12 = f106, f53, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f13 = f107, f54, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f52, f53 = [AO3], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f14 = f106, f55, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f107, f56, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f39, f54 = [AO4], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f16 = f106, f57, f16
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f17 = f107, f58, f17
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f56, f57 = [AO5], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f18 = f106, f59, f18
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f19 = f107, f60, f19
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f43, f58 = [AO6], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f20 = f106, f61, f20
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f21 = f107, f62, f21
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f60, f61 = [AO7], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f22 = f106, f63, f22
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f23 = f107, f64, f23
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f47, f62 = [AO8], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f105, f106 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f108, f65, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f109, f66, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f64, f65 = [AO1], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f10 = f108, f67, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f109, f68, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f51, f66 = [AO2], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f12 = f108, f69, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f13 = f109, f70, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f68, f69 = [AO3], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f14 = f108, f71, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f109, f72, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f55, f70 = [AO4], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f16 = f108, f73, f16
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f17 = f109, f74, f17
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f72, f73 = [AO5], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f18 = f108, f75, f18
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f19 = f109, f76, f19
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f59, f74 = [AO6], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f20 = f108, f77, f20
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f21 = f109, f78, f21
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f76, f77 = [AO7], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f22 = f108, f79, f22
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f23 = f109, f80, f23
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f63, f78 = [AO8], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f107, f108 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f110, f81, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f111, f82, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f80, f81 = [AO1], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f10 = f110, f83, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f111, f84, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f67, f82 = [AO2]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f12 = f110, f85, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p16) adds	AO2 = 3 * SIZE, AO2
kusano 2b45e8
	(p17) FMA	f13 = f111, f86, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f84, f85 = [AO3], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f14 = f110, f87, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f111, f88, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f71, f86 = [AO4]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f16 = f110, f89, f16
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p16) adds	AO4 = 3 * SIZE, AO4
kusano 2b45e8
	(p17) FMA	f17 = f111, f90, f17
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f88, f89 = [AO5], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f18 = f110, f91, f18
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f19 = f111, f92, f19
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f75, f90 = [AO6]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f20 = f110, f93, f20
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p16) adds	AO6 = 3 * SIZE, AO6
kusano 2b45e8
	(p17) FMA	f21 = f111, f94, f21
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f92, f93 = [AO7], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f22 = f110, f95, f22
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	(p17) FMA	f23 = f111, f96, f23
kusano 2b45e8
	br.ctop.sptk.few .L112
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L115:
kusano 2b45e8
	and	I = 15, MIN_M
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	;;
kusano 2b45e8
	cmp.eq	p6,  p0 = 0, I
kusano 2b45e8
	cmp.eq	p16, p15 = r0, r0
kusano 2b45e8
	;;
kusano 2b45e8
	adds	I = 1, I
kusano 2b45e8
	;;
kusano 2b45e8
	shr	I = I, 1
kusano 2b45e8
	;;
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	adds	AO21 = 1 * SIZE, AO2
kusano 2b45e8
	adds	AO41 = 1 * SIZE, AO4
kusano 2b45e8
	adds	AO61 = 1 * SIZE, AO6
kusano 2b45e8
	adds	AO81 = 1 * SIZE, AO8
kusano 2b45e8
	;;
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	mov	ar.ec= 3
kusano 2b45e8
	and	I = 15, MIN_M
kusano 2b45e8
	(p6) br.cond.dpnt .L118
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L116:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f104, f107 = [BO], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f32,  f35  = [AO1], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f8  = f106, f34, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f9  = f109, f37, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFD	f38  = [AO2], 2 * SIZE
kusano 2b45e8
	(p17) LDFD	f42  = [AO21], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f10 = f106, f40, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f11 = f109, f43, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f44,  f47  = [AO3], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p18) FMA	f12 = f106, f46, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f13 = f109, f49, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFD	f50  = [AO4], 2 * SIZE
kusano 2b45e8
	(p17) LDFD	f54  = [AO41], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f14 = f106, f52, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f15 = f109, f55, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f56,  f59  = [AO5], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p18) FMA	f16 = f106, f58, f16
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f17 = f109, f61, f17
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFD	f62  = [AO6], 2 * SIZE
kusano 2b45e8
	(p17) LDFD	f66  = [AO61], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f18 = f106, f64, f18
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) adds	I = -2, I
kusano 2b45e8
	(p15) FMA	f19 = f109, f67, f19
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f68,  f71  = [AO7], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p18) FMA	f20 = f106, f70, f20
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f21 = f109, f73, f21
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFD	f74  = [AO8], 2 * SIZE
kusano 2b45e8
	(p17) LDFD	f78  = [AO81], 2 * SIZE
kusano 2b45e8
	(p15) FMA	f23 = f109, f79, f23
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	(p17) cmp.ne.unc p15, p0 = -1, I
kusano 2b45e8
	(p18) FMA	f22 = f106, f76, f22
kusano 2b45e8
	br.ctop.sptk.few .L116
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
kusano 2b45e8
.L118:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	mov	AO1 = CO
kusano 2b45e8
	LDFD	f32 = [CO], INCY
kusano 2b45e8
	FADD	f8  = f8,  f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f33 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f10 = f10, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f34 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f12 = f12, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f35 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f14 = f14, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f36 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f16 = f16, f17
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f37 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f18 = f18, f19
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f38 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f20 = f20, f21
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f39 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f22 = f22, f23
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f32 = ALPHA, f8,  f32
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f33 = ALPHA, f10, f33
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f34 = ALPHA, f12, f34
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f35 = ALPHA, f14, f35
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f32
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	FMA	f36 = ALPHA, f16, f36
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f33
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	FMA	f37 = ALPHA, f18, f37
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f34
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	FMA	f38 = ALPHA, f20, f38
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f35
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	FMA	f39 = ALPHA, f22, f39
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	STFD [AO1] = f36
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	adds J = -1, J
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	STFD [AO1] = f37
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	STFD [AO1] = f38
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	cmp4.lt p6, p0 = 0, J
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mib
kusano 2b45e8
	STFD [AO1] = f39
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	(p6) br.cond.dptk .L111
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L120:
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	mov	AO1 = A
kusano 2b45e8
	mov	f8  = f0
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	add	AO2 = LDA, A
kusano 2b45e8
	mov	f10 = f0
kusano 2b45e8
	tbit.z	p6, p0  = N, 2
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	shladd	AO3 = LDA, 1, A
kusano 2b45e8
	mov	f12 = f0
kusano 2b45e8
	shr	I = MIN_M, 4
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	shladd	AO4 = LDA, 1, AO2
kusano 2b45e8
	mov	f14 = f0
kusano 2b45e8
	(p6) br.cond.dpnt .L130
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f32 = [AO1], SIZE
kusano 2b45e8
	(p8) LDFD f33 = [AO2], SIZE
kusano 2b45e8
	mov	f9  = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	mov	BO  = BUFFER
kusano 2b45e8
	shladd	A   = LDA, 2, A
kusano 2b45e8
	mov	f11 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f40 = [BO], 2 * SIZE
kusano 2b45e8
	cmp.eq	p6, p0 = 0, I
kusano 2b45e8
	mov	f13 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f34 = [AO3], SIZE
kusano 2b45e8
	(p8) LDFD f35 = [AO4], SIZE
kusano 2b45e8
	mov	f15 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	adds	RPRE1  = RPREFETCH * SIZE, AO1
kusano 2b45e8
	adds	RPRE2  = (RPREFETCH + 8) * SIZE, AO2
kusano 2b45e8
	mov	ar.ec= 2
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	cmp.eq	p16, p0 = r0, r0
kusano 2b45e8
	add	I = I, I
kusano 2b45e8
	adds	AO21 = 7 * SIZE, AO2
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	WPRE =  4 * SIZE, CO
kusano 2b45e8
	adds	PREB  = RPREFETCH * SIZE, BO
kusano 2b45e8
	(p8) FMPY	f8  = f40, f32
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	RPRE3  = RPREFETCH * SIZE, AO3
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	(p8) FMPY	f10 = f40, f33
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	adds	AO41 = 7 * SIZE, AO4
kusano 2b45e8
	(p8) FMPY	f12 = f40, f34
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	RPRE4  = (RPREFETCH + 8) * SIZE, AO4
kusano 2b45e8
	(p8) FMPY	f14 = f40, f35
kusano 2b45e8
	(p6) br.cond.dpnt .L125
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L122:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFPD	f72, f87 = [AO4]
kusano 2b45e8
	(p17) LDFPD	f110, f111 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f104, f33, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p17) adds	AO4 = 3 * SIZE, AO4
kusano 2b45e8
	(p17) FMA	f9  = f105, f34, f9
kusano 2b45e8
	(p16) tbit.nz.unc p14, p15 = I, 0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [RPRE1], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f32, f33 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f104, f35, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f105, f36, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p15) PREFETCH [RPRE2], 16 * SIZE
kusano 2b45e8
	(p16) LDFD	f34      = [AO2], 1 * SIZE
kusano 2b45e8
	(p17) FMA	f12 = f104, f37, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFD	f84      = [AO21], 8 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f13 = f105, f38, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [RPRE3], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f36, f37 = [AO3], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f14 = f104, f39, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f105, f40, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p15) PREFETCH [RPRE4], 16 * SIZE
kusano 2b45e8
	(p16) LDFD	f38      = [AO4], 1 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f106, f49, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFD	f88      = [AO41], 8 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f107, f50, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [PREB], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f48, f49 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f106, f51, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f107, f52, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f35, f50 = [AO2], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f103, f104 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f12 = f106, f53, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f13 = f107, f54, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f52, f53 = [AO3], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f14 = f106, f55, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f107, f56, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f39, f54 = [AO4], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f105, f106 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f108, f65, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f109, f66, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f64, f65 = [AO1], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f10 = f108, f67, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f109, f68, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f51, f66 = [AO2], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f12 = f108, f69, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f13 = f109, f70, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f68, f69 = [AO3], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f14 = f108, f71, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f15 = f109, f72, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f55, f70 = [AO4], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f107, f108 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f110, f81, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f111, f82, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f80, f81 = [AO1], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f10 = f110, f83, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f111, f84, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f67, f82 = [AO2]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f12 = f110, f85, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p16) adds	AO2 = 3 * SIZE, AO2
kusano 2b45e8
	(p17) FMA	f13 = f111, f86, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f84, f85 = [AO3], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f14 = f110, f87, f14
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	(p17) FMA	f15 = f111, f88, f15
kusano 2b45e8
	br.ctop.sptk.few .L122
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L125:
kusano 2b45e8
	and	I = 15, MIN_M
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	;;
kusano 2b45e8
	cmp.eq	p6,  p0 = 0, I
kusano 2b45e8
	cmp.eq	p16, p15 = r0, r0
kusano 2b45e8
	;;
kusano 2b45e8
	adds	I = 1, I
kusano 2b45e8
	adds	AO21 = 1 * SIZE, AO2
kusano 2b45e8
	adds	AO41 = 1 * SIZE, AO4
kusano 2b45e8
	;;
kusano 2b45e8
	shr	I = I, 1
kusano 2b45e8
	;;
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	;;
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	mov	ar.ec= 3
kusano 2b45e8
	and	I = 15, MIN_M
kusano 2b45e8
	(p6) br.cond.dpnt .L128
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L126:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f104, f107 = [BO], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f32,  f35  = [AO1], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f8  = f106, f34, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f9  = f109, f37, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFD	f42        = [AO21], 2 * SIZE
kusano 2b45e8
	(p16) LDFD	f38        = [AO2], 2 * SIZE
kusano 2b45e8
	(p18) FMA	f10 = f106, f40, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p15) FMA	f11 = f109, f43, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f44,  f47  = [AO3], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p18) FMA	f12 = f106, f46, f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) adds	I = -2, I
kusano 2b45e8
	(p15) FMA	f13 = f109, f49, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFD	f54        = [AO41], 2 * SIZE
kusano 2b45e8
	(p16) LDFD	f50        = [AO4], 2 * SIZE
kusano 2b45e8
	(p15) FMA	f15 = f109, f55, f15
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	(p17) cmp.ne.unc p15, p0 = -1, I
kusano 2b45e8
	(p18) FMA	f14 = f106, f52, f14
kusano 2b45e8
	br.ctop.sptk.few .L126
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
kusano 2b45e8
.L128:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	mov	AO1 = CO
kusano 2b45e8
	LDFD	f32 = [CO], INCY
kusano 2b45e8
	FADD	f8  = f8,  f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f33 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f10 = f10, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f34 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f12 = f12, f13
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f35 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f14 = f14, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f32 = ALPHA, f8,  f32
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f33 = ALPHA, f10, f33
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f34 = ALPHA, f12, f34
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f35 = ALPHA, f14, f35
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f32
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f33
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f34
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f35
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L130:
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	mov	AO1 = A
kusano 2b45e8
	mov	f8  = f0
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	add	AO2 = LDA, A
kusano 2b45e8
	mov	f10 = f0
kusano 2b45e8
	tbit.z	p6, p0  = N, 1
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	mov	BO  = BUFFER
kusano 2b45e8
	mov	f12 = f0
kusano 2b45e8
	shr	I = MIN_M, 4
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	WPRE =  4 * SIZE, CO
kusano 2b45e8
	mov	f14 = f0
kusano 2b45e8
	(p6) br.cond.dpnt .L140
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f32 = [AO1], SIZE
kusano 2b45e8
	(p8) LDFD f33 = [AO2], SIZE
kusano 2b45e8
	mov	f9  = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	shladd	A   = LDA, 1, A
kusano 2b45e8
	mov	f11 = f0
kusano 2b45e8
	mov	ar.ec= 2
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f40 = [BO], 2 * SIZE
kusano 2b45e8
	cmp.eq	p6, p0 = 0, I
kusano 2b45e8
	mov	f13 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	RPRE1  = RPREFETCH * SIZE, AO1
kusano 2b45e8
	add	I = I, I
kusano 2b45e8
	mov	f15 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmi
kusano 2b45e8
	cmp.eq	p16, p0 = r0, r0
kusano 2b45e8
	adds	RPRE2  = (RPREFETCH + 8) * SIZE, AO2
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	adds	AO21 = 7 * SIZE, AO2
kusano 2b45e8
	(p8) FMPY	f8  = f40, f32
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	PREB  = RPREFETCH * SIZE, BO
kusano 2b45e8
	(p8) FMPY	f10 = f40, f33
kusano 2b45e8
	(p6) br.cond.dpnt .L135
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L132:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFPD	f68, f83 = [AO2]
kusano 2b45e8
	(p17) LDFPD	f110, f111 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f104, f33, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p17) adds	AO2 = 3 * SIZE, AO2
kusano 2b45e8
	(p17) FMA	f9  = f105, f34, f9
kusano 2b45e8
	(p16) tbit.nz.unc p14, p15 = I, 0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [RPRE1], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f32, f33 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f104, f35, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f105, f36, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p15) PREFETCH [RPRE2], 16 * SIZE
kusano 2b45e8
	(p16) LDFD	f34      = [AO2], 1 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f106, f49, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFD	f84      = [AO21], 8 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f107, f50, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p14) PREFETCH [PREB], 16 * SIZE
kusano 2b45e8
	(p16) LDFPD	f48, f49 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f106, f51, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f107, f52, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f35, f50 = [AO2], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f103, f104 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f108, f65, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f109, f66, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f105, f106 = [BO], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f64, f65 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f10 = f108, f67, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f11 = f109, f68, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f51, f66 = [AO2], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f107, f108 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f110, f81, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f111, f82, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f80, f81 = [AO1], 2 * SIZE
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f10 = f110, f83, f10
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	(p17) FMA	f11 = f111, f84, f11
kusano 2b45e8
	br.ctop.sptk.few .L132
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L135:
kusano 2b45e8
	and	I = 15, MIN_M
kusano 2b45e8
	;;
kusano 2b45e8
	cmp.eq	p6,  p0 = 0, I
kusano 2b45e8
	(p6) br.cond.dpnt .L138
kusano 2b45e8
	;;
kusano 2b45e8
	tbit.nz	p12, p0 = MIN_M, 3
kusano 2b45e8
	tbit.nz	p13, p0 = MIN_M, 2
kusano 2b45e8
	tbit.nz	p14, p0 = MIN_M, 1
kusano 2b45e8
	tbit.nz	p15, p0 = MIN_M, 0
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f100, f101 = [BO], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f32,  f33  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFD	f34        = [AO2], 1 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f36,  f37  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f35,  f38  = [AO2], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f102, f103 = [BO],  2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f39,  f42  = [AO2], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f40,  f41  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f43,  f46  = [AO2], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f104, f105 = [BO], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f44,  f45  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFD	f47        = [AO2], 1 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f106, f107 = [BO], 2 * SIZE
kusano 2b45e8
	(p13) LDFD	f50        = [AO2], 1 * SIZE
kusano 2b45e8
	(p13) LDFPD	f48,  f49  = [AO1], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) LDFPD	f108, f109 = [BO], 2 * SIZE
kusano 2b45e8
	(p13) LDFPD	f51,  f54  = [AO2], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) LDFPD	f110, f111 = [BO], 2 * SIZE
kusano 2b45e8
	(p13) LDFPD	f52,  f53  = [AO1], 2 * SIZE
kusano 2b45e8
	(p13) LDFD	f55        = [AO2], 1 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p14) LDFPD	f56,  f57  = [AO1], 2 * SIZE
kusano 2b45e8
	(p14) LDFD	f58        = [AO2], 1 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p14) LDFPD	f112, f113 = [BO], 2 * SIZE
kusano 2b45e8
	(p15) LDFD	f60        = [AO1]
kusano 2b45e8
	(p14) LDFD	f59        = [AO2], 1 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p15) LDFD	f61        = [AO2]
kusano 2b45e8
	(p15) LDFD	f114       = [BO]
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) FMA	f8  = f100, f32, f8
kusano 2b45e8
	(p12) FMA	f9  = f101, f33, f9
kusano 2b45e8
	(p12) FMA	f10 = f100, f34, f10
kusano 2b45e8
	(p12) FMA	f11 = f101, f35, f11
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) FMA	f12 = f102, f36, f12
kusano 2b45e8
	(p12) FMA	f13 = f103, f37, f13
kusano 2b45e8
	(p12) FMA	f14 = f102, f38, f14
kusano 2b45e8
	(p12) FMA	f15 = f103, f39, f15
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) FMA	f8  = f104, f40, f8
kusano 2b45e8
	(p12) FMA	f9  = f105, f41, f9
kusano 2b45e8
	(p12) FMA	f10 = f104, f42, f10
kusano 2b45e8
	(p12) FMA	f11 = f105, f43, f11
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) FMA	f12 = f106, f44, f12
kusano 2b45e8
	(p12) FMA	f13 = f107, f45, f13
kusano 2b45e8
	(p12) FMA	f14 = f106, f46, f14
kusano 2b45e8
	(p12) FMA	f15 = f107, f47, f15
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) FMA	f8  = f108, f48, f8
kusano 2b45e8
	(p13) FMA	f9  = f109, f49, f9
kusano 2b45e8
	(p13) FMA	f10 = f108, f50, f10
kusano 2b45e8
	(p13) FMA	f11 = f109, f51, f11
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) FMA	f12 = f110, f52, f12
kusano 2b45e8
	(p13) FMA	f13 = f111, f53, f13
kusano 2b45e8
	(p13) FMA	f14 = f110, f54, f14
kusano 2b45e8
	(p13) FMA	f15 = f111, f55, f15
kusano 2b45e8
	;;
kusano 2b45e8
	(p14) FMA	f8  = f112, f56, f8
kusano 2b45e8
	(p14) FMA	f9  = f113, f57, f9
kusano 2b45e8
	(p14) FMA	f10 = f112, f58, f10
kusano 2b45e8
	(p14) FMA	f11 = f113, f59, f11
kusano 2b45e8
	;;
kusano 2b45e8
	(p15) FMA	f12 = f114, f60, f12
kusano 2b45e8
	(p15) FMA	f14 = f114, f61, f14
kusano 2b45e8
	;;
kusano 2b45e8
.L138:
kusano 2b45e8
	FADD	f8  = f8,  f9
kusano 2b45e8
	FADD	f10 = f10, f11
kusano 2b45e8
	FADD	f12 = f12, f13
kusano 2b45e8
	FADD	f14 = f14, f15
kusano 2b45e8
	;;
kusano 2b45e8
	FADD	f8  = f8,  f12
kusano 2b45e8
	FADD	f10 = f10, f14
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	mov	AO1 = CO
kusano 2b45e8
	LDFD	f32 = [CO], INCY
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f33 = [CO], INCY
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f32 = ALPHA, f8,  f32
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f33 = ALPHA, f10, f33
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f32
kusano 2b45e8
	add	AO1 = AO1, INCY
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [AO1] = f33
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L140:
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	mov	AO1 = A
kusano 2b45e8
	mov	f8  = f0
kusano 2b45e8
	shr	I = MIN_M, 4
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	mov	BO  = BUFFER
kusano 2b45e8
	mov	f10 = f0
kusano 2b45e8
	tbit.z	p7, p0  = N, 0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	cmp.eq	p6, p0 = 0, I
kusano 2b45e8
	mov	f12 = f0
kusano 2b45e8
	mov	pr.rot= 0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	add	I = I, I
kusano 2b45e8
	mov	f14 = f0
kusano 2b45e8
	(p7) br.cond.dpnt .L199
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	(p8) LDFD f32 = [AO1], SIZE
kusano 2b45e8
	mov	f9  = f0
kusano 2b45e8
	mov	ar.ec= 2
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p8) LDFD f40 = [BO], 2 * SIZE
kusano 2b45e8
	add	A   = A, LDA
kusano 2b45e8
	mov	f11 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	adds	WPRE =  1 * SIZE, CO
kusano 2b45e8
	adds	PREB  = RPREFETCH * SIZE, BO
kusano 2b45e8
	mov	f13 = f0
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	cmp.eq	p16, p0 = r0, r0
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	mov	f15 = f0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	lfetch.excl.nt1	[WPRE]
kusano 2b45e8
	(p8) FMPY	f8  = f40, f32
kusano 2b45e8
	mov	ar.lc = I
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmb
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p6) br.cond.dpnt .L145
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L142:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p17) LDFPD	f81, f82   = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) LDFPD	f110, f111 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f104, f33, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfi
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f105, f34, f9
kusano 2b45e8
	(p16) tbit.nz.unc p14, p15 = I, 0
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f32, f33   = [AO1], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f103, f104 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f106, f49, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f107, f50, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f105, f106 = [BO], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f48, f49 = [AO1], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f108, f65, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	(p17) FMA	f9  = f109, f66, f9
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	(p16) LDFPD	f64, f65  = [AO1], 2 * SIZE
kusano 2b45e8
	(p16) LDFPD	f107, f108 = [BO], 2 * SIZE
kusano 2b45e8
	(p17) FMA	f8  = f110, f81, f8
kusano 2b45e8
	}
kusano 2b45e8
	{ .mfb
kusano 2b45e8
	adds	I = -1, I
kusano 2b45e8
	(p17) FMA	f9  = f111, f82, f9
kusano 2b45e8
	br.ctop.sptk.few .L142
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L145:
kusano 2b45e8
	and	I = 15, MIN_M
kusano 2b45e8
	;;
kusano 2b45e8
	cmp.eq	p6,  p0 = 0, I
kusano 2b45e8
	(p6) br.cond.dpnt .L148
kusano 2b45e8
	;;
kusano 2b45e8
	tbit.nz	p12, p0 = MIN_M, 3
kusano 2b45e8
	tbit.nz	p13, p0 = MIN_M, 2
kusano 2b45e8
	tbit.nz	p14, p0 = MIN_M, 1
kusano 2b45e8
	tbit.nz	p15, p0 = MIN_M, 0
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f32,  f33  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f100, f101 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f36,  f37  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f102, f103 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f40,  f41  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f104, f105 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) LDFPD	f44,  f45  = [AO1], 2 * SIZE
kusano 2b45e8
	(p12) LDFPD	f106, f107 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) LDFPD	f48,  f49  = [AO1], 2 * SIZE
kusano 2b45e8
	(p13) LDFPD	f108, f109 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) LDFPD	f52,  f53  = [AO1], 2 * SIZE
kusano 2b45e8
	(p13) LDFPD	f110, f111 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p14) LDFPD	f56,  f57  = [AO1], 2 * SIZE
kusano 2b45e8
	(p14) LDFPD	f112, f113 = [BO], 2 * SIZE
kusano 2b45e8
	;;
kusano 2b45e8
	(p15) LDFD	f60        = [AO1]
kusano 2b45e8
	(p15) LDFD	f114       = [BO]
kusano 2b45e8
	;;
kusano 2b45e8
	(p12) FMA	f8  = f100, f32, f8
kusano 2b45e8
	(p12) FMA	f9  = f101, f33, f9
kusano 2b45e8
	(p12) FMA	f10 = f102, f36, f10
kusano 2b45e8
	(p12) FMA	f11 = f103, f37, f11
kusano 2b45e8
	(p12) FMA	f12 = f104, f40, f12
kusano 2b45e8
	(p12) FMA	f13 = f105, f41, f13
kusano 2b45e8
	(p12) FMA	f14 = f106, f44, f14
kusano 2b45e8
	(p12) FMA	f15 = f107, f45, f15
kusano 2b45e8
	;;
kusano 2b45e8
	(p13) FMA	f8  = f108, f48, f8
kusano 2b45e8
	(p13) FMA	f9  = f109, f49, f9
kusano 2b45e8
	(p13) FMA	f10 = f110, f52, f10
kusano 2b45e8
	(p13) FMA	f11 = f111, f53, f11
kusano 2b45e8
	(p14) FMA	f12 = f112, f56, f12
kusano 2b45e8
	(p14) FMA	f13 = f113, f57, f13
kusano 2b45e8
	(p15) FMA	f14 = f114, f60, f14
kusano 2b45e8
	;;
kusano 2b45e8
.L148:
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	LDFD	f32 = [CO]
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f8  = f8,  f9
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f10 = f10, f11
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f12 = f12, f13
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f14 = f14, f15
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f8  = f8,  f12
kusano 2b45e8
	}
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f10 = f10, f14
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FADD	f8  = f8,  f10
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	FMA	f32 = ALPHA, f8,  f32
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	{ .mmf
kusano 2b45e8
	STFD [CO] = f32
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	nop	__LINE__
kusano 2b45e8
	}
kusano 2b45e8
	;;
kusano 2b45e8
	.align 16
kusano 2b45e8
kusano 2b45e8
.L199:
kusano 2b45e8
	adds	IS = P, IS
kusano 2b45e8
	shladd	A  = LDAP, BASE_SHIFT, A
kusano 2b45e8
	;;
kusano 2b45e8
	cmp.gt	p6, p0 = M, IS
kusano 2b45e8
	(p6) br.cond.dptk .LIs_loop
kusano 2b45e8
	.align 4
kusano 2b45e8
	;;
kusano 2b45e8
kusano 2b45e8
.L999:
kusano 2b45e8
	mov	r8 = r0
kusano 2b45e8
	adds	r9 = 1 * 16, SP
kusano 2b45e8
	;;
kusano 2b45e8
	ldf.fill  f16 = [SP], 32
kusano 2b45e8
	ldf.fill  f17 = [r9], 32
kusano 2b45e8
	mov	 ar.lc = ARLC
kusano 2b45e8
	;;	
kusano 2b45e8
	ldf.fill  f18 = [SP], 32
kusano 2b45e8
	ldf.fill  f19 = [r9], 32
kusano 2b45e8
	mov pr    = PR, -1
kusano 2b45e8
	;;	
kusano 2b45e8
	ldf.fill  f20 = [SP], 32
kusano 2b45e8
	ldf.fill  f21 = [r9], 32
kusano 2b45e8
	mov	ar.pfs = ARPFS
kusano 2b45e8
	;;	
kusano 2b45e8
	ldf.fill  f22 = [SP], 32
kusano 2b45e8
	ldf.fill  f23 = [r9]
kusano 2b45e8
	br.ret.sptk.many b0
kusano 2b45e8
	;;
kusano 2b45e8
	EPILOGUE