Blame thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/power/zgemm_kernel_altivec.S

kusano 2b45e8
/*********************************************************************/
kusano 2b45e8
/* Copyright 2009, 2010 The University of Texas at Austin.           */
kusano 2b45e8
/* All rights reserved.                                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* Redistribution and use in source and binary forms, with or        */
kusano 2b45e8
/* without modification, are permitted provided that the following   */
kusano 2b45e8
/* conditions are met:                                               */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   1. Redistributions of source code must retain the above         */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer.                                                  */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   2. Redistributions in binary form must reproduce the above      */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer in the documentation and/or other materials       */
kusano 2b45e8
/*      provided with the distribution.                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
kusano 2b45e8
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
kusano 2b45e8
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
kusano 2b45e8
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
kusano 2b45e8
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
kusano 2b45e8
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
kusano 2b45e8
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
kusano 2b45e8
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
kusano 2b45e8
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
kusano 2b45e8
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
kusano 2b45e8
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
kusano 2b45e8
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
kusano 2b45e8
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
kusano 2b45e8
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* The views and conclusions contained in the software and           */
kusano 2b45e8
/* documentation are those of the authors and should not be          */
kusano 2b45e8
/* interpreted as representing official policies, either expressed   */
kusano 2b45e8
/* or implied, of The University of Texas at Austin.                 */
kusano 2b45e8
/*********************************************************************/
kusano 2b45e8
kusano 2b45e8
#define ASSEMBLER
kusano 2b45e8
#include "common.h"
kusano 2b45e8
		
kusano 2b45e8
#ifndef __64BIT__
kusano 2b45e8
#define LOAD	lwz
kusano 2b45e8
#else
kusano 2b45e8
#define LOAD	ld
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifdef __64BIT__
kusano 2b45e8
#define STACKSIZE 360
kusano 2b45e8
#else
kusano 2b45e8
#define STACKSIZE 272
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#define ALIGN_SIZE	0xffff
kusano 2b45e8
#define SWAP		  0
kusano 2b45e8
#define NEG		 16
kusano 2b45e8
#define ALPHA_R		 32
kusano 2b45e8
#define ALPHA_I		 48
kusano 2b45e8
#define FZERO		 64
kusano 2b45e8
kusano 2b45e8
#define	M	r3
kusano 2b45e8
#define	N	r4
kusano 2b45e8
#define	K	r5
kusano 2b45e8
kusano 2b45e8
#ifdef linux
kusano 2b45e8
#ifndef __64BIT__
kusano 2b45e8
#define A	r6
kusano 2b45e8
#define	B	r7
kusano 2b45e8
#define	C	r8
kusano 2b45e8
#define	LDC	r9
kusano 2b45e8
#else
kusano 2b45e8
#define A	r8
kusano 2b45e8
#define	B	r9
kusano 2b45e8
#define	C	r10
kusano 2b45e8
#define	LDC	r6
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#if defined(_AIX) || defined(__APPLE__)
kusano 2b45e8
#if !defined(__64BIT__) && defined(DOUBLE)
kusano 2b45e8
#define A	r10
kusano 2b45e8
#define	B	r6
kusano 2b45e8
#define	C	r7
kusano 2b45e8
#define	LDC	r8
kusano 2b45e8
#else
kusano 2b45e8
#define A	r8
kusano 2b45e8
#define	B	r9
kusano 2b45e8
#define	C	r10
kusano 2b45e8
#define	LDC	r6
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#define STACK	r11
kusano 2b45e8
kusano 2b45e8
#define	I	r21
kusano 2b45e8
#define J	r22
kusano 2b45e8
#define AO	r23
kusano 2b45e8
#define	BO	r24
kusano 2b45e8
#define	CO1	r25
kusano 2b45e8
#define CO2	r26
kusano 2b45e8
kusano 2b45e8
#define PREA	r29
kusano 2b45e8
#define PREB	r29
kusano 2b45e8
#define PREC	r30
kusano 2b45e8
#define VREG	r31
kusano 2b45e8
kusano 2b45e8
#define LOAD_A	lvx
kusano 2b45e8
#define LOAD_B	lvx
kusano 2b45e8
kusano 2b45e8
#define OFFSET_0	  0
kusano 2b45e8
#define OFFSET_1	r14
kusano 2b45e8
#define OFFSET_2	r15
kusano 2b45e8
#define OFFSET_3	r16
kusano 2b45e8
#define OFFSET_4	r17
kusano 2b45e8
#define OFFSET_5	r18
kusano 2b45e8
#define OFFSET_6	r19
kusano 2b45e8
#define OFFSET_7	r20
kusano 2b45e8
kusano 2b45e8
#define	c01	v0
kusano 2b45e8
#define	c02	v1
kusano 2b45e8
#define	c03	v2
kusano 2b45e8
#define	c04	v3
kusano 2b45e8
#define	c05	v4
kusano 2b45e8
#define	c06	v5
kusano 2b45e8
#define	c07	v6
kusano 2b45e8
#define	c08	v7
kusano 2b45e8
#define	c09	v8
kusano 2b45e8
#define	c10	v9
kusano 2b45e8
#define	c11	v10
kusano 2b45e8
#define	c12	v11
kusano 2b45e8
#define	c13	v12
kusano 2b45e8
#define	c14	v13
kusano 2b45e8
#define	c15	v14
kusano 2b45e8
#define	c16	v15
kusano 2b45e8
kusano 2b45e8
#define	a1	v16
kusano 2b45e8
#define	a2	v17
kusano 2b45e8
#define	a3	v18
kusano 2b45e8
#define	a4	v19
kusano 2b45e8
#define	a5	v20
kusano 2b45e8
#define	a6	v21
kusano 2b45e8
#define	a7	v22
kusano 2b45e8
#define	a8	v23
kusano 2b45e8
kusano 2b45e8
#define	b1	v24
kusano 2b45e8
#define	b2	v25
kusano 2b45e8
#define	bp1	v26
kusano 2b45e8
#define	bp2	v27
kusano 2b45e8
kusano 2b45e8
#define C1	v16
kusano 2b45e8
#define C2	v17
kusano 2b45e8
#define C3	v18
kusano 2b45e8
#define C4	v19
kusano 2b45e8
#define C5	v20
kusano 2b45e8
kusano 2b45e8
#define c00	v24
kusano 2b45e8
kusano 2b45e8
#define VZERO		 v25
kusano 2b45e8
#define PERMRSHIFT1	 v26
kusano 2b45e8
#define PERMRSHIFT2	 v27
kusano 2b45e8
kusano 2b45e8
#define swap		 v28
kusano 2b45e8
#define neg		 v29
kusano 2b45e8
#define alpha_r		 v30
kusano 2b45e8
#define alpha_i		 v31
kusano 2b45e8
kusano 2b45e8
#ifndef NEEDPARAM
kusano 2b45e8
kusano 2b45e8
	PROLOGUE
kusano 2b45e8
	PROFCODE
kusano 2b45e8
kusano 2b45e8
	addi	SP, SP, -STACKSIZE
kusano 2b45e8
	mr	STACK, SP
kusano 2b45e8
kusano 2b45e8
	li	r0,  0 * 16
kusano 2b45e8
	stvx	v20, SP, r0
kusano 2b45e8
	li	r0,  1 * 16
kusano 2b45e8
	stvx	v21, SP, r0
kusano 2b45e8
	li	r0,  2 * 16
kusano 2b45e8
	stvx	v22, SP, r0
kusano 2b45e8
	li	r0,  3 * 16
kusano 2b45e8
	stvx	v23, SP, r0
kusano 2b45e8
	li	r0,  4 * 16
kusano 2b45e8
	stvx	v24, SP, r0
kusano 2b45e8
	li	r0,  5 * 16
kusano 2b45e8
	stvx	v25, SP, r0
kusano 2b45e8
	li	r0,  6 * 16
kusano 2b45e8
	stvx	v26, SP, r0
kusano 2b45e8
	li	r0,  7 * 16
kusano 2b45e8
	stvx	v27, SP, r0
kusano 2b45e8
	li	r0,  8 * 16
kusano 2b45e8
	stvx	v28, SP, r0
kusano 2b45e8
	li	r0,  9 * 16
kusano 2b45e8
	stvx	v29, SP, r0
kusano 2b45e8
	li	r0, 10 * 16
kusano 2b45e8
	stvx	v30, SP, r0
kusano 2b45e8
	li	r0, 11 * 16
kusano 2b45e8
	stvx	v31, SP, r0
kusano 2b45e8
kusano 2b45e8
#ifdef __64BIT__
kusano 2b45e8
	std	r31,  192(SP)
kusano 2b45e8
	std	r30,  200(SP)
kusano 2b45e8
	std	r29,  208(SP)
kusano 2b45e8
	std	r28,  216(SP)
kusano 2b45e8
	std	r27,  224(SP)
kusano 2b45e8
	std	r26,  232(SP)
kusano 2b45e8
	std	r25,  240(SP)
kusano 2b45e8
	std	r24,  248(SP)
kusano 2b45e8
	std	r23,  256(SP)
kusano 2b45e8
	std	r22,  264(SP)
kusano 2b45e8
	std	r21,  272(SP)
kusano 2b45e8
	std	r20,  280(SP)
kusano 2b45e8
	std	r19,  288(SP)
kusano 2b45e8
	std	r18,  296(SP)
kusano 2b45e8
	std	r17,  304(SP)
kusano 2b45e8
	std	r16,  312(SP)
kusano 2b45e8
	std	r15,  320(SP)
kusano 2b45e8
	std	r14,  328(SP)
kusano 2b45e8
#else
kusano 2b45e8
	stw	r31,  192(SP)
kusano 2b45e8
	stw	r30,  196(SP)
kusano 2b45e8
	stw	r29,  200(SP)
kusano 2b45e8
	stw	r28,  204(SP)
kusano 2b45e8
	stw	r27,  208(SP)
kusano 2b45e8
	stw	r26,  212(SP)
kusano 2b45e8
	stw	r25,  216(SP)
kusano 2b45e8
	stw	r24,  220(SP)
kusano 2b45e8
	stw	r23,  224(SP)
kusano 2b45e8
	stw	r22,  228(SP)
kusano 2b45e8
	stw	r21,  232(SP)
kusano 2b45e8
	stw	r20,  236(SP)
kusano 2b45e8
	stw	r19,  240(SP)
kusano 2b45e8
	stw	r18,  244(SP)
kusano 2b45e8
	stw	r17,  248(SP)
kusano 2b45e8
	stw	r16,  252(SP)
kusano 2b45e8
	stw	r15,  256(SP)
kusano 2b45e8
	stw	r14,  260(SP)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#ifdef linux
kusano 2b45e8
#ifdef __64BIT__
kusano 2b45e8
	ld	LDC,    112 + STACKSIZE(SP)
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#if defined(_AIX) || defined(__APPLE__)
kusano 2b45e8
#ifdef __64BIT__
kusano 2b45e8
	ld	LDC,    112 + STACKSIZE(SP)
kusano 2b45e8
#else
kusano 2b45e8
#ifdef DOUBLE
kusano 2b45e8
	lwz	B,       56 + STACKSIZE(SP)
kusano 2b45e8
	lwz	C,       60 + STACKSIZE(SP)
kusano 2b45e8
	lwz	LDC,     64 + STACKSIZE(SP)
kusano 2b45e8
#else
kusano 2b45e8
	lwz	LDC,     56 + STACKSIZE(SP)
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifndef PREFETCHTEST
kusano 2b45e8
#ifdef PPC970
kusano 2b45e8
	li	PREC,   16 * SIZE
kusano 2b45e8
#endif
kusano 2b45e8
#else
kusano 2b45e8
kusano 2b45e8
#ifdef linux
kusano 2b45e8
#ifndef __64BIT__
kusano 2b45e8
	lwz	PREB,   16 + STACKSIZE(SP)
kusano 2b45e8
	lwz	PREC,   20 + STACKSIZE(SP)
kusano 2b45e8
#else
kusano 2b45e8
	ld	PREB,  136 + STACKSIZE(SP)
kusano 2b45e8
	ld	PREC,  144 + STACKSIZE(SP)
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#if defined(_AIX) || defined(__APPLE__)
kusano 2b45e8
#ifdef __64BIT__
kusano 2b45e8
	ld	PREB,  136 + STACKSIZE(SP)
kusano 2b45e8
	ld	PREC,  144 + STACKSIZE(SP)
kusano 2b45e8
#else
kusano 2b45e8
#ifdef DOUBLE
kusano 2b45e8
	lwz	PREB,   72 + STACKSIZE(SP)
kusano 2b45e8
	lwz	PREC,   76 + STACKSIZE(SP)
kusano 2b45e8
#else
kusano 2b45e8
	lwz	PREB,   68 + STACKSIZE(SP)
kusano 2b45e8
	lwz	PREC,   72 + STACKSIZE(SP)
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
#ifndef PREFETCHTEST
kusano 2b45e8
#ifdef CELL
kusano 2b45e8
	li	PREB,   (3 * 32 * SIZE)
kusano 2b45e8
#else
kusano 2b45e8
	li	PREB,   (5 * 32 * SIZE)
kusano 2b45e8
#endif
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	li	r0, -1
kusano 2b45e8
	mfspr	VREG, VRsave
kusano 2b45e8
kusano 2b45e8
	mtspr	VRsave, r0
kusano 2b45e8
kusano 2b45e8
	addi	SP, SP, -128
kusano 2b45e8
	li	r0, -8192
kusano 2b45e8
kusano 2b45e8
	and	SP, SP, r0
kusano 2b45e8
kusano 2b45e8
	fneg	f3, f1
kusano 2b45e8
	fneg	f4, f2
kusano 2b45e8
kusano 2b45e8
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
kusano 2b45e8
      defined(NC) || defined(TC) || defined(NR) || defined(TR)
kusano 2b45e8
	stfs	f1,  ALPHA_R +  0(SP)
kusano 2b45e8
	stfs	f1,  ALPHA_R +  4(SP)
kusano 2b45e8
	stfs	f1,  ALPHA_R +  8(SP)
kusano 2b45e8
	stfs	f1,  ALPHA_R + 12(SP)
kusano 2b45e8
kusano 2b45e8
	stfs	f4,  ALPHA_I +  0(SP)
kusano 2b45e8
	stfs	f2,  ALPHA_I +  4(SP)
kusano 2b45e8
	stfs	f4,  ALPHA_I +  8(SP)
kusano 2b45e8
	stfs	f2,  ALPHA_I + 12(SP)
kusano 2b45e8
#else
kusano 2b45e8
	stfs	f1,  ALPHA_R +  0(SP)
kusano 2b45e8
	stfs	f3,  ALPHA_R +  4(SP)
kusano 2b45e8
	stfs	f1,  ALPHA_R +  8(SP)
kusano 2b45e8
	stfs	f3,  ALPHA_R + 12(SP)
kusano 2b45e8
kusano 2b45e8
	stfs	f2,  ALPHA_I +  0(SP)
kusano 2b45e8
	stfs	f2,  ALPHA_I +  4(SP)
kusano 2b45e8
	stfs	f2,  ALPHA_I +  8(SP)
kusano 2b45e8
	stfs	f2,  ALPHA_I + 12(SP)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	li	I,    Address_L(0x04050607)
kusano 2b45e8
	addis	I, I, Address_H(0x04050607)
kusano 2b45e8
	stw	I, SWAP +  0(SP)
kusano 2b45e8
	li	I,    Address_L(0x00010203)
kusano 2b45e8
	addis	I, I, Address_H(0x00010203)
kusano 2b45e8
	stw	I, SWAP +  4(SP)
kusano 2b45e8
	li	I,    Address_L(0x0c0d0e0f)
kusano 2b45e8
	addis	I, I, Address_H(0x0c0d0e0f)
kusano 2b45e8
	stw	I, SWAP +  8(SP)
kusano 2b45e8
	li	I,    Address_L(0x08090a0b)
kusano 2b45e8
	addis	I, I, Address_H(0x08090a0b)
kusano 2b45e8
	stw	I, SWAP + 12(SP)
kusano 2b45e8
kusano 2b45e8
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
kusano 2b45e8
      defined(RR) || defined(RC) || defined(CR) || defined(CC)
kusano 2b45e8
	lis	I, 0x8000
kusano 2b45e8
	stw	I, NEG +  0(SP)
kusano 2b45e8
	stw	I, NEG +  8(SP)
kusano 2b45e8
	li	I, 0
kusano 2b45e8
	stw	I, NEG +  4(SP)
kusano 2b45e8
	stw	I, NEG + 12(SP)
kusano 2b45e8
#else
kusano 2b45e8
	li	I, 0
kusano 2b45e8
	stw	I, NEG +  0(SP)
kusano 2b45e8
	stw	I, NEG +  8(SP)
kusano 2b45e8
	lis	I, 0x8000
kusano 2b45e8
	stw	I, NEG +  4(SP)
kusano 2b45e8
	stw	I, NEG + 12(SP)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	li	r0, 0
kusano 2b45e8
	stw	r0, FZERO(SP)
kusano 2b45e8
kusano 2b45e8
	slwi	LDC, LDC, ZBASE_SHIFT
kusano 2b45e8
kusano 2b45e8
	li	OFFSET_1,  4 * SIZE
kusano 2b45e8
	li	OFFSET_2,  8 * SIZE
kusano 2b45e8
	li	OFFSET_3, 12 * SIZE
kusano 2b45e8
	li	OFFSET_4, 16 * SIZE
kusano 2b45e8
	li	OFFSET_5, 20 * SIZE
kusano 2b45e8
	li	OFFSET_6, 24 * SIZE
kusano 2b45e8
	li	OFFSET_7, 28 * SIZE
kusano 2b45e8
kusano 2b45e8
	cmpwi	cr0, M, 0
kusano 2b45e8
	ble	LL(999)
kusano 2b45e8
	cmpwi	cr0, N, 0
kusano 2b45e8
	ble	LL(999)
kusano 2b45e8
	cmpwi	cr0, K, 0
kusano 2b45e8
	ble	LL(999)
kusano 2b45e8
kusano 2b45e8
	srawi.	J, N,  1
kusano 2b45e8
	ble	LL(50)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(01):
kusano 2b45e8
	mr	CO1, C
kusano 2b45e8
	add	CO2, C,  LDC
kusano 2b45e8
	add	C,   CO2, LDC
kusano 2b45e8
kusano 2b45e8
	mr	AO, A
kusano 2b45e8
	srawi.	I, M,  3
kusano 2b45e8
	ble	LL(20)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(11):
kusano 2b45e8
	vxor	c01, c01, c01
kusano 2b45e8
	LOAD_B	b1, OFFSET_0, B
kusano 2b45e8
	vxor	c02, c02, c02
kusano 2b45e8
	LOAD_B	b2, OFFSET_1, B
kusano 2b45e8
	vxor	c03, c03, c03
kusano 2b45e8
	LOAD_A	a1, OFFSET_0, AO
kusano 2b45e8
	vxor	c04, c04, c04
kusano 2b45e8
	LOAD_A	a2, OFFSET_1, AO
kusano 2b45e8
	vxor	c05, c05, c05
kusano 2b45e8
	LOAD_A	a3, OFFSET_2, AO
kusano 2b45e8
	vxor	c06, c06, c06
kusano 2b45e8
	LOAD_A	a4, OFFSET_3, AO
kusano 2b45e8
	vxor	c07, c07, c07
kusano 2b45e8
	LOAD_A	a5, OFFSET_4, AO
kusano 2b45e8
	vxor	c08, c08, c08
kusano 2b45e8
kusano 2b45e8
	vxor	c09, c09, c09
kusano 2b45e8
	dcbtst	CO1, PREC
kusano 2b45e8
	vxor	c10, c10, c10
kusano 2b45e8
	dcbtst	CO2, PREC
kusano 2b45e8
	vxor	c11, c11, c11
kusano 2b45e8
	vxor	c12, c12, c12
kusano 2b45e8
	vxor	c13, c13, c13
kusano 2b45e8
	mr	BO, B
kusano 2b45e8
	vxor	c14, c14, c14
kusano 2b45e8
	srawi.	r0,  K,  1
kusano 2b45e8
	vxor	c15, c15, c15
kusano 2b45e8
	mtspr	CTR, r0
kusano 2b45e8
	vxor	c16, c16, c16
kusano 2b45e8
	vspltw	bp1, b1, 0
kusano 2b45e8
	ble	LL(15)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(12):
kusano 2b45e8
	vmaddfp	c01, a1, bp1, c01
kusano 2b45e8
	vspltw	bp2, b1, 1
kusano 2b45e8
	vmaddfp	c02, a2, bp1, c02
kusano 2b45e8
	DCBT(BO, PREB)
kusano 2b45e8
	vmaddfp	c03, a3, bp1, c03
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c04, a4, bp1, c04
kusano 2b45e8
	LOAD_A	a6, OFFSET_5, AO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c05, a1, bp2, c05
kusano 2b45e8
	vspltw	bp1, b1, 2
kusano 2b45e8
	vmaddfp	c06, a2, bp2, c06
kusano 2b45e8
#ifdef CELL
kusano 2b45e8
	DCBT(AO, PREA)
kusano 2b45e8
#else
kusano 2b45e8
	nop
kusano 2b45e8
#endif
kusano 2b45e8
	vmaddfp	c07, a3, bp2, c07
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c08, a4, bp2, c08
kusano 2b45e8
	LOAD_A	a7, OFFSET_6, AO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c09, a1, bp1, c09
kusano 2b45e8
	vspltw	bp2, b1, 3
kusano 2b45e8
	vmaddfp	c10, a2, bp1, c10
kusano 2b45e8
	LOAD_B	b1, OFFSET_2, BO
kusano 2b45e8
	vmaddfp	c11, a3, bp1, c11
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c12, a4, bp1, c12
kusano 2b45e8
	LOAD_A	a8, OFFSET_7, AO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c13, a1, bp2, c13
kusano 2b45e8
	vspltw	bp1, b2, 0
kusano 2b45e8
	vmaddfp	c14, a2, bp2, c14
kusano 2b45e8
	addi	AO, AO, 32 * SIZE
kusano 2b45e8
	vmaddfp	c15, a3, bp2, c15
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c16, a4, bp2, c16
kusano 2b45e8
	LOAD_A	a1, OFFSET_0, AO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c01, a5, bp1, c01
kusano 2b45e8
	vspltw	bp2, b2, 1
kusano 2b45e8
	vmaddfp	c02, a6, bp1, c02
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c03, a7, bp1, c03
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c04, a8, bp1, c04
kusano 2b45e8
	LOAD_A	a2, OFFSET_1, AO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c05, a5, bp2, c05
kusano 2b45e8
	vspltw	bp1, b2, 2
kusano 2b45e8
	vmaddfp	c06, a6, bp2, c06
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c07, a7, bp2, c07
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c08, a8, bp2, c08
kusano 2b45e8
	LOAD_A	a3, OFFSET_2, AO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c09, a5, bp1, c09
kusano 2b45e8
	vspltw	bp2, b2, 3
kusano 2b45e8
	vmaddfp	c10, a6, bp1, c10
kusano 2b45e8
	LOAD_B	b2, OFFSET_3, BO
kusano 2b45e8
	vmaddfp	c11, a7, bp1, c11
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c12, a8, bp1, c12
kusano 2b45e8
	LOAD_A	a4, OFFSET_3, AO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c13, a5, bp2, c13
kusano 2b45e8
	vspltw	bp1, b1, 0
kusano 2b45e8
	vmaddfp	c14, a6, bp2, c14
kusano 2b45e8
	addi	BO, BO,  8 * SIZE
kusano 2b45e8
	vmaddfp	c15, a7, bp2, c15
kusano 2b45e8
	LOAD_A	a5, OFFSET_4, AO
kusano 2b45e8
	vmaddfp	c16, a8, bp2, c16
kusano 2b45e8
	bdnz+	LL(12)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(15):
kusano 2b45e8
	lvx	swap,    OFFSET_0, SP
kusano 2b45e8
	lvx	neg,     OFFSET_1, SP
kusano 2b45e8
	lvx	alpha_r, OFFSET_2, SP
kusano 2b45e8
	lvx	alpha_i, OFFSET_3, SP
kusano 2b45e8
kusano 2b45e8
	andi.	r0,  K,  1
kusano 2b45e8
	ble+	LL(18)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(16):
kusano 2b45e8
	vmaddfp	c01, a1, bp1, c01
kusano 2b45e8
	vspltw	bp2, b1, 1
kusano 2b45e8
	vmaddfp	c02, a2, bp1, c02
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c03, a3, bp1, c03
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c04, a4, bp1, c04
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c05, a1, bp2, c05
kusano 2b45e8
	vspltw	bp1, b1, 2
kusano 2b45e8
	vmaddfp	c06, a2, bp2, c06
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c07, a3, bp2, c07
kusano 2b45e8
	nop
kusano 2b45e8
	vmaddfp	c08, a4, bp2, c08
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c09, a1, bp1, c09
kusano 2b45e8
	vspltw	bp2, b1, 3
kusano 2b45e8
	vmaddfp	c10, a2, bp1, c10
kusano 2b45e8
	addi	AO, AO, 16 * SIZE
kusano 2b45e8
	vmaddfp	c11, a3, bp1, c11
kusano 2b45e8
	addi	BO, BO,  4 * SIZE
kusano 2b45e8
	vmaddfp	c12, a4, bp1, c12
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c13, a1, bp2, c13
kusano 2b45e8
	vmaddfp	c14, a2, bp2, c14
kusano 2b45e8
	vmaddfp	c15, a3, bp2, c15
kusano 2b45e8
	vmaddfp	c16, a4, bp2, c16
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(18):
kusano 2b45e8
	vxor	VZERO, VZERO, VZERO
kusano 2b45e8
kusano 2b45e8
	vperm	c05, c05, c05, swap
kusano 2b45e8
	vperm	c06, c06, c06, swap
kusano 2b45e8
	vperm	c07, c07, c07, swap
kusano 2b45e8
	vperm	c08, c08, c08, swap
kusano 2b45e8
kusano 2b45e8
	vperm	c13, c13, c13, swap
kusano 2b45e8
	vperm	c14, c14, c14, swap
kusano 2b45e8
	vperm	c15, c15, c15, swap
kusano 2b45e8
	vperm	c16, c16, c16, swap
kusano 2b45e8
kusano 2b45e8
	vxor	c05, c05, neg
kusano 2b45e8
	vxor	c06, c06, neg
kusano 2b45e8
	vxor	c07, c07, neg
kusano 2b45e8
	vxor	c08, c08, neg
kusano 2b45e8
kusano 2b45e8
	vxor	c13, c13, neg
kusano 2b45e8
	vxor	c14, c14, neg
kusano 2b45e8
	vxor	c15, c15, neg
kusano 2b45e8
	vxor	c16, c16, neg
kusano 2b45e8
kusano 2b45e8
	vaddfp	c01, c01, c05
kusano 2b45e8
	vaddfp	c02, c02, c06
kusano 2b45e8
	vaddfp	c03, c03, c07
kusano 2b45e8
	vaddfp	c04, c04, c08
kusano 2b45e8
kusano 2b45e8
	vaddfp	c09, c09, c13
kusano 2b45e8
	vaddfp	c10, c10, c14
kusano 2b45e8
	vaddfp	c11, c11, c15
kusano 2b45e8
	vaddfp	c12, c12, c16
kusano 2b45e8
kusano 2b45e8
	vperm	c05, c01, c01, swap
kusano 2b45e8
	vperm	c06, c02, c02, swap
kusano 2b45e8
	vperm	c07, c03, c03, swap
kusano 2b45e8
	vperm	c08, c04, c04, swap
kusano 2b45e8
kusano 2b45e8
	vperm	c13, c09, c09, swap
kusano 2b45e8
	vperm	c14, c10, c10, swap
kusano 2b45e8
	vperm	c15, c11, c11, swap
kusano 2b45e8
	vperm	c16, c12, c12, swap
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c01, alpha_r, c01, VZERO
kusano 2b45e8
	vmaddfp	c02, alpha_r, c02, VZERO
kusano 2b45e8
	vmaddfp	c03, alpha_r, c03, VZERO
kusano 2b45e8
	vmaddfp	c04, alpha_r, c04, VZERO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c01, alpha_i, c05, c01
kusano 2b45e8
	vmaddfp	c02, alpha_i, c06, c02
kusano 2b45e8
	vmaddfp	c03, alpha_i, c07, c03
kusano 2b45e8
	vmaddfp	c04, alpha_i, c08, c04
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c09, alpha_r, c09, VZERO
kusano 2b45e8
	vmaddfp	c10, alpha_r, c10, VZERO
kusano 2b45e8
	vmaddfp	c11, alpha_r, c11, VZERO
kusano 2b45e8
	vmaddfp	c12, alpha_r, c12, VZERO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c09, alpha_i, c13, c09
kusano 2b45e8
	vmaddfp	c10, alpha_i, c14, c10
kusano 2b45e8
	vmaddfp	c11, alpha_i, c15, c11
kusano 2b45e8
	vmaddfp	c12, alpha_i, c16, c12
kusano 2b45e8
kusano 2b45e8
	lvx	C1, OFFSET_0, CO1
kusano 2b45e8
	lvx	C2, OFFSET_1, CO1
kusano 2b45e8
	lvx	C3, OFFSET_2, CO1
kusano 2b45e8
	lvx	C4, OFFSET_3, CO1
kusano 2b45e8
	lvx	C5, OFFSET_4, CO1
kusano 2b45e8
kusano 2b45e8
	lvsr	PERMRSHIFT1, 0, CO1
kusano 2b45e8
	lvsr	PERMRSHIFT2, 0, CO2
kusano 2b45e8
kusano 2b45e8
	vperm	c00, VZERO, c01,   PERMRSHIFT1
kusano 2b45e8
	vperm	c01, c01,   c02,   PERMRSHIFT1
kusano 2b45e8
	vperm	c02, c02,   c03,   PERMRSHIFT1
kusano 2b45e8
	vperm	c03, c03,   c04,   PERMRSHIFT1
kusano 2b45e8
	vperm	c04, c04,   VZERO, PERMRSHIFT1
kusano 2b45e8
kusano 2b45e8
	vaddfp	c00, c00, C1
kusano 2b45e8
	vaddfp	c01, c01, C2
kusano 2b45e8
	vaddfp	c02, c02, C3
kusano 2b45e8
	vaddfp	c03, c03, C4
kusano 2b45e8
	vaddfp	c04, c04, C5
kusano 2b45e8
kusano 2b45e8
	stvx	c00, OFFSET_0, CO1
kusano 2b45e8
	stvx	c01, OFFSET_1, CO1
kusano 2b45e8
	stvx	c02, OFFSET_2, CO1
kusano 2b45e8
	stvx	c03, OFFSET_3, CO1
kusano 2b45e8
	stvx	c04, OFFSET_4, CO1
kusano 2b45e8
kusano 2b45e8
	lvx	C1, OFFSET_0, CO2
kusano 2b45e8
	lvx	C2, OFFSET_1, CO2
kusano 2b45e8
	lvx	C3, OFFSET_2, CO2
kusano 2b45e8
	lvx	C4, OFFSET_3, CO2
kusano 2b45e8
	lvx	C5, OFFSET_4, CO2
kusano 2b45e8
kusano 2b45e8
	vperm	c00, VZERO, c09,   PERMRSHIFT2
kusano 2b45e8
	vperm	c09, c09,   c10,   PERMRSHIFT2
kusano 2b45e8
	vperm	c10, c10,   c11,   PERMRSHIFT2
kusano 2b45e8
	vperm	c11, c11,   c12,   PERMRSHIFT2
kusano 2b45e8
	vperm	c12, c12,   VZERO, PERMRSHIFT2
kusano 2b45e8
kusano 2b45e8
	vaddfp	c00, c00, C1
kusano 2b45e8
	vaddfp	c09, c09, C2
kusano 2b45e8
	vaddfp	c10, c10, C3
kusano 2b45e8
	vaddfp	c11, c11, C4
kusano 2b45e8
	vaddfp	c12, c12, C5
kusano 2b45e8
kusano 2b45e8
	stvx	c00, OFFSET_0, CO2
kusano 2b45e8
	stvx	c09, OFFSET_1, CO2
kusano 2b45e8
	stvx	c10, OFFSET_2, CO2
kusano 2b45e8
	stvx	c11, OFFSET_3, CO2
kusano 2b45e8
	stvx	c12, OFFSET_4, CO2
kusano 2b45e8
kusano 2b45e8
	addi	CO1, CO1, 16 * SIZE
kusano 2b45e8
	addi	CO2, CO2, 16 * SIZE
kusano 2b45e8
	addic.	I, I, -1
kusano 2b45e8
	bgt+	LL(11)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(20):
kusano 2b45e8
	andi.	I, M,  4
kusano 2b45e8
	ble	LL(30)
kusano 2b45e8
kusano 2b45e8
	vxor	c01, c01, c01
kusano 2b45e8
	LOAD_A	a1, OFFSET_0, AO
kusano 2b45e8
	vxor	c02, c02, c02
kusano 2b45e8
	LOAD_A	a2, OFFSET_1, AO
kusano 2b45e8
	vxor	c05, c05, c05
kusano 2b45e8
	LOAD_A	a3, OFFSET_2, AO
kusano 2b45e8
	vxor	c06, c06, c06
kusano 2b45e8
	LOAD_A	a4, OFFSET_3, AO
kusano 2b45e8
	vxor	c09, c09, c09
kusano 2b45e8
	LOAD_B	b1, OFFSET_0, B
kusano 2b45e8
	vxor	c10, c10, c10
kusano 2b45e8
	LOAD_B	b2, OFFSET_1, B
kusano 2b45e8
	vxor	c13, c13, c13
kusano 2b45e8
	vxor	c14, c14, c14
kusano 2b45e8
	mr	BO, B
kusano 2b45e8
	vspltw	bp1, b1, 0
kusano 2b45e8
kusano 2b45e8
	srawi.	r0,  K,  1
kusano 2b45e8
	mtspr	CTR, r0
kusano 2b45e8
	ble	LL(25)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(22):
kusano 2b45e8
	vmaddfp	c01, a1, bp1, c01
kusano 2b45e8
	vspltw	bp2, b1, 1
kusano 2b45e8
	addi	AO, AO, 16 * SIZE
kusano 2b45e8
	vmaddfp	c02, a2, bp1, c02
kusano 2b45e8
	addi	BO, BO,  8 * SIZE
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c05, a1, bp2, c05
kusano 2b45e8
	vspltw	bp1, b1, 2
kusano 2b45e8
	vmaddfp	c06, a2, bp2, c06
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c09, a1, bp1, c09
kusano 2b45e8
	vspltw	bp2, b1, 3
kusano 2b45e8
	LOAD_B	b1, OFFSET_0, BO
kusano 2b45e8
	vmaddfp	c10, a2, bp1, c10
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c13, a1, bp2, c13
kusano 2b45e8
	LOAD_A	a1, OFFSET_0, AO
kusano 2b45e8
	vspltw	bp1, b2, 0
kusano 2b45e8
	vmaddfp	c14, a2, bp2, c14
kusano 2b45e8
	LOAD_A	a2, OFFSET_1, AO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c01, a3, bp1, c01
kusano 2b45e8
	vspltw	bp2, b2, 1
kusano 2b45e8
	vmaddfp	c02, a4, bp1, c02
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c05, a3, bp2, c05
kusano 2b45e8
	vspltw	bp1, b2, 2
kusano 2b45e8
	vmaddfp	c06, a4, bp2, c06
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c09, a3, bp1, c09
kusano 2b45e8
	vspltw	bp2, b2, 3
kusano 2b45e8
	LOAD_B	b2, OFFSET_1, BO
kusano 2b45e8
	vmaddfp	c10, a4, bp1, c10
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c13, a3, bp2, c13
kusano 2b45e8
	LOAD_A	a3, OFFSET_2, AO
kusano 2b45e8
	vmaddfp	c14, a4, bp2, c14
kusano 2b45e8
	LOAD_A	a4, OFFSET_3, AO
kusano 2b45e8
	vspltw	bp1, b1, 0
kusano 2b45e8
	bdnz	LL(22)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(25):
kusano 2b45e8
	andi.	r0,  K,  1
kusano 2b45e8
	ble+	LL(28)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(26):
kusano 2b45e8
	vmaddfp	c01, a1, bp1, c01
kusano 2b45e8
	vspltw	bp2, b1, 1
kusano 2b45e8
	vmaddfp	c02, a2, bp1, c02
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c05, a1, bp2, c05
kusano 2b45e8
	vspltw	bp1, b1, 2
kusano 2b45e8
	vmaddfp	c06, a2, bp2, c06
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c09, a1, bp1, c09
kusano 2b45e8
	vspltw	bp2, b1, 3
kusano 2b45e8
	vmaddfp	c10, a2, bp1, c10
kusano 2b45e8
	addi	AO, AO,  8 * SIZE
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c13, a1, bp2, c13
kusano 2b45e8
	addi	BO, BO,  4 * SIZE
kusano 2b45e8
	vmaddfp	c14, a2, bp2, c14
kusano 2b45e8
	nop
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(28):
kusano 2b45e8
	vxor	VZERO, VZERO, VZERO
kusano 2b45e8
kusano 2b45e8
	lvx	swap,    OFFSET_0, SP
kusano 2b45e8
	lvx	neg,     OFFSET_1, SP
kusano 2b45e8
	lvx	alpha_r, OFFSET_2, SP
kusano 2b45e8
	lvx	alpha_i, OFFSET_3, SP
kusano 2b45e8
kusano 2b45e8
	vperm	c05, c05, c05, swap
kusano 2b45e8
	vperm	c06, c06, c06, swap
kusano 2b45e8
	vperm	c13, c13, c13, swap
kusano 2b45e8
	vperm	c14, c14, c14, swap
kusano 2b45e8
kusano 2b45e8
	vxor	c05, c05, neg
kusano 2b45e8
	vxor	c06, c06, neg
kusano 2b45e8
	vxor	c13, c13, neg
kusano 2b45e8
	vxor	c14, c14, neg
kusano 2b45e8
kusano 2b45e8
	vaddfp	c01, c01, c05
kusano 2b45e8
	vaddfp	c02, c02, c06
kusano 2b45e8
	vaddfp	c09, c09, c13
kusano 2b45e8
	vaddfp	c10, c10, c14
kusano 2b45e8
kusano 2b45e8
	vperm	c05, c01, c01, swap
kusano 2b45e8
	vperm	c06, c02, c02, swap
kusano 2b45e8
	vperm	c13, c09, c09, swap
kusano 2b45e8
	vperm	c14, c10, c10, swap
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c01, alpha_r, c01, VZERO
kusano 2b45e8
	vmaddfp	c02, alpha_r, c02, VZERO
kusano 2b45e8
	vmaddfp	c01, alpha_i, c05, c01
kusano 2b45e8
	vmaddfp	c02, alpha_i, c06, c02
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c09, alpha_r, c09, VZERO
kusano 2b45e8
	vmaddfp	c10, alpha_r, c10, VZERO
kusano 2b45e8
	vmaddfp	c09, alpha_i, c13, c09
kusano 2b45e8
	vmaddfp	c10, alpha_i, c14, c10
kusano 2b45e8
kusano 2b45e8
	lvx	C1, OFFSET_0, CO1
kusano 2b45e8
	lvx	C2, OFFSET_1, CO1
kusano 2b45e8
	lvx	C3, OFFSET_2, CO1
kusano 2b45e8
kusano 2b45e8
	lvsr	PERMRSHIFT1, 0, CO1
kusano 2b45e8
	lvsr	PERMRSHIFT2, 0, CO2
kusano 2b45e8
kusano 2b45e8
	vperm	c00, VZERO, c01,   PERMRSHIFT1
kusano 2b45e8
	vperm	c01, c01,   c02,   PERMRSHIFT1
kusano 2b45e8
	vperm	c02, c02, VZERO,   PERMRSHIFT1
kusano 2b45e8
kusano 2b45e8
	vaddfp	c00, c00, C1
kusano 2b45e8
	vaddfp	c01, c01, C2
kusano 2b45e8
	vaddfp	c02, c02, C3
kusano 2b45e8
kusano 2b45e8
	stvx	c00, OFFSET_0, CO1
kusano 2b45e8
	stvx	c01, OFFSET_1, CO1
kusano 2b45e8
	stvx	c02, OFFSET_2, CO1
kusano 2b45e8
kusano 2b45e8
	lvx	C1, OFFSET_0, CO2
kusano 2b45e8
	lvx	C2, OFFSET_1, CO2
kusano 2b45e8
	lvx	C3, OFFSET_2, CO2
kusano 2b45e8
kusano 2b45e8
	vperm	c00, VZERO, c09,   PERMRSHIFT2
kusano 2b45e8
	vperm	c09, c09,   c10,   PERMRSHIFT2
kusano 2b45e8
	vperm	c10, c10,   VZERO, PERMRSHIFT2
kusano 2b45e8
kusano 2b45e8
	vaddfp	c00, c00, C1
kusano 2b45e8
	vaddfp	c09, c09, C2
kusano 2b45e8
	vaddfp	c10, c10, C3
kusano 2b45e8
kusano 2b45e8
	stvx	c00, OFFSET_0, CO2
kusano 2b45e8
	stvx	c09, OFFSET_1, CO2
kusano 2b45e8
	stvx	c10, OFFSET_2, CO2
kusano 2b45e8
kusano 2b45e8
	addi	CO1, CO1, 8 * SIZE
kusano 2b45e8
	addi	CO2, CO2, 8 * SIZE
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(30):
kusano 2b45e8
	andi.	I, M,  2
kusano 2b45e8
	ble	LL(40)
kusano 2b45e8
kusano 2b45e8
	vxor	c01, c01, c01
kusano 2b45e8
	LOAD_A	a1, OFFSET_0, AO
kusano 2b45e8
	vxor	c02, c02, c02
kusano 2b45e8
	LOAD_A	a2, OFFSET_1, AO
kusano 2b45e8
	vxor	c05, c05, c05
kusano 2b45e8
	LOAD_B	b1, OFFSET_0, B
kusano 2b45e8
	vxor	c06, c06, c06
kusano 2b45e8
	LOAD_B	b2, OFFSET_1, B
kusano 2b45e8
	vxor	c09, c09, c09
kusano 2b45e8
	vxor	c10, c10, c10
kusano 2b45e8
	vxor	c13, c13, c13
kusano 2b45e8
	vxor	c14, c14, c14
kusano 2b45e8
kusano 2b45e8
	vspltw	bp1, b1, 0
kusano 2b45e8
	mr	BO, B
kusano 2b45e8
kusano 2b45e8
	srawi.	r0,  K,  1
kusano 2b45e8
	mtspr	CTR, r0
kusano 2b45e8
	ble	LL(35)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(32):
kusano 2b45e8
	vmaddfp	c01, a1, bp1, c01
kusano 2b45e8
	addi	AO, AO,  8 * SIZE
kusano 2b45e8
	vspltw	bp2, b1, 1
kusano 2b45e8
	vmaddfp	c05, a1, bp2, c05
kusano 2b45e8
	addi	BO, BO,  8 * SIZE
kusano 2b45e8
	vspltw	bp1, b1, 2
kusano 2b45e8
	vmaddfp	c09, a1, bp1, c09
kusano 2b45e8
	vspltw	bp2, b1, 3
kusano 2b45e8
	vmaddfp	c13, a1, bp2, c13
kusano 2b45e8
	LOAD_A	a1, OFFSET_0, AO
kusano 2b45e8
	vspltw	bp1, b2, 0
kusano 2b45e8
	LOAD_B	b1, OFFSET_0, BO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c02, a2, bp1, c02
kusano 2b45e8
	vspltw	bp2, b2, 1
kusano 2b45e8
	vmaddfp	c06, a2, bp2, c06
kusano 2b45e8
	vspltw	bp1, b2, 2
kusano 2b45e8
	vmaddfp	c10, a2, bp1, c10
kusano 2b45e8
	vspltw	bp2, b2, 3
kusano 2b45e8
	LOAD_B	b2, OFFSET_1, BO
kusano 2b45e8
	vmaddfp	c14, a2, bp2, c14
kusano 2b45e8
	LOAD_A	a2, OFFSET_1, AO
kusano 2b45e8
kusano 2b45e8
	vspltw	bp1, b1, 0
kusano 2b45e8
	bdnz	LL(32)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(35):
kusano 2b45e8
	andi.	r0,  K,  1
kusano 2b45e8
	ble+	LL(38)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(36):
kusano 2b45e8
	vmaddfp	c01, a1, bp1, c01
kusano 2b45e8
	vspltw	bp2, b1, 1
kusano 2b45e8
	vmaddfp	c05, a1, bp2, c05
kusano 2b45e8
	vspltw	bp1, b1, 2
kusano 2b45e8
	vmaddfp	c09, a1, bp1, c09
kusano 2b45e8
	vspltw	bp2, b1, 3
kusano 2b45e8
	vmaddfp	c13, a1, bp2, c13
kusano 2b45e8
	addi	AO, AO,  4 * SIZE
kusano 2b45e8
	addi	BO, BO,  4 * SIZE
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(38):
kusano 2b45e8
	vaddfp	c01, c01, c02
kusano 2b45e8
	vaddfp	c05, c05, c06
kusano 2b45e8
	vaddfp	c09, c09, c10
kusano 2b45e8
	vaddfp	c13, c13, c14
kusano 2b45e8
kusano 2b45e8
	vxor	VZERO, VZERO, VZERO
kusano 2b45e8
kusano 2b45e8
	lvx	swap,    OFFSET_0, SP
kusano 2b45e8
	lvx	neg,     OFFSET_1, SP
kusano 2b45e8
	lvx	alpha_r, OFFSET_2, SP
kusano 2b45e8
	lvx	alpha_i, OFFSET_3, SP
kusano 2b45e8
kusano 2b45e8
	vperm	c05, c05, c05, swap
kusano 2b45e8
	vperm	c13, c13, c13, swap
kusano 2b45e8
kusano 2b45e8
	vxor	c05, c05, neg
kusano 2b45e8
	vxor	c13, c13, neg
kusano 2b45e8
kusano 2b45e8
	vaddfp	c01, c01, c05
kusano 2b45e8
	vaddfp	c09, c09, c13
kusano 2b45e8
kusano 2b45e8
	vperm	c05, c01, c01, swap
kusano 2b45e8
	vperm	c13, c09, c09, swap
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c01, alpha_r, c01, VZERO
kusano 2b45e8
	vmaddfp	c01, alpha_i, c05, c01
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c09, alpha_r, c09, VZERO
kusano 2b45e8
	vmaddfp	c09, alpha_i, c13, c09
kusano 2b45e8
kusano 2b45e8
	lvx	C1, OFFSET_0, CO1
kusano 2b45e8
	lvx	C2, OFFSET_1, CO1
kusano 2b45e8
kusano 2b45e8
	lvsr	PERMRSHIFT1, 0, CO1
kusano 2b45e8
	lvsr	PERMRSHIFT2, 0, CO2
kusano 2b45e8
kusano 2b45e8
	vperm	c00, VZERO, c01,   PERMRSHIFT1
kusano 2b45e8
	vperm	c01, c01, VZERO,   PERMRSHIFT1
kusano 2b45e8
kusano 2b45e8
	vaddfp	c00, c00, C1
kusano 2b45e8
	vaddfp	c01, c01, C2
kusano 2b45e8
kusano 2b45e8
	stvx	c00, OFFSET_0, CO1
kusano 2b45e8
	stvx	c01, OFFSET_1, CO1
kusano 2b45e8
kusano 2b45e8
	lvx	C1, OFFSET_0, CO2
kusano 2b45e8
	lvx	C2, OFFSET_1, CO2
kusano 2b45e8
kusano 2b45e8
	vperm	c00, VZERO, c09,   PERMRSHIFT2
kusano 2b45e8
	vperm	c09, c09,   VZERO, PERMRSHIFT2
kusano 2b45e8
kusano 2b45e8
	vaddfp	c00, c00, C1
kusano 2b45e8
	vaddfp	c09, c09, C2
kusano 2b45e8
kusano 2b45e8
	stvx	c00, OFFSET_0, CO2
kusano 2b45e8
	stvx	c09, OFFSET_1, CO2
kusano 2b45e8
kusano 2b45e8
	addi	CO1, CO1,  4 * SIZE
kusano 2b45e8
	addi	CO2, CO2,  4 * SIZE
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(40):
kusano 2b45e8
	andi.	I, M,  1
kusano 2b45e8
	ble	LL(49)
kusano 2b45e8
kusano 2b45e8
	mr	BO, B
kusano 2b45e8
kusano 2b45e8
	LFD	f8,   0 * SIZE(AO)
kusano 2b45e8
	LFD	f9,   1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LFD	f10,  0 * SIZE(BO)
kusano 2b45e8
	LFD	f11,  1 * SIZE(BO)
kusano 2b45e8
	LFD	f12,  2 * SIZE(BO)
kusano 2b45e8
	LFD	f13,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	lfs	f0,  FZERO(SP)
kusano 2b45e8
 	fmr	f1,  f0
kusano 2b45e8
	fmr	f2,  f0
kusano 2b45e8
	fmr	f3,  f0
kusano 2b45e8
kusano 2b45e8
	fmr	f4,  f0
kusano 2b45e8
	fmr	f5,  f0
kusano 2b45e8
	fmr	f6,  f0
kusano 2b45e8
	fmr	f7,  f0
kusano 2b45e8
kusano 2b45e8
	srawi.	r0,  K,  1
kusano 2b45e8
	mtspr	CTR, r0
kusano 2b45e8
	ble	LL(45)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(42):
kusano 2b45e8
	fmadd	f0,  f8, f10, f0
kusano 2b45e8
	fmadd	f2,  f8, f11, f2
kusano 2b45e8
	fmadd	f4,  f8, f12, f4
kusano 2b45e8
	fmadd	f6,  f8, f13, f6
kusano 2b45e8
kusano 2b45e8
	fmadd	f1,  f9, f10, f1
kusano 2b45e8
	fmadd	f3,  f9, f11, f3
kusano 2b45e8
	fmadd	f5,  f9, f12, f5
kusano 2b45e8
	fmadd	f7,  f9, f13, f7
kusano 2b45e8
kusano 2b45e8
	LFD	f8,   2 * SIZE(AO)
kusano 2b45e8
	LFD	f9,   3 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LFD	f10,  4 * SIZE(BO)
kusano 2b45e8
	LFD	f11,  5 * SIZE(BO)
kusano 2b45e8
	LFD	f12,  6 * SIZE(BO)
kusano 2b45e8
	LFD	f13,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	fmadd	f0,  f8, f10, f0
kusano 2b45e8
	fmadd	f2,  f8, f11, f2
kusano 2b45e8
	fmadd	f4,  f8, f12, f4
kusano 2b45e8
	fmadd	f6,  f8, f13, f6
kusano 2b45e8
kusano 2b45e8
	fmadd	f1,  f9, f10, f1
kusano 2b45e8
	fmadd	f3,  f9, f11, f3
kusano 2b45e8
	fmadd	f5,  f9, f12, f5
kusano 2b45e8
	fmadd	f7,  f9, f13, f7
kusano 2b45e8
kusano 2b45e8
	LFD	f8,   4 * SIZE(AO)
kusano 2b45e8
	LFD	f9,   5 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LFD	f10,  8 * SIZE(BO)
kusano 2b45e8
	LFD	f11,  9 * SIZE(BO)
kusano 2b45e8
	LFD	f12, 10 * SIZE(BO)
kusano 2b45e8
	LFD	f13, 11 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	addi	AO, AO,  4 * SIZE
kusano 2b45e8
	addi	BO, BO,  8 * SIZE
kusano 2b45e8
	bdnz	LL(42)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(45):
kusano 2b45e8
	andi.	r0,  K,  1
kusano 2b45e8
	ble	LL(48)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(46):
kusano 2b45e8
	fmadd	f0,  f8, f10, f0
kusano 2b45e8
	fmadd	f2,  f8, f11, f2
kusano 2b45e8
	fmadd	f4,  f8, f12, f4
kusano 2b45e8
	fmadd	f6,  f8, f13, f6
kusano 2b45e8
kusano 2b45e8
	fmadd	f1,  f9, f10, f1
kusano 2b45e8
	fmadd	f3,  f9, f11, f3
kusano 2b45e8
	fmadd	f5,  f9, f12, f5
kusano 2b45e8
	fmadd	f7,  f9, f13, f7
kusano 2b45e8
kusano 2b45e8
	addi	AO, AO,  2 * SIZE
kusano 2b45e8
	addi	BO, BO,  4 * SIZE
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(48):
kusano 2b45e8
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
kusano 2b45e8
	fsub	f0, f0, f3
kusano 2b45e8
	fadd	f1, f1, f2
kusano 2b45e8
	fsub	f4, f4, f7
kusano 2b45e8
	fadd	f5, f5, f6
kusano 2b45e8
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
kusano 2b45e8
	fadd	f0, f0, f3
kusano 2b45e8
	fsub	f1, f1, f2
kusano 2b45e8
	fadd	f4, f4, f7
kusano 2b45e8
	fsub	f5, f5, f6
kusano 2b45e8
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
kusano 2b45e8
	fadd	f0, f0, f3
kusano 2b45e8
	fsub	f1, f2, f1
kusano 2b45e8
	fadd	f4, f4, f7
kusano 2b45e8
	fsub	f5, f6, f5
kusano 2b45e8
#else /* RR, RC, CR, CC */
kusano 2b45e8
	fsub	f0, f0, f3
kusano 2b45e8
	fadd	f1, f1, f2
kusano 2b45e8
	fsub	f4, f4, f7
kusano 2b45e8
	fadd	f5, f5, f6
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	LFD	f8,  0 * SIZE(CO1)
kusano 2b45e8
	LFD	f9,  1 * SIZE(CO1)
kusano 2b45e8
	LFD	f10, 0 * SIZE(CO2)
kusano 2b45e8
	LFD	f11, 1 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	lfs	f12,  ALPHA_R + 0(SP)
kusano 2b45e8
	lfs	f13,  ALPHA_I + 4(SP)
kusano 2b45e8
kusano 2b45e8
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
kusano 2b45e8
	fmadd	f8,  f12, f0, f8
kusano 2b45e8
	fnmsub	f9,  f12, f1, f9
kusano 2b45e8
	fmadd	f10, f12, f4, f10
kusano 2b45e8
	fnmsub	f11, f12, f5, f11
kusano 2b45e8
kusano 2b45e8
	fmadd	f8,  f13, f1, f8
kusano 2b45e8
	fmadd	f9,  f13, f0, f9
kusano 2b45e8
	fmadd	f10, f13, f5, f10
kusano 2b45e8
	fmadd	f11, f13, f4, f11
kusano 2b45e8
#else
kusano 2b45e8
	fmadd	f8,  f12, f0, f8
kusano 2b45e8
	fmadd	f9,  f12, f1, f9
kusano 2b45e8
	fmadd	f10, f12, f4, f10
kusano 2b45e8
	fmadd	f11, f12, f5, f11
kusano 2b45e8
kusano 2b45e8
	fnmsub	f8,  f13, f1, f8
kusano 2b45e8
	fmadd	f9,  f13, f0, f9
kusano 2b45e8
	fnmsub	f10, f13, f5, f10
kusano 2b45e8
	fmadd	f11, f13, f4, f11
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	STFD	f8,  0 * SIZE(CO1)
kusano 2b45e8
	STFD	f9,  1 * SIZE(CO1)
kusano 2b45e8
	STFD	f10, 0 * SIZE(CO2)
kusano 2b45e8
	STFD	f11, 1 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
LL(49):
kusano 2b45e8
	mr	B, BO
kusano 2b45e8
kusano 2b45e8
	addic.	J, J, -1
kusano 2b45e8
	bgt	LL(01)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(50):
kusano 2b45e8
	andi.	J, N,  1
kusano 2b45e8
	ble	LL(999)
kusano 2b45e8
kusano 2b45e8
	mr	CO1, C
kusano 2b45e8
	mr	AO, A
kusano 2b45e8
kusano 2b45e8
	srawi.	I, M,  3
kusano 2b45e8
	ble	LL(70)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(61):
kusano 2b45e8
	vxor	c01, c01, c01
kusano 2b45e8
	LOAD_B	b1, OFFSET_0, B
kusano 2b45e8
	vxor	c02, c02, c02
kusano 2b45e8
	vxor	c03, c03, c03
kusano 2b45e8
	LOAD_A	a1, OFFSET_0, AO
kusano 2b45e8
	vxor	c04, c04, c04
kusano 2b45e8
	LOAD_A	a2, OFFSET_1, AO
kusano 2b45e8
	vxor	c05, c05, c05
kusano 2b45e8
	LOAD_A	a3, OFFSET_2, AO
kusano 2b45e8
	vxor	c06, c06, c06
kusano 2b45e8
	LOAD_A	a4, OFFSET_3, AO
kusano 2b45e8
	vxor	c07, c07, c07
kusano 2b45e8
	vxor	c08, c08, c08
kusano 2b45e8
kusano 2b45e8
	mr	BO, B
kusano 2b45e8
	dcbtst	CO1, PREC
kusano 2b45e8
	dcbtst	CO2, PREC
kusano 2b45e8
kusano 2b45e8
	vspltw	bp1, b1, 0
kusano 2b45e8
kusano 2b45e8
	srawi.	r0,  K,  1
kusano 2b45e8
	mtspr	CTR, r0
kusano 2b45e8
	ble	LL(65)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(62):
kusano 2b45e8
	LOAD_A	a5, OFFSET_4, AO
kusano 2b45e8
	LOAD_A	a6, OFFSET_5, AO
kusano 2b45e8
	LOAD_A	a7, OFFSET_6, AO
kusano 2b45e8
	LOAD_A	a8, OFFSET_7, AO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c01, a1, bp1, c01
kusano 2b45e8
	vspltw	bp2, b1, 1
kusano 2b45e8
	vmaddfp	c02, a2, bp1, c02
kusano 2b45e8
	vmaddfp	c03, a3, bp1, c03
kusano 2b45e8
	vmaddfp	c04, a4, bp1, c04
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c05, a1, bp2, c05
kusano 2b45e8
	vspltw	bp1, b1, 2
kusano 2b45e8
	vmaddfp	c06, a2, bp2, c06
kusano 2b45e8
	vmaddfp	c07, a3, bp2, c07
kusano 2b45e8
	vmaddfp	c08, a4, bp2, c08
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c01, a5, bp1, c01
kusano 2b45e8
	vspltw	bp2, b1, 3
kusano 2b45e8
	vmaddfp	c02, a6, bp1, c02
kusano 2b45e8
	vmaddfp	c03, a7, bp1, c03
kusano 2b45e8
	vmaddfp	c04, a8, bp1, c04
kusano 2b45e8
kusano 2b45e8
	LOAD_B	b1, OFFSET_1, BO
kusano 2b45e8
	vspltw	bp1, b1, 0
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c05, a5, bp2, c05
kusano 2b45e8
	vmaddfp	c06, a6, bp2, c06
kusano 2b45e8
	vmaddfp	c07, a7, bp2, c07
kusano 2b45e8
	vmaddfp	c08, a8, bp2, c08
kusano 2b45e8
kusano 2b45e8
	addi	AO, AO, 32 * SIZE
kusano 2b45e8
	addi	BO, BO,  4 * SIZE
kusano 2b45e8
kusano 2b45e8
	LOAD_A	a1, OFFSET_0, AO
kusano 2b45e8
	LOAD_A	a2, OFFSET_1, AO
kusano 2b45e8
	LOAD_A	a3, OFFSET_2, AO
kusano 2b45e8
	LOAD_A	a4, OFFSET_3, AO
kusano 2b45e8
	bdnz	LL(62)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(65):
kusano 2b45e8
	andi.	r0,  K,  1
kusano 2b45e8
	ble+	LL(68)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(66):
kusano 2b45e8
	vmaddfp	c01, a1, bp1, c01
kusano 2b45e8
	vspltw	bp2, b1, 1
kusano 2b45e8
	vmaddfp	c02, a2, bp1, c02
kusano 2b45e8
	addi	AO, AO, 16 * SIZE
kusano 2b45e8
	vmaddfp	c03, a3, bp1, c03
kusano 2b45e8
	addi	BO, BO,  2 * SIZE
kusano 2b45e8
	vmaddfp	c04, a4, bp1, c04
kusano 2b45e8
	nop
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c05, a1, bp2, c05
kusano 2b45e8
	vmaddfp	c06, a2, bp2, c06
kusano 2b45e8
	vmaddfp	c07, a3, bp2, c07
kusano 2b45e8
	vmaddfp	c08, a4, bp2, c08
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(68):
kusano 2b45e8
	vxor	VZERO, VZERO, VZERO
kusano 2b45e8
kusano 2b45e8
	lvx	swap,    OFFSET_0, SP
kusano 2b45e8
	lvx	neg,     OFFSET_1, SP
kusano 2b45e8
	lvx	alpha_r, OFFSET_2, SP
kusano 2b45e8
	lvx	alpha_i, OFFSET_3, SP
kusano 2b45e8
kusano 2b45e8
	vperm	c05, c05, c05, swap
kusano 2b45e8
	vperm	c06, c06, c06, swap
kusano 2b45e8
	vperm	c07, c07, c07, swap
kusano 2b45e8
	vperm	c08, c08, c08, swap
kusano 2b45e8
kusano 2b45e8
	vxor	c05, c05, neg
kusano 2b45e8
	vxor	c06, c06, neg
kusano 2b45e8
	vxor	c07, c07, neg
kusano 2b45e8
	vxor	c08, c08, neg
kusano 2b45e8
kusano 2b45e8
	vaddfp	c01, c01, c05
kusano 2b45e8
	vaddfp	c02, c02, c06
kusano 2b45e8
	vaddfp	c03, c03, c07
kusano 2b45e8
	vaddfp	c04, c04, c08
kusano 2b45e8
kusano 2b45e8
	vperm	c05, c01, c01, swap
kusano 2b45e8
	vperm	c06, c02, c02, swap
kusano 2b45e8
	vperm	c07, c03, c03, swap
kusano 2b45e8
	vperm	c08, c04, c04, swap
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c01, alpha_r, c01, VZERO
kusano 2b45e8
	vmaddfp	c02, alpha_r, c02, VZERO
kusano 2b45e8
	vmaddfp	c03, alpha_r, c03, VZERO
kusano 2b45e8
	vmaddfp	c04, alpha_r, c04, VZERO
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c01, alpha_i, c05, c01
kusano 2b45e8
	vmaddfp	c02, alpha_i, c06, c02
kusano 2b45e8
	vmaddfp	c03, alpha_i, c07, c03
kusano 2b45e8
	vmaddfp	c04, alpha_i, c08, c04
kusano 2b45e8
kusano 2b45e8
	lvx	C1, OFFSET_0, CO1
kusano 2b45e8
	lvx	C2, OFFSET_1, CO1
kusano 2b45e8
	lvx	C3, OFFSET_2, CO1
kusano 2b45e8
	lvx	C4, OFFSET_3, CO1
kusano 2b45e8
	lvx	C5, OFFSET_4, CO1
kusano 2b45e8
kusano 2b45e8
	lvsr	PERMRSHIFT1, 0, CO1
kusano 2b45e8
kusano 2b45e8
	vperm	c00, VZERO, c01,   PERMRSHIFT1
kusano 2b45e8
	vperm	c01, c01,   c02,   PERMRSHIFT1
kusano 2b45e8
	vperm	c02, c02,   c03,   PERMRSHIFT1
kusano 2b45e8
	vperm	c03, c03,   c04,   PERMRSHIFT1
kusano 2b45e8
	vperm	c04, c04,   VZERO, PERMRSHIFT1
kusano 2b45e8
kusano 2b45e8
	vaddfp	c00, c00, C1
kusano 2b45e8
	vaddfp	c01, c01, C2
kusano 2b45e8
	vaddfp	c02, c02, C3
kusano 2b45e8
	vaddfp	c03, c03, C4
kusano 2b45e8
	vaddfp	c04, c04, C5
kusano 2b45e8
kusano 2b45e8
	stvx	c00, OFFSET_0, CO1
kusano 2b45e8
	stvx	c01, OFFSET_1, CO1
kusano 2b45e8
	stvx	c02, OFFSET_2, CO1
kusano 2b45e8
	stvx	c03, OFFSET_3, CO1
kusano 2b45e8
	stvx	c04, OFFSET_4, CO1
kusano 2b45e8
kusano 2b45e8
	addi	CO1, CO1, 16 * SIZE
kusano 2b45e8
	addic.	I, I, -1
kusano 2b45e8
	bgt+	LL(61)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(70):
kusano 2b45e8
	andi.	I, M,  4
kusano 2b45e8
	ble	LL(80)
kusano 2b45e8
kusano 2b45e8
	vxor	c01, c01, c01
kusano 2b45e8
	LOAD_B	b1, OFFSET_0, B
kusano 2b45e8
	vxor	c02, c02, c02
kusano 2b45e8
	vxor	c03, c03, c03
kusano 2b45e8
	LOAD_A	a1, OFFSET_0, AO
kusano 2b45e8
	vxor	c04, c04, c04
kusano 2b45e8
	LOAD_A	a2, OFFSET_1, AO
kusano 2b45e8
	vxor	c05, c05, c05
kusano 2b45e8
	LOAD_A	a3, OFFSET_2, AO
kusano 2b45e8
	vxor	c06, c06, c06
kusano 2b45e8
	LOAD_A	a4, OFFSET_3, AO
kusano 2b45e8
	vxor	c07, c07, c07
kusano 2b45e8
	vxor	c08, c08, c08
kusano 2b45e8
kusano 2b45e8
	mr	BO, B
kusano 2b45e8
kusano 2b45e8
	vspltw	bp1, b1, 0
kusano 2b45e8
	srawi.	r0,  K,  1
kusano 2b45e8
	mtspr	CTR, r0
kusano 2b45e8
	ble	LL(75)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(72):
kusano 2b45e8
	vmaddfp	c01, a1, bp1, c01
kusano 2b45e8
	vspltw	bp2, b1, 1
kusano 2b45e8
	vmaddfp	c02, a2, bp1, c02
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c05, a1, bp2, c05
kusano 2b45e8
	vspltw	bp1, b1, 2
kusano 2b45e8
	vmaddfp	c06, a2, bp2, c06
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c03, a3, bp1, c03
kusano 2b45e8
	vspltw	bp2, b1, 3
kusano 2b45e8
	vmaddfp	c04, a4, bp1, c04
kusano 2b45e8
kusano 2b45e8
	LOAD_B	b1, OFFSET_1, BO
kusano 2b45e8
	vspltw	bp1, b1, 0
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c07, a3, bp2, c07
kusano 2b45e8
	vmaddfp	c08, a4, bp2, c08
kusano 2b45e8
kusano 2b45e8
	addi	AO, AO, 16 * SIZE
kusano 2b45e8
	addi	BO, BO,  4 * SIZE
kusano 2b45e8
kusano 2b45e8
	LOAD_A	a1, OFFSET_0, AO
kusano 2b45e8
	LOAD_A	a2, OFFSET_1, AO
kusano 2b45e8
	LOAD_A	a3, OFFSET_2, AO
kusano 2b45e8
	LOAD_A	a4, OFFSET_3, AO
kusano 2b45e8
	bdnz	LL(72)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(75):
kusano 2b45e8
	andi.	r0,  K,  1
kusano 2b45e8
	ble+	LL(78)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(76):
kusano 2b45e8
	vmaddfp	c01, a1, bp1, c01
kusano 2b45e8
	vspltw	bp2, b1, 1
kusano 2b45e8
	vmaddfp	c02, a2, bp1, c02
kusano 2b45e8
	addi	AO, AO,  8 * SIZE
kusano 2b45e8
	vmaddfp	c05, a1, bp2, c05
kusano 2b45e8
	addi	BO, BO,  2 * SIZE
kusano 2b45e8
	vmaddfp	c06, a2, bp2, c06
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(78):
kusano 2b45e8
	vaddfp	c01, c01, c03
kusano 2b45e8
	vaddfp	c02, c02, c04
kusano 2b45e8
	vaddfp	c05, c05, c07
kusano 2b45e8
	vaddfp	c06, c06, c08
kusano 2b45e8
kusano 2b45e8
	vxor	VZERO, VZERO, VZERO
kusano 2b45e8
kusano 2b45e8
	lvx	swap,    OFFSET_0, SP
kusano 2b45e8
	lvx	neg,     OFFSET_1, SP
kusano 2b45e8
	lvx	alpha_r, OFFSET_2, SP
kusano 2b45e8
	lvx	alpha_i, OFFSET_3, SP
kusano 2b45e8
kusano 2b45e8
	vperm	c05, c05, c05, swap
kusano 2b45e8
	vperm	c06, c06, c06, swap
kusano 2b45e8
kusano 2b45e8
	vxor	c05, c05, neg
kusano 2b45e8
	vxor	c06, c06, neg
kusano 2b45e8
kusano 2b45e8
	vaddfp	c01, c01, c05
kusano 2b45e8
	vaddfp	c02, c02, c06
kusano 2b45e8
kusano 2b45e8
	vperm	c05, c01, c01, swap
kusano 2b45e8
	vperm	c06, c02, c02, swap
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c01, alpha_r, c01, VZERO
kusano 2b45e8
	vmaddfp	c02, alpha_r, c02, VZERO
kusano 2b45e8
	vmaddfp	c01, alpha_i, c05, c01
kusano 2b45e8
	vmaddfp	c02, alpha_i, c06, c02
kusano 2b45e8
kusano 2b45e8
	lvx	C1, OFFSET_0, CO1
kusano 2b45e8
	lvx	C2, OFFSET_1, CO1
kusano 2b45e8
	lvx	C3, OFFSET_2, CO1
kusano 2b45e8
kusano 2b45e8
	lvsr	PERMRSHIFT1, 0, CO1
kusano 2b45e8
kusano 2b45e8
	vperm	c00, VZERO, c01,   PERMRSHIFT1
kusano 2b45e8
	vperm	c01, c01,   c02,   PERMRSHIFT1
kusano 2b45e8
	vperm	c02, c02, VZERO,   PERMRSHIFT1
kusano 2b45e8
kusano 2b45e8
	vaddfp	c00, c00, C1
kusano 2b45e8
	vaddfp	c01, c01, C2
kusano 2b45e8
	vaddfp	c02, c02, C3
kusano 2b45e8
kusano 2b45e8
	stvx	c00, OFFSET_0, CO1
kusano 2b45e8
	stvx	c01, OFFSET_1, CO1
kusano 2b45e8
	stvx	c02, OFFSET_2, CO1
kusano 2b45e8
kusano 2b45e8
	addi	CO1, CO1,  8 * SIZE
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(80):
kusano 2b45e8
	andi.	I, M,  2
kusano 2b45e8
	ble	LL(90)
kusano 2b45e8
kusano 2b45e8
	vxor	c01, c01, c01
kusano 2b45e8
	LOAD_B	b1, OFFSET_0, B
kusano 2b45e8
	vxor	c02, c02, c02
kusano 2b45e8
	LOAD_A	a1, OFFSET_0, AO
kusano 2b45e8
	LOAD_A	a2, OFFSET_1, AO
kusano 2b45e8
	vxor	c05, c05, c05
kusano 2b45e8
	vxor	c06, c06, c06
kusano 2b45e8
kusano 2b45e8
	mr	BO, B
kusano 2b45e8
kusano 2b45e8
	vspltw	bp1, b1, 0
kusano 2b45e8
kusano 2b45e8
	srawi.	r0,  K,  1
kusano 2b45e8
	mtspr	CTR, r0
kusano 2b45e8
	ble	LL(85)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(82):
kusano 2b45e8
	vmaddfp	c01, a1, bp1, c01
kusano 2b45e8
	vspltw	bp2, b1, 1
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c05, a1, bp2, c05
kusano 2b45e8
	vspltw	bp1, b1, 2
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c02, a2, bp1, c02
kusano 2b45e8
	vspltw	bp2, b1, 3
kusano 2b45e8
kusano 2b45e8
	LOAD_B	b1, OFFSET_1, BO
kusano 2b45e8
	vspltw	bp1, b1, 0
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c06, a2, bp2, c06
kusano 2b45e8
kusano 2b45e8
	addi	AO, AO,  8 * SIZE
kusano 2b45e8
	addi	BO, BO,  4 * SIZE
kusano 2b45e8
kusano 2b45e8
	LOAD_A	a1, OFFSET_0, AO
kusano 2b45e8
	LOAD_A	a2, OFFSET_1, AO
kusano 2b45e8
	bdnz	LL(82)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(85):
kusano 2b45e8
	andi.	r0,  K,  1
kusano 2b45e8
	ble+	LL(88)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(86):
kusano 2b45e8
	vspltw	bp2, b1, 1
kusano 2b45e8
	vmaddfp	c01, a1, bp1, c01
kusano 2b45e8
	vmaddfp	c05, a1, bp2, c05
kusano 2b45e8
	addi	AO, AO,  4 * SIZE
kusano 2b45e8
	addi	BO, BO,  2 * SIZE
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(88):
kusano 2b45e8
	vaddfp	c01, c01, c02
kusano 2b45e8
	vaddfp	c05, c05, c06
kusano 2b45e8
	vaddfp	c09, c09, c10
kusano 2b45e8
	vaddfp	c13, c13, c14
kusano 2b45e8
kusano 2b45e8
	vxor	VZERO, VZERO, VZERO
kusano 2b45e8
kusano 2b45e8
	lvx	swap,    OFFSET_0, SP
kusano 2b45e8
	lvx	neg,     OFFSET_1, SP
kusano 2b45e8
	lvx	alpha_r, OFFSET_2, SP
kusano 2b45e8
	lvx	alpha_i, OFFSET_3, SP
kusano 2b45e8
kusano 2b45e8
	vperm	c05, c05, c05, swap
kusano 2b45e8
kusano 2b45e8
	vxor	c05, c05, neg
kusano 2b45e8
kusano 2b45e8
	vaddfp	c01, c01, c05
kusano 2b45e8
kusano 2b45e8
	vperm	c05, c01, c01, swap
kusano 2b45e8
kusano 2b45e8
	vmaddfp	c01, alpha_r, c01, VZERO
kusano 2b45e8
	vmaddfp	c01, alpha_i, c05, c01
kusano 2b45e8
kusano 2b45e8
	lvx	C1, OFFSET_0, CO1
kusano 2b45e8
	lvx	C2, OFFSET_1, CO1
kusano 2b45e8
kusano 2b45e8
	lvsr	PERMRSHIFT1, 0, CO1
kusano 2b45e8
kusano 2b45e8
	vperm	c00, VZERO, c01,   PERMRSHIFT1
kusano 2b45e8
	vperm	c01, c01, VZERO,   PERMRSHIFT1
kusano 2b45e8
kusano 2b45e8
	vaddfp	c00, c00, C1
kusano 2b45e8
	vaddfp	c01, c01, C2
kusano 2b45e8
kusano 2b45e8
	stvx	c00, OFFSET_0, CO1
kusano 2b45e8
	stvx	c01, OFFSET_1, CO1
kusano 2b45e8
kusano 2b45e8
	addi	CO1, CO1,  4 * SIZE
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(90):
kusano 2b45e8
	andi.	I, M,  1
kusano 2b45e8
	ble	LL(999)
kusano 2b45e8
kusano 2b45e8
	mr	BO, B
kusano 2b45e8
kusano 2b45e8
	LFD	f8,   0 * SIZE(AO)
kusano 2b45e8
	LFD	f9,   1 * SIZE(AO)
kusano 2b45e8
kusano 2b45e8
	LFD	f10,  0 * SIZE(BO)
kusano 2b45e8
	LFD	f11,  1 * SIZE(BO)
kusano 2b45e8
	LFD	f12,  2 * SIZE(BO)
kusano 2b45e8
	LFD	f13,  3 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	lfs	f0,  FZERO(SP)
kusano 2b45e8
 	fmr	f1,  f0
kusano 2b45e8
	fmr	f2,  f0
kusano 2b45e8
	fmr	f3,  f0
kusano 2b45e8
kusano 2b45e8
	srawi.	r0,  K,  1
kusano 2b45e8
	mtspr	CTR, r0
kusano 2b45e8
	ble	LL(95)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(92):
kusano 2b45e8
	fmadd	f0,  f8, f10, f0
kusano 2b45e8
	fmadd	f2,  f8, f11, f2
kusano 2b45e8
	fmadd	f1,  f9, f10, f1
kusano 2b45e8
	fmadd	f3,  f9, f11, f3
kusano 2b45e8
kusano 2b45e8
	LFD	f8,   2 * SIZE(AO)
kusano 2b45e8
	LFD	f9,   3 * SIZE(AO)
kusano 2b45e8
	LFD	f10,  4 * SIZE(BO)
kusano 2b45e8
	LFD	f11,  5 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	fmadd	f0,  f8, f12, f0
kusano 2b45e8
	fmadd	f2,  f8, f13, f2
kusano 2b45e8
	fmadd	f1,  f9, f12, f1
kusano 2b45e8
	fmadd	f3,  f9, f13, f3
kusano 2b45e8
kusano 2b45e8
	LFD	f8,   4 * SIZE(AO)
kusano 2b45e8
	LFD	f9,   5 * SIZE(AO)
kusano 2b45e8
	LFD	f12,  6 * SIZE(BO)
kusano 2b45e8
	LFD	f13,  7 * SIZE(BO)
kusano 2b45e8
kusano 2b45e8
	addi	AO, AO,  4 * SIZE
kusano 2b45e8
	addi	BO, BO,  4 * SIZE
kusano 2b45e8
	bdnz	LL(92)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(95):
kusano 2b45e8
	andi.	r0,  K,  1
kusano 2b45e8
	ble	LL(98)
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(96):
kusano 2b45e8
	fmadd	f0,  f8, f10, f0
kusano 2b45e8
	fmadd	f2,  f8, f11, f2
kusano 2b45e8
	fmadd	f1,  f9, f10, f1
kusano 2b45e8
	fmadd	f3,  f9, f11, f3
kusano 2b45e8
	.align 4
kusano 2b45e8
kusano 2b45e8
LL(98):
kusano 2b45e8
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
kusano 2b45e8
	fsub	f0, f0, f3
kusano 2b45e8
	fadd	f1, f1, f2
kusano 2b45e8
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
kusano 2b45e8
	fadd	f0, f0, f3
kusano 2b45e8
	fsub	f1, f1, f2
kusano 2b45e8
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
kusano 2b45e8
	fadd	f0, f0, f3
kusano 2b45e8
	fsub	f1, f2, f1
kusano 2b45e8
#else /* RR, RC, CR, CC */
kusano 2b45e8
	fsub	f0, f0, f3
kusano 2b45e8
	fadd	f1, f1, f2
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	LFD	f8,  0 * SIZE(CO1)
kusano 2b45e8
	LFD	f9,  1 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
	lfs	f12,  ALPHA_R + 0(SP)
kusano 2b45e8
	lfs	f13,  ALPHA_I + 4(SP)
kusano 2b45e8
kusano 2b45e8
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
kusano 2b45e8
	fmadd	f8,  f12, f0, f8
kusano 2b45e8
	fnmsub	f9,  f12, f1, f9
kusano 2b45e8
kusano 2b45e8
	fmadd	f8,  f13, f1, f8
kusano 2b45e8
	fmadd	f9,  f13, f0, f9
kusano 2b45e8
#else
kusano 2b45e8
	fmadd	f8,  f12, f0, f8
kusano 2b45e8
	fmadd	f9,  f12, f1, f9
kusano 2b45e8
kusano 2b45e8
	fnmsub	f8,  f13, f1, f8
kusano 2b45e8
	fmadd	f9,  f13, f0, f9
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	STFD	f8,  0 * SIZE(CO1)
kusano 2b45e8
	STFD	f9,  1 * SIZE(CO1)
kusano 2b45e8
	.align 4
kusano 2b45e8
	
kusano 2b45e8
LL(999):
kusano 2b45e8
	mr	SP, STACK
kusano 2b45e8
kusano 2b45e8
	li	r0,  0 * 16
kusano 2b45e8
	lvx	v20, SP, r0
kusano 2b45e8
	li	r0,  1 * 16
kusano 2b45e8
	lvx	v21, SP, r0
kusano 2b45e8
	li	r0,  2 * 16
kusano 2b45e8
	lvx	v22, SP, r0
kusano 2b45e8
	li	r0,  3 * 16
kusano 2b45e8
	lvx	v23, SP, r0
kusano 2b45e8
	li	r0,  4 * 16
kusano 2b45e8
	lvx	v24, SP, r0
kusano 2b45e8
	li	r0,  5 * 16
kusano 2b45e8
	lvx	v25, SP, r0
kusano 2b45e8
	li	r0,  6 * 16
kusano 2b45e8
	lvx	v26, SP, r0
kusano 2b45e8
	li	r0,  7 * 16
kusano 2b45e8
	lvx	v27, SP, r0
kusano 2b45e8
	li	r0,  8 * 16
kusano 2b45e8
	lvx	v28, SP, r0
kusano 2b45e8
	li	r0,  9 * 16
kusano 2b45e8
	lvx	v29, SP, r0
kusano 2b45e8
	li	r0, 10 * 16
kusano 2b45e8
	lvx	v30, SP, r0
kusano 2b45e8
	li	r0, 11 * 16
kusano 2b45e8
	lvx	v31, SP, r0
kusano 2b45e8
kusano 2b45e8
	mtspr	VRsave, VREG
kusano 2b45e8
kusano 2b45e8
#ifdef __64BIT__
kusano 2b45e8
	ld	r31,  192(SP)
kusano 2b45e8
	ld	r30,  200(SP)
kusano 2b45e8
	ld	r29,  208(SP)
kusano 2b45e8
	ld	r28,  216(SP)
kusano 2b45e8
	ld	r27,  224(SP)
kusano 2b45e8
	ld	r26,  232(SP)
kusano 2b45e8
	ld	r25,  240(SP)
kusano 2b45e8
	ld	r24,  248(SP)
kusano 2b45e8
	ld	r23,  256(SP)
kusano 2b45e8
	ld	r22,  264(SP)
kusano 2b45e8
	ld	r21,  272(SP)
kusano 2b45e8
	ld	r20,  280(SP)
kusano 2b45e8
	ld	r19,  288(SP)
kusano 2b45e8
	ld	r18,  296(SP)
kusano 2b45e8
	ld	r17,  304(SP)
kusano 2b45e8
	ld	r16,  312(SP)
kusano 2b45e8
	ld	r15,  320(SP)
kusano 2b45e8
	ld	r14,  328(SP)
kusano 2b45e8
#else
kusano 2b45e8
	lwz	r31,  192(SP)
kusano 2b45e8
	lwz	r30,  196(SP)
kusano 2b45e8
	lwz	r29,  200(SP)
kusano 2b45e8
	lwz	r28,  204(SP)
kusano 2b45e8
	lwz	r27,  208(SP)
kusano 2b45e8
	lwz	r26,  212(SP)
kusano 2b45e8
	lwz	r25,  216(SP)
kusano 2b45e8
	lwz	r24,  220(SP)
kusano 2b45e8
	lwz	r23,  224(SP)
kusano 2b45e8
	lwz	r22,  228(SP)
kusano 2b45e8
	lwz	r21,  232(SP)
kusano 2b45e8
	lwz	r20,  236(SP)
kusano 2b45e8
	lwz	r19,  240(SP)
kusano 2b45e8
	lwz	r18,  244(SP)
kusano 2b45e8
	lwz	r17,  248(SP)
kusano 2b45e8
	lwz	r16,  252(SP)
kusano 2b45e8
	lwz	r15,  256(SP)
kusano 2b45e8
	lwz	r14,  260(SP)
kusano 2b45e8
#endif
kusano 2b45e8
kusano 2b45e8
	addi	SP, SP, STACKSIZE
kusano 2b45e8
kusano 2b45e8
	blr
kusano 2b45e8
kusano 2b45e8
	EPILOGUE
kusano 2b45e8
#endif