Blob Blame Raw
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"
		
#ifndef __64BIT__
#define LOAD	lwz
#else
#define LOAD	ld
#endif

#ifdef __64BIT__
#define STACKSIZE 360
#else
#define STACKSIZE 272
#endif

#define ALPHA		  0
#define FZERO		 16

#define	M	r3
#define	N	r4
#define	K	r5

#ifdef linux
#ifndef __64BIT__
#define A	r6
#define	B	r7
#define	C	r8
#define	LDC	r9
#else
#define A	r7
#define	B	r8
#define	C	r9
#define	LDC	r10
#endif
#endif

#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A	r8
#define	B	r9
#define	C	r10
#define	LDC	r7
#else
#define A	r7
#define	B	r8
#define	C	r9
#define	LDC	r10
#endif
#endif

#define STACK	r11

#define	I	r21
#define J	r22
#define AO	r23
#define	BO	r24
#define	CO1	r25
#define CO2	r26
#define	CO3	r27
#define	CO4	r28

#define PREA	r29
#define PREB	r29
#define PREC	r30
#define VREG	r31

#define LOAD_A	lvx
#define LOAD_B	lvx

#define OFFSET_0	  0
#define OFFSET_1	r14
#define OFFSET_2	r15
#define OFFSET_3	r16
#define OFFSET_4	r17
#define OFFSET_5	r18
#define OFFSET_6	r19
#define OFFSET_7	r20

#define	c01	v0
#define	c02	v1
#define	c03	v2
#define	c04	v3
#define	c05	v4
#define	c06	v5
#define	c07	v6
#define	c08	v7
#define	c09	v8
#define	c10	v9
#define	c11	v10
#define	c12	v11
#define	c13	v12
#define	c14	v13
#define	c15	v14
#define	c16	v15

#define	a1	v16
#define	a2	v17
#define	a3	v18
#define	a4	v19
#define	a5	v20
#define	a6	v21
#define	a7	v22
#define	a8	v23

#define	b1	v24
#define	b2	v25
#define	bp1	v26
#define	bp2	v27

#define C1	v16
#define C2	v17
#define C3	v18
#define C4	v19
#define C5	v20
#define C6	v21
#define C7	v22
#define C8	v23
#define C9	v24

#define c00	v25

#define PERMRSHIFT1	 v26
#define PERMRSHIFT2	 v27
#define PERMRSHIFT3	 v28
#define PERMRSHIFT4	 v29

#define VZERO	v30
#define alpha	v31

#ifndef NEEDPARAM

	PROLOGUE
	PROFCODE

	addi	SP, SP, -STACKSIZE
	mr	STACK, SP

	li	r0,  0 * 16
	stvx	v20, SP, r0
	li	r0,  1 * 16
	stvx	v21, SP, r0
	li	r0,  2 * 16
	stvx	v22, SP, r0
	li	r0,  3 * 16
	stvx	v23, SP, r0
	li	r0,  4 * 16
	stvx	v24, SP, r0
	li	r0,  5 * 16
	stvx	v25, SP, r0
	li	r0,  6 * 16
	stvx	v26, SP, r0
	li	r0,  7 * 16
	stvx	v27, SP, r0
	li	r0,  8 * 16
	stvx	v28, SP, r0
	li	r0,  9 * 16
	stvx	v29, SP, r0
	li	r0, 10 * 16
	stvx	v30, SP, r0
	li	r0, 11 * 16
	stvx	v31, SP, r0

#ifdef __64BIT__
	std	r31,  192(SP)
	std	r30,  200(SP)
	std	r29,  208(SP)
	std	r28,  216(SP)
	std	r27,  224(SP)
	std	r26,  232(SP)
	std	r25,  240(SP)
	std	r24,  248(SP)
	std	r23,  256(SP)
	std	r22,  264(SP)
	std	r21,  272(SP)
	std	r20,  280(SP)
	std	r19,  288(SP)
	std	r18,  296(SP)
	std	r17,  304(SP)
	std	r16,  312(SP)
	std	r15,  320(SP)
	std	r14,  328(SP)
#else
	stw	r31,  192(SP)
	stw	r30,  196(SP)
	stw	r29,  200(SP)
	stw	r28,  204(SP)
	stw	r27,  208(SP)
	stw	r26,  212(SP)
	stw	r25,  216(SP)
	stw	r24,  220(SP)
	stw	r23,  224(SP)
	stw	r22,  228(SP)
	stw	r21,  232(SP)
	stw	r20,  236(SP)
	stw	r19,  240(SP)
	stw	r18,  244(SP)
	stw	r17,  248(SP)
	stw	r16,  252(SP)
	stw	r15,  256(SP)
	stw	r14,  260(SP)
#endif


#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
	lwz	LDC,    56 + STACKSIZE(SP)
#endif
#endif

	li	r0, -1

	mfspr	VREG, VRsave
	mtspr	VRsave, r0

	addi	SP, SP, -128
	li	r0, -128
	and	SP, SP, r0

	li	OFFSET_1,  4 * SIZE
	li	OFFSET_2,  8 * SIZE
	li	OFFSET_3, 12 * SIZE
	li	OFFSET_4, 16 * SIZE
	li	OFFSET_5, 20 * SIZE
	li	OFFSET_6, 24 * SIZE
	li	OFFSET_7, 28 * SIZE

	stfs	f1,  ALPHA +  0(SP)
	stfs	f1,  ALPHA +  4(SP)
	stfs	f1,  ALPHA +  8(SP)
	stfs	f1,  ALPHA + 12(SP)

	li	r29, 0
	stw	r29, FZERO(SP)

	slwi	LDC, LDC, BASE_SHIFT

	li	PREC,   (15 * SIZE)
	li	PREB,   (25 * 8 * SIZE)

	cmpwi	cr0, M, 0
	ble	LL(999)
	cmpwi	cr0, N, 0
	ble	LL(999)
	cmpwi	cr0, K, 0
	ble	LL(999)

	srawi.	J, N,  2
	ble	LL(60)
	.align 4

LL(01):
	mr	CO1, C
	add	CO2, C,  LDC
	add	CO3, CO2, LDC
	add	CO4, CO3, LDC
	add	C,   CO4, LDC

	mr	AO, A
	srawi.	I, M,  4
	ble	LL(20)
	.align 4

LL(11):
	vxor	c01, c01, c01
	LOAD_B	b1, OFFSET_0, B
	vxor	c02, c02, c02
	LOAD_A	a1, OFFSET_0, AO
	vxor	c03, c03, c03
	LOAD_A	a2, OFFSET_1, AO
	vxor	c04, c04, c04
	LOAD_A	a3, OFFSET_2, AO
	vxor	c05, c05, c05
	LOAD_A	a4, OFFSET_3, AO
	vxor	c06, c06, c06
	LOAD_B	b2, OFFSET_2, B
	vxor	c07, c07, c07
	LOAD_A	a5, OFFSET_4, AO
	vxor	c08, c08, c08
	LOAD_A	a6, OFFSET_5, AO
	vxor	c09, c09, c09
	dcbtst	CO1, PREC
	vxor	c10, c10, c10
	dcbtst	CO2, PREC
	vxor	c11, c11, c11
	dcbtst	CO3, PREC
	vxor	c12, c12, c12
	dcbtst	CO4, PREC
	vxor	c13, c13, c13
	mr	BO, B
	vxor	c14, c14, c14
	srawi.	r0,  K,  2
	vxor	c15, c15, c15
	mtspr	CTR, r0
	vxor	c16, c16, c16
	vspltw	bp1, b1, 0
	ble	LL(15)
	.align 4

LL(12):
/* 1 */
	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b1, 1
	vmaddfp	c02, a2, bp1, c02
	addi	AO, AO,  8 * SIZE
	vmaddfp	c03, a3, bp1, c03
	LOAD_A	a7, OFFSET_4, AO
	vmaddfp	c04, a4, bp1, c04
	LOAD_A	a8, OFFSET_5, AO

/* 2 */
	vmaddfp	c05, a1, bp2, c05
	vspltw	bp1, b1, 2
	vmaddfp	c06, a2, bp2, c06
	dcbt	BO, PREB
	vmaddfp	c07, a3, bp2, c07
	dcbt	AO, PREB
	vmaddfp	c08, a4, bp2, c08
	addi	AO, AO,  8 * SIZE

/* 3 */
	vmaddfp	c09, a1, bp1, c09
	vspltw	bp2, b1, 3
	vmaddfp	c10, a2, bp1, c10
	LOAD_B	b1, OFFSET_1, BO
	vmaddfp	c11, a3, bp1, c11
	dcbt	AO, PREB
	vmaddfp	c12, a4, bp1, c12
	addi	AO, AO, 8 * SIZE

/* 4 */
	vmaddfp	c13, a1, bp2, c13
	vspltw	bp1, b1, 0
	vmaddfp	c14, a2, bp2, c14
	LOAD_A	a1, OFFSET_2, AO
	vmaddfp	c15, a3, bp2, c15
	dcbt	AO, PREB
	vmaddfp	c16, a4, bp2, c16
	addi	AO, AO,  8 * SIZE

/* 5 */
	vmaddfp	c01, a5, bp1, c01
	vspltw	bp2, b1, 1
	vmaddfp	c02, a6, bp1, c02
	LOAD_A	a2, OFFSET_1, AO
	vmaddfp	c03, a7, bp1, c03
	LOAD_A	a3, OFFSET_2, AO
	vmaddfp	c04, a8, bp1, c04
	LOAD_A	a4, OFFSET_3, AO

/* 6 */
	vmaddfp	c05, a5, bp2, c05
	vspltw	bp1, b1, 2
	vmaddfp	c06, a6, bp2, c06
	nop
	vmaddfp	c07, a7, bp2, c07
	dcbt	AO, PREA
	vmaddfp	c08, a8, bp2, c08
	addi	AO, AO,  8 * SIZE

/* 7 */
	vmaddfp	c09, a5, bp1, c09
	vspltw	bp2, b1, 3
	vmaddfp	c10, a6, bp1, c10
	LOAD_B	b1, OFFSET_4, BO
	vmaddfp	c11, a7, bp1, c11
	nop
	vmaddfp	c12, a8, bp1, c12
	nop

/* 8 */
	vmaddfp	c13, a5, bp2, c13
	vspltw	bp1, b2, 0
	vmaddfp	c14, a6, bp2, c14
	LOAD_A	a5, OFFSET_2, AO
	vmaddfp	c15, a7, bp2, c15
	LOAD_A	a6, OFFSET_3, AO
	vmaddfp	c16, a8, bp2, c16
	LOAD_A	a7, OFFSET_4, AO

/* 9 */
	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b2, 1
	vmaddfp	c02, a2, bp1, c02
	LOAD_A	a8, OFFSET_5, AO
	vmaddfp	c03, a3, bp1, c03
	addi	BO, BO,  8 * SIZE
	vmaddfp	c04, a4, bp1, c04
	nop

/* 10 */
	vmaddfp	c05, a1, bp2, c05
	vspltw	bp1, b2, 2
	vmaddfp	c06, a2, bp2, c06
	nop
	vmaddfp	c07, a3, bp2, c07
	nop
	vmaddfp	c08, a4, bp2, c08
	nop

/* 11 */
	vmaddfp	c09, a1, bp1, c09
	vspltw	bp2, b2, 3
	vmaddfp	c10, a2, bp1, c10
	LOAD_B	b2, OFFSET_1, BO
	vmaddfp	c11, a3, bp1, c11
	dcbt	AO, PREA
	vmaddfp	c12, a4, bp1, c12
	addi	AO, AO,  8 * SIZE

/* 12 */
	vmaddfp	c13, a1, bp2, c13
	vspltw	bp1, b2, 0
	vmaddfp	c14, a2, bp2, c14
	LOAD_A	a1, OFFSET_4, AO
	vmaddfp	c15, a3, bp2, c15
	LOAD_A	a2, OFFSET_5, AO
	vmaddfp	c16, a4, bp2, c16
	LOAD_A	a3, OFFSET_6, AO

/* 13 */
	vmaddfp	c01, a5, bp1, c01
	vspltw	bp2, b2, 1
	vmaddfp	c02, a6, bp1, c02
	LOAD_A	a4, OFFSET_7, AO
	vmaddfp	c03, a7, bp1, c03
	dcbt	AO, PREA
	vmaddfp	c04, a8, bp1, c04
	addi	AO, AO,  8 * SIZE

/* 14 */
	vmaddfp	c05, a5, bp2, c05
	vspltw	bp1, b2, 2
	vmaddfp	c06, a6, bp2, c06
	nop
	vmaddfp	c07, a7, bp2, c07
	dcbt	AO, PREA
	vmaddfp	c08, a8, bp2, c08
	addi	AO, AO,  8 * SIZE

/* 15 */
	vmaddfp	c09, a5, bp1, c09
	vspltw	bp2, b2, 3
	vmaddfp	c10, a6, bp1, c10
	LOAD_B	b2, OFFSET_4, BO
	vmaddfp	c11, a7, bp1, c11
	dcbt	AO, PREA
	vmaddfp	c12, a8, bp1, c12
	addi	BO, BO,  8 * SIZE

/* 16 */
	vmaddfp	c13, a5, bp2, c13
	vspltw	bp1, b1, 0
	vmaddfp	c14, a6, bp2, c14
	LOAD_A	a5, OFFSET_4, AO
	vmaddfp	c15, a7, bp2, c15
	LOAD_A	a6, OFFSET_5, AO
	vmaddfp	c16, a8, bp2, c16
	bdnz+	LL(12)
	.align 4

LL(15):
	andi.	r0,  K,  3
	lvx	alpha, OFFSET_0, SP
	vxor	VZERO, VZERO, VZERO
	mtspr	CTR, r0
	ble+	LL(18)
	.align 4

LL(16):
	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b1, 1
	vmaddfp	c02, a2, bp1, c02
	nop
	vmaddfp	c03, a3, bp1, c03
	nop
	vmaddfp	c04, a4, bp1, c04
	nop

	vmaddfp	c05, a1, bp2, c05
	vspltw	bp1, b1, 2
	vmaddfp	c06, a2, bp2, c06
	nop
	vmaddfp	c07, a3, bp2, c07
	nop
	vmaddfp	c08, a4, bp2, c08
	nop

	vmaddfp	c09, a1, bp1, c09
	vspltw	bp2, b1, 3
	vmaddfp	c10, a2, bp1, c10
	LOAD_B	b1, OFFSET_1, BO
	vmaddfp	c11, a3, bp1, c11
	addi	AO, AO, 16 * SIZE
	vmaddfp	c12, a4, bp1, c12
	addi	BO, BO,  4 * SIZE

	vmaddfp	c13, a1, bp2, c13
	vspltw	bp1, b1, 0
	vmaddfp	c14, a2, bp2, c14
	LOAD_A	a1, OFFSET_0, AO
	vmaddfp	c15, a3, bp2, c15
	LOAD_A	a2, OFFSET_1, AO
	vmaddfp	c16, a4, bp2, c16
	LOAD_A	a3, OFFSET_2, AO

	LOAD_A	a4, OFFSET_3, AO
	bdnz+	LL(16)
	.align 4

LL(18):
	lvx	C1, OFFSET_0, CO1
	cmpwi	cr0, LDC, 32 * SIZE
	lvx	C2, OFFSET_1, CO1
	lvsr	PERMRSHIFT1, 0, CO1
	lvx	C3, OFFSET_2, CO1
	lvsr	PERMRSHIFT2, 0, CO2
	lvx	C4, OFFSET_3, CO1
	lvsr	PERMRSHIFT3, 0, CO3
	lvx	C5, OFFSET_4, CO1
	lvsr	PERMRSHIFT4, 0, CO4
	ble	LL(19)

	vperm	c00, VZERO, c01,   PERMRSHIFT1
	vperm	c01, c01,   c02,   PERMRSHIFT1
	vperm	c02, c02,   c03,   PERMRSHIFT1
	vperm	c03, c03,   c04,   PERMRSHIFT1
	vperm	c04, c04,   VZERO, PERMRSHIFT1

	vmaddfp	c00, alpha, c00, C1
	lvx	C1, OFFSET_0, CO2
	vmaddfp	c01, alpha, c01, C2
	lvx	C6, OFFSET_1, CO2
	vmaddfp	c02, alpha, c02, C3
	lvx	C7, OFFSET_2, CO2
	vmaddfp	c03, alpha, c03, C4
	lvx	C8, OFFSET_3, CO2
	vmaddfp	c04, alpha, c04, C5
	lvx	C9, OFFSET_4, CO2

	stvx	c00, OFFSET_0, CO1
	vperm	c00, VZERO, c05,   PERMRSHIFT2
	stvx	c01, OFFSET_1, CO1
	vperm	c05, c05,   c06,   PERMRSHIFT2
	stvx	c02, OFFSET_2, CO1
	vperm	c06, c06,   c07,   PERMRSHIFT2
	stvx	c03, OFFSET_3, CO1
	vperm	c07, c07,   c08,   PERMRSHIFT2
	stvx	c04, OFFSET_4, CO1
	vperm	c08, c08,   VZERO, PERMRSHIFT2

	vmaddfp	c00, alpha, c00, C1
	lvx	C1, OFFSET_0, CO3
	vmaddfp	c05, alpha, c05, C6
	lvx	C2, OFFSET_1, CO3
	vmaddfp	c06, alpha, c06, C7
	lvx	C3, OFFSET_2, CO3
	vmaddfp	c07, alpha, c07, C8
	lvx	C4, OFFSET_3, CO3
	vmaddfp	c08, alpha, c08, C9
	lvx	C5, OFFSET_4, CO3

	stvx	c00, OFFSET_0, CO2
	vperm	c00, VZERO, c09,   PERMRSHIFT3
	stvx	c05, OFFSET_1, CO2
	vperm	c09, c09,   c10,   PERMRSHIFT3
	stvx	c06, OFFSET_2, CO2
	vperm	c10, c10,   c11,   PERMRSHIFT3
	stvx	c07, OFFSET_3, CO2
	vperm	c11, c11,   c12,   PERMRSHIFT3
	stvx	c08, OFFSET_4, CO2
	vperm	c12, c12,   VZERO, PERMRSHIFT3

	vmaddfp	c00, alpha, c00, C1
	lvx	C9, OFFSET_4, CO4
	vmaddfp	c09, alpha, c09, C2
	lvx	C1, OFFSET_0, CO4
	vmaddfp	c10, alpha, c10, C3
	lvx	C6, OFFSET_1, CO4
	vmaddfp	c11, alpha, c11, C4
	lvx	C7, OFFSET_2, CO4
	vmaddfp	c12, alpha, c12, C5
	lvx	C8, OFFSET_3, CO4

	stvx	c00, OFFSET_0, CO3
	vperm	c00, VZERO, c13,   PERMRSHIFT4
	stvx	c09, OFFSET_1, CO3
	vperm	c13, c13,   c14,   PERMRSHIFT4
	stvx	c10, OFFSET_2, CO3
	vperm	c14, c14,   c15,   PERMRSHIFT4
	stvx	c11, OFFSET_3, CO3
	vperm	c15, c15,   c16,   PERMRSHIFT4
	stvx	c12, OFFSET_4, CO3
	vperm	c16, c16,   VZERO, PERMRSHIFT4

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c13, alpha, c13, C6
	vmaddfp	c14, alpha, c14, C7
	vmaddfp	c15, alpha, c15, C8
	vmaddfp	c16, alpha, c16, C9

	stvx	c00, OFFSET_0, CO4
	stvx	c13, OFFSET_1, CO4
	stvx	c14, OFFSET_2, CO4
	stvx	c15, OFFSET_3, CO4
	stvx	c16, OFFSET_4, CO4

	addi	CO1, CO1, 16 * SIZE
	addi	CO2, CO2, 16 * SIZE
	addi	CO3, CO3, 16 * SIZE
	addi	CO4, CO4, 16 * SIZE
	
	addic.	I, I, -1
	bgt+	LL(11)
	b	LL(20)
	.align 4

LL(19):
	lvx	C6, OFFSET_1, CO2
	lvx	C7, OFFSET_2, CO2
	lvx	C8, OFFSET_3, CO2
	lvx	C9, OFFSET_4, CO2

	vperm	c00, VZERO, c01,   PERMRSHIFT1
	vperm	c01, c01,   c02,   PERMRSHIFT1
	vperm	c02, c02,   c03,   PERMRSHIFT1
	vperm	c03, c03,   c04,   PERMRSHIFT1
	vperm	c04, c04,   VZERO, PERMRSHIFT1

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c01, alpha, c01, C2
	lvx	C2, OFFSET_1, CO3
	vmaddfp	c02, alpha, c02, C3
	lvx	C3, OFFSET_2, CO3
	vmaddfp	c03, alpha, c03, C4
	lvx	C4, OFFSET_3, CO3
	vmaddfp	c04, alpha, c04, C5
	lvx	C5, OFFSET_4, CO3

	stvx	c00, OFFSET_0, CO1
	stvx	c01, OFFSET_1, CO1
	stvx	c02, OFFSET_2, CO1
	stvx	c03, OFFSET_3, CO1
	stvx	c04, OFFSET_4, CO1

	lvx	C1, OFFSET_0, CO2

	vperm	c00, VZERO, c05,   PERMRSHIFT2
	vperm	c05, c05,   c06,   PERMRSHIFT2
	vperm	c06, c06,   c07,   PERMRSHIFT2
	vperm	c07, c07,   c08,   PERMRSHIFT2
	vperm	c08, c08,   VZERO, PERMRSHIFT2

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c05, alpha, c05, C6
	lvx	C6, OFFSET_1, CO4
	vmaddfp	c06, alpha, c06, C7
	lvx	C7, OFFSET_2, CO4
	vmaddfp	c07, alpha, c07, C8
	lvx	C8, OFFSET_3, CO4
	vmaddfp	c08, alpha, c08, C9
	lvx	C9, OFFSET_4, CO4

	stvx	c00, OFFSET_0, CO2
	stvx	c05, OFFSET_1, CO2
	stvx	c06, OFFSET_2, CO2
	stvx	c07, OFFSET_3, CO2
	stvx	c08, OFFSET_4, CO2

	lvx	C1, OFFSET_0, CO3

	vperm	c00, VZERO, c09,   PERMRSHIFT3
	vperm	c09, c09,   c10,   PERMRSHIFT3
	vperm	c10, c10,   c11,   PERMRSHIFT3
	vperm	c11, c11,   c12,   PERMRSHIFT3
	vperm	c12, c12,   VZERO, PERMRSHIFT3

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c09, alpha, c09, C2
	vmaddfp	c10, alpha, c10, C3
	vmaddfp	c11, alpha, c11, C4
	vmaddfp	c12, alpha, c12, C5

	stvx	c00, OFFSET_0, CO3
	stvx	c09, OFFSET_1, CO3
	stvx	c10, OFFSET_2, CO3
	stvx	c11, OFFSET_3, CO3
	stvx	c12, OFFSET_4, CO3

	lvx	C1, OFFSET_0, CO4

	vperm	c00, VZERO, c13,   PERMRSHIFT4
	vperm	c13, c13,   c14,   PERMRSHIFT4
	vperm	c14, c14,   c15,   PERMRSHIFT4
	vperm	c15, c15,   c16,   PERMRSHIFT4
	vperm	c16, c16,   VZERO, PERMRSHIFT4

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c13, alpha, c13, C6
	vmaddfp	c14, alpha, c14, C7
	vmaddfp	c15, alpha, c15, C8
	vmaddfp	c16, alpha, c16, C9

	stvx	c00, OFFSET_0, CO4
	stvx	c13, OFFSET_1, CO4
	stvx	c14, OFFSET_2, CO4
	stvx	c15, OFFSET_3, CO4
	stvx	c16, OFFSET_4, CO4

	addi	CO1, CO1, 16 * SIZE
	addi	CO2, CO2, 16 * SIZE
	addi	CO3, CO3, 16 * SIZE
	addi	CO4, CO4, 16 * SIZE
	
	addic.	I, I, -1
	bgt+	LL(11)
	.align 4

LL(20):
	andi.	I, M,  8
	ble	LL(30)

	vxor	c01, c01, c01
	LOAD_A	a1, OFFSET_0, AO
	vxor	c02, c02, c02
	LOAD_A	a2, OFFSET_1, AO
	vxor	c05, c05, c05
	LOAD_A	a3, OFFSET_2, AO
	vxor	c06, c06, c06
	LOAD_A	a4, OFFSET_3, AO
	vxor	c09, c09, c09
	LOAD_B	b1, OFFSET_0, B
	vxor	c10, c10, c10
	LOAD_B	b2, OFFSET_1, B
	vxor	c13, c13, c13
	vxor	c14, c14, c14
	mr	BO, B
	vspltw	bp1, b1, 0

	srawi.	r0,  K,  1
	mtspr	CTR, r0
	ble	LL(25)
	.align 4

LL(22):
	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b1, 1
	addi	AO, AO, 16 * SIZE
	vmaddfp	c02, a2, bp1, c02
	addi	BO, BO,  8 * SIZE

	vmaddfp	c05, a1, bp2, c05
	vspltw	bp1, b1, 2
	vmaddfp	c06, a2, bp2, c06

	vmaddfp	c09, a1, bp1, c09
	vspltw	bp2, b1, 3
	LOAD_B	b1, OFFSET_0, BO
	vmaddfp	c10, a2, bp1, c10

	vmaddfp	c13, a1, bp2, c13
	LOAD_A	a1, OFFSET_0, AO
	vspltw	bp1, b2, 0
	vmaddfp	c14, a2, bp2, c14
	LOAD_A	a2, OFFSET_1, AO

	vmaddfp	c01, a3, bp1, c01
	vspltw	bp2, b2, 1
	vmaddfp	c02, a4, bp1, c02

	vmaddfp	c05, a3, bp2, c05
	vspltw	bp1, b2, 2
	vmaddfp	c06, a4, bp2, c06

	vmaddfp	c09, a3, bp1, c09
	vspltw	bp2, b2, 3
	LOAD_B	b2, OFFSET_1, BO
	vmaddfp	c10, a4, bp1, c10

	vmaddfp	c13, a3, bp2, c13
	LOAD_A	a3, OFFSET_2, AO
	vmaddfp	c14, a4, bp2, c14
	LOAD_A	a4, OFFSET_3, AO
	vspltw	bp1, b1, 0
	bdnz	LL(22)
	.align 4

LL(25):
	andi.	r0,  K,  1
	lvx	alpha, OFFSET_0, SP
	vxor	VZERO, VZERO, VZERO
	ble+	LL(28)
	.align 4

LL(26):
	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b1, 1
	vmaddfp	c02, a2, bp1, c02
	nop

	vmaddfp	c05, a1, bp2, c05
	vspltw	bp1, b1, 2
	vmaddfp	c06, a2, bp2, c06
	nop

	vmaddfp	c09, a1, bp1, c09
	vspltw	bp2, b1, 3
	vmaddfp	c10, a2, bp1, c10
	addi	AO, AO,  8 * SIZE

	vmaddfp	c13, a1, bp2, c13
	addi	BO, BO,  4 * SIZE
	vmaddfp	c14, a2, bp2, c14
	nop
	.align 4

LL(28):
	lvx	C1, OFFSET_0, CO1
	lvx	C2, OFFSET_1, CO1
	lvx	C3, OFFSET_2, CO1

	lvsr	PERMRSHIFT1, 0, CO1
	lvsr	PERMRSHIFT2, 0, CO2
	lvsr	PERMRSHIFT3, 0, CO3
	lvsr	PERMRSHIFT4, 0, CO4

	vperm	c00, VZERO, c01,   PERMRSHIFT1
	vperm	c01, c01,   c02,   PERMRSHIFT1
	vperm	c02, c02,   VZERO, PERMRSHIFT1

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c01, alpha, c01, C2
	vmaddfp	c02, alpha, c02, C3

	stvx	c00, OFFSET_0, CO1
	stvx	c01, OFFSET_1, CO1
	stvx	c02, OFFSET_2, CO1

	lvx	C1, OFFSET_0, CO2
	lvx	C2, OFFSET_1, CO2
	lvx	C3, OFFSET_2, CO2

	vperm	c00, VZERO, c05,   PERMRSHIFT2
	vperm	c05, c05,   c06,   PERMRSHIFT2
	vperm	c06, c06,   VZERO, PERMRSHIFT2

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c05, alpha, c05, C2
	vmaddfp	c06, alpha, c06, C3

	stvx	c00, OFFSET_0, CO2
	stvx	c05, OFFSET_1, CO2
	stvx	c06, OFFSET_2, CO2

	lvx	C1, OFFSET_0, CO3
	lvx	C2, OFFSET_1, CO3
	lvx	C3, OFFSET_2, CO3

	vperm	c00, VZERO, c09,   PERMRSHIFT3
	vperm	c09, c09,   c10,   PERMRSHIFT3
	vperm	c10, c10,   VZERO, PERMRSHIFT3

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c09, alpha, c09, C2
	vmaddfp	c10, alpha, c10, C3

	stvx	c00, OFFSET_0, CO3
	stvx	c09, OFFSET_1, CO3
	stvx	c10, OFFSET_2, CO3

	lvx	C1, OFFSET_0, CO4
	lvx	C2, OFFSET_1, CO4
	lvx	C3, OFFSET_2, CO4

	vperm	c00, VZERO, c13,   PERMRSHIFT4
	vperm	c13, c13,   c14,   PERMRSHIFT4
	vperm	c14, c14,   VZERO, PERMRSHIFT4

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c13, alpha, c13, C2
	vmaddfp	c14, alpha, c14, C3

	stvx	c00, OFFSET_0, CO4
	stvx	c13, OFFSET_1, CO4
	stvx	c14, OFFSET_2, CO4

	addi	CO1, CO1, 8 * SIZE
	addi	CO2, CO2, 8 * SIZE
	addi	CO3, CO3, 8 * SIZE
	addi	CO4, CO4, 8 * SIZE
	.align 4

LL(30):
	andi.	I, M,  4
	ble	LL(40)

	vxor	c01, c01, c01
	LOAD_A	a1, OFFSET_0, AO
	vxor	c02, c02, c02
	LOAD_A	a2, OFFSET_1, AO
	vxor	c05, c05, c05
	LOAD_B	b1, OFFSET_0, B
	vxor	c06, c06, c06
	LOAD_B	b2, OFFSET_1, B
	vxor	c09, c09, c09
	vxor	c10, c10, c10
	vxor	c13, c13, c13
	vxor	c14, c14, c14

	vspltw	bp1, b1, 0
	mr	BO, B

	srawi.	r0,  K,  1
	mtspr	CTR, r0
	ble	LL(35)
	.align 4

LL(32):
	vmaddfp	c01, a1, bp1, c01
	addi	AO, AO,  8 * SIZE
	vspltw	bp2, b1, 1
	vmaddfp	c05, a1, bp2, c05
	addi	BO, BO,  8 * SIZE
	vspltw	bp1, b1, 2
	vmaddfp	c09, a1, bp1, c09
	vspltw	bp2, b1, 3
	vmaddfp	c13, a1, bp2, c13
	LOAD_A	a1, OFFSET_0, AO
	vspltw	bp1, b2, 0
	LOAD_B	b1, OFFSET_0, BO

	vmaddfp	c02, a2, bp1, c02
	vspltw	bp2, b2, 1
	vmaddfp	c06, a2, bp2, c06
	vspltw	bp1, b2, 2
	vmaddfp	c10, a2, bp1, c10
	vspltw	bp2, b2, 3
	LOAD_B	b2, OFFSET_1, BO
	vmaddfp	c14, a2, bp2, c14
	LOAD_A	a2, OFFSET_1, AO

	vspltw	bp1, b1, 0
	bdnz	LL(32)
	.align 4

LL(35):
	andi.	r0,  K,  1
	lvx	alpha, OFFSET_0, SP
	vxor	VZERO, VZERO, VZERO
	ble+	LL(38)
	.align 4

LL(36):
	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b1, 1
	vmaddfp	c05, a1, bp2, c05
	vspltw	bp1, b1, 2
	vmaddfp	c09, a1, bp1, c09
	vspltw	bp2, b1, 3
	vmaddfp	c13, a1, bp2, c13
	addi	AO, AO,  4 * SIZE
	addi	BO, BO,  4 * SIZE
	.align 4

LL(38):
	vaddfp	c01, c01, c02
	vaddfp	c05, c05, c06
	vaddfp	c09, c09, c10
	vaddfp	c13, c13, c14

	lvx	C1, OFFSET_0, CO1
	lvx	C2, OFFSET_1, CO1

	lvsr	PERMRSHIFT1, 0, CO1
	lvsr	PERMRSHIFT2, 0, CO2
	lvsr	PERMRSHIFT3, 0, CO3
	lvsr	PERMRSHIFT4, 0, CO4

	vperm	c00, VZERO, c01,   PERMRSHIFT1
	vperm	c01, c01,   VZERO, PERMRSHIFT1

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c01, alpha, c01, C2

	stvx	c00, OFFSET_0, CO1
	stvx	c01, OFFSET_1, CO1

	lvx	C1, OFFSET_0, CO2
	lvx	C2, OFFSET_1, CO2

	vperm	c00, VZERO, c05,   PERMRSHIFT2
	vperm	c05, c05,   VZERO, PERMRSHIFT2

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c05, alpha, c05, C2

	stvx	c00, OFFSET_0, CO2
	stvx	c05, OFFSET_1, CO2

	lvx	C1, OFFSET_0, CO3
	lvx	C2, OFFSET_1, CO3

	vperm	c00, VZERO, c09,   PERMRSHIFT3
	vperm	c09, c09,   VZERO, PERMRSHIFT3

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c09, alpha, c09, C2

	stvx	c00, OFFSET_0, CO3
	stvx	c09, OFFSET_1, CO3

	lvx	C1, OFFSET_0, CO4
	lvx	C2, OFFSET_1, CO4

	vperm	c00, VZERO, c13,   PERMRSHIFT4
	vperm	c13, c13,   VZERO, PERMRSHIFT4

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c13, alpha, c13, C2

	stvx	c00, OFFSET_0, CO4
	stvx	c13, OFFSET_1, CO4

	addi	CO1, CO1, 4 * SIZE
	addi	CO2, CO2, 4 * SIZE
	addi	CO3, CO3, 4 * SIZE
	addi	CO4, CO4, 4 * SIZE
	.align 4

LL(40):
	andi.	I, M,  2
	ble	LL(50)

	mr	BO, B

	LFD	f8,   0 * SIZE(AO)
	LFD	f9,   1 * SIZE(AO)

	LFD	f10,  0 * SIZE(B)
	LFD	f11,  1 * SIZE(B)
	LFD	f12,  2 * SIZE(B)
	LFD	f13,  3 * SIZE(B)

	lfs	f0,  FZERO(SP)
 	fmr	f1,  f0
	fmr	f2,  f0
	fmr	f3,  f0

	fmr	f4,  f0
	fmr	f5,  f0
	fmr	f6,  f0
	fmr	f7,  f0

	srawi.	r0,  K,  1
	mtspr	CTR, r0
	ble	LL(45)
	.align 4

LL(42):
	FMADD	f0,  f8, f10, f0
	FMADD	f2,  f8, f11, f2
	FMADD	f4,  f8, f12, f4
	FMADD	f6,  f8, f13, f6

	FMADD	f1,  f9, f10, f1
	FMADD	f3,  f9, f11, f3
	FMADD	f5,  f9, f12, f5
	FMADD	f7,  f9, f13, f7

	LFD	f8,   2 * SIZE(AO)
	LFD	f9,   3 * SIZE(AO)

	LFD	f10,  4 * SIZE(BO)
	LFD	f11,  5 * SIZE(BO)
	LFD	f12,  6 * SIZE(BO)
	LFD	f13,  7 * SIZE(BO)

	FMADD	f0,  f8, f10, f0
	FMADD	f2,  f8, f11, f2
	FMADD	f4,  f8, f12, f4
	FMADD	f6,  f8, f13, f6

	FMADD	f1,  f9, f10, f1
	FMADD	f3,  f9, f11, f3
	FMADD	f5,  f9, f12, f5
	FMADD	f7,  f9, f13, f7

	LFD	f8,   4 * SIZE(AO)
	LFD	f9,   5 * SIZE(AO)

	LFD	f10,  8 * SIZE(BO)
	LFD	f11,  9 * SIZE(BO)
	LFD	f12, 10 * SIZE(BO)
	LFD	f13, 11 * SIZE(BO)

	addi	AO, AO,  4 * SIZE
	addi	BO, BO,  8 * SIZE
	bdnz	LL(42)
	.align 4

LL(45):
	andi.	r0,  K,  1
	ble	LL(48)
	.align 4

LL(46):
	FMADD	f0,  f8, f10, f0
	FMADD	f2,  f8, f11, f2
	FMADD	f4,  f8, f12, f4
	FMADD	f6,  f8, f13, f6

	FMADD	f1,  f9, f10, f1
	FMADD	f3,  f9, f11, f3
	FMADD	f5,  f9, f12, f5
	FMADD	f7,  f9, f13, f7

	LFD	f8,   2 * SIZE(AO)
	LFD	f9,   3 * SIZE(AO)

	LFD	f10,  4 * SIZE(BO)
	LFD	f11,  5 * SIZE(BO)
	LFD	f12,  6 * SIZE(BO)
	LFD	f13,  7 * SIZE(BO)

	addi	AO, AO,  2 * SIZE
	addi	BO, BO,  4 * SIZE
	.align 4

LL(48):
	lfs	f13,  ALPHA(SP)

	LFD	f8,  0 * SIZE(CO1)
	LFD	f9,  1 * SIZE(CO1)
	LFD	f10, 0 * SIZE(CO2)
	LFD	f11, 1 * SIZE(CO2)

	FMADD	f0,  f0, f13, f8
	FMADD	f1,  f1, f13, f9
	FMADD	f2,  f2, f13, f10
	FMADD	f3,  f3, f13, f11

	LFD	f8,  0 * SIZE(CO3)
	LFD	f9,  1 * SIZE(CO3)
	LFD	f10, 0 * SIZE(CO4)
	LFD	f11, 1 * SIZE(CO4)

	FMADD	f4,  f4, f13, f8
	FMADD	f5,  f5, f13, f9
	FMADD	f6,  f6, f13, f10
	FMADD	f7,  f7, f13, f11

	STFD	f0,  0 * SIZE(CO1)
	STFD	f1,  1 * SIZE(CO1)
	STFD	f2,  0 * SIZE(CO2)
	STFD	f3,  1 * SIZE(CO2)

	STFD	f4,  0 * SIZE(CO3)
	STFD	f5,  1 * SIZE(CO3)
	STFD	f6,  0 * SIZE(CO4)
	STFD	f7,  1 * SIZE(CO4)

	addi	CO1, CO1, 2 * SIZE
	addi	CO2, CO2, 2 * SIZE
	addi	CO3, CO3, 2 * SIZE
	addi	CO4, CO4, 2 * SIZE
	.align 4

LL(50):
	andi.	I, M,  1
	ble	LL(59)

	mr	BO, B

	LFD	f8,   0 * SIZE(AO)
	LFD	f9,   1 * SIZE(AO)

	LFD	f10,  0 * SIZE(B)
	LFD	f11,  1 * SIZE(B)
	LFD	f12,  2 * SIZE(B)
	LFD	f13,  3 * SIZE(B)

	lfs	f0,  FZERO(SP)
 	fmr	f1,  f0
	fmr	f2,  f0
	fmr	f3,  f0

	srawi.	r0,  K,  1
	mtspr	CTR, r0
	ble	LL(55)
	.align 4

LL(52):
	FMADD	f0,  f8, f10, f0
	FMADD	f1,  f8, f11, f1
	FMADD	f2,  f8, f12, f2
	FMADD	f3,  f8, f13, f3

	LFD	f8,   2 * SIZE(AO)

	LFD	f10,  4 * SIZE(BO)
	LFD	f11,  5 * SIZE(BO)
	LFD	f12,  6 * SIZE(BO)
	LFD	f13,  7 * SIZE(BO)

	FMADD	f0,  f9, f10, f0
	FMADD	f1,  f9, f11, f1
	FMADD	f2,  f9, f12, f2
	FMADD	f3,  f9, f13, f3

	LFD	f9,   3 * SIZE(AO)

	LFD	f10,  8 * SIZE(BO)
	LFD	f11,  9 * SIZE(BO)
	LFD	f12, 10 * SIZE(BO)
	LFD	f13, 11 * SIZE(BO)

	addi	AO, AO,  2 * SIZE
	addi	BO, BO,  8 * SIZE
	bdnz	LL(52)
	.align 4

LL(55):
	andi.	r0,  K,  1
	ble	LL(58)
	.align 4

LL(56):
	FMADD	f0,  f8, f10, f0
	FMADD	f1,  f8, f11, f1
	FMADD	f2,  f8, f12, f2
	FMADD	f3,  f8, f13, f3

	LFD	f8,   2 * SIZE(AO)

	LFD	f10,  4 * SIZE(BO)
	LFD	f11,  5 * SIZE(BO)
	LFD	f12,  6 * SIZE(BO)
	LFD	f13,  7 * SIZE(BO)

	addi	AO, AO,  1 * SIZE
	addi	BO, BO,  4 * SIZE
	.align 4

LL(58):
	lfs	f13,  ALPHA(SP)

	LFD	f8,  0 * SIZE(CO1)
	LFD	f9,  0 * SIZE(CO2)
	LFD	f10, 0 * SIZE(CO3)
	LFD	f11, 0 * SIZE(CO4)

	FMADD	f0,  f0, f13, f8
	FMADD	f1,  f1, f13, f9
	FMADD	f2,  f2, f13, f10
	FMADD	f3,  f3, f13, f11

	STFD	f0,  0 * SIZE(CO1)
	STFD	f1,  0 * SIZE(CO2)
	STFD	f2,  0 * SIZE(CO3)
	STFD	f3,  0 * SIZE(CO4)
	.align 4

LL(59):
	mr	B, BO

	addic.	J, J, -1
	bgt	LL(01)
	.align 4

LL(60):
	andi.	r0, N,  2
	ble	LL(120)

	mr	CO1, C
	add	CO2, C,  LDC
	add	C,  CO2, LDC

	mr	AO, A
	srawi.	I, M,  4
	ble	LL(80)
	.align 4

LL(71):
	vxor	c01, c01, c01
	LOAD_B	b1, OFFSET_0, B
	vxor	c02, c02, c02
	vxor	c03, c03, c03
	LOAD_A	a1, OFFSET_0, AO
	vxor	c04, c04, c04
	LOAD_A	a2, OFFSET_1, AO
	vxor	c05, c05, c05
	LOAD_A	a3, OFFSET_2, AO
	vxor	c06, c06, c06
	LOAD_A	a4, OFFSET_3, AO
	vxor	c07, c07, c07
	vxor	c08, c08, c08

	mr	BO, B
	dcbtst	CO1, PREC
	dcbtst	CO2, PREC

	vspltw	bp1, b1, 0

	srawi.	r0,  K,  1
	mtspr	CTR, r0
	ble	LL(75)
	.align 4

LL(72):
	LOAD_A	a5, OFFSET_4, AO
	LOAD_A	a6, OFFSET_5, AO
	LOAD_A	a7, OFFSET_6, AO
	LOAD_A	a8, OFFSET_7, AO

	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b1, 1
	vmaddfp	c02, a2, bp1, c02
	vmaddfp	c03, a3, bp1, c03
	vmaddfp	c04, a4, bp1, c04

	vmaddfp	c05, a1, bp2, c05
	vspltw	bp1, b1, 2
	vmaddfp	c06, a2, bp2, c06
	vmaddfp	c07, a3, bp2, c07
	vmaddfp	c08, a4, bp2, c08

	vmaddfp	c01, a5, bp1, c01
	vspltw	bp2, b1, 3
	vmaddfp	c02, a6, bp1, c02
	vmaddfp	c03, a7, bp1, c03
	vmaddfp	c04, a8, bp1, c04

	LOAD_B	b1, OFFSET_1, BO
	vspltw	bp1, b1, 0

	vmaddfp	c05, a5, bp2, c05
	vmaddfp	c06, a6, bp2, c06
	vmaddfp	c07, a7, bp2, c07
	vmaddfp	c08, a8, bp2, c08

	addi	AO, AO, 32 * SIZE
	addi	BO, BO,  4 * SIZE

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO
	bdnz	LL(72)
	.align 4

LL(75):
	andi.	r0,  K,  1
	lvx	alpha, OFFSET_0, SP
	vxor	VZERO, VZERO, VZERO
	ble+	LL(78)
	.align 4

LL(76):
	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b1, 1
	vmaddfp	c02, a2, bp1, c02
	addi	AO, AO, 16 * SIZE
	vmaddfp	c03, a3, bp1, c03
	addi	BO, BO,  2 * SIZE
	vmaddfp	c04, a4, bp1, c04
	nop

	vmaddfp	c05, a1, bp2, c05
	vmaddfp	c06, a2, bp2, c06
	vmaddfp	c07, a3, bp2, c07
	vmaddfp	c08, a4, bp2, c08
	.align 4

LL(78):
	lvx	C1, OFFSET_0, CO1
	lvx	C2, OFFSET_1, CO1
	lvx	C3, OFFSET_2, CO1
	lvx	C4, OFFSET_3, CO1
	lvx	C5, OFFSET_4, CO1

	lvsr	PERMRSHIFT1, 0, CO1
	lvsr	PERMRSHIFT2, 0, CO2
	lvsr	PERMRSHIFT3, 0, CO3
	lvsr	PERMRSHIFT4, 0, CO4

	vperm	c00, VZERO, c01,   PERMRSHIFT1
	vperm	c01, c01,   c02,   PERMRSHIFT1
	vperm	c02, c02,   c03,   PERMRSHIFT1
	vperm	c03, c03,   c04,   PERMRSHIFT1
	vperm	c04, c04,   VZERO, PERMRSHIFT1

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c01, alpha, c01, C2
	vmaddfp	c02, alpha, c02, C3
	vmaddfp	c03, alpha, c03, C4
	vmaddfp	c04, alpha, c04, C5

	stvx	c00, OFFSET_0, CO1
	stvx	c01, OFFSET_1, CO1
	stvx	c02, OFFSET_2, CO1
	stvx	c03, OFFSET_3, CO1
	stvx	c04, OFFSET_4, CO1

	lvx	C1, OFFSET_0, CO2
	lvx	C2, OFFSET_1, CO2
	lvx	C3, OFFSET_2, CO2
	lvx	C4, OFFSET_3, CO2
	lvx	C5, OFFSET_4, CO2

	vperm	c00, VZERO, c05,   PERMRSHIFT2
	vperm	c05, c05,   c06,   PERMRSHIFT2
	vperm	c06, c06,   c07,   PERMRSHIFT2
	vperm	c07, c07,   c08,   PERMRSHIFT2
	vperm	c08, c08,   VZERO, PERMRSHIFT2

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c05, alpha, c05, C2
	vmaddfp	c06, alpha, c06, C3
	vmaddfp	c07, alpha, c07, C4
	vmaddfp	c08, alpha, c08, C5

	stvx	c00, OFFSET_0, CO2
	stvx	c05, OFFSET_1, CO2
	stvx	c06, OFFSET_2, CO2
	stvx	c07, OFFSET_3, CO2
	stvx	c08, OFFSET_4, CO2

	addi	CO1, CO1, 16 * SIZE
	addi	CO2, CO2, 16 * SIZE
	addic.	I, I, -1
	bgt+	LL(71)
	.align 4

LL(80):
	andi.	I, M,  8
	ble	LL(90)

	vxor	c01, c01, c01
	LOAD_B	b1, OFFSET_0, B
	vxor	c02, c02, c02
	vxor	c03, c03, c03
	LOAD_A	a1, OFFSET_0, AO
	vxor	c04, c04, c04
	LOAD_A	a2, OFFSET_1, AO
	vxor	c05, c05, c05
	LOAD_A	a3, OFFSET_2, AO
	vxor	c06, c06, c06
	LOAD_A	a4, OFFSET_3, AO
	vxor	c07, c07, c07
	vxor	c08, c08, c08

	mr	BO, B

	vspltw	bp1, b1, 0
	srawi.	r0,  K,  1
	mtspr	CTR, r0
	ble	LL(85)
	.align 4

LL(82):
	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b1, 1
	vmaddfp	c02, a2, bp1, c02

	vmaddfp	c05, a1, bp2, c05
	vspltw	bp1, b1, 2
	vmaddfp	c06, a2, bp2, c06

	vmaddfp	c03, a3, bp1, c03
	vspltw	bp2, b1, 3
	vmaddfp	c04, a4, bp1, c04

	LOAD_B	b1, OFFSET_1, BO
	vspltw	bp1, b1, 0

	vmaddfp	c07, a3, bp2, c07
	vmaddfp	c08, a4, bp2, c08

	addi	AO, AO, 16 * SIZE
	addi	BO, BO,  4 * SIZE

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO
	bdnz	LL(82)
	.align 4

LL(85):
	andi.	r0,  K,  1
	lvx	alpha, OFFSET_0, SP
	vxor	VZERO, VZERO, VZERO
	ble+	LL(88)
	.align 4

LL(86):
	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b1, 1
	vmaddfp	c02, a2, bp1, c02
	addi	AO, AO,  8 * SIZE
	vmaddfp	c05, a1, bp2, c05
	addi	BO, BO,  2 * SIZE
	vmaddfp	c06, a2, bp2, c06
	.align 4

LL(88):
	lvx	C1, OFFSET_0, CO1
	lvx	C2, OFFSET_1, CO1
	lvx	C3, OFFSET_2, CO1

	vaddfp	c01, c01, c03
	vaddfp	c02, c02, c04
	vaddfp	c05, c05, c07
	vaddfp	c06, c06, c08

	lvsr	PERMRSHIFT1, 0, CO1
	lvsr	PERMRSHIFT2, 0, CO2
	lvsr	PERMRSHIFT3, 0, CO3
	lvsr	PERMRSHIFT4, 0, CO4

	vperm	c00, VZERO, c01,   PERMRSHIFT1
	vperm	c01, c01,   c02,   PERMRSHIFT1
	vperm	c02, c02,   VZERO, PERMRSHIFT1

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c01, alpha, c01, C2
	vmaddfp	c02, alpha, c02, C3

	stvx	c00, OFFSET_0, CO1
	stvx	c01, OFFSET_1, CO1
	stvx	c02, OFFSET_2, CO1

	lvx	C1, OFFSET_0, CO2
	lvx	C2, OFFSET_1, CO2
	lvx	C3, OFFSET_2, CO2

	vperm	c00, VZERO, c05,   PERMRSHIFT2
	vperm	c05, c05,   c06,   PERMRSHIFT2
	vperm	c06, c06,   VZERO, PERMRSHIFT2

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c05, alpha, c05, C2
	vmaddfp	c06, alpha, c06, C3

	stvx	c00, OFFSET_0, CO2
	stvx	c05, OFFSET_1, CO2
	stvx	c06, OFFSET_2, CO2

	addi	CO1, CO1, 8 * SIZE
	addi	CO2, CO2, 8 * SIZE
	.align 4

LL(90):
	andi.	I, M,  4
	ble	LL(100)

	vxor	c01, c01, c01
	LOAD_B	b1, OFFSET_0, B
	vxor	c02, c02, c02
	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	vxor	c05, c05, c05
	vxor	c06, c06, c06

	mr	BO, B

	vspltw	bp1, b1, 0

	srawi.	r0,  K,  1
	mtspr	CTR, r0
	ble	LL(95)
	.align 4

LL(92):
	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b1, 1

	vmaddfp	c05, a1, bp2, c05
	vspltw	bp1, b1, 2

	vmaddfp	c02, a2, bp1, c02
	vspltw	bp2, b1, 3

	LOAD_B	b1, OFFSET_1, BO
	vspltw	bp1, b1, 0

	vmaddfp	c06, a2, bp2, c06

	addi	AO, AO,  8 * SIZE
	addi	BO, BO,  4 * SIZE

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	bdnz	LL(92)
	.align 4

LL(95):
	andi.	r0,  K,  1
	lvx	alpha, OFFSET_0, SP
	vxor	VZERO, VZERO, VZERO
	ble+	LL(98)
	.align 4

LL(96):
	vspltw	bp2, b1, 1
	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c05, a1, bp2, c05
	addi	AO, AO,  4 * SIZE
	addi	BO, BO,  2 * SIZE
	.align 4

LL(98):
	vaddfp	c01, c01, c02
	vaddfp	c05, c05, c06
	vaddfp	c09, c09, c10
	vaddfp	c13, c13, c14

	lvx	C1, OFFSET_0, CO1
	lvx	C2, OFFSET_1, CO1

	lvsr	PERMRSHIFT1, 0, CO1
	lvsr	PERMRSHIFT2, 0, CO2
	lvsr	PERMRSHIFT3, 0, CO3
	lvsr	PERMRSHIFT4, 0, CO4

	vperm	c00, VZERO, c01,   PERMRSHIFT1
	vperm	c01, c01,   VZERO, PERMRSHIFT1

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c01, alpha, c01, C2

	stvx	c00, OFFSET_0, CO1
	stvx	c01, OFFSET_1, CO1

	lvx	C1, OFFSET_0, CO2
	lvx	C2, OFFSET_1, CO2

	vperm	c00, VZERO, c05,   PERMRSHIFT2
	vperm	c05, c05,   VZERO, PERMRSHIFT2

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c05, alpha, c05, C2

	stvx	c00, OFFSET_0, CO2
	stvx	c05, OFFSET_1, CO2

	addi	CO1, CO1, 4 * SIZE
	addi	CO2, CO2, 4 * SIZE
	.align 4

LL(100):
	andi.	I, M,  2
	ble	LL(110)

	mr	BO, B

	LFD	f8,   0 * SIZE(AO)
	LFD	f9,   1 * SIZE(AO)

	LFD	f10,  0 * SIZE(B)
	LFD	f11,  1 * SIZE(B)
	LFD	f12,  2 * SIZE(B)
	LFD	f13,  3 * SIZE(B)

	lfs	f0,  FZERO(SP)
 	fmr	f1,  f0
	fmr	f2,  f0
	fmr	f3,  f0

	fmr	f4,  f0
	fmr	f5,  f0
	fmr	f6,  f0
	fmr	f7,  f0

	srawi.	r0,  K,  1
	mtspr	CTR, r0
	ble	LL(105)
	.align 4

LL(102):
	FMADD	f0,  f8, f10, f0
	FMADD	f1,  f9, f10, f1
	FMADD	f2,  f8, f11, f2
	FMADD	f3,  f9, f11, f3

	LFD	f8,   2 * SIZE(AO)
	LFD	f9,   3 * SIZE(AO)

	FMADD	f4,  f8, f12, f4
	FMADD	f5,  f9, f12, f5
	FMADD	f6,  f8, f13, f6
	FMADD	f7,  f9, f13, f7

	LFD	f8,  4 * SIZE(AO)
	LFD	f9,  5 * SIZE(AO)

	LFD	f10,  4 * SIZE(BO)
	LFD	f11,  5 * SIZE(BO)
	LFD	f12,  6 * SIZE(BO)
	LFD	f13,  7 * SIZE(BO)

	addi	AO, AO,  4 * SIZE
	addi	BO, BO,  4 * SIZE
	bdnz	LL(102)
	.align 4

LL(105):
	andi.	r0,  K,  1
	lfs	f13,  ALPHA(SP)
	ble	LL(108)
	.align 4

LL(106):
	FMADD	f0,  f8, f10, f0
	FMADD	f1,  f9, f10, f1
	FMADD	f2,  f8, f11, f2
	FMADD	f3,  f9, f11, f3

	LFD	f8,   2 * SIZE(AO)
	LFD	f9,   3 * SIZE(AO)

	LFD	f10,  2 * SIZE(BO)
	LFD	f11,  3 * SIZE(BO)

	addi	AO, AO,  2 * SIZE
	addi	BO, BO,  2 * SIZE
	.align 4

LL(108):
	LFD	f8,  0 * SIZE(CO1)
	LFD	f9,  1 * SIZE(CO1)
	LFD	f10, 0 * SIZE(CO2)
	LFD	f11, 1 * SIZE(CO2)

	FADD	f0, f0, f4
	FADD	f1, f1, f5
	FADD	f2, f2, f6
	FADD	f3, f3, f7

	FMADD	f0,  f0, f13, f8
	FMADD	f1,  f1, f13, f9
	FMADD	f2,  f2, f13, f10
	FMADD	f3,  f3, f13, f11

	STFD	f0,  0 * SIZE(CO1)
	STFD	f1,  1 * SIZE(CO1)
	STFD	f2,  0 * SIZE(CO2)
	STFD	f3,  1 * SIZE(CO2)

	addi	CO1, CO1, 2 * SIZE
	addi	CO2, CO2, 2 * SIZE
	.align 4

LL(110):
	andi.	I, M,  1
	ble	LL(119)

	mr	BO, B

	LFD	f8,   0 * SIZE(AO)
	LFD	f9,   1 * SIZE(AO)

	LFD	f10,  0 * SIZE(B)
	LFD	f11,  1 * SIZE(B)
	LFD	f12,  2 * SIZE(B)
	LFD	f13,  3 * SIZE(B)

	lfs	f0,  FZERO(SP)
 	fmr	f1,  f0
	fmr	f2,  f0
	fmr	f3,  f0

	srawi.	r0,  K,  1
	mtspr	CTR, r0
	ble	LL(115)
	.align 4

LL(112):
	FMADD	f0,  f8, f10, f0
	FMADD	f1,  f8, f11, f1
	FMADD	f2,  f9, f12, f2
	FMADD	f3,  f9, f13, f3

	LFD	f8,   2 * SIZE(AO)
	LFD	f9,   3 * SIZE(AO)

	LFD	f10,  4 * SIZE(BO)
	LFD	f11,  5 * SIZE(BO)
	LFD	f12,  6 * SIZE(BO)
	LFD	f13,  7 * SIZE(BO)

	addi	AO, AO,  2 * SIZE
	addi	BO, BO,  4 * SIZE
	bdnz	LL(112)
	.align 4

LL(115):
	andi.	r0,  K,  1
	lfs	f13,  ALPHA(SP)
	ble	LL(118)
	.align 4

LL(116):
	FMADD	f0,  f8, f10, f0
	FMADD	f1,  f8, f11, f1

	LFD	f8,   1 * SIZE(AO)

	LFD	f10,  2 * SIZE(BO)
	LFD	f11,  3 * SIZE(BO)

	addi	AO, AO,  1 * SIZE
	addi	BO, BO,  2 * SIZE
	.align 4

LL(118):
	LFD	f8,  0 * SIZE(CO1)
	LFD	f9,  0 * SIZE(CO2)

	FADD	f0, f0, f2
	FADD	f1, f1, f3
	
	FMADD	f0,  f0, f13, f8
	FMADD	f1,  f1, f13, f9

	STFD	f0,  0 * SIZE(CO1)
	STFD	f1,  0 * SIZE(CO2)
	.align 4

LL(119):
	mr	B, BO
	.align 4

LL(120):
	andi.	r0, N,  1
	ble	LL(999)

	mr	CO1, C
	mr	AO, A
	srawi.	I, M,  4
	ble	LL(140)
	.align 4

LL(130):
	vxor	c01, c01, c01
	vxor	c02, c02, c02
	vxor	c03, c03, c03
	vxor	c04, c04, c04

	mr	BO, B

	dcbtst	CO1, PREC

	mr	J, K

	andi.	r0,  B,  15
	ble+	LL(131)

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO
	LOAD_B	b1, OFFSET_0, BO
	vspltw	bp1, b1,  2
	vspltw	bp2, b1,  3

	addi	AO, AO, 16 * SIZE
	addi	BO, BO, SIZE

	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02
	vmaddfp	c03, a3, bp1, c03
	vmaddfp	c04, a4, bp1, c04
	subi	J, J, 1
	cmpwi	cr0, J, 0
	ble	LL(138)

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO

	addi	AO, AO, 16 * SIZE
	addi	BO, BO, SIZE

	vmaddfp	c01, a1, bp2, c01
	vmaddfp	c02, a2, bp2, c02
	vmaddfp	c03, a3, bp2, c03
	vmaddfp	c04, a4, bp2, c04
	subi	J, J, 1
	cmpwi	cr0, J, 0
	ble	LL(138)
	.align 4


LL(131):
	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO
	LOAD_A	a5, OFFSET_4, AO
	LOAD_A	a6, OFFSET_5, AO
	LOAD_A	a7, OFFSET_6, AO
	LOAD_A	a8, OFFSET_7, AO

	LOAD_B	b1, OFFSET_0, BO

	srawi.	r0,  J,  2
	mtspr	CTR, r0
	ble	LL(135)
	.align 4

LL(133):
	vspltw	bp1, b1,  0
	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02
	vmaddfp	c03, a3, bp1, c03
	vmaddfp	c04, a4, bp1, c04

	vspltw	bp2, b1,  1
	vmaddfp	c01, a5, bp2, c01
	vmaddfp	c02, a6, bp2, c02
	vmaddfp	c03, a7, bp2, c03
	vmaddfp	c04, a8, bp2, c04

	addi	AO, AO, 32 * SIZE

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO

	vspltw	bp1, b1,  2
	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02
	vmaddfp	c03, a3, bp1, c03
	vmaddfp	c04, a4, bp1, c04

	LOAD_A	a5, OFFSET_4, AO
	LOAD_A	a6, OFFSET_5, AO
	LOAD_A	a7, OFFSET_6, AO
	LOAD_A	a8, OFFSET_7, AO

	vspltw	bp2, b1,  3
	vmaddfp	c01, a5, bp2, c01
	vmaddfp	c02, a6, bp2, c02
	vmaddfp	c03, a7, bp2, c03
	vmaddfp	c04, a8, bp2, c04

	addi	AO, AO, 32 * SIZE
	addi	BO, BO,  4 * SIZE

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO

	LOAD_A	a5, OFFSET_4, AO
	LOAD_A	a6, OFFSET_5, AO
	LOAD_A	a7, OFFSET_6, AO
	LOAD_A	a8, OFFSET_7, AO

	LOAD_B	b1, OFFSET_0, BO

	bdnz	LL(133)
	.align 4

LL(135):
	andi.	r0,  J,  3
	ble+	LL(138)

	cmpwi	cr0, r0, 3
	bne	LL(136)

	vspltw	bp1, b1,  0
	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02
	vmaddfp	c03, a3, bp1, c03
	vmaddfp	c04, a4, bp1, c04

	addi	AO, AO, 16 * SIZE

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO

	vspltw	bp2, b1,  1
	vmaddfp	c01, a1, bp2, c01
	vmaddfp	c02, a2, bp2, c02
	vmaddfp	c03, a3, bp2, c03
	vmaddfp	c04, a4, bp2, c04

	addi	AO, AO, 16 * SIZE

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO

	vspltw	bp1, b1,  2
	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02
	vmaddfp	c03, a3, bp1, c03
	vmaddfp	c04, a4, bp1, c04

	addi	AO, AO, 16 * SIZE
	addi	BO, BO,  3 * SIZE
	b	LL(138)
	.align 4

LL(136):
	cmpwi	cr0, r0, 2
	bne	LL(137)

	vspltw	bp1, b1,  0
	vspltw	bp2, b1,  1

	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02
	vmaddfp	c03, a3, bp1, c03
	vmaddfp	c04, a4, bp1, c04

	LOAD_A	a1, OFFSET_4, AO
	LOAD_A	a2, OFFSET_5, AO
	LOAD_A	a3, OFFSET_6, AO
	LOAD_A	a4, OFFSET_7, AO

	vmaddfp	c01, a1, bp2, c01
	vmaddfp	c02, a2, bp2, c02
	vmaddfp	c03, a3, bp2, c03
	vmaddfp	c04, a4, bp2, c04

	addi	AO, AO, 32 * SIZE
	addi	BO, BO,  2 * SIZE
	b	LL(138)
	.align 4

LL(137):
	cmpwi	cr0, r0, 1
	bne	LL(138)

	vspltw	bp1, b1,  0

	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02
	vmaddfp	c03, a3, bp1, c03
	vmaddfp	c04, a4, bp1, c04

	addi	AO, AO, 16 * SIZE
	addi	BO, BO,  1 * SIZE
	.align 4

LL(138):
	lvx	alpha, OFFSET_0, SP
	vxor	VZERO, VZERO, VZERO

	lvx	C1, OFFSET_0, CO1
	lvx	C2, OFFSET_1, CO1
	lvx	C3, OFFSET_2, CO1
	lvx	C4, OFFSET_3, CO1
	lvx	C5, OFFSET_4, CO1

	lvsr	PERMRSHIFT1, 0, CO1

	vperm	c00, VZERO, c01,   PERMRSHIFT1
	vperm	c01, c01,   c02,   PERMRSHIFT1
	vperm	c02, c02,   c03,   PERMRSHIFT1
	vperm	c03, c03,   c04,   PERMRSHIFT1
	vperm	c04, c04,   VZERO, PERMRSHIFT1

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c01, alpha, c01, C2
	vmaddfp	c02, alpha, c02, C3
	vmaddfp	c03, alpha, c03, C4
	vmaddfp	c04, alpha, c04, C5

	stvx	c00, OFFSET_0, CO1
	stvx	c01, OFFSET_1, CO1
	stvx	c02, OFFSET_2, CO1
	stvx	c03, OFFSET_3, CO1
	stvx	c04, OFFSET_4, CO1

	addi	CO1, CO1, 16 * SIZE
	addic.	I, I, -1
	bgt+	LL(130)
	.align 4

LL(140):
	andi.	I, M,  8
	ble	LL(150)

	vxor	c01, c01, c01
	vxor	c02, c02, c02

	mr	BO, B

	mr	J, K

	andi.	r0,  B,  15
	ble+	LL(141)

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_B	b1, OFFSET_0, BO
	vspltw	bp1, b1,  2
	vspltw	bp2, b1,  3

	addi	AO, AO, 8 * SIZE
	addi	BO, BO, SIZE

	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02
	subi	J, J, 1
	cmpwi	cr0, J, 0
	ble	LL(148)

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO

	addi	AO, AO, 8 * SIZE
	addi	BO, BO, SIZE

	vmaddfp	c01, a1, bp2, c01
	vmaddfp	c02, a2, bp2, c02
	subi	J, J, 1
	cmpwi	cr0, J, 0
	ble	LL(148)
	.align 4


LL(141):
	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO
	LOAD_A	a5, OFFSET_4, AO
	LOAD_A	a6, OFFSET_5, AO
	LOAD_A	a7, OFFSET_6, AO
	LOAD_A	a8, OFFSET_7, AO

	LOAD_B	b1, OFFSET_0, BO

	srawi.	r0,  J,  2
	mtspr	CTR, r0
	ble	LL(145)
	.align 4

LL(143):
	vspltw	bp1, b1,  0
	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02

	vspltw	bp2, b1,  1
	vmaddfp	c01, a3, bp2, c01
	vmaddfp	c02, a4, bp2, c02

	vspltw	bp1, b1,  2
	vmaddfp	c01, a5, bp1, c01
	vmaddfp	c02, a6, bp1, c02

	vspltw	bp2, b1,  3
	vmaddfp	c01, a7, bp2, c01
	vmaddfp	c02, a8, bp2, c02

	addi	AO, AO, 32 * SIZE
	addi	BO, BO,  4 * SIZE

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO

	LOAD_A	a5, OFFSET_4, AO
	LOAD_A	a6, OFFSET_5, AO
	LOAD_A	a7, OFFSET_6, AO
	LOAD_A	a8, OFFSET_7, AO

	LOAD_B	b1, OFFSET_0, BO

	bdnz	LL(143)
	.align 4

LL(145):
	andi.	r0,  J,  3
	ble+	LL(148)

	cmpwi	cr0, r0, 3
	bne	LL(146)

	vspltw	bp1, b1,  0
	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02

	vspltw	bp2, b1,  1
	vmaddfp	c01, a3, bp2, c01
	vmaddfp	c02, a4, bp2, c02

	LOAD_A	a1, OFFSET_4, AO
	LOAD_A	a2, OFFSET_5, AO

	vspltw	bp1, b1,  2
	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02


	addi	AO, AO, 24 * SIZE
	addi	BO, BO,  3 * SIZE
	b	LL(148)
	.align 4

LL(146):
	cmpwi	cr0, r0, 2
	bne	LL(147)

	vspltw	bp1, b1,  0
	vspltw	bp2, b1,  1

	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02

	vmaddfp	c01, a3, bp2, c01
	vmaddfp	c02, a4, bp2, c02

	addi	AO, AO, 16 * SIZE
	addi	BO, BO,  2 * SIZE
	b	LL(148)
	.align 4

LL(147):
	cmpwi	cr0, r0, 1
	bne	LL(148)

	vspltw	bp1, b1,  0

	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c02, a2, bp1, c02

	addi	AO, AO,  8 * SIZE
	addi	BO, BO,  1 * SIZE
	.align 4

LL(148):
	lvx	alpha, OFFSET_0, SP
	vxor	VZERO, VZERO, VZERO

	lvx	C1, OFFSET_0, CO1
	lvx	C2, OFFSET_1, CO1
	lvx	C3, OFFSET_2, CO1

	lvsr	PERMRSHIFT1, 0, CO1

	vperm	c00, VZERO, c01,   PERMRSHIFT1
	vperm	c01, c01,   c02,   PERMRSHIFT1
	vperm	c02, c02,   VZERO, PERMRSHIFT1

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c01, alpha, c01, C2
	vmaddfp	c02, alpha, c02, C3

	stvx	c00, OFFSET_0, CO1
	stvx	c01, OFFSET_1, CO1
	stvx	c02, OFFSET_2, CO1
	addi	CO1, CO1, 8 * SIZE
	.align 4

LL(150):
	andi.	I, M,  4
	ble	LL(160)

	vxor	c01, c01, c01

	mr	BO, B

	mr	J, K

	andi.	r0,  B,  15
	ble+	LL(151)

	LOAD_A	a1, OFFSET_0, AO
	LOAD_B	b1, OFFSET_0, BO
	vspltw	bp1, b1,  2
	vspltw	bp2, b1,  3

	addi	AO, AO, 4 * SIZE
	addi	BO, BO, SIZE

	vmaddfp	c01, a1, bp1, c01
	subi	J, J, 1
	cmpwi	cr0, J, 0
	ble	LL(158)

	LOAD_A	a1, OFFSET_0, AO
	addi	AO, AO, 4 * SIZE
	addi	BO, BO, SIZE

	vmaddfp	c01, a1, bp2, c01
	subi	J, J, 1
	cmpwi	cr0, J, 0
	ble	LL(158)
	.align 4


LL(151):
	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO
	LOAD_B	b1, OFFSET_0, BO

	srawi.	r0,  J,  2
	mtspr	CTR, r0
	ble	LL(155)
	.align 4

LL(153):
	vspltw	bp1, b1,  0
	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b1,  1
	vmaddfp	c01, a2, bp2, c01
	vspltw	bp1, b1,  2
	vmaddfp	c01, a3, bp1, c01
	vspltw	bp2, b1,  3
	vmaddfp	c01, a4, bp2, c01

	addi	AO, AO, 16 * SIZE
	addi	BO, BO,  4 * SIZE

	LOAD_A	a1, OFFSET_0, AO
	LOAD_A	a2, OFFSET_1, AO
	LOAD_A	a3, OFFSET_2, AO
	LOAD_A	a4, OFFSET_3, AO

	LOAD_B	b1, OFFSET_0, BO

	bdnz	LL(153)
	.align 4

LL(155):
	andi.	r0,  J,  3
	ble+	LL(158)

	cmpwi	cr0, r0, 3
	bne	LL(156)

	vspltw	bp1, b1,  0
	vmaddfp	c01, a1, bp1, c01
	vspltw	bp2, b1,  1
	vmaddfp	c01, a2, bp2, c01
	vspltw	bp1, b1,  2
	vmaddfp	c01, a3, bp1, c01

	addi	AO, AO, 12 * SIZE
	addi	BO, BO,  3 * SIZE
	b	LL(158)
	.align 4

LL(156):
	cmpwi	cr0, r0, 2
	bne	LL(157)

	vspltw	bp1, b1,  0
	vspltw	bp2, b1,  1

	vmaddfp	c01, a1, bp1, c01
	vmaddfp	c01, a2, bp2, c01

	addi	AO, AO,  8 * SIZE
	addi	BO, BO,  2 * SIZE
	b	LL(158)
	.align 4

LL(157):
	cmpwi	cr0, r0, 1
	bne	LL(158)

	vspltw	bp1, b1,  0

	vmaddfp	c01, a1, bp1, c01

	addi	AO, AO,  4 * SIZE
	addi	BO, BO,  1 * SIZE
	.align 4

LL(158):
	lvx	alpha, OFFSET_0, SP
	vxor	VZERO, VZERO, VZERO

	lvx	C1, OFFSET_0, CO1
	lvx	C2, OFFSET_1, CO1

	lvsr	PERMRSHIFT1, 0, CO1

	vperm	c00, VZERO, c01,   PERMRSHIFT1
	vperm	c01, c01,   VZERO, PERMRSHIFT1

	vmaddfp	c00, alpha, c00, C1
	vmaddfp	c01, alpha, c01, C2

	stvx	c00, OFFSET_0, CO1
	stvx	c01, OFFSET_1, CO1
	addi	CO1, CO1, 4 * SIZE
	.align 4

LL(160):
	andi.	I, M,  2
	ble	LL(170)

	mr	BO, B

	LFD	f8,   0 * SIZE(AO)
	LFD	f9,   1 * SIZE(AO)
	LFD	f10,  2 * SIZE(AO)
	LFD	f11,  3 * SIZE(AO)

	LFD	f12,  0 * SIZE(B)
	LFD	f13,  1 * SIZE(B)

	lfs	f0,  FZERO(SP)
 	fmr	f1,  f0
	fmr	f2,  f0
	fmr	f3,  f0

	srawi.	r0,  K,  1
	mtspr	CTR, r0
	ble	LL(165)
	.align 4

LL(162):
	FMADD	f0,  f8,  f12, f0
	FMADD	f1,  f9,  f12, f1
	FMADD	f2,  f10, f13, f2
	FMADD	f3,  f11, f13, f3

	LFD	f8,   4 * SIZE(AO)
	LFD	f9,   5 * SIZE(AO)
	LFD	f10,  6 * SIZE(AO)
	LFD	f11,  7 * SIZE(AO)

	LFD	f12,  2 * SIZE(BO)
	LFD	f13,  3 * SIZE(BO)

	addi	AO, AO,  4 * SIZE
	addi	BO, BO,  2 * SIZE
	bdnz	LL(162)
	.align 4

LL(165):
	andi.	r0,  K,  1
	lfs	f13,  ALPHA(SP)
	ble	LL(168)
	.align 4

LL(166):
	FMADD	f0,  f8, f12, f0
	FMADD	f1,  f9, f12, f1

	addi	AO, AO,  2 * SIZE
	addi	BO, BO,  1 * SIZE
	.align 4

LL(168):
	LFD	f8,  0 * SIZE(CO1)
	LFD	f9,  1 * SIZE(CO1)

	FADD	f0, f0, f2
	FADD	f1, f1, f3

	FMADD	f0,  f0, f13, f8
	FMADD	f1,  f1, f13, f9

	STFD	f0,  0 * SIZE(CO1)
	STFD	f1,  1 * SIZE(CO1)

	addi	CO1, CO1, 2 * SIZE
	.align 4

LL(170):
	andi.	I, M,  1
	ble	LL(999)

	mr	BO, B

	LFD	f8,   0 * SIZE(AO)
	LFD	f9,   1 * SIZE(AO)

	LFD	f10,  0 * SIZE(B)
	LFD	f11,  1 * SIZE(B)

	lfs	f0,  FZERO(SP)
 	fmr	f1,  f0

	srawi.	r0,  K,  1
	mtspr	CTR, r0
	ble	LL(175)
	.align 4

LL(172):
	FMADD	f0,  f8, f10, f0
	FMADD	f1,  f9, f11, f1

	LFD	f8,   2 * SIZE(AO)
	LFD	f9,   3 * SIZE(AO)
	LFD	f10,  2 * SIZE(BO)
	LFD	f11,  3 * SIZE(BO)

	addi	AO, AO,  2 * SIZE
	addi	BO, BO,  2 * SIZE
	bdnz	LL(172)
	.align 4

LL(175):
	andi.	r0,  K,  1
	lfs	f13,  ALPHA(SP)
	ble	LL(178)
	.align 4

LL(176):
	FMADD	f0,  f8, f10, f0

	addi	AO, AO,  1 * SIZE
	addi	BO, BO,  1 * SIZE
	.align 4

LL(178):
	LFD	f8,  0 * SIZE(CO1)

	FADD	f0, f0, f1

	FMADD	f0,  f0, f13, f8

	STFD	f0,  0 * SIZE(CO1)
	.align 4
	
LL(999):
	mr	SP, STACK

	li	r0,  0 * 16
	lvx	v20, SP, r0
	li	r0,  1 * 16
	lvx	v21, SP, r0
	li	r0,  2 * 16
	lvx	v22, SP, r0
	li	r0,  3 * 16
	lvx	v23, SP, r0
	li	r0,  4 * 16
	lvx	v24, SP, r0
	li	r0,  5 * 16
	lvx	v25, SP, r0
	li	r0,  6 * 16
	lvx	v26, SP, r0
	li	r0,  7 * 16
	lvx	v27, SP, r0
	li	r0,  8 * 16
	lvx	v28, SP, r0
	li	r0,  9 * 16
	lvx	v29, SP, r0
	li	r0, 10 * 16
	lvx	v30, SP, r0
	li	r0, 11 * 16
	lvx	v31, SP, r0

	mtspr	VRsave, VREG

#ifdef __64BIT__
	ld	r31,  192(SP)
	ld	r30,  200(SP)
	ld	r29,  208(SP)
	ld	r28,  216(SP)
	ld	r27,  224(SP)
	ld	r26,  232(SP)
	ld	r25,  240(SP)
	ld	r24,  248(SP)
	ld	r23,  256(SP)
	ld	r22,  264(SP)
	ld	r21,  272(SP)
	ld	r20,  280(SP)
	ld	r19,  288(SP)
	ld	r18,  296(SP)
	ld	r17,  304(SP)
	ld	r16,  312(SP)
	ld	r15,  320(SP)
	ld	r14,  328(SP)
#else
	lwz	r31,  192(SP)
	lwz	r30,  196(SP)
	lwz	r29,  200(SP)
	lwz	r28,  204(SP)
	lwz	r27,  208(SP)
	lwz	r26,  212(SP)
	lwz	r25,  216(SP)
	lwz	r24,  220(SP)
	lwz	r23,  224(SP)
	lwz	r22,  228(SP)
	lwz	r21,  232(SP)
	lwz	r20,  236(SP)
	lwz	r19,  240(SP)
	lwz	r18,  244(SP)
	lwz	r17,  248(SP)
	lwz	r16,  252(SP)
	lwz	r15,  256(SP)
	lwz	r14,  260(SP)
#endif

	addi	SP, SP, STACKSIZE

	blr

	EPILOGUE
#endif