Blob Blame Raw
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"
	
#define	M	r3
#define	N	r4
#define	A	r5
#define	LDA	r6
#define B	r7

#define AO1	r8
#define AO2	r9
#define AO3	r10
#define AO4	r11

#define J	r12

#define AO5	r26
#define AO6	r27
#define AO7	r28
#define AO8	r29
#define INC	r30
#define INC2	r31

#define c01	f0
#define c02	f1
#define c03	f2
#define c04	f3
#define c05	f4
#define c06	f5
#define c07	f6
#define c08	f7
#define c09	f8
#define c10	f9
#define c11	f10
#define c12	f11
#define c13	f12
#define c14	f13
#define c15	f14
#define c16	f15

#define c17	f16
#define c18	f17
#define c19	f18
#define c20	f19
#define c21	f20
#define c22	f21
#define c23	f22
#define c24	f23
#define c25	f24
#define c26	f25
#define c27	f26
#define c28	f27
#define c29	f28
#define c30	f29
#define c31	f30
#define c32	f31

#define	sel_p	f30
#define	sel_s	f31


	PROLOGUE
	PROFCODE

	li	r0, -16

	stfpdux	f14, SP, r0
	stfpdux	f15, SP, r0
	stfpdux	f16, SP, r0
	stfpdux	f17, SP, r0
	stfpdux	f18, SP, r0
	stfpdux	f19, SP, r0
	stfpdux	f20, SP, r0
	stfpdux	f21, SP, r0
	stfpdux	f22, SP, r0
	stfpdux	f23, SP, r0
	stfpdux	f24, SP, r0
	stfpdux	f25, SP, r0
	stfpdux	f26, SP, r0
	stfpdux	f27, SP, r0
	stfpdux	f28, SP, r0
	stfpdux	f29, SP, r0
	stfpdux	f30, SP, r0
	stfpdux	f31, SP, r0
	
	stwu	r31,  -4(SP)
	stwu	r30,  -4(SP)
	stwu	r29,  -4(SP)
	stwu	r28,  -4(SP)

	stwu	r27,  -4(SP)
	stwu	r26,  -4(SP)

	lis	r9,   0x3f80
	lis	r10,  0xbf80

	stwu	r9,    -4(SP)
	stwu	r10,   -4(SP)
	stwu	r10,   -4(SP)
	stwu	r9,    -4(SP)

	slwi	LDA, LDA, BASE_SHIFT

	li	r0, 0
	lfpsux	sel_p, SP, r0
	li	r0, 8
	lfpsux	sel_s, SP, r0

	cmpwi	cr0, M, 0
	ble-	.L999
	cmpwi	cr0, N, 0
	ble-	.L999

	li	INC,  1 * SIZE
	li	INC2, 2 * SIZE

	subi	B, B, 2 * SIZE

	andi.	r0, A,   2 * SIZE - 1
	bne	.L100
	andi.	r0, LDA, 2 * SIZE - 1
	bne	.L100

	subi	A, A, 2 * SIZE
	srawi.	J,  N,  3
	ble	.L20
	.align 4
.L11:
	mr	AO1, A
	add	AO2, A,   LDA
	add	AO3, AO2, LDA
	add	AO4, AO3, LDA
	add	AO5, AO4, LDA
	add	AO6, AO5, LDA
	add	AO7, AO6, LDA
	add	AO8, AO7, LDA
	add	A,   AO8, LDA

	srawi.	r0,  M,  2
	mtspr	CTR, r0
	ble	.L15
	.align 4

.L12:
	LFPDUX	c01,   AO1, INC2
	LFXDUX	c02,   AO2, INC2
	LFPDUX	c03,   AO3, INC2
	LFXDUX	c04,   AO4, INC2

	LFPDUX	c05,   AO5, INC2
	LFXDUX	c06,   AO6, INC2
	LFPDUX	c07,   AO7, INC2
	LFXDUX	c08,   AO8, INC2

	LFPDUX	c09,   AO1, INC2
	LFXDUX	c10,   AO2, INC2
	LFPDUX	c11,   AO3, INC2
	LFXDUX	c12,   AO4, INC2
	fpsel	c17, sel_p, c01, c02

	LFPDUX	c13,   AO5, INC2
	fpsel	c18, sel_p, c03, c04
	LFXDUX	c14,   AO6, INC2
	fpsel	c19, sel_p, c05, c06
	LFPDUX	c15,   AO7, INC2
	fpsel	c20, sel_p, c07, c08
	LFXDUX	c16,   AO8, INC2
	fpsel	c21, sel_s, c01, c02

	fpsel	c22, sel_s, c03, c04
	STFPDUX	c17,   B, INC2
	fpsel	c23, sel_s, c05, c06
	STFPDUX	c18,   B, INC2
	fpsel	c24, sel_s, c07, c08
	STFPDUX	c19,   B, INC2

	fpsel	c01, sel_p, c09, c10
	STFPDUX	c20,   B, INC2
	fpsel	c02, sel_p, c11, c12
	STFXDUX	c21,   B, INC2
	fpsel	c03, sel_p, c13, c14
	STFXDUX	c22,   B, INC2
	fpsel	c04, sel_p, c15, c16
	STFXDUX	c23,   B, INC2

	fpsel	c05, sel_s, c09, c10
	STFXDUX	c24,   B, INC2
	fpsel	c06, sel_s, c11, c12
	STFPDUX	c01,   B, INC2
	fpsel	c07, sel_s, c13, c14
	STFPDUX	c02,   B, INC2
	fpsel	c08, sel_s, c15, c16
	STFPDUX	c03,   B, INC2

	STFPDUX	c04,   B, INC2
	STFXDUX	c05,   B, INC2
	STFXDUX	c06,   B, INC2
	STFXDUX	c07,   B, INC2
	STFXDUX	c08,   B, INC2
	bdnz	.L12
	.align 4
	
.L15:
	andi.	r0,  M,  3
	ble	.L19

	andi.	r0,  M,  2
	beq	.L17

	LFPDUX	c01,   AO1, INC2
	LFXDUX	c02,   AO2, INC2
	LFPDUX	c03,   AO3, INC2
	LFXDUX	c04,   AO4, INC2

	LFPDUX	c05,   AO5, INC2
	fpsel	c09, sel_p, c01, c02
	LFXDUX	c06,   AO6, INC2
	fpsel	c10, sel_p, c03, c04
	LFPDUX	c07,   AO7, INC2
	fpsel	c11, sel_p, c05, c06
	LFXDUX	c08,   AO8, INC2
	fpsel	c12, sel_p, c07, c08

	fpsel	c13, sel_s, c01, c02
	fpsel	c14, sel_s, c03, c04
	STFPDUX	c09,   B, INC2
	fpsel	c15, sel_s, c05, c06
	STFPDUX	c10,   B, INC2
	fpsel	c16, sel_s, c07, c08
	STFPDUX	c11,   B, INC2

	STFPDUX	c12,   B, INC2
	STFXDUX	c13,   B, INC2
	STFXDUX	c14,   B, INC2
	STFXDUX	c15,   B, INC2
	STFXDUX	c16,   B, INC2
	.align 4

.L17:
	andi.	r0,  M,  1
	beq	.L19

	LFDUX	c01,   AO1, INC2
	LFDUX	c02,   AO3, INC2
	LFDUX	c03,   AO5, INC2
	LFDUX	c04,   AO7, INC2

	LFSDUX	c01,   AO2, INC2
	LFSDUX	c02,   AO4, INC2
	LFSDUX	c03,   AO6, INC2
	LFSDUX	c04,   AO8, INC2

	STFPDUX	c01,   B, INC2
	STFPDUX	c02,   B, INC2
	STFPDUX	c03,   B, INC2
	STFPDUX	c04,   B, INC2
	.align 4

.L19:
	addic.	J, J, -1
	bgt	.L11
	.align 4

.L20:
	andi.	J,  N,  4
	ble	.L30
	.align 4
.L21:
	mr	AO1, A
	add	AO2, A,   LDA
	add	AO3, AO2, LDA
	add	AO4, AO3, LDA
	add	A,   AO4, LDA

	srawi.	r0,  M,  3
	mtspr	CTR, r0
	ble	.L25
	.align 4

.L22:
	LFPDUX	c01,   AO1, INC2
	LFXDUX	c02,   AO2, INC2
	LFPDUX	c03,   AO3, INC2
	LFXDUX	c04,   AO4, INC2

	LFPDUX	c05,   AO1, INC2
	LFXDUX	c06,   AO2, INC2
	LFPDUX	c07,   AO3, INC2
	LFXDUX	c08,   AO4, INC2

	LFPDUX	c09,   AO1, INC2
	LFXDUX	c10,   AO2, INC2
	LFPDUX	c11,   AO3, INC2
	LFXDUX	c12,   AO4, INC2
	fpsel	c17, sel_p, c01, c02

	LFPDUX	c13,   AO1, INC2
	fpsel	c18, sel_p, c03, c04
	LFXDUX	c14,   AO2, INC2
	fpsel	c19, sel_s, c01, c02
	LFPDUX	c15,   AO3, INC2
	fpsel	c20, sel_s, c03, c04
	LFXDUX	c16,   AO4, INC2
	fpsel	c21, sel_p, c05, c06

	fpsel	c22, sel_p, c07, c08
	STFPDUX	c17,   B, INC2
	fpsel	c23, sel_s, c05, c06
	STFPDUX	c18,   B, INC2
	fpsel	c24, sel_s, c07, c08
	STFXDUX	c19,   B, INC2

	fpsel	c01, sel_p, c09, c10
	STFXDUX	c20,   B, INC2
	fpsel	c02, sel_p, c11, c12
	STFPDUX	c21,   B, INC2
	fpsel	c03, sel_s, c09, c10
	STFPDUX	c22,   B, INC2
	fpsel	c04, sel_s, c11, c12
	STFXDUX	c23,   B, INC2

	fpsel	c05, sel_p, c13, c14
	STFXDUX	c24,   B, INC2
	fpsel	c06, sel_p, c15, c16
	STFPDUX	c01,   B, INC2
	fpsel	c07, sel_s, c13, c14
	STFPDUX	c02,   B, INC2
	fpsel	c08, sel_s, c15, c16
	STFXDUX	c03,   B, INC2

	STFXDUX	c04,   B, INC2
	STFPDUX	c05,   B, INC2
	STFPDUX	c06,   B, INC2
	STFXDUX	c07,   B, INC2
	STFXDUX	c08,   B, INC2
	bdnz	.L22
	.align 4
	
.L25:
	andi.	r0,  M,  7
	ble	.L30

	andi.	r0,  M,  4
	beq	.L26

	LFPDUX	c01,   AO1, INC2
	LFXDUX	c02,   AO2, INC2
	LFPDUX	c03,   AO3, INC2
	LFXDUX	c04,   AO4, INC2

	LFPDUX	c05,   AO1, INC2
	fpsel	c09, sel_p, c01, c02
	LFXDUX	c06,   AO2, INC2
	fpsel	c10, sel_p, c03, c04
	LFPDUX	c07,   AO3, INC2
	fpsel	c11, sel_s, c01, c02
	LFXDUX	c08,   AO4, INC2
	fpsel	c12, sel_s, c03, c04

	fpsel	c13, sel_p, c05, c06
	fpsel	c14, sel_p, c07, c08
	STFPDUX	c09,   B, INC2
	fpsel	c15, sel_s, c05, c06
	STFPDUX	c10,   B, INC2
	fpsel	c16, sel_s, c07, c08
	STFXDUX	c11,   B, INC2

	STFXDUX	c12,   B, INC2
	STFPDUX	c13,   B, INC2
	STFPDUX	c14,   B, INC2
	STFXDUX	c15,   B, INC2
	STFXDUX	c16,   B, INC2
	.align 4

.L26:
	andi.	r0,  M,  2
	beq	.L27

	LFPDUX	c01,   AO1, INC2
	LFXDUX	c02,   AO2, INC2
	LFPDUX	c03,   AO3, INC2
	LFXDUX	c04,   AO4, INC2

	fpsel	c05, sel_p, c01, c02
	fpsel	c06, sel_p, c03, c04
	fpsel	c07, sel_s, c01, c02
	fpsel	c08, sel_s, c03, c04

	STFPDUX	c05,   B, INC2
	STFPDUX	c06,   B, INC2
	STFXDUX	c07,   B, INC2
	STFXDUX	c08,   B, INC2
	.align 4

.L27:
	andi.	r0,  M,  1
	beq	.L30

	LFDUX	c01,   AO1, INC2
	LFDUX	c02,   AO2, INC2
	LFDUX	c03,   AO3, INC2
	LFDUX	c04,   AO4, INC2

	fsmfp	c01, c02
	fsmfp	c03, c04
	
	STFPDUX	c01,   B, INC2
	STFPDUX	c03,   B, INC2
	.align 4

	
.L30:
	andi.	J,  N,  2
	ble	.L40

	mr	AO1, A
	add	AO2, A,   LDA
	add	A,   AO2, LDA

	srawi.	r0,  M,  3
	mtspr	CTR, r0
	ble	.L35
	.align 4

.L32:
	LFPDUX	c01,   AO1, INC2
	LFXDUX	c05,   AO2, INC2
	LFPDUX	c02,   AO1, INC2
	LFXDUX	c06,   AO2, INC2

	LFPDUX	c03,   AO1, INC2
	fpsel	c09, sel_p, c01, c05
	LFXDUX	c07,   AO2, INC2
	fpsel	c10, sel_s, c01, c05
	LFPDUX	c04,   AO1, INC2
	fpsel	c11, sel_p, c02, c06
	LFXDUX	c08,   AO2, INC2
	fpsel	c12, sel_s, c02, c06

	fpsel	c13, sel_p, c03, c07
	fpsel	c14, sel_s, c03, c07
	STFPDUX	c09,   B, INC2
	fpsel	c15, sel_p, c04, c08
	STFXDUX	c10,   B, INC2
	fpsel	c16, sel_s, c04, c08
	STFPDUX	c11,   B, INC2
	STFXDUX	c12,   B, INC2

	STFPDUX	c13,   B, INC2
	STFXDUX	c14,   B, INC2
	STFPDUX	c15,   B, INC2
	STFXDUX	c16,   B, INC2
	bdnz	.L32
	.align 4
	
.L35:
	andi.	r0,  M,  7
	ble	.L40

	andi.	r0,  M,  4
	beq	.L36

	LFPDUX	c01,   AO1, INC2
	LFXDUX	c03,   AO2, INC2
	LFPDUX	c02,   AO1, INC2
	LFXDUX	c04,   AO2, INC2

	fpsel	c05, sel_p, c01, c03
	fpsel	c06, sel_s, c01, c03
	fpsel	c07, sel_p, c02, c04
	fpsel	c08, sel_s, c02, c04

	STFPDUX	c05,   B, INC2
	STFXDUX	c06,   B, INC2
	STFPDUX	c07,   B, INC2
	STFXDUX	c08,   B, INC2
	.align 4

.L36:
	andi.	r0,  M,  2
	beq	.L37

	LFPDUX	c01,   AO1, INC2
	LFXDUX	c02,   AO2, INC2

	fpsel	c03, sel_p, c01, c02
	fpsel	c04, sel_s, c01, c02

	STFPDUX	c03,   B, INC2
	STFXDUX	c04,   B, INC2
	.align 4

.L37:
	andi.	r0,  M,  1
	beq	.L40

	LFDUX	c01,   AO1, INC2
	LFDUX	c02,   AO2, INC2

	fsmfp	c01, c02
	STFPDUX	c01,   B, INC2
	.align 4

.L40:
	andi.	J,  N,  1
	ble	.L999

	mr	AO1, A

	srawi.	r0,  M,  3
	mtspr	CTR, r0
	ble	.L45
	.align 4

.L42:
	LFPDUX	c01,   AO1, INC2
	LFPDUX	c02,   AO1, INC2
	LFPDUX	c03,   AO1, INC2
	LFPDUX	c04,   AO1, INC2

	STFPDUX	c01,   B, INC2
	STFPDUX	c02,   B, INC2
	STFPDUX	c03,   B, INC2
	STFPDUX	c04,   B, INC2
	bdnz	.L42
	.align 4
	
.L45:
	andi.	r0,  M,  7
	ble	.L999

	andi.	r0,  M,  4
	beq	.L46

	LFPDUX	c01,   AO1, INC2
	LFPDUX	c02,   AO1, INC2

	STFPDUX	c01,   B, INC2
	STFPDUX	c02,   B, INC2
	.align 4

.L46:
	andi.	r0,  M,  2
	beq	.L47

	LFPDUX	c01,   AO1, INC2
	STFPDUX	c01,   B, INC2
	.align 4

.L47:
	andi.	r0,  M,  1
	beq	.L999

	LFDX	c01,   AO1, INC2
	STFDX	c01,   B,  INC2
	b	.L999
	.align 4


.L100:
	subi	A, A, 1 * SIZE
	srawi.	J,  N,  3
	ble	.L120
	.align 4
.L111:
	mr	AO1, A
	add	AO2, A,   LDA
	add	AO3, AO2, LDA
	add	AO4, AO3, LDA
	add	AO5, AO4, LDA
	add	AO6, AO5, LDA
	add	AO7, AO6, LDA
	add	AO8, AO7, LDA
	add	A,   AO8, LDA

	srawi.	r0,  M,  3
	mtspr	CTR, r0
	ble	.L115
	.align 4

.L112:
	LFDUX	c01,   AO1, INC
	LFDUX	c05,   AO1, INC
	LFDUX	c09,   AO1, INC
	LFDUX	c13,   AO1, INC

	LFDUX	c17,   AO1, INC
	LFDUX	c21,   AO1, INC
	LFDUX	c25,   AO1, INC
	LFDUX	c29,   AO1, INC

	LFSDUX	c01,   AO2, INC
	LFSDUX	c05,   AO2, INC
	LFSDUX	c09,   AO2, INC
	LFSDUX	c13,   AO2, INC

	LFSDUX	c17,   AO2, INC
	LFSDUX	c21,   AO2, INC
	LFSDUX	c25,   AO2, INC
	LFSDUX	c29,   AO2, INC

	LFDUX	c02,   AO3, INC
	LFDUX	c06,   AO3, INC
	LFDUX	c10,   AO3, INC
	LFDUX	c14,   AO3, INC

	LFDUX	c18,   AO3, INC
	LFDUX	c22,   AO3, INC
	LFDUX	c26,   AO3, INC
	LFDUX	c30,   AO3, INC

	LFSDUX	c02,   AO4, INC
	LFSDUX	c06,   AO4, INC
	LFSDUX	c10,   AO4, INC
	LFSDUX	c14,   AO4, INC

	LFSDUX	c18,   AO4, INC
	LFSDUX	c22,   AO4, INC
	LFSDUX	c26,   AO4, INC
	LFSDUX	c30,   AO4, INC

	LFDUX	c03,   AO5, INC
	LFDUX	c07,   AO5, INC
	LFDUX	c11,   AO5, INC
	LFDUX	c15,   AO5, INC

	LFDUX	c19,   AO5, INC
	LFDUX	c23,   AO5, INC
	LFDUX	c27,   AO5, INC
	LFDUX	c31,   AO5, INC

	LFSDUX	c03,   AO6, INC
	LFSDUX	c07,   AO6, INC
	LFSDUX	c11,   AO6, INC
	LFSDUX	c15,   AO6, INC

	LFSDUX	c19,   AO6, INC
	LFSDUX	c23,   AO6, INC
	LFSDUX	c27,   AO6, INC
	LFSDUX	c31,   AO6, INC

	LFDUX	c04,   AO7, INC
	LFDUX	c08,   AO7, INC
	LFDUX	c12,   AO7, INC
	LFDUX	c16,   AO7, INC

	LFDUX	c20,   AO7, INC
	LFDUX	c24,   AO7, INC
	LFDUX	c28,   AO7, INC
	LFDUX	c32,   AO7, INC

	LFSDUX	c04,   AO8, INC
	LFSDUX	c08,   AO8, INC
	LFSDUX	c12,   AO8, INC
	LFSDUX	c16,   AO8, INC

	LFSDUX	c20,   AO8, INC
	LFSDUX	c24,   AO8, INC
	LFSDUX	c28,   AO8, INC
	LFSDUX	c32,   AO8, INC

	STFPDUX	c01,   B, INC2
	STFPDUX	c02,   B, INC2
	STFPDUX	c03,   B, INC2
	STFPDUX	c04,   B, INC2
	STFPDUX	c05,   B, INC2
	STFPDUX	c06,   B, INC2
	STFPDUX	c07,   B, INC2
	STFPDUX	c08,   B, INC2

	STFPDUX	c09,   B, INC2
	STFPDUX	c10,   B, INC2
	STFPDUX	c11,   B, INC2
	STFPDUX	c12,   B, INC2
	STFPDUX	c13,   B, INC2
	STFPDUX	c14,   B, INC2
	STFPDUX	c15,   B, INC2
	STFPDUX	c16,   B, INC2

	STFPDUX	c17,   B, INC2
	STFPDUX	c18,   B, INC2
	STFPDUX	c19,   B, INC2
	STFPDUX	c20,   B, INC2
	STFPDUX	c21,   B, INC2
	STFPDUX	c22,   B, INC2
	STFPDUX	c23,   B, INC2
	STFPDUX	c24,   B, INC2

	STFPDUX	c25,   B, INC2
	STFPDUX	c26,   B, INC2
	STFPDUX	c27,   B, INC2
	STFPDUX	c28,   B, INC2
	STFPDUX	c29,   B, INC2
	STFPDUX	c30,   B, INC2
	STFPDUX	c31,   B, INC2
	STFPDUX	c32,   B, INC2
	bdnz	.L112
	.align 4
	
.L115:
	andi.	r0,  M,  7
	ble	.L119

	andi.	r0,  M,  4
	beq	.L116

	LFDUX	c01,   AO1, INC
	LFDUX	c05,   AO1, INC
	LFDUX	c09,   AO1, INC
	LFDUX	c13,   AO1, INC

	LFSDUX	c01,   AO2, INC
	LFSDUX	c05,   AO2, INC
	LFSDUX	c09,   AO2, INC
	LFSDUX	c13,   AO2, INC

	LFDUX	c02,   AO3, INC
	LFDUX	c06,   AO3, INC
	LFDUX	c10,   AO3, INC
	LFDUX	c14,   AO3, INC

	LFSDUX	c02,   AO4, INC
	LFSDUX	c06,   AO4, INC
	LFSDUX	c10,   AO4, INC
	LFSDUX	c14,   AO4, INC

	LFDUX	c03,   AO5, INC
	LFDUX	c07,   AO5, INC
	LFDUX	c11,   AO5, INC
	LFDUX	c15,   AO5, INC

	LFSDUX	c03,   AO6, INC
	LFSDUX	c07,   AO6, INC
	LFSDUX	c11,   AO6, INC
	LFSDUX	c15,   AO6, INC

	LFDUX	c04,   AO7, INC
	LFDUX	c08,   AO7, INC
	LFDUX	c12,   AO7, INC
	LFDUX	c16,   AO7, INC

	LFSDUX	c04,   AO8, INC
	LFSDUX	c08,   AO8, INC
	LFSDUX	c12,   AO8, INC
	LFSDUX	c16,   AO8, INC

	STFPDUX	c01,   B, INC2
	STFPDUX	c02,   B, INC2
	STFPDUX	c03,   B, INC2
	STFPDUX	c04,   B, INC2
	STFPDUX	c05,   B, INC2
	STFPDUX	c06,   B, INC2
	STFPDUX	c07,   B, INC2
	STFPDUX	c08,   B, INC2

	STFPDUX	c09,   B, INC2
	STFPDUX	c10,   B, INC2
	STFPDUX	c11,   B, INC2
	STFPDUX	c12,   B, INC2
	STFPDUX	c13,   B, INC2
	STFPDUX	c14,   B, INC2
	STFPDUX	c15,   B, INC2
	STFPDUX	c16,   B, INC2
	.align 4

.L116:
	andi.	r0,  M,  2
	beq	.L117

	LFDUX	c01,   AO1, INC
	LFDUX	c05,   AO1, INC
	LFDUX	c02,   AO3, INC
	LFDUX	c06,   AO3, INC

	LFSDUX	c01,   AO2, INC
	LFSDUX	c05,   AO2, INC
	LFSDUX	c02,   AO4, INC
	LFSDUX	c06,   AO4, INC

	LFDUX	c03,   AO5, INC
	LFDUX	c07,   AO5, INC
	LFDUX	c04,   AO7, INC
	LFDUX	c08,   AO7, INC

	LFSDUX	c03,   AO6, INC
	LFSDUX	c07,   AO6, INC
	LFSDUX	c04,   AO8, INC
	LFSDUX	c08,   AO8, INC

	STFPDUX	c01,   B, INC2
	STFPDUX	c02,   B, INC2
	STFPDUX	c03,   B, INC2
	STFPDUX	c04,   B, INC2
	STFPDUX	c05,   B, INC2
	STFPDUX	c06,   B, INC2
	STFPDUX	c07,   B, INC2
	STFPDUX	c08,   B, INC2
	.align 4

.L117:
	andi.	r0,  M,  1
	beq	.L119

	LFDUX	c01,   AO1, INC
	LFDUX	c02,   AO3, INC
	LFDUX	c03,   AO5, INC
	LFDUX	c04,   AO7, INC

	LFSDUX	c01,   AO2, INC
	LFSDUX	c02,   AO4, INC
	LFSDUX	c03,   AO6, INC
	LFSDUX	c04,   AO8, INC

	STFPDUX	c01,   B, INC2
	STFPDUX	c02,   B, INC2
	STFPDUX	c03,   B, INC2
	STFPDUX	c04,   B, INC2
	.align 4

.L119:
	addic.	J, J, -1
	bgt	.L111
	.align 4

.L120:
	andi.	J,  N,  4
	ble	.L130
	.align 4
.L121:
	mr	AO1, A
	add	AO2, A,   LDA
	add	AO3, AO2, LDA
	add	AO4, AO3, LDA
	add	A,   AO4, LDA

	srawi.	r0,  M,  3
	mtspr	CTR, r0
	ble	.L125
	.align 4

.L122:
	LFDUX	c01,   AO1, INC
	LFDUX	c02,   AO1, INC
	LFDUX	c03,   AO1, INC
	LFDUX	c04,   AO1, INC

	LFDUX	c09,   AO1, INC
	LFDUX	c10,   AO1, INC
	LFDUX	c11,   AO1, INC
	LFDUX	c12,   AO1, INC

	LFSDUX	c01,   AO2, INC
	LFSDUX	c02,   AO2, INC
	LFSDUX	c03,   AO2, INC
	LFSDUX	c04,   AO2, INC

	LFSDUX	c09,   AO2, INC
	LFSDUX	c10,   AO2, INC
	LFSDUX	c11,   AO2, INC
	LFSDUX	c12,   AO2, INC

	LFDUX	c05,   AO3, INC
	LFDUX	c06,   AO3, INC
	LFDUX	c07,   AO3, INC
	LFDUX	c08,   AO3, INC

	LFDUX	c13,   AO3, INC
	LFDUX	c14,   AO3, INC
	LFDUX	c15,   AO3, INC
	LFDUX	c16,   AO3, INC

	LFSDUX	c05,   AO4, INC
	LFSDUX	c06,   AO4, INC
	LFSDUX	c07,   AO4, INC
	LFSDUX	c08,   AO4, INC

	LFSDUX	c13,   AO4, INC
	LFSDUX	c14,   AO4, INC
	LFSDUX	c15,   AO4, INC
	LFSDUX	c16,   AO4, INC

	STFPDUX	c01,   B, INC2
	STFPDUX	c05,   B, INC2
	STFPDUX	c02,   B, INC2
	STFPDUX	c06,   B, INC2
	STFPDUX	c03,   B, INC2
	STFPDUX	c07,   B, INC2
	STFPDUX	c04,   B, INC2
	STFPDUX	c08,   B, INC2

	STFPDUX	c09,   B, INC2
	STFPDUX	c13,   B, INC2
	STFPDUX	c10,   B, INC2
	STFPDUX	c14,   B, INC2
	STFPDUX	c11,   B, INC2
	STFPDUX	c15,   B, INC2
	STFPDUX	c12,   B, INC2
	STFPDUX	c16,   B, INC2
	bdnz	.L122
	.align 4
	
.L125:
	andi.	r0,  M,  7
	ble	.L130

	andi.	r0,  M,  4
	beq	.L126

	LFDUX	c01,   AO1, INC
	LFDUX	c02,   AO1, INC
	LFDUX	c03,   AO1, INC
	LFDUX	c04,   AO1, INC

	LFSDUX	c01,   AO2, INC
	LFSDUX	c02,   AO2, INC
	LFSDUX	c03,   AO2, INC
	LFSDUX	c04,   AO2, INC

	LFDUX	c05,   AO3, INC
	LFDUX	c06,   AO3, INC
	LFDUX	c07,   AO3, INC
	LFDUX	c08,   AO3, INC

	LFSDUX	c05,   AO4, INC
	LFSDUX	c06,   AO4, INC
	LFSDUX	c07,   AO4, INC
	LFSDUX	c08,   AO4, INC

	STFPDUX	c01,   B, INC2
	STFPDUX	c05,   B, INC2
	STFPDUX	c02,   B, INC2
	STFPDUX	c06,   B, INC2
	STFPDUX	c03,   B, INC2
	STFPDUX	c07,   B, INC2
	STFPDUX	c04,   B, INC2
	STFPDUX	c08,   B, INC2
	.align 4

.L126:
	andi.	r0,  M,  2
	beq	.L127

	LFDUX	c01,   AO1, INC
	LFDUX	c02,   AO1, INC

	LFSDUX	c01,   AO2, INC
	LFSDUX	c02,   AO2, INC

	LFDUX	c05,   AO3, INC
	LFDUX	c06,   AO3, INC

	LFSDUX	c05,   AO4, INC
	LFSDUX	c06,   AO4, INC

	STFPDUX	c01,   B, INC2
	STFPDUX	c05,   B, INC2
	STFPDUX	c02,   B, INC2
	STFPDUX	c06,   B, INC2
	.align 4

.L127:
	andi.	r0,  M,  1
	beq	.L130

	LFDUX	c01,   AO1, INC
	LFDUX	c05,   AO3, INC

	nop
	nop

	LFSDUX	c01,   AO2, INC
	LFSDUX	c05,   AO4, INC

	STFPDUX	c01,   B, INC2
	STFPDUX	c05,   B, INC2
	.align 4

	
.L130:
	andi.	J,  N,  2
	ble	.L140

	mr	AO1, A
	add	AO2, A,   LDA
	add	A,   AO2, LDA

	srawi.	r0,  M,  3
	mtspr	CTR, r0
	ble	.L135
	.align 4

.L132:
	LFDUX	c01,   AO1, INC
	LFDUX	c02,   AO1, INC
	LFDUX	c03,   AO1, INC
	LFDUX	c04,   AO1, INC

	LFDUX	c09,   AO1, INC
	LFDUX	c10,   AO1, INC
	LFDUX	c11,   AO1, INC
	LFDUX	c12,   AO1, INC

	LFSDUX	c01,   AO2, INC
	LFSDUX	c02,   AO2, INC
	LFSDUX	c03,   AO2, INC
	LFSDUX	c04,   AO2, INC

	LFSDUX	c09,   AO2, INC
	LFSDUX	c10,   AO2, INC
	LFSDUX	c11,   AO2, INC
	LFSDUX	c12,   AO2, INC

	STFPDUX	c01,   B, INC2
	STFPDUX	c02,   B, INC2
	STFPDUX	c03,   B, INC2
	STFPDUX	c04,   B, INC2

	STFPDUX	c09,   B, INC2
	STFPDUX	c10,   B, INC2
	STFPDUX	c11,   B, INC2
	STFPDUX	c12,   B, INC2
	bdnz	.L132
	.align 4
	
.L135:
	andi.	r0,  M,  7
	ble	.L140

	andi.	r0,  M,  4
	beq	.L136

	LFDUX	c01,   AO1, INC
	LFDUX	c02,   AO1, INC
	LFDUX	c03,   AO1, INC
	LFDUX	c04,   AO1, INC

	LFSDUX	c01,   AO2, INC
	LFSDUX	c02,   AO2, INC
	LFSDUX	c03,   AO2, INC
	LFSDUX	c04,   AO2, INC

	STFPDUX	c01,   B, INC2
	STFPDUX	c02,   B, INC2
	STFPDUX	c03,   B, INC2
	STFPDUX	c04,   B, INC2
	.align 4

.L136:
	andi.	r0,  M,  2
	beq	.L137

	LFDUX	c01,   AO1, INC
	LFDUX	c02,   AO1, INC

	LFSDUX	c01,   AO2, INC
	LFSDUX	c02,   AO2, INC

	STFPDUX	c01,   B, INC2
	STFPDUX	c02,   B, INC2
	.align 4

.L137:
	andi.	r0,  M,  1
	beq	.L140

	LFDUX	c01,   AO1, INC
	LFDUX	c02,   AO2, INC

	fsmfp	c01, c02
	STFPDUX	c01,   B, INC2
	.align 4

.L140:
	andi.	J,  N,  1
	ble	.L999

	mr	AO1, A

	srawi.	r0,  M,  3
	mtspr	CTR, r0
	ble	.L145
	.align 4

.L142:
	LFDUX	c01,   AO1, INC
	LFDUX	c02,   AO1, INC
	LFDUX	c03,   AO1, INC
	LFDUX	c04,   AO1, INC

	LFDUX	c05,   AO1, INC
	LFDUX	c06,   AO1, INC
	LFDUX	c07,   AO1, INC
	LFDUX	c08,   AO1, INC

	fsmfp	c01, c02
	fsmfp	c03, c04
	fsmfp	c05, c06
	fsmfp	c07, c08

	STFPDUX	c01,   B, INC2
	STFPDUX	c03,   B, INC2
	STFPDUX	c05,   B, INC2
	STFPDUX	c07,   B, INC2
	bdnz	.L142
	.align 4
	
.L145:
	andi.	r0,  M,  7
	ble	.L999

	andi.	r0,  M,  4
	beq	.L146

	LFDUX	c01,   AO1, INC
	LFDUX	c02,   AO1, INC
	LFDUX	c03,   AO1, INC
	LFDUX	c04,   AO1, INC

	fsmfp	c01, c02
	fsmfp	c03, c04

	STFPDUX	c01,   B, INC2
	STFPDUX	c03,   B, INC2
	.align 4

.L146:
	andi.	r0,  M,  2
	beq	.L147

	LFDUX	c01,   AO1, INC
	LFDUX	c02,   AO1, INC

	fsmfp	c01, c02
	STFPDUX	c01,   B, INC2
	.align 4

.L147:
	andi.	r0,  M,  1
	beq	.L999

	LFDX	c01,   AO1, INC
	STFDX	c01,   B,  INC2
	.align 4

.L999:
	addi	SP, SP, 4

	lwzu	r26,   4(SP)
	lwzu	r27,   4(SP)

	lwzu	r28,   4(SP)
	lwzu	r29,   4(SP)
	lwzu	r30,   4(SP)
	lwzu	r31,   4(SP)

	subi	SP, SP, 12
	li	r0, 16

	lfpdux	f31, SP, r0
	lfpdux	f30, SP, r0
	lfpdux	f29, SP, r0
	lfpdux	f28, SP, r0
	lfpdux	f27, SP, r0
	lfpdux	f26, SP, r0
	lfpdux	f25, SP, r0
	lfpdux	f24, SP, r0
	lfpdux	f23, SP, r0
	lfpdux	f22, SP, r0
	lfpdux	f21, SP, r0
	lfpdux	f20, SP, r0
	lfpdux	f19, SP, r0
	lfpdux	f18, SP, r0
	lfpdux	f17, SP, r0
	lfpdux	f16, SP, r0
	lfpdux	f15, SP, r0
	lfpdux	f14, SP, r0
	addi	SP, SP, 16
	blr
	EPILOGUE