kusano 2b45e8
kusano 2b45e8
/* Copyright 2009, 2010 The University of Texas at Austin.           */
kusano 2b45e8
/* All rights reserved.                                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* Redistribution and use in source and binary forms, with or        */
kusano 2b45e8
/* without modification, are permitted provided that the following   */
kusano 2b45e8
/* conditions are met:                                               */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   1. Redistributions of source code must retain the above         */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer.                                                  */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   2. Redistributions in binary form must reproduce the above      */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer in the documentation and/or other materials       */
kusano 2b45e8
/*      provided with the distribution.                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* The views and conclusions contained in the software and           */
kusano 2b45e8
/* documentation are those of the authors and should not be          */
kusano 2b45e8
/* interpreted as representing official policies, either expressed   */
kusano 2b45e8
/* or implied, of The University of Texas at Austin.                 */
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#include "common.h"
kusano 2b45e8
kusano 2b45e8
#define STACK	16
kusano 2b45e8
#define ARGS	 0
kusano 2b45e8
kusano 2b45e8
#define OLD_M	 4 + STACK + ARGS(%esi)
kusano 2b45e8
#define OLD_N	 8 + STACK + ARGS(%esi)
kusano 2b45e8
#define OLD_K	12 + STACK + ARGS(%esi)
kusano 2b45e8
#define OLD_ALPHA	16 + STACK + ARGS(%esi)
kusano 2b45e8
#define OLD_A	24 + STACK + ARGS(%esi)
kusano 2b45e8
#define OLD_B	28 + STACK + ARGS(%esi)
kusano 2b45e8
#define OLD_C	32 + STACK + ARGS(%esi)
kusano 2b45e8
#define OLD_LDC	36 + STACK + ARGS(%esi)
kusano 2b45e8
#define OLD_OFFT	40 + STACK + ARGS(%esi)
kusano 2b45e8
kusano 2b45e8
#define ALPHA	 0(%esp)
kusano 2b45e8
#define K	16(%esp)
kusano 2b45e8
#define N	20(%esp)
kusano 2b45e8
#define M	24(%esp)
kusano 2b45e8
#define A	28(%esp)
kusano 2b45e8
#define C	32(%esp)
kusano 2b45e8
#define J	36(%esp)
kusano 2b45e8
#define BX	40(%esp)
kusano 2b45e8
#define OLD_STACK 44(%esp)
kusano 2b45e8
#define OFFSET  48(%esp)
kusano 2b45e8
#define KK	52(%esp)
kusano 2b45e8
#define KKK	56(%esp)
kusano 2b45e8
#define BUFFER 256(%esp)
kusano 2b45e8
kusano 2b45e8
#define PREFETCH_R    (8 * 16 + 0)
kusano 2b45e8
#define PREFETCH_W    (PREFETCH_R * 2)
kusano 2b45e8
kusano 2b45e8
#define PREFETCHSIZE  (8 * 7 + 4)
kusano 2b45e8
#define PREFETCH     prefetcht0
kusano 2b45e8
kusano 2b45e8
#define AA	%edx
kusano 2b45e8
#define BB	%ecx
kusano 2b45e8
#define LDC	%ebp
kusano 2b45e8
#define B	%edi
kusano 2b45e8
#define C1	%esi
kusano 2b45e8
#define I	%ebx
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	pushl	%ebp
kusano 2b45e8
	pushl	%edi
kusano 2b45e8
	pushl	%esi
kusano 2b45e8
	pushl	%ebx
kusano 2b45e8
kusano 2b45e8
	movl	%esp, %esi	# save old stack
kusano 2b45e8
kusano 2b45e8
	subl	$512 + LOCAL_BUFFER_SIZE, %esp
kusano 2b45e8
	andl	$-4096, %esp	# align stack
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	OLD_M, %ebx
kusano 2b45e8
	movl	OLD_N, %eax
kusano 2b45e8
	movl	OLD_K, %ecx
kusano 2b45e8
	movl	OLD_A, %edx
kusano 2b45e8
	movsd	OLD_ALPHA,  %xmm3
kusano 2b45e8
kusano 2b45e8
	movd	OLD_OFFT, %mm4
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	%ebx, M
kusano 2b45e8
	movl	%eax, N
kusano 2b45e8
	movl	%ecx, K
kusano 2b45e8
	movl	%edx, A
kusano 2b45e8
	movl	%esi, OLD_STACK
kusano 2b45e8
kusano 2b45e8
	unpcklpd %xmm3, %xmm3
kusano 2b45e8
	movl	OLD_B, B
kusano 2b45e8
	movl	OLD_C, %ebx
kusano 2b45e8
kusano 2b45e8
	movapd	 %xmm3, ALPHA
kusano 2b45e8
	movl	%ebx, C
kusano 2b45e8
	movl	OLD_LDC, LDC
kusano 2b45e8
kusano 2b45e8
	movd	%mm4, OFFSET
kusano 2b45e8
	movd	%mm4, KK
kusano 2b45e8
#ifndef LEFT
kusano 2b45e8
	negl	KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	subl	$-16 * SIZE, A
kusano 2b45e8
	subl	$-16 * SIZE, B
kusano 2b45e8
kusano 2b45e8
	leal	(, LDC, SIZE), LDC
kusano 2b45e8
kusano 2b45e8
	sarl	$1, %eax
kusano 2b45e8
	movl	%eax, J
kusano 2b45e8
	jle	.L40
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	movl	OFFSET, %eax
kusano 2b45e8
	movl	%eax, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	sarl	$2, %eax
kusano 2b45e8
	jle	.L05
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movddup	 -16 * SIZE(B), %xmm0
kusano 2b45e8
	movddup	 -15 * SIZE(B), %xmm1
kusano 2b45e8
	movddup	 -14 * SIZE(B), %xmm2
kusano 2b45e8
	movddup	 -13 * SIZE(B), %xmm3
kusano 2b45e8
	movddup	 -12 * SIZE(B), %xmm4
kusano 2b45e8
	movddup	 -11 * SIZE(B), %xmm5
kusano 2b45e8
	movddup	 -10 * SIZE(B), %xmm6
kusano 2b45e8
	movddup	  -9 * SIZE(B), %xmm7
kusano 2b45e8
kusano 2b45e8
	prefetcht0	(PREFETCH_R + 0) * SIZE(B)
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm0,  -16 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm1,  -14 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm2,  -12 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm3,  -10 * SIZE(BB)
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm4,   -8 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm5,   -6 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm6,   -4 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm7,   -2 * SIZE(BB)
kusano 2b45e8
kusano 2b45e8
	addl	$ 8 * SIZE, B
kusano 2b45e8
	addl	$16 * SIZE, BB
kusano 2b45e8
	decl	%eax
kusano 2b45e8
	jne	.L02
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	andl	$3, %eax
kusano 2b45e8
kusano 2b45e8
	jle	.L10
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movddup	 -16 * SIZE(B), %xmm0
kusano 2b45e8
	movddup	 -15 * SIZE(B), %xmm1
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm0,  -16 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm1,  -14 * SIZE(BB)
kusano 2b45e8
	addl	$2 * SIZE, B
kusano 2b45e8
	addl	$4 * SIZE, BB
kusano 2b45e8
	decl	%eax
kusano 2b45e8
	jne	.L06
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	B, BX
kusano 2b45e8
kusano 2b45e8
	movl	C, C1
kusano 2b45e8
	movl	A, AA
kusano 2b45e8
	movl	M,  I
kusano 2b45e8
	sarl	$2, I
kusano 2b45e8
	jle	.L20
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
	movl	KK, %eax
kusano 2b45e8
	leal	(, %eax, SIZE), %eax
kusano 2b45e8
	leal	(AA, %eax, 4), AA
kusano 2b45e8
	leal	(BB, %eax, 4), BB /* because it's doubled */
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movapd	-16 * SIZE(AA), %xmm0
kusano 2b45e8
	pxor	%xmm4, %xmm4
kusano 2b45e8
	movapd	-16 * SIZE(BB), %xmm1
kusano 2b45e8
	pxor	%xmm5, %xmm5
kusano 2b45e8
	movapd	 -8 * SIZE(AA), %xmm3
kusano 2b45e8
	pxor	%xmm6, %xmm6
kusano 2b45e8
	prefetcht0	3 * SIZE(C1)
kusano 2b45e8
	pxor	%xmm7, %xmm7
kusano 2b45e8
	prefetcht0	7 * SIZE(C1, LDC)
kusano 2b45e8
	movapd	%xmm1,  %xmm2
kusano 2b45e8
kusano 2b45e8
	movl	BX, %eax
kusano 2b45e8
	prefetcht0   (%eax)
kusano 2b45e8
	subl	$-8 * SIZE, %eax
kusano 2b45e8
	movl	%eax, BX
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	subl	KK, %eax
kusano 2b45e8
	movl	%eax, KKK	
kusano 2b45e8
kusano 2b45e8
	movl	KK, %eax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addl	$4, %eax
kusano 2b45e8
kusano 2b45e8
	addl	$2, %eax
kusano 2b45e8
kusano 2b45e8
	movl	%eax, KKK
kusano 2b45e8
kusano 2b45e8
	sarl	$3, %eax
kusano 2b45e8
	je	.L15
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	addpd	%xmm1,  %xmm4
kusano 2b45e8
	movapd	-14 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm1,  %xmm0
kusano 2b45e8
	addpd	%xmm0,  %xmm5
kusano 2b45e8
	movapd	-14 * SIZE(AA), %xmm0
kusano 2b45e8
	mulpd	%xmm0,  %xmm2
kusano 2b45e8
	addpd	%xmm2,  %xmm6
kusano 2b45e8
	movapd	-12 * SIZE(BB), %xmm2
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	movapd	-12 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm1,  %xmm7
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm2,  %xmm1
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0,  %xmm2
kusano 2b45e8
	addpd	%xmm2,  %xmm4
kusano 2b45e8
	movapd	-10 * SIZE(BB), %xmm2
kusano 2b45e8
	mulpd	%xmm2,  %xmm0
kusano 2b45e8
	addpd	%xmm0,  %xmm5
kusano 2b45e8
	movapd	-10 * SIZE(AA), %xmm0
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	addpd	%xmm1,  %xmm6
kusano 2b45e8
	movapd	 -8 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm0,  %xmm2
kusano 2b45e8
kusano 2b45e8
	movapd	  0 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm2,  %xmm7
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm1,  %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm3,  %xmm1
kusano 2b45e8
	addpd	%xmm1,  %xmm4
kusano 2b45e8
	movapd	 -6 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm1,  %xmm3
kusano 2b45e8
	addpd	%xmm3,  %xmm5
kusano 2b45e8
	movapd	 -6 * SIZE(AA), %xmm3
kusano 2b45e8
	mulpd	%xmm3,  %xmm2
kusano 2b45e8
	addpd	%xmm2,  %xmm6
kusano 2b45e8
	movapd	 -4 * SIZE(BB), %xmm2
kusano 2b45e8
	mulpd	%xmm3,  %xmm1
kusano 2b45e8
	movapd	 -4 * SIZE(AA), %xmm3
kusano 2b45e8
	addpd	%xmm1,  %xmm7
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm2,  %xmm1
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm3,  %xmm2
kusano 2b45e8
	addpd	%xmm2,  %xmm4
kusano 2b45e8
	movapd	 -2 * SIZE(BB), %xmm2
kusano 2b45e8
	mulpd	%xmm2,  %xmm3
kusano 2b45e8
	addpd	%xmm3,  %xmm5
kusano 2b45e8
	movapd	 -2 * SIZE(AA), %xmm3
kusano 2b45e8
	mulpd	%xmm3,  %xmm1
kusano 2b45e8
	addpd	%xmm1,  %xmm6
kusano 2b45e8
kusano 2b45e8
	movapd	  0 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm3,  %xmm2
kusano 2b45e8
	movapd	  8 * SIZE(AA), %xmm3
kusano 2b45e8
	addpd	%xmm2,  %xmm7
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm1,  %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	addpd	%xmm1,  %xmm4
kusano 2b45e8
	movapd	  2 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm1,  %xmm0
kusano 2b45e8
	addpd	%xmm0,  %xmm5
kusano 2b45e8
	movapd	  2 * SIZE(AA), %xmm0
kusano 2b45e8
	mulpd	%xmm0,  %xmm2
kusano 2b45e8
	addpd	%xmm2,  %xmm6
kusano 2b45e8
	movapd	  4 * SIZE(BB), %xmm2
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	movapd	  4 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm1,  %xmm7
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm2,  %xmm1
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0,  %xmm2
kusano 2b45e8
	addpd	%xmm2,  %xmm4
kusano 2b45e8
	movapd	  6 * SIZE(BB), %xmm2
kusano 2b45e8
	mulpd	%xmm2,  %xmm0
kusano 2b45e8
	addpd	%xmm0,  %xmm5
kusano 2b45e8
	movapd	  6 * SIZE(AA), %xmm0
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	addpd	%xmm1,  %xmm6
kusano 2b45e8
	movapd	  8 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm0,  %xmm2
kusano 2b45e8
	movapd	 16 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm2,  %xmm7
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm1,  %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm3,  %xmm1
kusano 2b45e8
	addpd	%xmm1,  %xmm4
kusano 2b45e8
	movapd	 10 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm1,  %xmm3
kusano 2b45e8
	addpd	%xmm3,  %xmm5
kusano 2b45e8
	movapd	 10 * SIZE(AA), %xmm3
kusano 2b45e8
	mulpd	%xmm3,  %xmm2
kusano 2b45e8
	addpd	%xmm2,  %xmm6
kusano 2b45e8
	movapd	 12 * SIZE(BB), %xmm2
kusano 2b45e8
	mulpd	%xmm3,  %xmm1
kusano 2b45e8
	movapd	 12 * SIZE(AA), %xmm3
kusano 2b45e8
	addpd	%xmm1,  %xmm7
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm2,  %xmm1
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm3,  %xmm2
kusano 2b45e8
	addpd	%xmm2,  %xmm4
kusano 2b45e8
	movapd	 14 * SIZE(BB), %xmm2
kusano 2b45e8
	mulpd	%xmm2,  %xmm3
kusano 2b45e8
	subl   $-32 * SIZE, BB
kusano 2b45e8
	addpd	%xmm3,  %xmm5
kusano 2b45e8
	movapd	 14 * SIZE(AA), %xmm3
kusano 2b45e8
	mulpd	%xmm3,  %xmm1
kusano 2b45e8
	addpd	%xmm1,  %xmm6
kusano 2b45e8
	movapd	-16 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm3,  %xmm2
kusano 2b45e8
	movapd	 24 * SIZE(AA), %xmm3
kusano 2b45e8
	addpd	%xmm2,  %xmm7
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm1,  %xmm2
kusano 2b45e8
kusano 2b45e8
	subl   $-32 * SIZE, AA
kusano 2b45e8
	decl   %eax
kusano 2b45e8
kusano 2b45e8
	jne  .L12
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
kusano 2b45e8
	movl	KKK, %eax
kusano 2b45e8
kusano 2b45e8
	movapd	ALPHA,  %xmm3
kusano 2b45e8
kusano 2b45e8
	andl	$7, %eax
kusano 2b45e8
kusano 2b45e8
	je .L18
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	addpd	%xmm1,  %xmm4
kusano 2b45e8
	movapd	-14 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm1,  %xmm0
kusano 2b45e8
	addpd	%xmm0,  %xmm5
kusano 2b45e8
	movapd	-14 * SIZE(AA), %xmm0
kusano 2b45e8
	mulpd	%xmm0,  %xmm2
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	movapd	-12 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm2,  %xmm6
kusano 2b45e8
	addpd	%xmm1,  %xmm7
kusano 2b45e8
	movapd	-12 * SIZE(BB), %xmm1
kusano 2b45e8
	movapd	%xmm1,  %xmm2
kusano 2b45e8
kusano 2b45e8
	addl	$4 * SIZE, AA
kusano 2b45e8
	addl	$4 * SIZE, BB
kusano 2b45e8
	decl	%eax
kusano 2b45e8
	jg	.L16
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm3, %xmm4
kusano 2b45e8
	mulpd	%xmm3, %xmm5
kusano 2b45e8
	mulpd	%xmm3, %xmm6
kusano 2b45e8
	mulpd	%xmm3, %xmm7
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(C1), %xmm0
kusano 2b45e8
	movhpd	1 * SIZE(C1), %xmm0
kusano 2b45e8
	movsd	2 * SIZE(C1), %xmm2
kusano 2b45e8
	movhpd	3 * SIZE(C1), %xmm2
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(C1, LDC), %xmm1
kusano 2b45e8
	movhpd	1 * SIZE(C1, LDC), %xmm1
kusano 2b45e8
	movsd	2 * SIZE(C1, LDC), %xmm3
kusano 2b45e8
	movhpd	3 * SIZE(C1, LDC), %xmm3
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm4
kusano 2b45e8
	addpd	%xmm1, %xmm5
kusano 2b45e8
	addpd	%xmm2, %xmm6
kusano 2b45e8
	addpd	%xmm3, %xmm7
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm4, 0 * SIZE(C1)
kusano 2b45e8
	movhpd	%xmm4, 1 * SIZE(C1)
kusano 2b45e8
	movsd	%xmm6, 2 * SIZE(C1)
kusano 2b45e8
	movhpd	%xmm6, 3 * SIZE(C1)
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm5, 0 * SIZE(C1, LDC)
kusano 2b45e8
	movhpd	%xmm5, 1 * SIZE(C1, LDC)
kusano 2b45e8
	movsd	%xmm7, 2 * SIZE(C1, LDC)
kusano 2b45e8
	movhpd	%xmm7, 3 * SIZE(C1, LDC)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	subl	KKK, %eax
kusano 2b45e8
	leal	(,%eax, SIZE), %eax
kusano 2b45e8
	leal	(AA, %eax, 4), AA
kusano 2b45e8
	leal	(BB, %eax, 4), BB
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addl	$4, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addl	$4 * SIZE, C1
kusano 2b45e8
	decl	I
kusano 2b45e8
	jg	.L11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	M,  I
kusano 2b45e8
	testl	$2, I
kusano 2b45e8
	jle	.L30
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
	movl	KK, %eax
kusano 2b45e8
	leal	(, %eax, SIZE), %eax
kusano 2b45e8
	leal	(AA, %eax, 2), AA
kusano 2b45e8
	leal	(BB, %eax, 4), BB /* because it's doubled */
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movapd	-16 * SIZE(AA), %xmm0
kusano 2b45e8
	pxor	%xmm4, %xmm4
kusano 2b45e8
	movapd	-16 * SIZE(BB), %xmm1
kusano 2b45e8
	pxor	%xmm5, %xmm5
kusano 2b45e8
	movapd	 -8 * SIZE(AA), %xmm2
kusano 2b45e8
	pxor	%xmm6, %xmm6
kusano 2b45e8
	movapd	 -8 * SIZE(BB), %xmm3
kusano 2b45e8
	pxor	%xmm7, %xmm7
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	subl	KK, %eax
kusano 2b45e8
	movl	%eax, KKK	
kusano 2b45e8
kusano 2b45e8
	movl	KK, %eax
kusano 2b45e8
	addl	$2, %eax
kusano 2b45e8
	movl	%eax, KKK
kusano 2b45e8
kusano 2b45e8
	sarl	$3, %eax
kusano 2b45e8
	je	.L25
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	mulpd	-14 * SIZE(BB), %xmm0
kusano 2b45e8
	addpd	%xmm1, %xmm4
kusano 2b45e8
	movapd	-12 * SIZE(BB), %xmm1
kusano 2b45e8
	addpd	%xmm0, %xmm5
kusano 2b45e8
	movapd	-14 * SIZE(AA), %xmm0
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	mulpd	-10 * SIZE(BB), %xmm0
kusano 2b45e8
	addpd	%xmm1, %xmm6
kusano 2b45e8
	movapd	  0 * SIZE(BB), %xmm1
kusano 2b45e8
	addpd	%xmm0, %xmm7
kusano 2b45e8
	movapd	-12 * SIZE(AA), %xmm0
kusano 2b45e8
	mulpd	%xmm0,  %xmm3
kusano 2b45e8
	mulpd	 -6 * SIZE(BB), %xmm0
kusano 2b45e8
	addpd	%xmm3, %xmm4
kusano 2b45e8
	movapd	 -4 * SIZE(BB), %xmm3
kusano 2b45e8
	addpd	%xmm0, %xmm5
kusano 2b45e8
	movapd	-10 * SIZE(AA), %xmm0
kusano 2b45e8
	mulpd	%xmm0,  %xmm3
kusano 2b45e8
	mulpd	 -2 * SIZE(BB), %xmm0
kusano 2b45e8
	addpd	%xmm3, %xmm6
kusano 2b45e8
	movapd	  8 * SIZE(BB), %xmm3
kusano 2b45e8
	addpd	%xmm0, %xmm7
kusano 2b45e8
	movapd	  0 * SIZE(AA), %xmm0
kusano 2b45e8
	mulpd	%xmm2,  %xmm1
kusano 2b45e8
	mulpd	  2 * SIZE(BB), %xmm2
kusano 2b45e8
	addpd	%xmm1, %xmm4
kusano 2b45e8
	movapd	  4 * SIZE(BB), %xmm1
kusano 2b45e8
	addpd	%xmm2, %xmm5
kusano 2b45e8
	movapd	 -6 * SIZE(AA), %xmm2
kusano 2b45e8
	mulpd	%xmm2,  %xmm1
kusano 2b45e8
	mulpd	  6 * SIZE(BB), %xmm2
kusano 2b45e8
	addpd	%xmm1, %xmm6
kusano 2b45e8
	movapd	 16 * SIZE(BB), %xmm1
kusano 2b45e8
	addpd	%xmm2, %xmm7
kusano 2b45e8
	movapd	 -4 * SIZE(AA), %xmm2
kusano 2b45e8
	mulpd	%xmm2,  %xmm3
kusano 2b45e8
	mulpd	 10 * SIZE(BB), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm4
kusano 2b45e8
	movapd	 12 * SIZE(BB), %xmm3
kusano 2b45e8
	addpd	%xmm2, %xmm5
kusano 2b45e8
	movapd	 -2 * SIZE(AA), %xmm2
kusano 2b45e8
	mulpd	%xmm2,  %xmm3
kusano 2b45e8
	mulpd	 14 * SIZE(BB), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm6
kusano 2b45e8
	movapd	 24 * SIZE(BB), %xmm3
kusano 2b45e8
	addpd	%xmm2, %xmm7
kusano 2b45e8
	movapd	  8 * SIZE(AA), %xmm2
kusano 2b45e8
kusano 2b45e8
	subl   $-16 * SIZE, AA
kusano 2b45e8
	addl   $ 32 * SIZE, BB
kusano 2b45e8
	decl   %eax
kusano 2b45e8
	jne    .L22
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movapd	ALPHA,  %xmm3
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
kusano 2b45e8
	movl	KKK, %eax
kusano 2b45e8
kusano 2b45e8
	andl	$7, %eax
kusano 2b45e8
kusano 2b45e8
	je .L28
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	mulpd	-14 * SIZE(BB), %xmm0
kusano 2b45e8
	addpd	%xmm1, %xmm4
kusano 2b45e8
	movapd	-12 * SIZE(BB), %xmm1
kusano 2b45e8
	addpd	%xmm0, %xmm5
kusano 2b45e8
	movapd	-14 * SIZE(AA), %xmm0
kusano 2b45e8
kusano 2b45e8
	addl	$2 * SIZE, AA
kusano 2b45e8
	addl	$4 * SIZE, BB
kusano 2b45e8
	decl	%eax
kusano 2b45e8
	jg	.L26
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm6, %xmm4
kusano 2b45e8
	addpd	%xmm7, %xmm5
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm3, %xmm4
kusano 2b45e8
	mulpd	%xmm3, %xmm5
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(C1), %xmm0
kusano 2b45e8
	movhpd	1 * SIZE(C1), %xmm0
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(C1, LDC), %xmm1
kusano 2b45e8
	movhpd	1 * SIZE(C1, LDC), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm4
kusano 2b45e8
	addpd	%xmm1, %xmm5
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm4, 0 * SIZE(C1)
kusano 2b45e8
	movhpd	%xmm4, 1 * SIZE(C1)
kusano 2b45e8
	movsd	%xmm5, 0 * SIZE(C1, LDC)
kusano 2b45e8
	movhpd	%xmm5, 1 * SIZE(C1, LDC)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	subl	KKK, %eax
kusano 2b45e8
	leal	(,%eax, SIZE), %eax
kusano 2b45e8
	leal	(AA, %eax, 2), AA
kusano 2b45e8
	leal	(BB, %eax, 4), BB
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addl	$2, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addl	$2 * SIZE, C1
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	M,  I
kusano 2b45e8
	testl	$1, I
kusano 2b45e8
	jle	.L39
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
	movl	KK, %eax
kusano 2b45e8
	leal	(, %eax, SIZE), %eax
kusano 2b45e8
	leal	(AA, %eax, 1), AA
kusano 2b45e8
	leal	(BB, %eax, 4), BB /* because it's doubled */
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	-16 * SIZE(AA), %xmm0
kusano 2b45e8
	pxor	%xmm4, %xmm4
kusano 2b45e8
	movsd	-16 * SIZE(BB), %xmm1
kusano 2b45e8
	pxor	%xmm5, %xmm5
kusano 2b45e8
	movsd	-12 * SIZE(AA), %xmm2
kusano 2b45e8
	pxor	%xmm6, %xmm6
kusano 2b45e8
	movsd	 -8 * SIZE(BB), %xmm3
kusano 2b45e8
	pxor	%xmm7, %xmm7
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	subl	KK, %eax
kusano 2b45e8
	movl	%eax, KKK	
kusano 2b45e8
kusano 2b45e8
	movl	KK, %eax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addl	$1, %eax
kusano 2b45e8
kusano 2b45e8
	addl	$2, %eax
kusano 2b45e8
kusano 2b45e8
	movl	%eax, KKK
kusano 2b45e8
kusano 2b45e8
	sarl	$3, %eax
kusano 2b45e8
	je	.L35
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulsd	%xmm0,  %xmm1
kusano 2b45e8
	mulsd	-14 * SIZE(BB), %xmm0
kusano 2b45e8
	addsd	%xmm1, %xmm4
kusano 2b45e8
 	movsd	-12 * SIZE(BB), %xmm1
kusano 2b45e8
	addsd	%xmm0, %xmm5
kusano 2b45e8
	movsd	-15 * SIZE(AA), %xmm0
kusano 2b45e8
	mulsd	%xmm0,  %xmm1
kusano 2b45e8
	mulsd	-10 * SIZE(BB), %xmm0
kusano 2b45e8
	addsd	%xmm1, %xmm6
kusano 2b45e8
	movsd	  0 * SIZE(BB), %xmm1
kusano 2b45e8
	addsd	%xmm0, %xmm7
kusano 2b45e8
	movsd	-14 * SIZE(AA), %xmm0
kusano 2b45e8
	mulsd	%xmm0,  %xmm3
kusano 2b45e8
	mulsd	 -6 * SIZE(BB), %xmm0
kusano 2b45e8
	addsd	%xmm3, %xmm4
kusano 2b45e8
	movsd	 -4 * SIZE(BB), %xmm3
kusano 2b45e8
	addsd	%xmm0, %xmm5
kusano 2b45e8
	movsd	-13 * SIZE(AA), %xmm0
kusano 2b45e8
	mulsd	%xmm0,  %xmm3
kusano 2b45e8
	mulsd	 -2 * SIZE(BB), %xmm0
kusano 2b45e8
	addsd	%xmm3, %xmm6
kusano 2b45e8
	movsd	  8 * SIZE(BB), %xmm3
kusano 2b45e8
	addsd	%xmm0, %xmm7
kusano 2b45e8
	movsd	 -8 * SIZE(AA), %xmm0
kusano 2b45e8
	mulsd	%xmm2,  %xmm1
kusano 2b45e8
	mulsd	  2 * SIZE(BB), %xmm2
kusano 2b45e8
	addsd	%xmm1, %xmm4
kusano 2b45e8
	movsd	  4 * SIZE(BB), %xmm1
kusano 2b45e8
	addsd	%xmm2, %xmm5
kusano 2b45e8
	movsd	-11 * SIZE(AA), %xmm2
kusano 2b45e8
	mulsd	%xmm2,  %xmm1
kusano 2b45e8
	mulsd	  6 * SIZE(BB), %xmm2
kusano 2b45e8
	addsd	%xmm1, %xmm6
kusano 2b45e8
	movsd	 16 * SIZE(BB), %xmm1
kusano 2b45e8
	addsd	%xmm2, %xmm7
kusano 2b45e8
	movsd	-10 * SIZE(AA), %xmm2
kusano 2b45e8
	mulsd	%xmm2,  %xmm3
kusano 2b45e8
	mulsd	 10 * SIZE(BB), %xmm2
kusano 2b45e8
	addsd	%xmm3, %xmm4
kusano 2b45e8
	movsd	 12 * SIZE(BB), %xmm3
kusano 2b45e8
	addsd	%xmm2, %xmm5
kusano 2b45e8
	movsd	 -9 * SIZE(AA), %xmm2
kusano 2b45e8
	mulsd	%xmm2,  %xmm3
kusano 2b45e8
	mulsd	 14 * SIZE(BB), %xmm2
kusano 2b45e8
	addsd	%xmm3, %xmm6
kusano 2b45e8
	movsd	 24 * SIZE(BB), %xmm3
kusano 2b45e8
	addsd	%xmm2, %xmm7
kusano 2b45e8
	movsd	 -4 * SIZE(AA), %xmm2
kusano 2b45e8
kusano 2b45e8
	subl   $-8 * SIZE, AA
kusano 2b45e8
	addl   $32 * SIZE, BB
kusano 2b45e8
	decl   %eax
kusano 2b45e8
	jne    .L32
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	ALPHA,  %xmm3
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
kusano 2b45e8
	movl	KKK, %eax
kusano 2b45e8
kusano 2b45e8
	andl	$7, %eax
kusano 2b45e8
kusano 2b45e8
	je .L38
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulsd	%xmm0,  %xmm1
kusano 2b45e8
	mulsd	-14 * SIZE(BB), %xmm0
kusano 2b45e8
	addsd	%xmm1, %xmm4
kusano 2b45e8
	movsd	-12 * SIZE(BB), %xmm1
kusano 2b45e8
	addsd	%xmm0, %xmm5
kusano 2b45e8
	movsd	-15 * SIZE(AA), %xmm0
kusano 2b45e8
kusano 2b45e8
	addl	$1 * SIZE, AA
kusano 2b45e8
	addl	$4 * SIZE, BB
kusano 2b45e8
	decl	%eax
kusano 2b45e8
	jg	.L36
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addsd	%xmm6, %xmm4
kusano 2b45e8
	addsd	%xmm7, %xmm5
kusano 2b45e8
kusano 2b45e8
	mulsd	%xmm3, %xmm4
kusano 2b45e8
	mulsd	%xmm3, %xmm5
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(C1), %xmm0
kusano 2b45e8
	movsd	0 * SIZE(C1, LDC), %xmm1
kusano 2b45e8
kusano 2b45e8
	addsd	%xmm0, %xmm4
kusano 2b45e8
	addsd	%xmm1, %xmm5
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm4, 0 * SIZE(C1)
kusano 2b45e8
	movsd	%xmm5, 0 * SIZE(C1, LDC)
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addl	$1, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && !defined(LEFT)
kusano 2b45e8
	addl	$2, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	leal	(, LDC, 2), %eax
kusano 2b45e8
	addl	%eax, C
kusano 2b45e8
	decl	J
kusano 2b45e8
	jg	.L01
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	N, %eax
kusano 2b45e8
	testl	$1, %eax
kusano 2b45e8
	jle	.L999
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	movl	OFFSET, %eax
kusano 2b45e8
	movl	%eax, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	sarl	$3, %eax
kusano 2b45e8
	jle	.L45
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movddup	 -16 * SIZE(B), %xmm0
kusano 2b45e8
	movddup	 -15 * SIZE(B), %xmm1
kusano 2b45e8
	movddup	 -14 * SIZE(B), %xmm2
kusano 2b45e8
	movddup	 -13 * SIZE(B), %xmm3
kusano 2b45e8
	movddup	 -12 * SIZE(B), %xmm4
kusano 2b45e8
	movddup	 -11 * SIZE(B), %xmm5
kusano 2b45e8
	movddup	 -10 * SIZE(B), %xmm6
kusano 2b45e8
	movddup	  -9 * SIZE(B), %xmm7
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm0,  -16 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm1,  -14 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm2,  -12 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm3,  -10 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm4,   -8 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm5,   -6 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm6,   -4 * SIZE(BB)
kusano 2b45e8
	movapd	%xmm7,   -2 * SIZE(BB)
kusano 2b45e8
kusano 2b45e8
	addl	$ 8 * SIZE, B
kusano 2b45e8
	addl	$16 * SIZE, BB
kusano 2b45e8
	decl	%eax
kusano 2b45e8
	jne	.L42
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	andl	$7, %eax
kusano 2b45e8
kusano 2b45e8
	jle	.L50
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movddup	 -16 * SIZE(B), %xmm0
kusano 2b45e8
kusano 2b45e8
	movapd	%xmm0,  -16 * SIZE(BB)
kusano 2b45e8
	addl	$1 * SIZE, B
kusano 2b45e8
	addl	$2 * SIZE, BB
kusano 2b45e8
	decl	%eax
kusano 2b45e8
	jne	.L46
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	C, C1
kusano 2b45e8
	movl	A, AA
kusano 2b45e8
	movl	M,  I
kusano 2b45e8
	sarl	$2, I
kusano 2b45e8
	jle	.L60
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
	movl	KK, %eax
kusano 2b45e8
	leal	(, %eax, SIZE), %eax
kusano 2b45e8
	leal	(AA, %eax, 4), AA
kusano 2b45e8
	leal	(BB, %eax, 2), BB
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movapd	-16 * SIZE(AA), %xmm0
kusano 2b45e8
	pxor	%xmm4, %xmm4
kusano 2b45e8
	movapd	-16 * SIZE(BB), %xmm1
kusano 2b45e8
	pxor	%xmm5, %xmm5
kusano 2b45e8
	movapd	 -8 * SIZE(AA), %xmm2
kusano 2b45e8
	pxor	%xmm6, %xmm6
kusano 2b45e8
	movapd	 -8 * SIZE(BB), %xmm3
kusano 2b45e8
	pxor	%xmm7, %xmm7
kusano 2b45e8
kusano 2b45e8
	prefetcht0	3 * SIZE(C1)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	subl	KK, %eax
kusano 2b45e8
	movl	%eax, KKK	
kusano 2b45e8
kusano 2b45e8
	movl	KK, %eax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addl	$4, %eax
kusano 2b45e8
kusano 2b45e8
	addl	$1, %eax
kusano 2b45e8
kusano 2b45e8
	movl	%eax, KKK
kusano 2b45e8
kusano 2b45e8
	sarl	$3, %eax
kusano 2b45e8
	je	.L55
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm1, %xmm0
kusano 2b45e8
	mulpd	-14 * SIZE(AA), %xmm1
kusano 2b45e8
	addpd	%xmm0, %xmm4
kusano 2b45e8
	movapd	-12 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm1, %xmm6
kusano 2b45e8
	movapd	-14 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm1, %xmm0
kusano 2b45e8
	mulpd	-10 * SIZE(AA), %xmm1
kusano 2b45e8
	addpd	%xmm0, %xmm5
kusano 2b45e8
	movapd	  0 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm1, %xmm7
kusano 2b45e8
	movapd	-12 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm1, %xmm2
kusano 2b45e8
	mulpd	 -6 * SIZE(AA), %xmm1
kusano 2b45e8
	addpd	%xmm2, %xmm4
kusano 2b45e8
	movapd	 -4 * SIZE(AA), %xmm2
kusano 2b45e8
	addpd	%xmm1, %xmm6
kusano 2b45e8
	movapd	-10 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm1, %xmm2
kusano 2b45e8
	mulpd	 -2 * SIZE(AA), %xmm1
kusano 2b45e8
	addpd	%xmm2, %xmm5
kusano 2b45e8
	movapd	  8 * SIZE(AA), %xmm2
kusano 2b45e8
	addpd	%xmm1, %xmm7
kusano 2b45e8
	movapd	  0 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm3, %xmm0
kusano 2b45e8
	mulpd	  2 * SIZE(AA), %xmm3
kusano 2b45e8
	addpd	%xmm0, %xmm4
kusano 2b45e8
	movapd	  4 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm3, %xmm6
kusano 2b45e8
	movapd	 -6 * SIZE(BB), %xmm3
kusano 2b45e8
	mulpd	%xmm3, %xmm0
kusano 2b45e8
	mulpd	  6 * SIZE(AA), %xmm3
kusano 2b45e8
	addpd	%xmm0, %xmm5
kusano 2b45e8
	movapd	 16 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm3, %xmm7
kusano 2b45e8
	movapd	 -4 * SIZE(BB), %xmm3
kusano 2b45e8
	mulpd	%xmm3, %xmm2
kusano 2b45e8
	mulpd	 10 * SIZE(AA), %xmm3
kusano 2b45e8
	addpd	%xmm2, %xmm4
kusano 2b45e8
	movapd	 12 * SIZE(AA), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm6
kusano 2b45e8
	movapd	 -2 * SIZE(BB), %xmm3
kusano 2b45e8
	mulpd	%xmm3, %xmm2
kusano 2b45e8
	mulpd	 14 * SIZE(AA), %xmm3
kusano 2b45e8
	addpd	%xmm2, %xmm5
kusano 2b45e8
	movapd	 24 * SIZE(AA), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm7
kusano 2b45e8
	movapd	  8 * SIZE(BB), %xmm3
kusano 2b45e8
kusano 2b45e8
	addl   $ 32 * SIZE, AA
kusano 2b45e8
	subl   $-16 * SIZE, BB
kusano 2b45e8
	decl   %eax
kusano 2b45e8
	jne    .L52
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movapd	ALPHA,  %xmm3
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
kusano 2b45e8
	movl	KKK, %eax
kusano 2b45e8
kusano 2b45e8
	andl	$7, %eax
kusano 2b45e8
kusano 2b45e8
	je .L58
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm1, %xmm0
kusano 2b45e8
	mulpd	-14 * SIZE(AA), %xmm1
kusano 2b45e8
	addpd	%xmm0, %xmm4
kusano 2b45e8
	movapd	-12 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm1, %xmm6
kusano 2b45e8
	movapd	-14 * SIZE(BB), %xmm1
kusano 2b45e8
kusano 2b45e8
	addl	$4 * SIZE, AA
kusano 2b45e8
	addl	$2 * SIZE, BB
kusano 2b45e8
	decl	%eax
kusano 2b45e8
	jg	.L56
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm5, %xmm4
kusano 2b45e8
	addpd	%xmm7, %xmm6
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm3, %xmm4
kusano 2b45e8
	mulpd	%xmm3, %xmm6
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(C1), %xmm0
kusano 2b45e8
	movhpd	1 * SIZE(C1), %xmm0
kusano 2b45e8
	movsd	2 * SIZE(C1), %xmm2
kusano 2b45e8
	movhpd	3 * SIZE(C1), %xmm2
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm4
kusano 2b45e8
	addpd	%xmm2, %xmm6
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm4, 0 * SIZE(C1)
kusano 2b45e8
	movhpd	%xmm4, 1 * SIZE(C1)
kusano 2b45e8
	movsd	%xmm6, 2 * SIZE(C1)
kusano 2b45e8
	movhpd	%xmm6, 3 * SIZE(C1)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	subl	KKK, %eax
kusano 2b45e8
	leal	(,%eax, SIZE), %eax
kusano 2b45e8
	leal	(AA, %eax, 4), AA
kusano 2b45e8
	leal	(BB, %eax, 2), BB
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addl	$4, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addl	$4 * SIZE, C1
kusano 2b45e8
	decl	I
kusano 2b45e8
	jg	.L51
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	M,  I
kusano 2b45e8
	testl	$2, I
kusano 2b45e8
	jle	.L70
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
	movl	KK, %eax
kusano 2b45e8
	leal	(, %eax, SIZE), %eax
kusano 2b45e8
	leal	(AA, %eax, 2), AA
kusano 2b45e8
	leal	(BB, %eax, 2), BB
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movapd	-16 * SIZE(AA), %xmm0
kusano 2b45e8
	pxor	%xmm4, %xmm4
kusano 2b45e8
	movapd	-16 * SIZE(BB), %xmm1
kusano 2b45e8
	pxor	%xmm5, %xmm5
kusano 2b45e8
	movapd	 -8 * SIZE(AA), %xmm2
kusano 2b45e8
	movapd	 -8 * SIZE(BB), %xmm3
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	subl	KK, %eax
kusano 2b45e8
	movl	%eax, KKK	
kusano 2b45e8
kusano 2b45e8
	movl	KK, %eax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addl	$2, %eax
kusano 2b45e8
kusano 2b45e8
	addl	$1, %eax
kusano 2b45e8
kusano 2b45e8
	movl	%eax, KKK
kusano 2b45e8
kusano 2b45e8
	sarl	$3, %eax
kusano 2b45e8
	je	.L65
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	movapd	-14 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm1, %xmm4
kusano 2b45e8
	movapd	-14 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	movapd	-12 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm1, %xmm5
kusano 2b45e8
	movapd	-12 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	movapd	-10 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm1, %xmm4
kusano 2b45e8
	movapd	-10 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	movapd	  0 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm1, %xmm5
kusano 2b45e8
	movapd	  0 * SIZE(BB), %xmm1
kusano 2b45e8
	mulpd	%xmm2,  %xmm3
kusano 2b45e8
	movapd	 -6 * SIZE(AA), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm4
kusano 2b45e8
	movapd	 -6 * SIZE(BB), %xmm3
kusano 2b45e8
	mulpd	%xmm2,  %xmm3
kusano 2b45e8
	movapd	 -4 * SIZE(AA), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm5
kusano 2b45e8
	movapd	 -4 * SIZE(BB), %xmm3
kusano 2b45e8
	mulpd	%xmm2,  %xmm3
kusano 2b45e8
	movapd	 -2 * SIZE(AA), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm4
kusano 2b45e8
	movapd	 -2 * SIZE(BB), %xmm3
kusano 2b45e8
	mulpd	%xmm2,  %xmm3
kusano 2b45e8
	movapd	  8 * SIZE(AA), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm5
kusano 2b45e8
	movapd	  8 * SIZE(BB), %xmm3
kusano 2b45e8
kusano 2b45e8
	subl   $-16 * SIZE, AA
kusano 2b45e8
	subl   $-16 * SIZE, BB
kusano 2b45e8
	decl   %eax
kusano 2b45e8
	jne    .L62
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movapd	ALPHA,  %xmm3
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
kusano 2b45e8
	movl	KKK, %eax
kusano 2b45e8
kusano 2b45e8
	andl	$7, %eax
kusano 2b45e8
kusano 2b45e8
	je .L68
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0,  %xmm1
kusano 2b45e8
	movapd	-14 * SIZE(AA), %xmm0
kusano 2b45e8
	addpd	%xmm1, %xmm4
kusano 2b45e8
	movapd	-14 * SIZE(BB), %xmm1
kusano 2b45e8
kusano 2b45e8
	addl	$2 * SIZE, AA
kusano 2b45e8
	addl	$2 * SIZE, BB
kusano 2b45e8
	decl	%eax
kusano 2b45e8
	jg	.L66
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm5, %xmm4
kusano 2b45e8
	mulpd	%xmm3, %xmm4
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(C1), %xmm0
kusano 2b45e8
	movhpd	1 * SIZE(C1), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm4
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm4, 0 * SIZE(C1)
kusano 2b45e8
	movhpd	%xmm4, 1 * SIZE(C1)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	subl	KKK, %eax
kusano 2b45e8
	leal	(,%eax, SIZE), %eax
kusano 2b45e8
	leal	(AA, %eax, 2), AA
kusano 2b45e8
	leal	(BB, %eax, 2), BB
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addl	$2, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addl	$2 * SIZE, C1
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	M,  I
kusano 2b45e8
	testl	$1, I
kusano 2b45e8
	jle	.L79
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
kusano 2b45e8
	leal	16 * SIZE + BUFFER, BB
kusano 2b45e8
	movl	KK, %eax
kusano 2b45e8
	leal	(, %eax, SIZE), %eax
kusano 2b45e8
	leal	(AA, %eax, 1), AA
kusano 2b45e8
	leal	(BB, %eax, 2), BB
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	-16 * SIZE(AA), %xmm0
kusano 2b45e8
	pxor	%xmm4, %xmm4
kusano 2b45e8
	movsd	-16 * SIZE(BB), %xmm1
kusano 2b45e8
	pxor	%xmm5, %xmm5
kusano 2b45e8
	movsd	 -8 * SIZE(BB), %xmm3
kusano 2b45e8
	movsd	-12 * SIZE(AA), %xmm2
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
	subl	KK, %eax
kusano 2b45e8
	movl	%eax, KKK	
kusano 2b45e8
kusano 2b45e8
	movl	KK, %eax
kusano 2b45e8
	addl	$1, %eax
kusano 2b45e8
	movl	%eax, KKK
kusano 2b45e8
kusano 2b45e8
	sarl	$3, %eax
kusano 2b45e8
	je	.L75
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulsd	%xmm0,  %xmm1
kusano 2b45e8
	movsd	-15 * SIZE(AA), %xmm0
kusano 2b45e8
	addsd	%xmm1, %xmm4
kusano 2b45e8
	movsd	-14 * SIZE(BB), %xmm1
kusano 2b45e8
	mulsd	%xmm0,  %xmm1
kusano 2b45e8
	movsd	-14 * SIZE(AA), %xmm0
kusano 2b45e8
	addsd	%xmm1, %xmm5
kusano 2b45e8
	movsd	-12 * SIZE(BB), %xmm1
kusano 2b45e8
	mulsd	%xmm0,  %xmm1
kusano 2b45e8
	movsd	-13 * SIZE(AA), %xmm0
kusano 2b45e8
	addsd	%xmm1, %xmm4
kusano 2b45e8
	movsd	-10 * SIZE(BB), %xmm1
kusano 2b45e8
	mulsd	%xmm0,  %xmm1
kusano 2b45e8
	movsd	 -8 * SIZE(AA), %xmm0
kusano 2b45e8
	addsd	%xmm1, %xmm5
kusano 2b45e8
	movsd	 -0 * SIZE(BB), %xmm1
kusano 2b45e8
	mulsd	%xmm2,  %xmm3
kusano 2b45e8
	movsd	-11 * SIZE(AA), %xmm2
kusano 2b45e8
	addsd	%xmm3, %xmm4
kusano 2b45e8
	movsd	 -6 * SIZE(BB), %xmm3
kusano 2b45e8
	mulsd	%xmm2,  %xmm3
kusano 2b45e8
	movsd	-10 * SIZE(AA), %xmm2
kusano 2b45e8
	addsd	%xmm3, %xmm5
kusano 2b45e8
	movsd	 -4 * SIZE(BB), %xmm3
kusano 2b45e8
	mulsd	%xmm2,  %xmm3
kusano 2b45e8
	movsd	 -9 * SIZE(AA), %xmm2
kusano 2b45e8
	addsd	%xmm3, %xmm4
kusano 2b45e8
	movsd	 -2 * SIZE(BB), %xmm3
kusano 2b45e8
	mulsd	%xmm2,  %xmm3
kusano 2b45e8
	movsd	 -4 * SIZE(AA), %xmm2
kusano 2b45e8
	addsd	%xmm3, %xmm5
kusano 2b45e8
	movsd	  8 * SIZE(BB), %xmm3
kusano 2b45e8
kusano 2b45e8
	subl   $ -8 * SIZE, AA
kusano 2b45e8
	subl   $-16 * SIZE, BB
kusano 2b45e8
	decl   %eax
kusano 2b45e8
	jne    .L72
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	ALPHA,  %xmm3
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	K, %eax
kusano 2b45e8
kusano 2b45e8
	movl	KKK, %eax
kusano 2b45e8
kusano 2b45e8
	andl	$7, %eax
kusano 2b45e8
kusano 2b45e8
	je .L78
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulsd	%xmm0,  %xmm1
kusano 2b45e8
	movsd	-15 * SIZE(AA), %xmm0
kusano 2b45e8
	addsd	%xmm1, %xmm4
kusano 2b45e8
	movsd	-14 * SIZE(BB), %xmm1
kusano 2b45e8
kusano 2b45e8
	addl	$1 * SIZE, AA
kusano 2b45e8
	addl	$2 * SIZE, BB
kusano 2b45e8
	decl	%eax
kusano 2b45e8
	jg	.L76
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addsd	%xmm5, %xmm4
kusano 2b45e8
	mulsd	%xmm3, %xmm4
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(C1), %xmm0
kusano 2b45e8
	addsd	%xmm0, %xmm4
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm4, 0 * SIZE(C1)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addl	LDC, C
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movl	OLD_STACK, %esp
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	popl	%ebx
kusano 2b45e8
	popl	%esi
kusano 2b45e8
	popl	%edi
kusano 2b45e8
	popl	%ebp
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8