kusano 2b45e8
kusano 2b45e8
/* Copyright 2009, 2010 The University of Texas at Austin.           */
kusano 2b45e8
/* All rights reserved.                                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* Redistribution and use in source and binary forms, with or        */
kusano 2b45e8
/* without modification, are permitted provided that the following   */
kusano 2b45e8
/* conditions are met:                                               */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   1. Redistributions of source code must retain the above         */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer.                                                  */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/*   2. Redistributions in binary form must reproduce the above      */
kusano 2b45e8
/*      copyright notice, this list of conditions and the following  */
kusano 2b45e8
/*      disclaimer in the documentation and/or other materials       */
kusano 2b45e8
/*      provided with the distribution.                              */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
kusano 2b45e8
/*                                                                   */
kusano 2b45e8
/* The views and conclusions contained in the software and           */
kusano 2b45e8
/* documentation are those of the authors and should not be          */
kusano 2b45e8
/* interpreted as representing official policies, either expressed   */
kusano 2b45e8
/* or implied, of The University of Texas at Austin.                 */
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#include "common.h"
kusano 2b45e8
kusano 2b45e8
#define OLD_M	%rdi
kusano 2b45e8
#define OLD_N	%rsi
kusano 2b45e8
#define OLD_K	%rdx
kusano 2b45e8
kusano 2b45e8
#define M	%r13
kusano 2b45e8
#define N	%r14
kusano 2b45e8
#define K	%r15
kusano 2b45e8
kusano 2b45e8
#define A	%rcx
kusano 2b45e8
#define B	%r8
kusano 2b45e8
#define C	%r9
kusano 2b45e8
#define LDC	%r10
kusano 2b45e8
kusano 2b45e8
#define I	%r11
kusano 2b45e8
#define AO	%rdi
kusano 2b45e8
#define BO	%rsi
kusano 2b45e8
#define	CO1	%rbx
kusano 2b45e8
#define CO2	%rbp
kusano 2b45e8
#define BB	%r12
kusano 2b45e8
kusano 2b45e8
#define PREA	%rdx
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#define STACKSIZE 128
kusano 2b45e8
kusano 2b45e8
#define OLD_LDC		 8 + STACKSIZE(%rsp)
kusano 2b45e8
#define OLD_OFFSET	16 + STACKSIZE(%rsp)
kusano 2b45e8
kusano 2b45e8
#define ALPHA	   48(%rsp)
kusano 2b45e8
#define J	   56(%rsp)
kusano 2b45e8
#define OFFSET	   64(%rsp)
kusano 2b45e8
#define KK	   72(%rsp)
kusano 2b45e8
#define KKK	   80(%rsp)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#define STACKSIZE 512
kusano 2b45e8
kusano 2b45e8
#define OLD_A		40 + STACKSIZE(%rsp)
kusano 2b45e8
#define OLD_B		48 + STACKSIZE(%rsp)
kusano 2b45e8
#define OLD_C		56 + STACKSIZE(%rsp)
kusano 2b45e8
#define OLD_LDC		64 + STACKSIZE(%rsp)
kusano 2b45e8
#define OLD_OFFSET	72 + STACKSIZE(%rsp)
kusano 2b45e8
kusano 2b45e8
#define ALPHA	  224(%rsp)
kusano 2b45e8
#define J	  232(%rsp)
kusano 2b45e8
#define OFFSET	  240(%rsp)
kusano 2b45e8
#define KK	  248(%rsp)
kusano 2b45e8
#define KKK	  256(%rsp)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#ifdef NANO
kusano 2b45e8
#define PREFETCHSIZE  (8 * 2 + 4)
kusano 2b45e8
#define PREFETCHW     prefetcht0
kusano 2b45e8
#define PREFETCHB     prefetcht0
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#define PREFETCHSIZE  (8 * 97 + 4)
kusano 2b45e8
#define PREFETCHB     prefetcht2
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#ifndef PREFETCH
kusano 2b45e8
#define PREFETCH      prefetcht0
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#define PREFETCHW     prefetcht2
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#define PREFETCHB     prefetcht0
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#define PREFETCHSIZE  (8 * 17 + 4)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	subq	$STACKSIZE, %rsp
kusano 2b45e8
kusano 2b45e8
	movq	%rbx,  0(%rsp)
kusano 2b45e8
	movq	%rbp,  8(%rsp)
kusano 2b45e8
	movq	%r12, 16(%rsp)
kusano 2b45e8
	movq	%r13, 24(%rsp)
kusano 2b45e8
	movq	%r14, 32(%rsp)
kusano 2b45e8
	movq	%r15, 40(%rsp)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	%rdi,    48(%rsp)
kusano 2b45e8
	movq	%rsi,    56(%rsp)
kusano 2b45e8
	movups	%xmm6,   64(%rsp)
kusano 2b45e8
	movups	%xmm7,   80(%rsp)
kusano 2b45e8
	movups	%xmm8,   96(%rsp)
kusano 2b45e8
	movups	%xmm9,  112(%rsp)
kusano 2b45e8
	movups	%xmm10, 128(%rsp)
kusano 2b45e8
	movups	%xmm11, 144(%rsp)
kusano 2b45e8
	movups	%xmm12, 160(%rsp)
kusano 2b45e8
	movups	%xmm13, 176(%rsp)
kusano 2b45e8
	movups	%xmm14, 192(%rsp)
kusano 2b45e8
	movups	%xmm15, 208(%rsp)
kusano 2b45e8
kusano 2b45e8
	movq	ARG1,      OLD_M
kusano 2b45e8
	movq	ARG2,      OLD_N
kusano 2b45e8
	movq	ARG3,      OLD_K
kusano 2b45e8
	movq	OLD_A,     A
kusano 2b45e8
	movq	OLD_B,     B
kusano 2b45e8
	movq	OLD_C,     C
kusano 2b45e8
	movq	OLD_LDC,   LDC
kusano 2b45e8
kusano 2b45e8
	movq	OLD_OFFSET, %r11
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm3, %xmm0
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	OLD_LDC,   LDC
kusano 2b45e8
kusano 2b45e8
	movq	OLD_OFFSET, %r11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movlps	 %xmm0, ALPHA
kusano 2b45e8
kusano 2b45e8
	subq	$-16 * SIZE, A
kusano 2b45e8
	subq	$-17 * SIZE, B
kusano 2b45e8
kusano 2b45e8
	movq	OLD_M, M
kusano 2b45e8
	movq	OLD_N, N
kusano 2b45e8
	movq	OLD_K, K
kusano 2b45e8
kusano 2b45e8
	leaq	(, LDC, SIZE), LDC
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	%r11, OFFSET
kusano 2b45e8
#ifndef LEFT
kusano 2b45e8
	negq	%r11
kusano 2b45e8
kusano 2b45e8
	movq	%r11, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	N,  J
kusano 2b45e8
	sarq	$2, J
kusano 2b45e8
kusano 2b45e8
	jle	.L40
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
        movq    OFFSET, %rax
kusano 2b45e8
	movq    %rax, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	C, CO1
kusano 2b45e8
	leaq	(C, LDC, 1), CO2
kusano 2b45e8
	movq	A, AO
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	salq	$BASE_SHIFT + 2, %rax
kusano 2b45e8
	leaq	(B, %rax), BB
kusano 2b45e8
kusano 2b45e8
	movq	M,  I
kusano 2b45e8
	sarq	$2, I	# i = (m >> 2)
kusano 2b45e8
kusano 2b45e8
	jle	.L20
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
	leaq	(, %rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 4), AO
kusano 2b45e8
	leaq	(BO, %rax, 4), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movaps	-16 * SIZE(AO), %xmm0
kusano 2b45e8
	xorpd	%xmm3, %xmm3
kusano 2b45e8
	movaps	-14 * SIZE(AO), %xmm1
kusano 2b45e8
	xorpd	%xmm4, %xmm4
kusano 2b45e8
	movaps	-17 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	xorpd	%xmm5, %xmm5
kusano 2b45e8
	xorpd	%xmm6, %xmm6
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm4, %xmm8
kusano 2b45e8
	movaps	%xmm4, %xmm9
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm4, %xmm10
kusano 2b45e8
	movaps	%xmm4, %xmm11
kusano 2b45e8
kusano 2b45e8
	PREFETCHW     3 * SIZE(CO1, LDC, 2)
kusano 2b45e8
	movaps	%xmm4, %xmm12
kusano 2b45e8
	movaps	%xmm4, %xmm13
kusano 2b45e8
	PREFETCHW     7 * SIZE(CO2, LDC, 2)
kusano 2b45e8
	movapd	%xmm4, %xmm14
kusano 2b45e8
	movapd	%xmm4, %xmm15
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KK, %rax
kusano 2b45e8
	movq	%rax, KKK	
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addq	$4, %rax
kusano 2b45e8
kusano 2b45e8
	addq	$4, %rax
kusano 2b45e8
kusano 2b45e8
	movq	%rax, KKK
kusano 2b45e8
kusano 2b45e8
	sarq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	jle	.L15
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	movaps	-15 * SIZE(BO), %xmm3
kusano 2b45e8
	addpd	%xmm4, %xmm15
kusano 2b45e8
	movaps	%xmm2, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm5, %xmm10
kusano 2b45e8
	addpd	%xmm6, %xmm14
kusano 2b45e8
	movaps	%xmm7, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	-13 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm4, %xmm13
kusano 2b45e8
	movaps	%xmm3, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm3, %xmm5
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	addpd	%xmm6, %xmm12
kusano 2b45e8
	movaps	%xmm5, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm5
kusano 2b45e8
	movaps	-12 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
	movaps	-10 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	movaps	-11 * SIZE(BO), %xmm3
kusano 2b45e8
	addpd	%xmm4, %xmm15
kusano 2b45e8
	movaps	%xmm2, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm5, %xmm10
kusano 2b45e8
	addpd	%xmm6, %xmm14
kusano 2b45e8
	movaps	%xmm7, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	 -9 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm4, %xmm13
kusano 2b45e8
	movaps	%xmm3, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm3, %xmm5
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	addpd	%xmm6, %xmm12
kusano 2b45e8
	movaps	%xmm5, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm5
kusano 2b45e8
	movaps	 -8 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
	movaps	 -6 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	movaps	 -7 * SIZE(BO), %xmm3
kusano 2b45e8
	addpd	%xmm4, %xmm15
kusano 2b45e8
	movapd	%xmm2, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm5, %xmm10
kusano 2b45e8
	addpd	%xmm6, %xmm14
kusano 2b45e8
	movapd	%xmm7, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	 -5 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm4, %xmm13
kusano 2b45e8
	movaps	%xmm3, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm3, %xmm5
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	addpd	%xmm6, %xmm12
kusano 2b45e8
	movaps	%xmm5, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm5
kusano 2b45e8
	movaps	 -4 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
	movaps	 -2 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	subq	$-16 * SIZE, AO
kusano 2b45e8
	movaps	 -3 * SIZE(BO), %xmm3
kusano 2b45e8
	addpd	%xmm4, %xmm15
kusano 2b45e8
	movaps	%xmm2, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm5, %xmm10
kusano 2b45e8
	addpd	%xmm6, %xmm14
kusano 2b45e8
	movaps	%xmm7, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	 -1 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm4, %xmm13
kusano 2b45e8
	movaps	%xmm3, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm3, %xmm5
kusano 2b45e8
	subq	$-16 * SIZE, BO
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	addpd	%xmm6, %xmm12
kusano 2b45e8
	movaps	%xmm5, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm5
kusano 2b45e8
	movaps	-16 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
	movaps	-14 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L12
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
kusano 2b45e8
	movq	KKK, %rax
kusano 2b45e8
kusano 2b45e8
	andq	$3, %rax		# if (k & 1)
kusano 2b45e8
kusano 2b45e8
	je	.L18
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	movaps	-15 * SIZE(BO), %xmm3
kusano 2b45e8
	addpd	%xmm4, %xmm15
kusano 2b45e8
	movaps	%xmm2, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm5, %xmm10
kusano 2b45e8
	addpd	%xmm6, %xmm14
kusano 2b45e8
	movaps	%xmm7, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	-13 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm4, %xmm13
kusano 2b45e8
	movaps	%xmm3, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm3, %xmm5
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	addpd	%xmm6, %xmm12
kusano 2b45e8
	movaps	%xmm5, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm5
kusano 2b45e8
	movaps	-12 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
	movaps	-10 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addq	$4 * SIZE, AO
kusano 2b45e8
	addq	$4 * SIZE, BO
kusano 2b45e8
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L16
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movddup	ALPHA, %xmm1
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	subq		 $-16 * SIZE, BB
kusano 2b45e8
kusano 2b45e8
	subq		 $-32 * SIZE, BB
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	addpd	%xmm4, %xmm15
kusano 2b45e8
	addpd	%xmm5, %xmm10
kusano 2b45e8
	addpd	%xmm6, %xmm14
kusano 2b45e8
kusano 2b45e8
	movaps	 %xmm8, %xmm0
kusano 2b45e8
	movsd    %xmm9, %xmm8
kusano 2b45e8
	mulpd	%xmm1, %xmm8
kusano 2b45e8
	movsd    %xmm0, %xmm9
kusano 2b45e8
	mulpd	%xmm1, %xmm9
kusano 2b45e8
kusano 2b45e8
	movaps	 %xmm10, %xmm0
kusano 2b45e8
	movsd    %xmm11, %xmm10
kusano 2b45e8
	mulpd	%xmm1, %xmm10
kusano 2b45e8
	movsd    %xmm0,  %xmm11
kusano 2b45e8
	mulpd	%xmm1, %xmm11
kusano 2b45e8
kusano 2b45e8
	movaps	 %xmm12, %xmm0
kusano 2b45e8
	movsd    %xmm13, %xmm12
kusano 2b45e8
	mulpd	%xmm1, %xmm12
kusano 2b45e8
	movsd    %xmm0,  %xmm13
kusano 2b45e8
	mulpd	%xmm1, %xmm13
kusano 2b45e8
kusano 2b45e8
	movaps	 %xmm14, %xmm0
kusano 2b45e8
	movsd    %xmm15, %xmm14
kusano 2b45e8
	mulpd	%xmm1, %xmm14
kusano 2b45e8
	movsd    %xmm0,  %xmm15
kusano 2b45e8
	mulpd	%xmm1, %xmm15
kusano 2b45e8
kusano 2b45e8
	movq	CO1, %rax
kusano 2b45e8
	orq	LDC, %rax
kusano 2b45e8
	testq	$15, %rax
kusano 2b45e8
kusano 2b45e8
	jne	.L18x
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	0 * SIZE(CO1), %xmm8
kusano 2b45e8
	addpd	2 * SIZE(CO1), %xmm12
kusano 2b45e8
	addpd	0 * SIZE(CO2), %xmm9
kusano 2b45e8
	addpd	2 * SIZE(CO2), %xmm13
kusano 2b45e8
kusano 2b45e8
	addpd	0 * SIZE(CO1, LDC, 2), %xmm10
kusano 2b45e8
	addpd	2 * SIZE(CO1, LDC, 2), %xmm14
kusano 2b45e8
	addpd	0 * SIZE(CO2, LDC, 2), %xmm11
kusano 2b45e8
	addpd	2 * SIZE(CO2, LDC, 2), %xmm15
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm8,  0 * SIZE(CO1)
kusano 2b45e8
	movaps	%xmm12, 2 * SIZE(CO1)
kusano 2b45e8
	movaps	%xmm9,  0 * SIZE(CO2)
kusano 2b45e8
	movaps	%xmm13, 2 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm10, 0 * SIZE(CO1, LDC, 2)
kusano 2b45e8
	movaps	%xmm14, 2 * SIZE(CO1, LDC, 2)
kusano 2b45e8
	movaps	%xmm11, 0 * SIZE(CO2, LDC, 2)
kusano 2b45e8
	movaps	%xmm15, 2 * SIZE(CO2, LDC, 2)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KKK, %rax
kusano 2b45e8
	leaq	(,%rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 4), AO
kusano 2b45e8
	leaq	(BO, %rax, 4), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addq	$4, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addq	$4 * SIZE, CO1		# coffset += 4
kusano 2b45e8
	addq	$4 * SIZE, CO2		# coffset += 4
kusano 2b45e8
kusano 2b45e8
	decq	I			# i --
kusano 2b45e8
kusano 2b45e8
	jg	.L11
kusano 2b45e8
	jmp	.L20
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(CO1), %xmm0
kusano 2b45e8
	movhpd	1 * SIZE(CO1), %xmm0
kusano 2b45e8
	movsd	2 * SIZE(CO1), %xmm1
kusano 2b45e8
	movhpd	3 * SIZE(CO1), %xmm1
kusano 2b45e8
	movsd	0 * SIZE(CO2), %xmm2
kusano 2b45e8
	movhpd	1 * SIZE(CO2), %xmm2
kusano 2b45e8
	movsd	2 * SIZE(CO2), %xmm3
kusano 2b45e8
	movhpd	3 * SIZE(CO2), %xmm3
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(CO1, LDC, 2), %xmm4
kusano 2b45e8
	movhpd	1 * SIZE(CO1, LDC, 2), %xmm4
kusano 2b45e8
	movsd	2 * SIZE(CO1, LDC, 2), %xmm5
kusano 2b45e8
	movhpd	3 * SIZE(CO1, LDC, 2), %xmm5
kusano 2b45e8
	movsd	0 * SIZE(CO2, LDC, 2), %xmm6
kusano 2b45e8
	movhpd	1 * SIZE(CO2, LDC, 2), %xmm6
kusano 2b45e8
	movsd	2 * SIZE(CO2, LDC, 2), %xmm7
kusano 2b45e8
	movhpd	3 * SIZE(CO2, LDC, 2), %xmm7
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm8
kusano 2b45e8
	addpd	%xmm1, %xmm12
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	addpd	%xmm3, %xmm13
kusano 2b45e8
	addpd	%xmm4, %xmm10
kusano 2b45e8
	addpd	%xmm5, %xmm14
kusano 2b45e8
	addpd	%xmm6, %xmm11
kusano 2b45e8
	addpd	%xmm7, %xmm15
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm8,  0 * SIZE(CO1)
kusano 2b45e8
	movhpd	%xmm8,  1 * SIZE(CO1)
kusano 2b45e8
	movsd	%xmm12, 2 * SIZE(CO1)
kusano 2b45e8
	movhpd	%xmm12, 3 * SIZE(CO1)
kusano 2b45e8
	movsd	%xmm9,  0 * SIZE(CO2)
kusano 2b45e8
	movhpd	%xmm9,  1 * SIZE(CO2)
kusano 2b45e8
	movsd	%xmm13, 2 * SIZE(CO2)
kusano 2b45e8
	movhpd	%xmm13, 3 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm10, 0 * SIZE(CO1, LDC, 2)
kusano 2b45e8
	movhpd	%xmm10, 1 * SIZE(CO1, LDC, 2)
kusano 2b45e8
	movsd	%xmm14, 2 * SIZE(CO1, LDC, 2)
kusano 2b45e8
	movhpd	%xmm14, 3 * SIZE(CO1, LDC, 2)
kusano 2b45e8
	movsd	%xmm11, 0 * SIZE(CO2, LDC, 2)
kusano 2b45e8
	movhpd	%xmm11, 1 * SIZE(CO2, LDC, 2)
kusano 2b45e8
	movsd	%xmm15, 2 * SIZE(CO2, LDC, 2)
kusano 2b45e8
	movhpd	%xmm15, 3 * SIZE(CO2, LDC, 2)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KKK, %rax
kusano 2b45e8
	leaq	(,%rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 4), AO
kusano 2b45e8
	leaq	(BO, %rax, 4), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addq	$4, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addq	$4 * SIZE, CO1		# coffset += 4
kusano 2b45e8
	addq	$4 * SIZE, CO2		# coffset += 4
kusano 2b45e8
	decq	I			# i --
kusano 2b45e8
kusano 2b45e8
	jg	.L11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	testq	$2, M
kusano 2b45e8
kusano 2b45e8
	jle	.L30
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
	leaq	(, %rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 2), AO
kusano 2b45e8
	leaq	(BO, %rax, 4), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movaps	-16 * SIZE(AO), %xmm0
kusano 2b45e8
	movaps	-17 * SIZE(BO), %xmm2
kusano 2b45e8
	movaps	-15 * SIZE(BO), %xmm3
kusano 2b45e8
kusano 2b45e8
	xorps	%xmm3, %xmm3
kusano 2b45e8
	xorps	%xmm4, %xmm4
kusano 2b45e8
	xorps	%xmm5, %xmm5
kusano 2b45e8
	xorps	%xmm6, %xmm6
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm3, %xmm8
kusano 2b45e8
	movaps	%xmm3, %xmm9
kusano 2b45e8
	movaps	%xmm3, %xmm10
kusano 2b45e8
	movaps	%xmm3, %xmm11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KK, %rax
kusano 2b45e8
	movq	%rax, KKK	
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	addq	$4, %rax
kusano 2b45e8
kusano 2b45e8
	movq	%rax, KKK
kusano 2b45e8
kusano 2b45e8
	sarq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	jle	.L25
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	movaps	-15 * SIZE(BO), %xmm3
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	addpd	%xmm5, %xmm10
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	-13 * SIZE(BO), %xmm2
kusano 2b45e8
	pshufd	$0x4e, %xmm3, %xmm5
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	mulpd	%xmm0, %xmm5
kusano 2b45e8
	movaps	-14 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	movaps	-11 * SIZE(BO), %xmm3
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	addpd	%xmm5, %xmm10
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	 -9 * SIZE(BO), %xmm2
kusano 2b45e8
	pshufd	$0x4e, %xmm3, %xmm5
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	mulpd	%xmm0, %xmm5
kusano 2b45e8
	movaps	-12 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	movaps	 -7 * SIZE(BO), %xmm3
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	addpd	%xmm5, %xmm10
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	 -5 * SIZE(BO), %xmm2
kusano 2b45e8
	pshufd	$0x4e, %xmm3, %xmm5
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	mulpd	%xmm0, %xmm5
kusano 2b45e8
	movaps	-10 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	movaps	 -3 * SIZE(BO), %xmm3
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	addpd	%xmm5, %xmm10
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	subq	$ -8 * SIZE, AO
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	 -1 * SIZE(BO), %xmm2
kusano 2b45e8
	pshufd	$0x4e, %xmm3, %xmm5
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	mulpd	%xmm0, %xmm5
kusano 2b45e8
	movaps	-16 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	subq	$-16 * SIZE, BO
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L22
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
kusano 2b45e8
	movq	KKK, %rax
kusano 2b45e8
kusano 2b45e8
	andq	$3, %rax		# if (k & 1)
kusano 2b45e8
kusano 2b45e8
	je	.L28
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	movaps	-15 * SIZE(BO), %xmm3
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	addpd	%xmm5, %xmm10
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	-13 * SIZE(BO), %xmm2
kusano 2b45e8
	pshufd	$0x4e, %xmm3, %xmm5
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	mulpd	%xmm0, %xmm5
kusano 2b45e8
	movaps	-14 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addq	$2 * SIZE, AO
kusano 2b45e8
	addq	$4 * SIZE, BO
kusano 2b45e8
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L26
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	addpd	%xmm5, %xmm10
kusano 2b45e8
kusano 2b45e8
	movddup	ALPHA, %xmm3
kusano 2b45e8
kusano 2b45e8
	movaps	 %xmm8, %xmm0
kusano 2b45e8
	movsd    %xmm9, %xmm8
kusano 2b45e8
	mulpd	%xmm3, %xmm8
kusano 2b45e8
	movsd    %xmm0, %xmm9
kusano 2b45e8
	mulpd	%xmm3, %xmm9
kusano 2b45e8
kusano 2b45e8
	movaps	 %xmm10, %xmm0
kusano 2b45e8
	movsd    %xmm11, %xmm10
kusano 2b45e8
	mulpd	%xmm3, %xmm10
kusano 2b45e8
	movsd    %xmm0,  %xmm11
kusano 2b45e8
	mulpd	%xmm3, %xmm11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(CO1), %xmm0
kusano 2b45e8
	movhpd	1 * SIZE(CO1), %xmm0
kusano 2b45e8
	movsd	0 * SIZE(CO2), %xmm2
kusano 2b45e8
	movhpd	1 * SIZE(CO2), %xmm2
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(CO1, LDC, 2), %xmm4
kusano 2b45e8
	movhpd	1 * SIZE(CO1, LDC, 2), %xmm4
kusano 2b45e8
	movsd	0 * SIZE(CO2, LDC, 2), %xmm6
kusano 2b45e8
	movhpd	1 * SIZE(CO2, LDC, 2), %xmm6
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	addpd	%xmm4, %xmm10
kusano 2b45e8
	addpd	%xmm6, %xmm11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm8,  0 * SIZE(CO1)
kusano 2b45e8
	movhpd	%xmm8,  1 * SIZE(CO1)
kusano 2b45e8
	movsd	%xmm9,  0 * SIZE(CO2)
kusano 2b45e8
	movhpd	%xmm9,  1 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm10, 0 * SIZE(CO1, LDC, 2)
kusano 2b45e8
	movhpd	%xmm10, 1 * SIZE(CO1, LDC, 2)
kusano 2b45e8
	movsd	%xmm11, 0 * SIZE(CO2, LDC, 2)
kusano 2b45e8
	movhpd	%xmm11, 1 * SIZE(CO2, LDC, 2)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KKK, %rax
kusano 2b45e8
	leaq	(,%rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 2), AO
kusano 2b45e8
	leaq	(BO, %rax, 4), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addq	$2, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addq	$2 * SIZE, CO1		# coffset += 4
kusano 2b45e8
	addq	$2 * SIZE, CO2		# coffset += 4
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	testq	$1, M
kusano 2b45e8
kusano 2b45e8
	jle	.L39
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
	leaq	(, %rax, SIZE), %rax
kusano 2b45e8
	addq	%rax, AO
kusano 2b45e8
	leaq	(BO, %rax, 4), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	-16 * SIZE(AO), %xmm0
kusano 2b45e8
	movaps	-17 * SIZE(BO), %xmm2
kusano 2b45e8
	movaps	-15 * SIZE(BO), %xmm3
kusano 2b45e8
kusano 2b45e8
	xorps	%xmm8,  %xmm8
kusano 2b45e8
	xorps	%xmm9,  %xmm9
kusano 2b45e8
	xorps	%xmm10, %xmm10
kusano 2b45e8
	xorps	%xmm11, %xmm11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KK, %rax
kusano 2b45e8
	movq	%rax, KKK	
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	addq	$4, %rax
kusano 2b45e8
kusano 2b45e8
	movq	%rax, KKK
kusano 2b45e8
kusano 2b45e8
	sarq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	jle	.L35
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	shufps	$0x44, %xmm0, %xmm0
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movsd	-15 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm8
kusano 2b45e8
	movaps	-13 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm9
kusano 2b45e8
	movaps	-11 * SIZE(BO), %xmm3
kusano 2b45e8
kusano 2b45e8
	shufps	$0x44, %xmm0, %xmm0
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movsd	-14 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm10
kusano 2b45e8
	movaps	 -9 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	movaps	 -7 * SIZE(BO), %xmm3
kusano 2b45e8
kusano 2b45e8
	shufps	$0x44, %xmm0, %xmm0
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movsd	-13 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm8
kusano 2b45e8
	movaps	 -5 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm9
kusano 2b45e8
	movaps	 -3 * SIZE(BO), %xmm3
kusano 2b45e8
kusano 2b45e8
	shufps	$0x44, %xmm0, %xmm0
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movsd	-12 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm10
kusano 2b45e8
	movaps	 -1 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm11
kusano 2b45e8
	movaps	  1 * SIZE(BO), %xmm3
kusano 2b45e8
kusano 2b45e8
	subq	$ -4 * SIZE, AO
kusano 2b45e8
	subq	$-16 * SIZE, BO
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L32
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
kusano 2b45e8
	movq	KKK, %rax
kusano 2b45e8
kusano 2b45e8
	andq	$3, %rax		# if (k & 1)
kusano 2b45e8
kusano 2b45e8
	je	.L38
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	shufps	$0x44, %xmm0, %xmm0
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movsd	-15 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm8
kusano 2b45e8
	movaps	-13 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm3, %xmm9
kusano 2b45e8
	movaps	-11 * SIZE(BO), %xmm3
kusano 2b45e8
kusano 2b45e8
	addq	$1 * SIZE, AO
kusano 2b45e8
	addq	$4 * SIZE, BO
kusano 2b45e8
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L36
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movddup	ALPHA, %xmm3
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm10, %xmm8
kusano 2b45e8
	addpd	%xmm11, %xmm9
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(CO1), %xmm0
kusano 2b45e8
	movhpd	0 * SIZE(CO2), %xmm0
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(CO1, LDC, 2), %xmm1
kusano 2b45e8
	movhpd	0 * SIZE(CO2, LDC, 2), %xmm1
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm3, %xmm8
kusano 2b45e8
	mulpd	%xmm3, %xmm9
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm8
kusano 2b45e8
	addpd	%xmm1, %xmm9
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movlpd	%xmm8,  0 * SIZE(CO1)
kusano 2b45e8
	movhpd	%xmm8,  0 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
	movlpd	%xmm9, 0 * SIZE(CO1, LDC, 2)
kusano 2b45e8
	movhpd	%xmm9, 0 * SIZE(CO2, LDC, 2)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KKK, %rax
kusano 2b45e8
	leaq	(,%rax, SIZE), %rax
kusano 2b45e8
	addq	%rax, AO
kusano 2b45e8
	leaq	(BO, %rax, 4), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addq	$1, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && !defined(LEFT)
kusano 2b45e8
	addq	$4, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	BO, B
kusano 2b45e8
kusano 2b45e8
	leaq	(C, LDC, 4), C
kusano 2b45e8
kusano 2b45e8
	subq	$1, J
kusano 2b45e8
kusano 2b45e8
	jg	.L01
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	testq	$2, N
kusano 2b45e8
kusano 2b45e8
	jle	.L80
kusano 2b45e8
kusano 2b45e8
	movq	C, CO1
kusano 2b45e8
	leaq	(C, LDC, 1), CO2
kusano 2b45e8
	movq	A, AO
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
        movq    OFFSET, %rax
kusano 2b45e8
	movq    %rax, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	salq	$BASE_SHIFT + 1, %rax
kusano 2b45e8
	leaq	(B, %rax), BB
kusano 2b45e8
kusano 2b45e8
	movq	M,  I
kusano 2b45e8
	sarq	$2, I	# i = (m >> 2)
kusano 2b45e8
kusano 2b45e8
	jle	.L60
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
	leaq	(, %rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 4), AO
kusano 2b45e8
	leaq	(BO, %rax, 2), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	subq		 $-4 * SIZE, BB
kusano 2b45e8
kusano 2b45e8
	movaps	-16 * SIZE(AO), %xmm0
kusano 2b45e8
	movaps	-14 * SIZE(AO), %xmm1
kusano 2b45e8
	movaps	-17 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	xorps	%xmm8,  %xmm8
kusano 2b45e8
	xorps	%xmm9,  %xmm9
kusano 2b45e8
kusano 2b45e8
	xorps	%xmm12, %xmm12
kusano 2b45e8
	xorps	%xmm13, %xmm13
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KK, %rax
kusano 2b45e8
	movq	%rax, KKK	
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addq	$4, %rax
kusano 2b45e8
kusano 2b45e8
	addq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	movq	%rax, KKK
kusano 2b45e8
kusano 2b45e8
	sarq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	jle	.L55
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm2, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm7, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	movaps	-12 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
	movaps	-10 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	-15 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm4, %xmm13
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	addpd	%xmm6, %xmm12
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm2, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm7, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	movaps	 -8 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
	movaps	 -6 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	-13 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm4, %xmm13
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	addpd	%xmm6, %xmm12
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm2, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm7, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	movaps	 -4 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
	movaps	 -2 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	-11 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm4, %xmm13
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	addpd	%xmm6, %xmm12
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm2, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm7, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	movaps	  0 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
	movaps	  2 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	 -9 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm4, %xmm13
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	addpd	%xmm6, %xmm12
kusano 2b45e8
kusano 2b45e8
	subq	$-16 * SIZE, AO
kusano 2b45e8
	subq	$ -8 * SIZE, BO
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L52
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
kusano 2b45e8
	movq	KKK, %rax
kusano 2b45e8
kusano 2b45e8
	andq	$3, %rax		# if (k & 1)
kusano 2b45e8
kusano 2b45e8
	je	.L58
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm2, %xmm4
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
kusano 2b45e8
	movaps	%xmm7, %xmm6
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	movaps	-12 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm6
kusano 2b45e8
	movaps	-10 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	-15 * SIZE(BO), %xmm2
kusano 2b45e8
	addpd	%xmm4, %xmm13
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	addpd	%xmm6, %xmm12
kusano 2b45e8
kusano 2b45e8
	addq	$4 * SIZE, AO
kusano 2b45e8
	addq	$2 * SIZE, BO
kusano 2b45e8
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L56
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movddup	ALPHA, %xmm3
kusano 2b45e8
kusano 2b45e8
	movaps	 %xmm8, %xmm0
kusano 2b45e8
	movsd    %xmm9, %xmm8
kusano 2b45e8
	mulpd	%xmm3, %xmm8
kusano 2b45e8
	movsd    %xmm0, %xmm9
kusano 2b45e8
	mulpd	%xmm3, %xmm9
kusano 2b45e8
kusano 2b45e8
	movaps	 %xmm12, %xmm0
kusano 2b45e8
	movsd    %xmm13, %xmm12
kusano 2b45e8
	mulpd	%xmm3, %xmm12
kusano 2b45e8
	movsd    %xmm0,  %xmm13
kusano 2b45e8
	mulpd	%xmm3, %xmm13
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(CO1), %xmm0
kusano 2b45e8
	movhpd	1 * SIZE(CO1), %xmm0
kusano 2b45e8
	movsd	2 * SIZE(CO1), %xmm1
kusano 2b45e8
	movhpd	3 * SIZE(CO1), %xmm1
kusano 2b45e8
	movsd	0 * SIZE(CO2), %xmm2
kusano 2b45e8
	movhpd	1 * SIZE(CO2), %xmm2
kusano 2b45e8
	movsd	2 * SIZE(CO2), %xmm3
kusano 2b45e8
	movhpd	3 * SIZE(CO2), %xmm3
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm8
kusano 2b45e8
	addpd	%xmm1, %xmm12
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	addpd	%xmm3, %xmm13
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm8,  0 * SIZE(CO1)
kusano 2b45e8
	movhpd	%xmm8,  1 * SIZE(CO1)
kusano 2b45e8
	movsd	%xmm12, 2 * SIZE(CO1)
kusano 2b45e8
	movhpd	%xmm12, 3 * SIZE(CO1)
kusano 2b45e8
	movsd	%xmm9,  0 * SIZE(CO2)
kusano 2b45e8
	movhpd	%xmm9,  1 * SIZE(CO2)
kusano 2b45e8
	movsd	%xmm13, 2 * SIZE(CO2)
kusano 2b45e8
	movhpd	%xmm13, 3 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KKK, %rax
kusano 2b45e8
	leaq	(,%rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 4), AO
kusano 2b45e8
	leaq	(BO, %rax, 2), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addq	$4, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addq	$4 * SIZE, CO1
kusano 2b45e8
	addq	$4 * SIZE, CO2
kusano 2b45e8
	decq	I
kusano 2b45e8
kusano 2b45e8
	jg	.L51
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	testq	$2, M
kusano 2b45e8
kusano 2b45e8
	jle	.L70
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
	leaq	(, %rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 2), AO
kusano 2b45e8
	leaq	(BO, %rax, 2), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movaps	-16 * SIZE(AO), %xmm0
kusano 2b45e8
	xorps	%xmm8,  %xmm8
kusano 2b45e8
	xorps	%xmm9,  %xmm9
kusano 2b45e8
	movaps	-17 * SIZE(BO), %xmm2
kusano 2b45e8
	xorps	%xmm10, %xmm10
kusano 2b45e8
	xorps	%xmm11, %xmm11
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KK, %rax
kusano 2b45e8
	movq	%rax, KKK	
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	addq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	movq	%rax, KKK
kusano 2b45e8
kusano 2b45e8
	sarq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	jle	.L65
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	movaps	-14 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	movaps	-15 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	movaps	-12 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm11
kusano 2b45e8
	addpd	%xmm7, %xmm10
kusano 2b45e8
	movaps	-13 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	movaps	-10 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	movaps	-11 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	movaps	 -8 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm11
kusano 2b45e8
	addpd	%xmm7, %xmm10
kusano 2b45e8
	movaps	 -9 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	subq	$-8 * SIZE, AO
kusano 2b45e8
	subq	$-8 * SIZE, BO
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L62
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
kusano 2b45e8
	movq	KKK, %rax
kusano 2b45e8
kusano 2b45e8
	andq	$3, %rax		# if (k & 1)
kusano 2b45e8
kusano 2b45e8
	je	.L68
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x4e, %xmm2, %xmm7
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	mulpd	%xmm0, %xmm7
kusano 2b45e8
	movaps	-14 * SIZE(AO), %xmm0
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	addpd	%xmm7, %xmm8
kusano 2b45e8
	movaps	-15 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	addq	$2 * SIZE, AO
kusano 2b45e8
	addq	$2 * SIZE, BO
kusano 2b45e8
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L66
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm10, %xmm8
kusano 2b45e8
	addpd	%xmm11, %xmm9
kusano 2b45e8
kusano 2b45e8
	movddup	ALPHA, %xmm3
kusano 2b45e8
kusano 2b45e8
	movaps	 %xmm8, %xmm0
kusano 2b45e8
	movsd    %xmm9, %xmm8
kusano 2b45e8
	mulpd	%xmm3, %xmm8
kusano 2b45e8
	movsd    %xmm0, %xmm9
kusano 2b45e8
	mulpd	%xmm3, %xmm9
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(CO1), %xmm0
kusano 2b45e8
	movhpd	1 * SIZE(CO1), %xmm0
kusano 2b45e8
	movsd	0 * SIZE(CO2), %xmm2
kusano 2b45e8
	movhpd	1 * SIZE(CO2), %xmm2
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm8
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm8,  0 * SIZE(CO1)
kusano 2b45e8
	movhpd	%xmm8,  1 * SIZE(CO1)
kusano 2b45e8
	movsd	%xmm9,  0 * SIZE(CO2)
kusano 2b45e8
	movhpd	%xmm9,  1 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KKK, %rax
kusano 2b45e8
	leaq	(,%rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 2), AO
kusano 2b45e8
	leaq	(BO, %rax, 2), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addq	$2, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addq	$2 * SIZE, CO1
kusano 2b45e8
	addq	$2 * SIZE, CO2
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	testq	$1, M
kusano 2b45e8
kusano 2b45e8
	jle	.L79
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
	leaq	(, %rax, SIZE), %rax
kusano 2b45e8
	addq	%rax, AO
kusano 2b45e8
	leaq	(BO, %rax, 2), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	-16 * SIZE(AO), %xmm0
kusano 2b45e8
	movaps	-17 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	xorps	%xmm8,  %xmm8
kusano 2b45e8
	xorps	%xmm9,  %xmm9
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KK, %rax
kusano 2b45e8
	movq	%rax, KKK	
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	addq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	movq	%rax, KKK
kusano 2b45e8
kusano 2b45e8
	sarq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	jle	.L75
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	shufps	$0x44, %xmm0, %xmm0
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	movsd	-15 * SIZE(AO), %xmm0
kusano 2b45e8
	addpd	%xmm2, %xmm8
kusano 2b45e8
	movaps	-15 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	shufps	$0x44, %xmm0, %xmm0
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	movsd	-14 * SIZE(AO), %xmm0
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	-13 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	shufps	$0x44, %xmm0, %xmm0
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	movsd	-13 * SIZE(AO), %xmm0
kusano 2b45e8
	addpd	%xmm2, %xmm8
kusano 2b45e8
	movaps	-11 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	shufps	$0x44, %xmm0, %xmm0
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	movsd	-12 * SIZE(AO), %xmm0
kusano 2b45e8
	addpd	%xmm2, %xmm9
kusano 2b45e8
	movaps	 -9 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	subq	$-4 * SIZE, AO
kusano 2b45e8
	subq	$-8 * SIZE, BO
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L72
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
kusano 2b45e8
	movq	KKK, %rax
kusano 2b45e8
kusano 2b45e8
	andq	$3, %rax		# if (k & 1)
kusano 2b45e8
kusano 2b45e8
	je	.L78
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	shufps	$0x44, %xmm0, %xmm0
kusano 2b45e8
	mulpd	%xmm0, %xmm2
kusano 2b45e8
	movsd	-15 * SIZE(AO), %xmm0
kusano 2b45e8
	addpd	%xmm2, %xmm8
kusano 2b45e8
	movaps	-15 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	addq	$1 * SIZE, AO
kusano 2b45e8
	addq	$2 * SIZE, BO
kusano 2b45e8
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L76
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movddup	ALPHA, %xmm3
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm9, %xmm8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(CO1), %xmm0
kusano 2b45e8
	movhpd	0 * SIZE(CO2), %xmm0
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm3, %xmm8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movlpd	%xmm8,  0 * SIZE(CO1)
kusano 2b45e8
	movhpd	%xmm8,  0 * SIZE(CO2)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KKK, %rax
kusano 2b45e8
	leaq	(,%rax, SIZE), %rax
kusano 2b45e8
	addq	%rax, AO
kusano 2b45e8
	leaq	(BO, %rax, 2), BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addq	$1, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && !defined(LEFT)
kusano 2b45e8
	addq	$2, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	leaq	(C, LDC, 2), C
kusano 2b45e8
	movq	BO, B
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	testq	$1, N
kusano 2b45e8
kusano 2b45e8
	jle	.L999
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
        movq    OFFSET, %rax
kusano 2b45e8
	movq    %rax, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	C, CO1
kusano 2b45e8
	movq	A, AO
kusano 2b45e8
kusano 2b45e8
	movq	M,  I
kusano 2b45e8
	sarq	$2, I	# i = (m >> 2)
kusano 2b45e8
kusano 2b45e8
	jle	.L100
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
	leaq	(, %rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 4), AO
kusano 2b45e8
	addq	%rax, BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movaps	-16 * SIZE(AO), %xmm0
kusano 2b45e8
	movaps	-14 * SIZE(AO), %xmm1
kusano 2b45e8
	movsd	-17 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	xorps	%xmm8,  %xmm8
kusano 2b45e8
	xorps	%xmm9,  %xmm9
kusano 2b45e8
	xorps	%xmm12, %xmm12
kusano 2b45e8
	xorps	%xmm13, %xmm13
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KK, %rax
kusano 2b45e8
	movq	%rax, KKK	
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addq	$4, %rax
kusano 2b45e8
kusano 2b45e8
	addq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	movq	%rax, KKK
kusano 2b45e8
kusano 2b45e8
	sarq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	jle	.L95
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm3
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm4
kusano 2b45e8
	movsd	-16 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movaps	-12 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
	movaps	-10 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm8
kusano 2b45e8
	addpd	%xmm4, %xmm12
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm3
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm4
kusano 2b45e8
	movsd	-15 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movaps	 -8 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
	movaps	 -6 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm8
kusano 2b45e8
	addpd	%xmm4, %xmm12
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm3
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm4
kusano 2b45e8
	movsd	-14 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movaps	 -4 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
	movaps	 -2 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm8
kusano 2b45e8
	addpd	%xmm4, %xmm12
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm3
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm4
kusano 2b45e8
	movsd	-13 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movaps	  0 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
	movaps	  2 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm8
kusano 2b45e8
	addpd	%xmm4, %xmm12
kusano 2b45e8
kusano 2b45e8
	subq	$-16 * SIZE, AO
kusano 2b45e8
	subq	$ -4 * SIZE, BO
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L92
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
kusano 2b45e8
	movq	KKK, %rax
kusano 2b45e8
kusano 2b45e8
	andq	$3, %rax		# if (k & 1)
kusano 2b45e8
kusano 2b45e8
	je	.L98
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm3
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm4
kusano 2b45e8
	movsd	-16 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movaps	-12 * SIZE(AO), %xmm0
kusano 2b45e8
	mulpd	%xmm1, %xmm4
kusano 2b45e8
	movaps	-10 * SIZE(AO), %xmm1
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm3, %xmm8
kusano 2b45e8
	addpd	%xmm4, %xmm12
kusano 2b45e8
kusano 2b45e8
	addq	$4 * SIZE, AO
kusano 2b45e8
	addq	$1 * SIZE, BO
kusano 2b45e8
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L96
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movddup	ALPHA, %xmm3
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(CO1), %xmm0
kusano 2b45e8
	movhpd	1 * SIZE(CO1), %xmm0
kusano 2b45e8
	movsd	2 * SIZE(CO1), %xmm1
kusano 2b45e8
	movhpd	3 * SIZE(CO1), %xmm1
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm3, %xmm8
kusano 2b45e8
	mulpd	%xmm3, %xmm12
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm8
kusano 2b45e8
	addpd	%xmm1, %xmm12
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm8,  0 * SIZE(CO1)
kusano 2b45e8
	movhpd	%xmm8,  1 * SIZE(CO1)
kusano 2b45e8
	movsd	%xmm12, 2 * SIZE(CO1)
kusano 2b45e8
	movhpd	%xmm12, 3 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KKK, %rax
kusano 2b45e8
	leaq	(,%rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 4), AO
kusano 2b45e8
	addq	%rax, BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addq	$4, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addq	$4 * SIZE, CO1
kusano 2b45e8
	decq	I
kusano 2b45e8
kusano 2b45e8
	jg	.L91
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	testq	$2, M
kusano 2b45e8
kusano 2b45e8
	jle	.L110
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
	leaq	(, %rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 2), AO
kusano 2b45e8
	addq	%rax, BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movaps	-16 * SIZE(AO), %xmm0
kusano 2b45e8
	xorps	%xmm8,  %xmm8
kusano 2b45e8
	movaps	-17 * SIZE(BO), %xmm2
kusano 2b45e8
	xorps	%xmm9,  %xmm9
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KK, %rax
kusano 2b45e8
	movq	%rax, KKK	
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	addq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	movq	%rax, KKK
kusano 2b45e8
kusano 2b45e8
	sarq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	jle	.L105
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm3
kusano 2b45e8
	movsd	-16 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movaps	-14 * SIZE(AO), %xmm0
kusano 2b45e8
	addpd	%xmm3, %xmm8
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm3
kusano 2b45e8
	movsd	-15 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movaps	-12 * SIZE(AO), %xmm0
kusano 2b45e8
	addpd	%xmm3, %xmm9
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm3
kusano 2b45e8
	movsd	-14 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movaps	-10 * SIZE(AO), %xmm0
kusano 2b45e8
	addpd	%xmm3, %xmm8
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm3
kusano 2b45e8
	movsd	-13 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movaps	 -8 * SIZE(AO), %xmm0
kusano 2b45e8
	addpd	%xmm3, %xmm9
kusano 2b45e8
kusano 2b45e8
	subq	$-8 * SIZE, AO
kusano 2b45e8
	subq	$-4 * SIZE, BO
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L102
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
kusano 2b45e8
	movq	KKK, %rax
kusano 2b45e8
kusano 2b45e8
	andq	$3, %rax		# if (k & 1)
kusano 2b45e8
kusano 2b45e8
	je	.L108
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	pshufd	$0x44, %xmm2, %xmm3
kusano 2b45e8
	movsd	-16 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm0, %xmm3
kusano 2b45e8
	movaps	-14 * SIZE(AO), %xmm0
kusano 2b45e8
	addpd	%xmm3, %xmm8
kusano 2b45e8
kusano 2b45e8
	addq	$2 * SIZE, AO
kusano 2b45e8
	addq	$1 * SIZE, BO
kusano 2b45e8
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L106
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm9, %xmm8
kusano 2b45e8
kusano 2b45e8
	movddup	ALPHA, %xmm3
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(CO1), %xmm0
kusano 2b45e8
	movhpd	1 * SIZE(CO1), %xmm0
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulpd	%xmm3, %xmm8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	%xmm8,  0 * SIZE(CO1)
kusano 2b45e8
	movhpd	%xmm8,  1 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KKK, %rax
kusano 2b45e8
	leaq	(,%rax, SIZE), %rax
kusano 2b45e8
	leaq	(AO, %rax, 2), AO
kusano 2b45e8
	addq	%rax, BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
#if defined(TRMMKERNEL) && defined(LEFT)
kusano 2b45e8
	addq	$2, KK
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addq	$2 * SIZE, CO1
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	testq	$1, M
kusano 2b45e8
kusano 2b45e8
	jle	.L999
kusano 2b45e8
kusano 2b45e8
#if !defined(TRMMKERNEL) || \
kusano 2b45e8
	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
kusano 2b45e8
	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	B, BO
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
	leaq	(, %rax, SIZE), %rax
kusano 2b45e8
	addq	%rax, AO
kusano 2b45e8
	addq	%rax, BO
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	-16 * SIZE(AO), %xmm0
kusano 2b45e8
	movsd	-17 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	xorps	%xmm8,  %xmm8
kusano 2b45e8
	xorps	%xmm9,  %xmm9
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
	subq	KK, %rax
kusano 2b45e8
	movq	%rax, KKK	
kusano 2b45e8
kusano 2b45e8
	movq	KK, %rax
kusano 2b45e8
#ifdef LEFT
kusano 2b45e8
	addq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	addq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	movq	%rax, KKK
kusano 2b45e8
kusano 2b45e8
	sarq	$2, %rax
kusano 2b45e8
kusano 2b45e8
	jle	.L115
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulsd	%xmm0, %xmm2
kusano 2b45e8
	addsd	%xmm2, %xmm8
kusano 2b45e8
	movsd	-15 * SIZE(AO), %xmm0
kusano 2b45e8
	movsd	-16 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulsd	%xmm0, %xmm2
kusano 2b45e8
	addsd	%xmm2, %xmm8
kusano 2b45e8
	movsd	-14 * SIZE(AO), %xmm0
kusano 2b45e8
	movsd	-15 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulsd	%xmm0, %xmm2
kusano 2b45e8
	addsd	%xmm2, %xmm8
kusano 2b45e8
	movsd	-13 * SIZE(AO), %xmm0
kusano 2b45e8
	movsd	-14 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	mulsd	%xmm0, %xmm2
kusano 2b45e8
	addsd	%xmm2, %xmm8
kusano 2b45e8
	movsd	-12 * SIZE(AO), %xmm0
kusano 2b45e8
	movsd	-13 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	subq	$-4 * SIZE, AO
kusano 2b45e8
	subq	$-4 * SIZE, BO
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L112
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	K, %rax
kusano 2b45e8
kusano 2b45e8
	movq	KKK, %rax
kusano 2b45e8
kusano 2b45e8
	andq	$3, %rax		# if (k & 1)
kusano 2b45e8
kusano 2b45e8
	je	.L118
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulsd	%xmm0, %xmm2
kusano 2b45e8
	addsd	%xmm2, %xmm8
kusano 2b45e8
	movsd	-15 * SIZE(AO), %xmm0
kusano 2b45e8
	movsd	-16 * SIZE(BO), %xmm2
kusano 2b45e8
kusano 2b45e8
	addq	$1 * SIZE, AO
kusano 2b45e8
	addq	$1 * SIZE, BO
kusano 2b45e8
kusano 2b45e8
	subq	$1, %rax
kusano 2b45e8
kusano 2b45e8
	jg	.L116
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movddup	ALPHA, %xmm3
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm9, %xmm8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movsd	0 * SIZE(CO1), %xmm0
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	mulsd	%xmm3, %xmm8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addpd	%xmm0, %xmm8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movlpd	%xmm8,  0 * SIZE(CO1)
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	  0(%rsp), %rbx
kusano 2b45e8
	movq	  8(%rsp), %rbp
kusano 2b45e8
	movq	 16(%rsp), %r12
kusano 2b45e8
	movq	 24(%rsp), %r13
kusano 2b45e8
	movq	 32(%rsp), %r14
kusano 2b45e8
	movq	 40(%rsp), %r15
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	movq	 48(%rsp), %rdi
kusano 2b45e8
	movq	 56(%rsp), %rsi
kusano 2b45e8
	movups	 64(%rsp), %xmm6
kusano 2b45e8
	movups	 80(%rsp), %xmm7
kusano 2b45e8
	movups	 96(%rsp), %xmm8
kusano 2b45e8
	movups	112(%rsp), %xmm9
kusano 2b45e8
	movups	128(%rsp), %xmm10
kusano 2b45e8
	movups	144(%rsp), %xmm11
kusano 2b45e8
	movups	160(%rsp), %xmm12
kusano 2b45e8
	movups	176(%rsp), %xmm13
kusano 2b45e8
	movups	192(%rsp), %xmm14
kusano 2b45e8
	movups	208(%rsp), %xmm15
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8
	addq	$STACKSIZE, %rsp
kusano 2b45e8
kusano 2b45e8
kusano 2b45e8