/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#ifdef ATOM
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)
#endif
#ifdef CORE2
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)
#endif
#if defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)
#endif
#ifdef NEHALEM
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)
#endif
#ifdef PENTIUM4
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 20)
#endif
#ifdef OPTERON
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 8)
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#endif
#ifdef NANO
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 24)
#endif
#ifdef GENERIC
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 20)
#endif
#ifndef WINDOWS_ABI
#define STACKSIZE 80
#define OLD_Y 8 + STACKSIZE(%rsp)
#define OLD_INCY 16 + STACKSIZE(%rsp)
#define OLD_BUFFER 24 + STACKSIZE(%rsp)
#define M ARG1
#define N ARG2
#define A ARG3
#define LDA ARG4
#define X ARG5
#define INCX ARG6
#else
#define STACKSIZE 256
#define OLD_LDA 40 + STACKSIZE(%rsp)
#define OLD_X 48 + STACKSIZE(%rsp)
#define OLD_INCX 56 + STACKSIZE(%rsp)
#define OLD_Y 64 + STACKSIZE(%rsp)
#define OLD_INCY 72 + STACKSIZE(%rsp)
#define OLD_BUFFER 80 + STACKSIZE(%rsp)
#define M ARG1
#define N ARG2
#define A ARG4
#define LDA ARG3
#define X %rdi
#define INCX %rsi
#endif
#define Y %r10
#define INCY %r11
#define BUFFER %r12
#define TEMP %rax
#define I %rax
#define A1 %rbx
#define A2 %rbp
#define XX %r13
#define YY %r14
#define IS %r15
#define NEW_X BUFFER
#define NEW_Y X
#define ALPHA %xmm0
#define xtemp1 %xmm0
#define xtemp2 %xmm1
#define yy1 %xmm2
#define yy2 %xmm3
#define atemp1 %xmm4
#define atemp2 %xmm5
#define atemp3 %xmm6
#define atemp4 %xmm7
#define xsum1 %xmm8
#define xsum2 %xmm9
#define xsum3 %xmm10
#define xsum4 %xmm11
#define a1 %xmm12
#define a2 %xmm13
#define a3 %xmm14
#define xt1 %xmm15
PROLOGUE
PROFCODE
subq $STACKSIZE, %rsp
movq %rbx, 0(%rsp)
movq %rbp, 8(%rsp)
movq %r12, 16(%rsp)
movq %r13, 24(%rsp)
movq %r14, 32(%rsp)
movq %r15, 40(%rsp)
#ifdef WINDOWS_ABI
movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp)
movups %xmm6, 64(%rsp)
movups %xmm7, 80(%rsp)
movups %xmm8, 96(%rsp)
movups %xmm9, 112(%rsp)
movups %xmm10, 128(%rsp)
movups %xmm11, 144(%rsp)
movups %xmm12, 160(%rsp)
movups %xmm13, 176(%rsp)
movups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp)
movq OLD_LDA, LDA
movq OLD_X, X
movq OLD_INCX, INCX
movaps %xmm2, %xmm0
#endif
movq OLD_Y, Y
movq OLD_INCY, INCY
movq OLD_BUFFER, BUFFER
leaq (,INCX, SIZE), INCX
leaq (,INCY, SIZE), INCY
leaq (,LDA, SIZE), LDA
testq M, M
jle .L999
unpcklpd ALPHA, ALPHA
movq BUFFER, XX
movq M, %rax
sarq $3, %rax
jle .L02
ALIGN_3
.L01:
movsd 0 * SIZE(X), %xmm1
addq INCX, X
movhpd 0 * SIZE(X), %xmm1
addq INCX, X
movsd 0 * SIZE(X), %xmm2
addq INCX, X
movhpd 0 * SIZE(X), %xmm2
addq INCX, X
movsd 0 * SIZE(X), %xmm3
addq INCX, X
movhpd 0 * SIZE(X), %xmm3
addq INCX, X
movsd 0 * SIZE(X), %xmm4
addq INCX, X
movhpd 0 * SIZE(X), %xmm4
addq INCX, X
mulpd ALPHA, %xmm1
mulpd ALPHA, %xmm2
mulpd ALPHA, %xmm3
mulpd ALPHA, %xmm4
movapd %xmm1, 0 * SIZE(XX)
movapd %xmm2, 2 * SIZE(XX)
movapd %xmm3, 4 * SIZE(XX)
movapd %xmm4, 6 * SIZE(XX)
addq $8 * SIZE, XX
decq %rax
jg .L01
ALIGN_3
.L02:
movq M, %rax
andq $7, %rax
jle .L05
ALIGN_3
.L03:
movsd 0 * SIZE(X), %xmm1
addq INCX, X
mulsd ALPHA, %xmm1
movlpd %xmm1, 0 * SIZE(XX)
addq $1 * SIZE, XX
decq %rax
jg .L03
ALIGN_3
.L05:
/* now we don't need original X */
movq Y, NEW_Y
addq $512, XX
andq $-512, XX
cmpq $SIZE, INCY
je .L10
movq Y, YY
movq XX, NEW_Y
movq M, %rax
sarq $3, %rax
jle .L07
ALIGN_3
.L06:
movsd 0 * SIZE(YY), %xmm0
addq INCY, YY
movhpd 0 * SIZE(YY), %xmm0
addq INCY, YY
movsd 0 * SIZE(YY), %xmm1
addq INCY, YY
movhpd 0 * SIZE(YY), %xmm1
addq INCY, YY
movsd 0 * SIZE(YY), %xmm2
addq INCY, YY
movhpd 0 * SIZE(YY), %xmm2
addq INCY, YY
movsd 0 * SIZE(YY), %xmm3
addq INCY, YY
movhpd 0 * SIZE(YY), %xmm3
addq INCY, YY
movapd %xmm0, 0 * SIZE(XX)
movapd %xmm1, 2 * SIZE(XX)
movapd %xmm2, 4 * SIZE(XX)
movapd %xmm3, 6 * SIZE(XX)
addq $8 * SIZE, XX
decq %rax
jg .L06
ALIGN_3
.L07:
movq M, %rax
andq $7, %rax
jle .L10
ALIGN_3
.L08:
movsd 0 * SIZE(YY), %xmm0
addq INCY, YY
movsd %xmm0, 0 * SIZE(XX)
addq $1 * SIZE, XX
decq %rax
jg .L08
ALIGN_3
.L10:
xorq IS, IS # is = 0
cmpq $4, N
jl .L20
ALIGN_3
.L11:
movq A, A1
leaq (A, LDA, 2), A2
leaq 4 * SIZE(A, LDA, 4), A
leaq (NEW_X, IS, SIZE), XX
leaq 4 * SIZE(NEW_Y, IS, SIZE), YY
movapd 0 * SIZE(XX), atemp2
movapd 2 * SIZE(XX), atemp4
movsd 0 * SIZE(A1), xsum1
movhpd 1 * SIZE(A1), xsum1
mulpd atemp2, xsum1
movsd 1 * SIZE(A1), xsum2
movhpd 1 * SIZE(A1, LDA, 1), xsum2
mulpd atemp2, xsum2
movsd 2 * SIZE(A1), xsum3
movhpd 2 * SIZE(A1, LDA, 1), xsum3
mulpd atemp2, xsum3
movsd 3 * SIZE(A1), xsum4
movhpd 3 * SIZE(A1, LDA, 1), xsum4
mulpd atemp2, xsum4
movsd 2 * SIZE(A1), a1
movhpd 3 * SIZE(A1), a1
mulpd atemp4, a1
addpd a1, xsum1
movsd 2 * SIZE(A1, LDA, 1), a1
movhpd 3 * SIZE(A1, LDA, 1), a1
mulpd atemp4, a1
addpd a1, xsum2
movsd 2 * SIZE(A2), a1
movhpd 3 * SIZE(A2), a1
mulpd atemp4, a1
addpd a1, xsum3
movsd 3 * SIZE(A2), a1
movhpd 3 * SIZE(A2, LDA, 1), a1
mulpd atemp4, a1
addpd a1, xsum4
movapd 4 * SIZE(XX), xtemp1
movapd 6 * SIZE(XX), xtemp2
movsd 4 * SIZE(A1), a1
movhpd 5 * SIZE(A1), a1
movsd 6 * SIZE(A1), a2
movhpd 7 * SIZE(A1), a2
movsd 4 * SIZE(A1, LDA, 1), a3
movhpd 5 * SIZE(A1, LDA, 1), a3
movsd 0 * SIZE(YY), yy1
movhpd 1 * SIZE(YY), yy1
movsd 2 * SIZE(YY), yy2
movhpd 3 * SIZE(YY), yy2
#ifndef HAVE_SSE3
movapd atemp2, atemp1
unpcklpd atemp1, atemp1
unpckhpd atemp2, atemp2
movapd atemp4, atemp3
unpcklpd atemp3, atemp3
unpckhpd atemp4, atemp4
#else
movddup atemp2, atemp1
unpckhpd atemp2, atemp2
movddup atemp4, atemp3
unpckhpd atemp4, atemp4
#endif
addq $4 * SIZE, XX
addq $4 * SIZE, A1
addq $4 * SIZE, A2
movq M, I
subq IS, I
subq $4, I
sarq $3, I
jle .L15
ALIGN_3
.L12:
movapd xtemp1, xt1
mulpd a1, xt1
mulpd atemp1, a1
addpd xt1, xsum1
addpd a1, yy1
movsd 2 * SIZE(A1, LDA, 1), a1
movhpd 3 * SIZE(A1, LDA, 1), a1
PREFETCH PREFETCHSIZE(A1)
movapd xtemp2, xt1
mulpd a2, xt1
mulpd atemp1, a2
addpd xt1, xsum1
addpd a2, yy2
movsd 0 * SIZE(A2), a2
movhpd 1 * SIZE(A2), a2
movapd xtemp1, xt1
mulpd a3, xt1
mulpd atemp2, a3
addpd xt1, xsum2
addpd a3, yy1
movsd 2 * SIZE(A2), a3
movhpd 3 * SIZE(A2), a3
#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
PREFETCH PREFETCHSIZE(XX)
#endif
movapd xtemp2, xt1
mulpd a1, xt1
mulpd atemp2, a1
addpd xt1, xsum2
addpd a1, yy2
movsd 0 * SIZE(A2, LDA, 1), a1
movhpd 1 * SIZE(A2, LDA, 1), a1
movapd xtemp1, xt1
mulpd a2, xt1
mulpd atemp3, a2
addpd xt1, xsum3
addpd a2, yy1
movsd 2 * SIZE(A2, LDA, 1), a2
movhpd 3 * SIZE(A2, LDA, 1), a2
PREFETCH PREFETCHSIZE(A1, LDA, 1)
movapd xtemp2, xt1
mulpd a3, xt1
mulpd atemp3, a3
addpd xt1, xsum3
addpd a3, yy2
movsd 4 * SIZE(A1), a3
movhpd 5 * SIZE(A1), a3
movapd xtemp1, xt1
movapd 4 * SIZE(XX), xtemp1
mulpd a1, xt1
mulpd atemp4, a1
addpd xt1, xsum4
addpd a1, yy1
movsd 6 * SIZE(A1), a1
movhpd 7 * SIZE(A1), a1
movapd xtemp2, xt1
movapd 6 * SIZE(XX), xtemp2
mulpd a2, xt1
mulpd atemp4, a2
addpd xt1, xsum4
addpd a2, yy2
movsd 4 * SIZE(A1, LDA, 1), a2
movhpd 5 * SIZE(A1, LDA, 1), a2
movsd yy1, 0 * SIZE(YY)
movhpd yy1, 1 * SIZE(YY)
movsd 4 * SIZE(YY), yy1
movhpd 5 * SIZE(YY), yy1
movsd yy2, 2 * SIZE(YY)
movhpd yy2, 3 * SIZE(YY)
movsd 6 * SIZE(YY), yy2
movhpd 7 * SIZE(YY), yy2
movapd xtemp1, xt1
mulpd a3, xt1
mulpd atemp1, a3
addpd xt1, xsum1
addpd a3, yy1
movsd 6 * SIZE(A1, LDA, 1), a3
movhpd 7 * SIZE(A1, LDA, 1), a3
PREFETCH PREFETCHSIZE(A2)
movapd xtemp2, xt1
mulpd a1, xt1
mulpd atemp1, a1
addpd xt1, xsum1
addpd a1, yy2
movsd 4 * SIZE(A2), a1
movhpd 5 * SIZE(A2), a1
movapd xtemp1, xt1
mulpd a2, xt1
mulpd atemp2, a2
addpd xt1, xsum2
addpd a2, yy1
movsd 6 * SIZE(A2), a2
movhpd 7 * SIZE(A2), a2
#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
PREFETCHW PREFETCHSIZE(YY)
#endif
movapd xtemp2, xt1
mulpd a3, xt1
mulpd atemp2, a3
addpd xt1, xsum2
addpd a3, yy2
movsd 4 * SIZE(A2, LDA, 1), a3
movhpd 5 * SIZE(A2, LDA, 1), a3
movapd xtemp1, xt1
mulpd a1, xt1
mulpd atemp3, a1
addpd xt1, xsum3
addpd a1, yy1
movsd 6 * SIZE(A2, LDA, 1), a1
movhpd 7 * SIZE(A2, LDA, 1), a1
PREFETCH PREFETCHSIZE(A2, LDA, 1)
movapd xtemp2, xt1
mulpd a2, xt1
mulpd atemp3, a2
addpd xt1, xsum3
addpd a2, yy2
movsd 10 * SIZE(A1), a2
movhpd 11 * SIZE(A1), a2
movapd xtemp1, xt1
movapd 8 * SIZE(XX), xtemp1
mulpd a3, xt1
mulpd atemp4, a3
addpd xt1, xsum4
addpd a3, yy1
movsd 8 * SIZE(A1, LDA, 1), a3
movhpd 9 * SIZE(A1, LDA, 1), a3
movapd xtemp2, xt1
movapd 10 * SIZE(XX), xtemp2
mulpd a1, xt1
mulpd atemp4, a1
addpd xt1, xsum4
addpd a1, yy2
movsd 8 * SIZE(A1), a1
movhpd 9 * SIZE(A1), a1
movsd yy1, 4 * SIZE(YY)
movhpd yy1, 5 * SIZE(YY)
movsd 8 * SIZE(YY), yy1
movhpd 9 * SIZE(YY), yy1
movsd yy2, 6 * SIZE(YY)
movhpd yy2, 7 * SIZE(YY)
movsd 10 * SIZE(YY), yy2
movhpd 11 * SIZE(YY), yy2
addq $8 * SIZE, XX
addq $8 * SIZE, YY
addq $8 * SIZE, A1
addq $8 * SIZE, A2
decq I
jg .L12
ALIGN_3
.L15:
movq M, I
subq IS, I
subq $4, I
test $4, I
jle .L17
movapd xtemp1, xt1
mulpd a1, xt1
mulpd atemp1, a1
addpd xt1, xsum1
addpd a1, yy1
movsd 2 * SIZE(A1, LDA, 1), a1
movhpd 3 * SIZE(A1, LDA, 1), a1
movapd xtemp2, xt1
mulpd a2, xt1
mulpd atemp1, a2
addpd xt1, xsum1
addpd a2, yy2
movsd 0 * SIZE(A2), a2
movhpd 1 * SIZE(A2), a2
movapd xtemp1, xt1
mulpd a3, xt1
mulpd atemp2, a3
addpd xt1, xsum2
addpd a3, yy1
movsd 2 * SIZE(A2), a3
movhpd 3 * SIZE(A2), a3
movapd xtemp2, xt1
mulpd a1, xt1
mulpd atemp2, a1
addpd xt1, xsum2
addpd a1, yy2
movsd 0 * SIZE(A2, LDA, 1), a1
movhpd 1 * SIZE(A2, LDA, 1), a1
movapd xtemp1, xt1
mulpd a2, xt1
mulpd atemp3, a2
addpd xt1, xsum3
addpd a2, yy1
movsd 2 * SIZE(A2, LDA, 1), a2
movhpd 3 * SIZE(A2, LDA, 1), a2
movapd xtemp2, xt1
mulpd a3, xt1
mulpd atemp3, a3
addpd xt1, xsum3
addpd a3, yy2
movsd 4 * SIZE(A1, LDA, 1), a3
movhpd 5 * SIZE(A1, LDA, 1), a3
movapd xtemp1, xt1
movapd 4 * SIZE(XX), xtemp1
mulpd a1, xt1
mulpd atemp4, a1
addpd xt1, xsum4
addpd a1, yy1
movsd 4 * SIZE(A1), a1
movhpd 5 * SIZE(A1), a1
movapd xtemp2, xt1
movapd 6 * SIZE(XX), xtemp2
mulpd a2, xt1
mulpd atemp4, a2
addpd xt1, xsum4
addpd a2, yy2
movsd 6 * SIZE(A1), a2
movhpd 7 * SIZE(A1), a2
movsd yy1, 0 * SIZE(YY)
movhpd yy1, 1 * SIZE(YY)
movsd 4 * SIZE(YY), yy1
movhpd 5 * SIZE(YY), yy1
movsd yy2, 2 * SIZE(YY)
movhpd yy2, 3 * SIZE(YY)
movsd 6 * SIZE(YY), yy2
movhpd 7 * SIZE(YY), yy2
addq $4 * SIZE, XX
addq $4 * SIZE, YY
addq $4 * SIZE, A1
addq $4 * SIZE, A2
ALIGN_3
.L17:
testq $2, M
jle .L18
movapd xtemp1, xt1
mulpd a1, xt1
mulpd atemp1, a1
addpd xt1, xsum1
addpd a1, yy1
movsd 0 * SIZE(A1, LDA, 1), a1
movhpd 1 * SIZE(A1, LDA, 1), a1
movapd xtemp1, xt1
mulpd a1, xt1
mulpd atemp2, a1
addpd xt1, xsum2
addpd a1, yy1
movsd 0 * SIZE(A2), a1
movhpd 1 * SIZE(A2), a1
movapd xtemp1, xt1
mulpd a1, xt1
mulpd atemp3, a1
addpd xt1, xsum3
addpd a1, yy1
movsd 0 * SIZE(A2, LDA, 1), a1
movhpd 1 * SIZE(A2, LDA, 1), a1
movapd xtemp1, xt1
movapd 2 * SIZE(XX), xtemp1
mulpd a1, xt1
mulpd atemp4, a1
addpd xt1, xsum4
addpd a1, yy1
movsd 2 * SIZE(A1), a1
movsd yy1, 0 * SIZE(YY)
movhpd yy1, 1 * SIZE(YY)
movsd 2 * SIZE(YY), yy1
addq $2 * SIZE, XX
addq $2 * SIZE, YY
addq $2 * SIZE, A1
addq $2 * SIZE, A2
ALIGN_3
.L18:
testq $1, M
jle .L19
movapd xtemp1, xt1
mulsd a1, xt1
mulsd atemp1, a1
addsd xt1, xsum1
addpd a1, yy1
movsd 0 * SIZE(A1, LDA, 1), a1
movapd xtemp1, xt1
mulsd a1, xt1
mulsd atemp2, a1
addsd xt1, xsum2
addsd a1, yy1
movsd 0 * SIZE(A2), a1
movapd xtemp1, xt1
mulsd a1, xt1
mulsd atemp3, a1
addsd xt1, xsum3
addsd a1, yy1
movsd 0 * SIZE(A2, LDA, 1), a1
movapd xtemp1, xt1
mulsd a1, xt1
mulsd atemp4, a1
addsd xt1, xsum4
addsd a1, yy1
movsd yy1, 0 * SIZE(YY)
ALIGN_3
.L19:
#ifndef HAVE_SSE3
movapd xsum1, atemp1
movapd xsum3, atemp3
unpcklpd xsum2, xsum1
unpcklpd xsum4, xsum3
unpckhpd xsum2, atemp1
unpckhpd xsum4, atemp3
addpd atemp1, xsum1
addpd atemp3, xsum3
#else
haddpd xsum2, xsum1
haddpd xsum4, xsum3
#endif
movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1
movsd 2 * SIZE(NEW_Y, IS, SIZE), yy2
movhpd 3 * SIZE(NEW_Y, IS, SIZE), yy2
addpd xsum1, yy1
addpd xsum3, yy2
movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE)
movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE)
movsd yy2, 2 * SIZE(NEW_Y, IS, SIZE)
movhpd yy2, 3 * SIZE(NEW_Y, IS, SIZE)
addq $4, IS
movq IS, I
addq $4, I
cmpq N, I
jle .L11
ALIGN_3
.L20:
testq $2, N
jle .L30
movq A, A1
leaq 2 * SIZE(A, LDA, 2), A
movapd 0 * SIZE(NEW_X, IS, SIZE), atemp2
movsd 0 * SIZE(A1), xsum1
movhpd 1 * SIZE(A1), xsum1
mulpd atemp2, xsum1
movsd 1 * SIZE(A1), xsum2
movhpd 1 * SIZE(A1, LDA, 1), xsum2
mulpd atemp2, xsum2
#ifndef HAVE_SSE3
movapd atemp2, atemp1
unpcklpd atemp1, atemp1
#else
movddup atemp2, atemp1
#endif
unpckhpd atemp2, atemp2
testq $1, M
jle .L29
movsd 2 * SIZE(A1), a1
movsd 2 * SIZE(A1, LDA, 1), a2
movsd 2 * SIZE(NEW_X, IS, SIZE), xtemp1
movsd 2 * SIZE(NEW_Y, IS, SIZE), yy1
movapd xtemp1, xt1
mulsd a1, xt1
mulsd atemp1, a1
addsd xt1, xsum1
addpd a1, yy1
movapd xtemp1, xt1
mulsd a2, xt1
mulsd atemp2, a2
addsd xt1, xsum2
addsd a2, yy1
movsd yy1, 2 * SIZE(NEW_Y, IS, SIZE)
ALIGN_3
.L29:
#ifndef HAVE_SSE3
movapd xsum1, atemp1
unpcklpd xsum2, xsum1
unpckhpd xsum2, atemp1
addpd atemp1, xsum1
#else
haddpd xsum2, xsum1
#endif
movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1
addpd xsum1, yy1
movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE)
movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE)
addq $2, IS
ALIGN_3
.L30:
testq $1, N
jle .L990
movsd 0 * SIZE(A), xsum1
movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
mulsd atemp1, xsum1
addsd xsum1, yy1
movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE)
ALIGN_3
.L990:
cmpq $SIZE, INCY
je .L999
movq M, %rax
sarq $3, %rax
jle .L997
ALIGN_3
.L996:
movapd 0 * SIZE(NEW_Y), %xmm0
movapd 2 * SIZE(NEW_Y), %xmm1
movapd 4 * SIZE(NEW_Y), %xmm2
movapd 6 * SIZE(NEW_Y), %xmm3
movsd %xmm0, 0 * SIZE(Y)
addq INCY, Y
movhpd %xmm0, 0 * SIZE(Y)
addq INCY, Y
movsd %xmm1, 0 * SIZE(Y)
addq INCY, Y
movhpd %xmm1, 0 * SIZE(Y)
addq INCY, Y
movsd %xmm2, 0 * SIZE(Y)
addq INCY, Y
movhpd %xmm2, 0 * SIZE(Y)
addq INCY, Y
movsd %xmm3, 0 * SIZE(Y)
addq INCY, Y
movhpd %xmm3, 0 * SIZE(Y)
addq INCY, Y
addq $8 * SIZE, NEW_Y
decq %rax
jg .L996
ALIGN_3
.L997:
movq M, %rax
andq $7, %rax
jle .L999
ALIGN_3
.L998:
movsd 0 * SIZE(NEW_Y), %xmm0
movsd %xmm0, 0 * SIZE(Y)
addq INCY, Y
addq $1 * SIZE, NEW_Y
decq %rax
jg .L998
ALIGN_3
.L999:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12
movq 24(%rsp), %r13
movq 32(%rsp), %r14
movq 40(%rsp), %r15
#ifdef WINDOWS_ABI
movq 48(%rsp), %rdi
movq 56(%rsp), %rsi
movups 64(%rsp), %xmm6
movups 80(%rsp), %xmm7
movups 96(%rsp), %xmm8
movups 112(%rsp), %xmm9
movups 128(%rsp), %xmm10
movups 144(%rsp), %xmm11
movups 160(%rsp), %xmm12
movups 176(%rsp), %xmm13
movups 192(%rsp), %xmm14
movups 208(%rsp), %xmm15
#endif
addq $STACKSIZE, %rsp
ret
EPILOGUE