/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifdef CORE2 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifdef NEHALEM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 20) #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 8) #define movsd movlpd #endif #if defined(BARCELONA) || defined(SHANGHAI) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 24) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 20) #endif #ifndef WINDOWS_ABI #define STACKSIZE 80 #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG3 #define LDA ARG4 #define X ARG5 #define INCX ARG6 #else #define STACKSIZE 256 #define OLD_LDA 40 + STACKSIZE(%rsp) #define OLD_X 48 + STACKSIZE(%rsp) #define OLD_INCX 56 + STACKSIZE(%rsp) #define OLD_Y 64 + STACKSIZE(%rsp) #define OLD_INCY 72 + STACKSIZE(%rsp) #define OLD_BUFFER 80 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG4 #define LDA ARG3 #define X %rdi #define INCX %rsi #endif #define Y %r10 #define INCY %r11 #define BUFFER %r12 #define TEMP %rax #define I %rax #define A1 %rbx #define A2 %rbp #define XX %r13 #define YY %r14 #define IS %r15 #define NEW_X BUFFER #define NEW_Y X #define ALPHA %xmm0 #define xtemp1 %xmm0 #define xtemp2 %xmm1 #define yy1 %xmm2 #define yy2 %xmm3 #define atemp1 %xmm4 #define atemp2 %xmm5 #define atemp3 %xmm6 #define atemp4 %xmm7 #define xsum1 %xmm8 #define xsum2 %xmm9 #define xsum3 %xmm10 #define xsum4 %xmm11 #define a1 %xmm12 #define a2 %xmm13 #define a3 %xmm14 #define xt1 %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_LDA, LDA movq OLD_X, X movq OLD_INCX, INCX movaps %xmm2, %xmm0 #endif movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA testq M, M jle .L999 unpcklpd ALPHA, ALPHA movq BUFFER, XX movq M, %rax sarq $3, %rax jle .L02 ALIGN_3 .L01: movsd 0 * SIZE(X), %xmm1 addq INCX, X movhpd 0 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 addq INCX, X movhpd 0 * SIZE(X), %xmm2 addq INCX, X movsd 0 * SIZE(X), %xmm3 addq INCX, X movhpd 0 * SIZE(X), %xmm3 addq INCX, X movsd 0 * SIZE(X), %xmm4 addq INCX, X movhpd 0 * SIZE(X), %xmm4 addq INCX, X mulpd ALPHA, %xmm1 mulpd ALPHA, %xmm2 mulpd ALPHA, %xmm3 mulpd ALPHA, %xmm4 movapd %xmm1, 0 * SIZE(XX) movapd %xmm2, 2 * SIZE(XX) movapd %xmm3, 4 * SIZE(XX) movapd %xmm4, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L01 ALIGN_3 .L02: movq M, %rax andq $7, %rax jle .L05 ALIGN_3 .L03: movsd 0 * SIZE(X), %xmm1 addq INCX, X mulsd ALPHA, %xmm1 movlpd %xmm1, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L03 ALIGN_3 .L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $3, %rax jle .L07 ALIGN_3 .L06: movsd 0 * SIZE(YY), %xmm0 addq INCY, YY movhpd 0 * SIZE(YY), %xmm0 addq INCY, YY movsd 0 * SIZE(YY), %xmm1 addq INCY, YY movhpd 0 * SIZE(YY), %xmm1 addq INCY, YY movsd 0 * SIZE(YY), %xmm2 addq INCY, YY movhpd 0 * SIZE(YY), %xmm2 addq INCY, YY movsd 0 * SIZE(YY), %xmm3 addq INCY, YY movhpd 0 * SIZE(YY), %xmm3 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) movapd %xmm1, 2 * SIZE(XX) movapd %xmm2, 4 * SIZE(XX) movapd %xmm3, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3 .L07: movq M, %rax andq $7, %rax jle .L10 ALIGN_3 .L08: movsd 0 * SIZE(YY), %xmm0 addq INCY, YY movsd %xmm0, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L08 ALIGN_3 .L10: xorq IS, IS # is = 0 cmpq $4, N jl .L20 ALIGN_3 .L11: movq A, A1 leaq (A, LDA, 2), A2 leaq 4 * SIZE(A, LDA, 4), A leaq (NEW_X, IS, SIZE), XX leaq 4 * SIZE(NEW_Y, IS, SIZE), YY movapd 0 * SIZE(XX), atemp2 movapd 2 * SIZE(XX), atemp4 movsd 0 * SIZE(A1), xsum1 movhpd 1 * SIZE(A1), xsum1 mulpd atemp2, xsum1 movsd 1 * SIZE(A1), xsum2 movhpd 1 * SIZE(A1, LDA, 1), xsum2 mulpd atemp2, xsum2 movsd 2 * SIZE(A1), xsum3 movhpd 2 * SIZE(A1, LDA, 1), xsum3 mulpd atemp2, xsum3 movsd 3 * SIZE(A1), xsum4 movhpd 3 * SIZE(A1, LDA, 1), xsum4 mulpd atemp2, xsum4 movsd 2 * SIZE(A1), a1 movhpd 3 * SIZE(A1), a1 mulpd atemp4, a1 addpd a1, xsum1 movsd 2 * SIZE(A1, LDA, 1), a1 movhpd 3 * SIZE(A1, LDA, 1), a1 mulpd atemp4, a1 addpd a1, xsum2 movsd 2 * SIZE(A2), a1 movhpd 3 * SIZE(A2), a1 mulpd atemp4, a1 addpd a1, xsum3 movsd 3 * SIZE(A2), a1 movhpd 3 * SIZE(A2, LDA, 1), a1 mulpd atemp4, a1 addpd a1, xsum4 movapd 4 * SIZE(XX), xtemp1 movapd 6 * SIZE(XX), xtemp2 movsd 4 * SIZE(A1), a1 movhpd 5 * SIZE(A1), a1 movsd 6 * SIZE(A1), a2 movhpd 7 * SIZE(A1), a2 movsd 4 * SIZE(A1, LDA, 1), a3 movhpd 5 * SIZE(A1, LDA, 1), a3 movsd 0 * SIZE(YY), yy1 movhpd 1 * SIZE(YY), yy1 movsd 2 * SIZE(YY), yy2 movhpd 3 * SIZE(YY), yy2 #ifndef HAVE_SSE3 movapd atemp2, atemp1 unpcklpd atemp1, atemp1 unpckhpd atemp2, atemp2 movapd atemp4, atemp3 unpcklpd atemp3, atemp3 unpckhpd atemp4, atemp4 #else movddup atemp2, atemp1 unpckhpd atemp2, atemp2 movddup atemp4, atemp3 unpckhpd atemp4, atemp4 #endif addq $4 * SIZE, XX addq $4 * SIZE, A1 addq $4 * SIZE, A2 movq M, I subq IS, I subq $4, I sarq $3, I jle .L15 ALIGN_3 .L12: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 2 * SIZE(A1, LDA, 1), a1 movhpd 3 * SIZE(A1, LDA, 1), a1 PREFETCH PREFETCHSIZE(A1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp1, a2 addpd xt1, xsum1 addpd a2, yy2 movsd 0 * SIZE(A2), a2 movhpd 1 * SIZE(A2), a2 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp2, a3 addpd xt1, xsum2 addpd a3, yy1 movsd 2 * SIZE(A2), a3 movhpd 3 * SIZE(A2), a3 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) PREFETCH PREFETCHSIZE(XX) #endif movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum2 addpd a1, yy2 movsd 0 * SIZE(A2, LDA, 1), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum3 addpd a2, yy1 movsd 2 * SIZE(A2, LDA, 1), a2 movhpd 3 * SIZE(A2, LDA, 1), a2 PREFETCH PREFETCHSIZE(A1, LDA, 1) movapd xtemp2, xt1 mulpd a3, xt1 mulpd atemp3, a3 addpd xt1, xsum3 addpd a3, yy2 movsd 4 * SIZE(A1), a3 movhpd 5 * SIZE(A1), a3 movapd xtemp1, xt1 movapd 4 * SIZE(XX), xtemp1 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy1 movsd 6 * SIZE(A1), a1 movhpd 7 * SIZE(A1), a1 movapd xtemp2, xt1 movapd 6 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum4 addpd a2, yy2 movsd 4 * SIZE(A1, LDA, 1), a2 movhpd 5 * SIZE(A1, LDA, 1), a2 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movsd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp1, a3 addpd xt1, xsum1 addpd a3, yy1 movsd 6 * SIZE(A1, LDA, 1), a3 movhpd 7 * SIZE(A1, LDA, 1), a3 PREFETCH PREFETCHSIZE(A2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 movsd 4 * SIZE(A2), a1 movhpd 5 * SIZE(A2), a1 movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp2, a2 addpd xt1, xsum2 addpd a2, yy1 movsd 6 * SIZE(A2), a2 movhpd 7 * SIZE(A2), a2 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) PREFETCHW PREFETCHSIZE(YY) #endif movapd xtemp2, xt1 mulpd a3, xt1 mulpd atemp2, a3 addpd xt1, xsum2 addpd a3, yy2 movsd 4 * SIZE(A2, LDA, 1), a3 movhpd 5 * SIZE(A2, LDA, 1), a3 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp3, a1 addpd xt1, xsum3 addpd a1, yy1 movsd 6 * SIZE(A2, LDA, 1), a1 movhpd 7 * SIZE(A2, LDA, 1), a1 PREFETCH PREFETCHSIZE(A2, LDA, 1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum3 addpd a2, yy2 movsd 10 * SIZE(A1), a2 movhpd 11 * SIZE(A1), a2 movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a3, xt1 mulpd atemp4, a3 addpd xt1, xsum4 addpd a3, yy1 movsd 8 * SIZE(A1, LDA, 1), a3 movhpd 9 * SIZE(A1, LDA, 1), a3 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy2 movsd 8 * SIZE(A1), a1 movhpd 9 * SIZE(A1), a1 movsd yy1, 4 * SIZE(YY) movhpd yy1, 5 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhpd 9 * SIZE(YY), yy1 movsd yy2, 6 * SIZE(YY) movhpd yy2, 7 * SIZE(YY) movsd 10 * SIZE(YY), yy2 movhpd 11 * SIZE(YY), yy2 addq $8 * SIZE, XX addq $8 * SIZE, YY addq $8 * SIZE, A1 addq $8 * SIZE, A2 decq I jg .L12 ALIGN_3 .L15: movq M, I subq IS, I subq $4, I test $4, I jle .L17 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 2 * SIZE(A1, LDA, 1), a1 movhpd 3 * SIZE(A1, LDA, 1), a1 movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp1, a2 addpd xt1, xsum1 addpd a2, yy2 movsd 0 * SIZE(A2), a2 movhpd 1 * SIZE(A2), a2 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp2, a3 addpd xt1, xsum2 addpd a3, yy1 movsd 2 * SIZE(A2), a3 movhpd 3 * SIZE(A2), a3 movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum2 addpd a1, yy2 movsd 0 * SIZE(A2, LDA, 1), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum3 addpd a2, yy1 movsd 2 * SIZE(A2, LDA, 1), a2 movhpd 3 * SIZE(A2, LDA, 1), a2 movapd xtemp2, xt1 mulpd a3, xt1 mulpd atemp3, a3 addpd xt1, xsum3 addpd a3, yy2 movsd 4 * SIZE(A1, LDA, 1), a3 movhpd 5 * SIZE(A1, LDA, 1), a3 movapd xtemp1, xt1 movapd 4 * SIZE(XX), xtemp1 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy1 movsd 4 * SIZE(A1), a1 movhpd 5 * SIZE(A1), a1 movapd xtemp2, xt1 movapd 6 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum4 addpd a2, yy2 movsd 6 * SIZE(A1), a2 movhpd 7 * SIZE(A1), a2 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movsd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 addq $4 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L17: testq $2, M jle .L18 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 0 * SIZE(A1, LDA, 1), a1 movhpd 1 * SIZE(A1, LDA, 1), a1 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum2 addpd a1, yy1 movsd 0 * SIZE(A2), a1 movhpd 1 * SIZE(A2), a1 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp3, a1 addpd xt1, xsum3 addpd a1, yy1 movsd 0 * SIZE(A2, LDA, 1), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 movapd xtemp1, xt1 movapd 2 * SIZE(XX), xtemp1 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy1 movsd 2 * SIZE(A1), a1 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 2 * SIZE(YY), yy1 addq $2 * SIZE, XX addq $2 * SIZE, YY addq $2 * SIZE, A1 addq $2 * SIZE, A2 ALIGN_3 .L18: testq $1, M jle .L19 movapd xtemp1, xt1 mulsd a1, xt1 mulsd atemp1, a1 addsd xt1, xsum1 addpd a1, yy1 movsd 0 * SIZE(A1, LDA, 1), a1 movapd xtemp1, xt1 mulsd a1, xt1 mulsd atemp2, a1 addsd xt1, xsum2 addsd a1, yy1 movsd 0 * SIZE(A2), a1 movapd xtemp1, xt1 mulsd a1, xt1 mulsd atemp3, a1 addsd xt1, xsum3 addsd a1, yy1 movsd 0 * SIZE(A2, LDA, 1), a1 movapd xtemp1, xt1 mulsd a1, xt1 mulsd atemp4, a1 addsd xt1, xsum4 addsd a1, yy1 movsd yy1, 0 * SIZE(YY) ALIGN_3 .L19: #ifndef HAVE_SSE3 movapd xsum1, atemp1 movapd xsum3, atemp3 unpcklpd xsum2, xsum1 unpcklpd xsum4, xsum3 unpckhpd xsum2, atemp1 unpckhpd xsum4, atemp3 addpd atemp1, xsum1 addpd atemp3, xsum3 #else haddpd xsum2, xsum1 haddpd xsum4, xsum3 #endif movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1 movsd 2 * SIZE(NEW_Y, IS, SIZE), yy2 movhpd 3 * SIZE(NEW_Y, IS, SIZE), yy2 addpd xsum1, yy1 addpd xsum3, yy2 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE) movsd yy2, 2 * SIZE(NEW_Y, IS, SIZE) movhpd yy2, 3 * SIZE(NEW_Y, IS, SIZE) addq $4, IS movq IS, I addq $4, I cmpq N, I jle .L11 ALIGN_3 .L20: testq $2, N jle .L30 movq A, A1 leaq 2 * SIZE(A, LDA, 2), A movapd 0 * SIZE(NEW_X, IS, SIZE), atemp2 movsd 0 * SIZE(A1), xsum1 movhpd 1 * SIZE(A1), xsum1 mulpd atemp2, xsum1 movsd 1 * SIZE(A1), xsum2 movhpd 1 * SIZE(A1, LDA, 1), xsum2 mulpd atemp2, xsum2 #ifndef HAVE_SSE3 movapd atemp2, atemp1 unpcklpd atemp1, atemp1 #else movddup atemp2, atemp1 #endif unpckhpd atemp2, atemp2 testq $1, M jle .L29 movsd 2 * SIZE(A1), a1 movsd 2 * SIZE(A1, LDA, 1), a2 movsd 2 * SIZE(NEW_X, IS, SIZE), xtemp1 movsd 2 * SIZE(NEW_Y, IS, SIZE), yy1 movapd xtemp1, xt1 mulsd a1, xt1 mulsd atemp1, a1 addsd xt1, xsum1 addpd a1, yy1 movapd xtemp1, xt1 mulsd a2, xt1 mulsd atemp2, a2 addsd xt1, xsum2 addsd a2, yy1 movsd yy1, 2 * SIZE(NEW_Y, IS, SIZE) ALIGN_3 .L29: #ifndef HAVE_SSE3 movapd xsum1, atemp1 unpcklpd xsum2, xsum1 unpckhpd xsum2, atemp1 addpd atemp1, xsum1 #else haddpd xsum2, xsum1 #endif movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1 addpd xsum1, yy1 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE) addq $2, IS ALIGN_3 .L30: testq $1, N jle .L990 movsd 0 * SIZE(A), xsum1 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 mulsd atemp1, xsum1 addsd xsum1, yy1 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) ALIGN_3 .L990: cmpq $SIZE, INCY je .L999 movq M, %rax sarq $3, %rax jle .L997 ALIGN_3 .L996: movapd 0 * SIZE(NEW_Y), %xmm0 movapd 2 * SIZE(NEW_Y), %xmm1 movapd 4 * SIZE(NEW_Y), %xmm2 movapd 6 * SIZE(NEW_Y), %xmm3 movsd %xmm0, 0 * SIZE(Y) addq INCY, Y movhpd %xmm0, 0 * SIZE(Y) addq INCY, Y movsd %xmm1, 0 * SIZE(Y) addq INCY, Y movhpd %xmm1, 0 * SIZE(Y) addq INCY, Y movsd %xmm2, 0 * SIZE(Y) addq INCY, Y movhpd %xmm2, 0 * SIZE(Y) addq INCY, Y movsd %xmm3, 0 * SIZE(Y) addq INCY, Y movhpd %xmm3, 0 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3 .L997: movq M, %rax andq $7, %rax jle .L999 ALIGN_3 .L998: movsd 0 * SIZE(NEW_Y), %xmm0 movsd %xmm0, 0 * SIZE(Y) addq INCY, Y addq $1 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE