#ifndef GEMV_PARAM_H
#define GEMV_PARAM_H
#ifdef movsd
#undef movsd
#endif
#undef movapd
#define movapd movaps
#ifdef ATHLON
#define ALIGNED_ACCESS
#define MOVUPS_A movaps
#define MOVUPS_XL movaps
#define MOVUPS_XS movaps
#define MOVUPS_YL movaps
#define MOVUPS_YS movaps
#define PREFETCH prefetcht0
#define PREFETCHSIZE 64 * 3
#endif
#ifdef PENTIUM4
#define ALIGNED_ACCESS
#define MOVUPS_A movaps
#define MOVUPS_XL movaps
#define MOVUPS_XS movaps
#define MOVUPS_YL movaps
#define MOVUPS_YS movaps
#define PREFETCH prefetcht0
#define PREFETCHSIZE 64 * 2
#endif
#ifdef CORE2
#define ALIGNED_ACCESS
#define MOVUPS_A movaps
#define MOVUPS_XL movaps
#define MOVUPS_XS movaps
#define MOVUPS_YL movaps
#define MOVUPS_YS movaps
#define PREFETCH prefetcht0
#define PREFETCHSIZE 64 * 4
#endif
#ifdef PENRYN
#define ALIGNED_ACCESS
#define MOVUPS_A movaps
#define MOVUPS_XL movaps
#define MOVUPS_XS movaps
#define MOVUPS_YL movaps
#define MOVUPS_YS movaps
#define PREFETCH prefetcht0
#define PREFETCHSIZE 64 * 4
#endif
#ifdef NEHALEM
#define MOVUPS_A movups
#define MOVUPS_XL movups
#define MOVUPS_XS movups
#define MOVUPS_YL movups
#define MOVUPS_YS movups
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE 64 * 3
#endif
#ifdef OPTERON
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#ifndef COMPLEX
#define PREFETCHSIZE 64 * 1
#else
#define PREFETCHSIZE 64 * 1
#endif
#define movsd movlps
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#define ALIGNED_ACCESS
#define MOVUPS_A movaps
#define MOVUPS_XL movaps
#define MOVUPS_XS movaps
#define MOVUPS_YL movaps
#define MOVUPS_YS movaps
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#ifndef COMPLEX
#define PREFETCHSIZE 64 * 2
#else
#define PREFETCHSIZE 64 * 4
#endif
#endif
#ifdef NANO
#define ALIGNED_ACCESS
#define MOVUPS_A movaps
#define MOVUPS_XL movaps
#define MOVUPS_XS movaps
#define MOVUPS_YL movaps
#define MOVUPS_YS movaps
#define PREFETCH prefetcht0
#ifndef COMPLEX
#define PREFETCHSIZE 64 * 1
#else
#define PREFETCHSIZE 64 * 2
#endif
#endif
#ifndef PREOFFSET
#ifdef L1_DATA_LINESIZE
#define PREOFFSET (L1_DATA_LINESIZE >> 1)
#else
#define PREOFFSET 32
#endif
#endif
#ifndef GEMV_UNROLL
#define GEMV_UNROLL 4
#endif
#ifndef ZGEMV_UNROLL
#define ZGEMV_UNROLL 4
#endif
/* #define COPY_FORCE */ /* Always copy X or Y to the buffer */
/* #define NOCOPY_UNALIGNED */ /* Not copy if X or Y is not aligned */
#ifdef MOVUPS_A
#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS
#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS
#else
#define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS
#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS
#endif
#define MOVRPS_A1(OFF, ADDR, REGS) movsd OFF + 8(ADDR), REGS; movhps OFF(ADDR), REGS
#define MOVRPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF + 8(ADDR, BASE, SCALE), REGS; movhps OFF(ADDR, BASE, SCALE), REGS
#ifdef MOVUPS_XL
#define MOVUPS_XL1(OFF, ADDR, REGS) MOVUPS_XL OFF(ADDR), REGS
#else
#define MOVUPS_XL1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS
#endif
#ifdef MOVUPS_XS
#define MOVUPS_XS1(OFF, ADDR, REGS) MOVUPS_XS REGS, OFF(ADDR)
#else
#define MOVUPS_XS1(OFF, ADDR, REGS) movsd REGS, OFF(ADDR); movhps REGS, OFF + 8(ADDR)
#endif
#ifdef MOVUPS_YL
#define MOVUPS_YL1(OFF, ADDR, REGS) MOVUPS_YL OFF(ADDR), REGS
#else
#define MOVUPS_YL1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS
#endif
#ifdef MOVUPS_YS
#define MOVUPS_YS1(OFF, ADDR, REGS) MOVUPS_YS REGS, OFF(ADDR)
#else
#define MOVUPS_YS1(OFF, ADDR, REGS) movsd REGS, OFF(ADDR); movhps REGS, OFF + 8(ADDR)
#endif
#endif