shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; jsimdext.inc - common declarations
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB</ossman@cendio.se>
shun-iwasawa 82a8f5
; Copyright (C) 2010, 2016, 2019, D. R. Commander.
shun-iwasawa 82a8f5
; Copyright (C) 2018, Matthieu Darbois.
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Copyright (C) 1999-2006, MIYASAKA Masaru.
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; This software is provided 'as-is', without any express or implied
shun-iwasawa 82a8f5
; warranty.  In no event will the authors be held liable for any damages
shun-iwasawa 82a8f5
; arising from the use of this software.
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Permission is granted to anyone to use this software for any purpose,
shun-iwasawa 82a8f5
; including commercial applications, and to alter it and redistribute it
shun-iwasawa 82a8f5
; freely, subject to the following restrictions:
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; 1. The origin of this software must not be misrepresented; you must not
shun-iwasawa 82a8f5
;    claim that you wrote the original software. If you use this software
shun-iwasawa 82a8f5
;    in a product, an acknowledgment in the product documentation would be
shun-iwasawa 82a8f5
;    appreciated but is not required.
shun-iwasawa 82a8f5
; 2. Altered source versions must be plainly marked as such, and must not be
shun-iwasawa 82a8f5
;    misrepresented as being the original software.
shun-iwasawa 82a8f5
; 3. This notice may not be removed or altered from any source distribution.
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; ==========================================================================
shun-iwasawa 82a8f5
;  System-dependent configurations
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
shun-iwasawa 82a8f5
; * Microsoft Visual C++
shun-iwasawa 82a8f5
; * MinGW (Minimalist GNU for Windows)
shun-iwasawa 82a8f5
; * CygWin
shun-iwasawa 82a8f5
; * LCC-Win32
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; -- segment definition --
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%ifdef __YASM_VER__
shun-iwasawa 82a8f5
%define SEG_TEXT   .text  align=32
shun-iwasawa 82a8f5
%define SEG_CONST  .rdata align=32
shun-iwasawa 82a8f5
%else
shun-iwasawa 82a8f5
%define SEG_TEXT   .text  align=32 public use32 class=CODE
shun-iwasawa 82a8f5
%define SEG_CONST  .rdata align=32 public use32 class=CONST
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
shun-iwasawa 82a8f5
; * Microsoft Visual C++
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; -- segment definition --
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%ifdef __YASM_VER__
shun-iwasawa 82a8f5
%define SEG_TEXT    .text  align=32
shun-iwasawa 82a8f5
%define SEG_CONST   .rdata align=32
shun-iwasawa 82a8f5
%else
shun-iwasawa 82a8f5
%define SEG_TEXT    .text  align=32 public use64 class=CODE
shun-iwasawa 82a8f5
%define SEG_CONST   .rdata align=32 public use64 class=CONST
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%define EXTN(name)  name                ; foo() -> foo
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
shun-iwasawa 82a8f5
; * Borland C++ (Win32)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; -- segment definition --
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%define SEG_TEXT   _text align=32 public use32 class=CODE
shun-iwasawa 82a8f5
%define SEG_CONST  _data align=32 public use32 class=DATA
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
shun-iwasawa 82a8f5
; * Linux
shun-iwasawa 82a8f5
; * *BSD family Unix using elf format
shun-iwasawa 82a8f5
; * Unix System V, including Solaris x86, UnixWare and SCO Unix
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; mark stack as non-executable
shun-iwasawa 82a8f5
section .note.GNU-stack noalloc noexec nowrite progbits
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; -- segment definition --
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%ifdef __x86_64__
shun-iwasawa 82a8f5
%define SEG_TEXT   .text   progbits align=32
shun-iwasawa 82a8f5
%define SEG_CONST  .rodata progbits align=32
shun-iwasawa 82a8f5
%else
shun-iwasawa 82a8f5
%define SEG_TEXT   .text   progbits alloc exec   nowrite align=32
shun-iwasawa 82a8f5
%define SEG_CONST  .rodata progbits alloc noexec nowrite align=32
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; To make the code position-independent, append -DPIC to the commandline
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_  ; ELF supports PIC
shun-iwasawa 82a8f5
%define EXTN(name)  name                   ; foo() -> foo
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
shun-iwasawa 82a8f5
; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
shun-iwasawa 82a8f5
; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; -- segment definition --
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%define SEG_TEXT   .text
shun-iwasawa 82a8f5
%define SEG_CONST  .data
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; To make the code position-independent, append -DPIC to the commandline
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_  ; BSD-style a.out supports PIC
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
shun-iwasawa 82a8f5
; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; -- segment definition --
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%define SEG_TEXT   .text  ;align=32     ; nasm doesn't accept align=32. why?
shun-iwasawa 82a8f5
%define SEG_CONST  .rodata align=32
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; The generation of position-independent code (PIC) is the default on Darwin.
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%define PIC
shun-iwasawa 82a8f5
%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%else           ; ----(Other case)----------------------
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; -- segment definition --
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%define SEG_TEXT   .text
shun-iwasawa 82a8f5
%define SEG_CONST  .data
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%endif          ; ----------------------------------------------
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; ==========================================================================
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
;  Common types
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%ifdef __x86_64__
shun-iwasawa 82a8f5
%define POINTER         qword           ; general pointer type
shun-iwasawa 82a8f5
%define SIZEOF_POINTER  SIZEOF_QWORD    ; sizeof(POINTER)
shun-iwasawa 82a8f5
%define POINTER_BIT     QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
shun-iwasawa 82a8f5
%else
shun-iwasawa 82a8f5
%define POINTER         dword           ; general pointer type
shun-iwasawa 82a8f5
%define SIZEOF_POINTER  SIZEOF_DWORD    ; sizeof(POINTER)
shun-iwasawa 82a8f5
%define POINTER_BIT     DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define INT             dword           ; signed integer type
shun-iwasawa 82a8f5
%define SIZEOF_INT      SIZEOF_DWORD    ; sizeof(INT)
shun-iwasawa 82a8f5
%define INT_BIT         DWORD_BIT       ; sizeof(INT)*BYTE_BIT
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define FP32            dword           ; IEEE754 single
shun-iwasawa 82a8f5
%define SIZEOF_FP32     SIZEOF_DWORD    ; sizeof(FP32)
shun-iwasawa 82a8f5
%define FP32_BIT        DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define MMWORD          qword           ; int64  (MMX register)
shun-iwasawa 82a8f5
%define SIZEOF_MMWORD   SIZEOF_QWORD    ; sizeof(MMWORD)
shun-iwasawa 82a8f5
%define MMWORD_BIT      QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; NASM is buggy and doesn't properly handle operand sizes for SSE
shun-iwasawa 82a8f5
; instructions, so for now we have to define XMMWORD as blank.
shun-iwasawa 82a8f5
%define XMMWORD                         ; int128 (SSE register)
shun-iwasawa 82a8f5
%define SIZEOF_XMMWORD  SIZEOF_OWORD    ; sizeof(XMMWORD)
shun-iwasawa 82a8f5
%define XMMWORD_BIT     OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define YMMWORD                         ; int256 (AVX register)
shun-iwasawa 82a8f5
%define SIZEOF_YMMWORD  SIZEOF_YWORD    ; sizeof(YMMWORD)
shun-iwasawa 82a8f5
%define YMMWORD_BIT     YWORD_BIT       ; sizeof(YMMWORD)*BYTE_BIT
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; Similar hacks for when we load a dword or MMWORD into an xmm# register
shun-iwasawa 82a8f5
%define XMM_DWORD
shun-iwasawa 82a8f5
%define XMM_MMWORD
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define SIZEOF_BYTE   1                 ; sizeof(byte)
shun-iwasawa 82a8f5
%define SIZEOF_WORD   2                 ; sizeof(word)
shun-iwasawa 82a8f5
%define SIZEOF_DWORD  4                 ; sizeof(dword)
shun-iwasawa 82a8f5
%define SIZEOF_QWORD  8                 ; sizeof(qword)
shun-iwasawa 82a8f5
%define SIZEOF_OWORD  16                ; sizeof(oword)
shun-iwasawa 82a8f5
%define SIZEOF_YWORD  32                ; sizeof(yword)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define BYTE_BIT      8                 ; CHAR_BIT in C
shun-iwasawa 82a8f5
%define WORD_BIT      16                ; sizeof(word)*BYTE_BIT
shun-iwasawa 82a8f5
%define DWORD_BIT     32                ; sizeof(dword)*BYTE_BIT
shun-iwasawa 82a8f5
%define QWORD_BIT     64                ; sizeof(qword)*BYTE_BIT
shun-iwasawa 82a8f5
%define OWORD_BIT     128               ; sizeof(oword)*BYTE_BIT
shun-iwasawa 82a8f5
%define YWORD_BIT     256               ; sizeof(yword)*BYTE_BIT
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
;  External Symbol Name
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%ifndef EXTN
shun-iwasawa 82a8f5
%define EXTN(name)  _ %+ name           ; foo() -> _foo
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
;  Hidden symbols
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%ifdef ELF      ; ----(nasm -felf[64] -DELF ...)--------
shun-iwasawa 82a8f5
%define GLOBAL_FUNCTION(name)  global EXTN(name):function hidden
shun-iwasawa 82a8f5
%define GLOBAL_DATA(name)      global EXTN(name):data hidden
shun-iwasawa 82a8f5
%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
shun-iwasawa 82a8f5
%ifdef __YASM_VER__
shun-iwasawa 82a8f5
%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
shun-iwasawa 82a8f5
%define GLOBAL_DATA(name)      global EXTN(name):private_extern
shun-iwasawa 82a8f5
%else
shun-iwasawa 82a8f5
%if __NASM_VERSION_ID__ >= 0x020E0000
shun-iwasawa 82a8f5
%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
shun-iwasawa 82a8f5
%define GLOBAL_DATA(name)      global EXTN(name):private_extern
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%ifndef GLOBAL_FUNCTION
shun-iwasawa 82a8f5
%define GLOBAL_FUNCTION(name)  global EXTN(name)
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%ifndef GLOBAL_DATA
shun-iwasawa 82a8f5
%define GLOBAL_DATA(name)      global EXTN(name)
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
;  Macros for position-independent code (PIC) support
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%ifndef GOT_SYMBOL
shun-iwasawa 82a8f5
%undef PIC
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%ifdef PIC  ; -------------------------------------------
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%ifidn GOT_SYMBOL, _MACHO_PIC_  ; --------------------
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; At present, nasm doesn't seem to support PIC generation for Mach-O.
shun-iwasawa 82a8f5
; The PIC support code below is a little tricky.
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    SECTION     SEG_CONST
shun-iwasawa 82a8f5
const_base:
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define GOTOFF(got, sym)  (got) + (sym) - const_base
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro get_GOT 1
shun-iwasawa 82a8f5
    ; NOTE: this macro destroys ecx resister.
shun-iwasawa 82a8f5
    call        %%geteip
shun-iwasawa 82a8f5
    add         ecx, byte (%%ref - $)
shun-iwasawa 82a8f5
    jmp         short %%adjust
shun-iwasawa 82a8f5
%%geteip:
shun-iwasawa 82a8f5
    mov         ecx, POINTER [esp]
shun-iwasawa 82a8f5
    ret
shun-iwasawa 82a8f5
%%adjust:
shun-iwasawa 82a8f5
    push        ebp
shun-iwasawa 82a8f5
    xor         ebp, ebp                ; ebp = 0
shun-iwasawa 82a8f5
%ifidni %1, ebx  ; (%1 == ebx)
shun-iwasawa 82a8f5
    ; db 0x8D,0x9C + jmp near const_base =
shun-iwasawa 82a8f5
    ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
shun-iwasawa 82a8f5
    db          0x8D, 0x9C              ; 8D,9C
shun-iwasawa 82a8f5
    jmp         near const_base         ; E9,(const_base-%%ref)
shun-iwasawa 82a8f5
%%ref:
shun-iwasawa 82a8f5
%else  ; (%1 != ebx)
shun-iwasawa 82a8f5
    ; db 0x8D,0x8C + jmp near const_base =
shun-iwasawa 82a8f5
    ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
shun-iwasawa 82a8f5
    db          0x8D, 0x8C              ; 8D,8C
shun-iwasawa 82a8f5
    jmp         near const_base         ; E9,(const_base-%%ref)
shun-iwasawa 82a8f5
%%ref:
shun-iwasawa 82a8f5
    mov         %1, ecx
shun-iwasawa 82a8f5
%endif  ; (%1 == ebx)
shun-iwasawa 82a8f5
    pop         ebp
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%else     ; GOT_SYMBOL != _MACHO_PIC_ ----------------
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define GOTOFF(got, sym)  (got) + (sym) wrt ..gotoff
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro get_GOT 1
shun-iwasawa 82a8f5
    extern      GOT_SYMBOL
shun-iwasawa 82a8f5
    call        %%geteip
shun-iwasawa 82a8f5
    add         %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
shun-iwasawa 82a8f5
    jmp         short %%done
shun-iwasawa 82a8f5
%%geteip:
shun-iwasawa 82a8f5
    mov         %1, POINTER [esp]
shun-iwasawa 82a8f5
    ret
shun-iwasawa 82a8f5
%%done:
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%endif    ; GOT_SYMBOL == _MACHO_PIC_ ----------------
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro pushpic 1.nolist
shun-iwasawa 82a8f5
    push        %1
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
%imacro poppic  1.nolist
shun-iwasawa 82a8f5
    pop         %1
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
%imacro movpic  2.nolist
shun-iwasawa 82a8f5
    mov         %1, %2
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%else    ; !PIC -----------------------------------------
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define GOTOFF(got, sym)  (sym)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro get_GOT 1.nolist
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
%imacro pushpic 1.nolist
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
%imacro poppic  1.nolist
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
%imacro movpic  2.nolist
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%endif   ;  PIC -----------------------------------------
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
;  Align the next instruction on {2,4,8,16,..}-byte boundary.
shun-iwasawa 82a8f5
;  ".balign n,,m" in GNU as
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%define MSKLE(x, y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
shun-iwasawa 82a8f5
%define FILLB(b, n)  (($$-(b)) & ((n)-1))
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro alignx 1-2.nolist 0xFFFF
shun-iwasawa 82a8f5
%%bs: \
shun-iwasawa 82a8f5
  times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
shun-iwasawa 82a8f5
        db 0x90                                      ; nop
shun-iwasawa 82a8f5
  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
shun-iwasawa 82a8f5
        db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00  ; lea ebx,[ebx+0x00000000]
shun-iwasawa 82a8f5
  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
shun-iwasawa 82a8f5
        db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00  ; lea ebp,[ebp+0x00000000]
shun-iwasawa 82a8f5
  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
shun-iwasawa 82a8f5
        db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00        ; lea ebp,[ebp+0x00000000]
shun-iwasawa 82a8f5
  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
shun-iwasawa 82a8f5
        db 0x8D, 0x6C, 0x25, 0x00                    ; lea ebp,[ebp+0x00]
shun-iwasawa 82a8f5
  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
shun-iwasawa 82a8f5
        db 0x8D, 0x6D, 0x00                          ; lea ebp,[ebp+0x00]
shun-iwasawa 82a8f5
  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
shun-iwasawa 82a8f5
        db 0x8B, 0xED                                ; mov ebp,ebp
shun-iwasawa 82a8f5
  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
shun-iwasawa 82a8f5
        db 0x90                                      ; nop
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; Align the next data on {2,4,8,16,..}-byte boundary.
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%imacro alignz 1.nolist
shun-iwasawa 82a8f5
    align       %1, db 0                ; filling zeros
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%ifdef __x86_64__
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%ifdef WIN64
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro collect_args 1
shun-iwasawa 82a8f5
    sub         rsp, SIZEOF_XMMWORD
shun-iwasawa 82a8f5
    movaps      XMMWORD [rsp], xmm6
shun-iwasawa 82a8f5
    sub         rsp, SIZEOF_XMMWORD
shun-iwasawa 82a8f5
    movaps      XMMWORD [rsp], xmm7
shun-iwasawa 82a8f5
    mov         r10, rcx
shun-iwasawa 82a8f5
%if %1 > 1
shun-iwasawa 82a8f5
    mov         r11, rdx
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 2
shun-iwasawa 82a8f5
    push        r12
shun-iwasawa 82a8f5
    mov         r12, r8
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 3
shun-iwasawa 82a8f5
    push        r13
shun-iwasawa 82a8f5
    mov         r13, r9
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 4
shun-iwasawa 82a8f5
    push        r14
shun-iwasawa 82a8f5
    mov         r14, [rax+48]
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 5
shun-iwasawa 82a8f5
    push        r15
shun-iwasawa 82a8f5
    mov         r15, [rax+56]
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
    push        rsi
shun-iwasawa 82a8f5
    push        rdi
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro uncollect_args 1
shun-iwasawa 82a8f5
    pop         rdi
shun-iwasawa 82a8f5
    pop         rsi
shun-iwasawa 82a8f5
%if %1 > 5
shun-iwasawa 82a8f5
    pop         r15
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 4
shun-iwasawa 82a8f5
    pop         r14
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 3
shun-iwasawa 82a8f5
    pop         r13
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 2
shun-iwasawa 82a8f5
    pop         r12
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
    movaps      xmm7, XMMWORD [rsp]
shun-iwasawa 82a8f5
    add         rsp, SIZEOF_XMMWORD
shun-iwasawa 82a8f5
    movaps      xmm6, XMMWORD [rsp]
shun-iwasawa 82a8f5
    add         rsp, SIZEOF_XMMWORD
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro push_xmm 1
shun-iwasawa 82a8f5
    sub         rsp, %1 * SIZEOF_XMMWORD
shun-iwasawa 82a8f5
    movaps      XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
shun-iwasawa 82a8f5
%if %1 > 1
shun-iwasawa 82a8f5
    movaps      XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 2
shun-iwasawa 82a8f5
    movaps      XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 3
shun-iwasawa 82a8f5
    movaps      XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro pop_xmm 1
shun-iwasawa 82a8f5
    movaps      xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
shun-iwasawa 82a8f5
%if %1 > 1
shun-iwasawa 82a8f5
    movaps      xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 2
shun-iwasawa 82a8f5
    movaps      xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 3
shun-iwasawa 82a8f5
    movaps      xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
    add         rsp, %1 * SIZEOF_XMMWORD
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%else
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro collect_args 1
shun-iwasawa 82a8f5
    push        r10
shun-iwasawa 82a8f5
    mov         r10, rdi
shun-iwasawa 82a8f5
%if %1 > 1
shun-iwasawa 82a8f5
    push        r11
shun-iwasawa 82a8f5
    mov         r11, rsi
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 2
shun-iwasawa 82a8f5
    push        r12
shun-iwasawa 82a8f5
    mov         r12, rdx
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 3
shun-iwasawa 82a8f5
    push        r13
shun-iwasawa 82a8f5
    mov         r13, rcx
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 4
shun-iwasawa 82a8f5
    push        r14
shun-iwasawa 82a8f5
    mov         r14, r8
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 5
shun-iwasawa 82a8f5
    push        r15
shun-iwasawa 82a8f5
    mov         r15, r9
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro uncollect_args 1
shun-iwasawa 82a8f5
%if %1 > 5
shun-iwasawa 82a8f5
    pop         r15
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 4
shun-iwasawa 82a8f5
    pop         r14
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 3
shun-iwasawa 82a8f5
    pop         r13
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 2
shun-iwasawa 82a8f5
    pop         r12
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
%if %1 > 1
shun-iwasawa 82a8f5
    pop         r11
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
    pop         r10
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro push_xmm 1
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%imacro pop_xmm 1
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
;  Defines picked up from the C headers
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
%include "jsimdcfg.inc"
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------