shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
shun-iwasawa 82a8f5
; (64-bit SSE2)
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Copyright (C) 2016, 2018, Matthieu Darbois
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Based on the x86 SIMD extension for IJG JPEG library
shun-iwasawa 82a8f5
; Copyright (C) 1999-2006, MIYASAKA Masaru.
shun-iwasawa 82a8f5
; For conditions of distribution and use, see copyright notice in jsimdext.inc
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; This file should be assembled with NASM (Netwide Assembler),
shun-iwasawa 82a8f5
; can *not* be assembled with Microsoft's MASM or any compatible
shun-iwasawa 82a8f5
; assembler (including Borland's Turbo Assembler).
shun-iwasawa 82a8f5
; NASM is available from http://nasm.sourceforge.net/ or
shun-iwasawa 82a8f5
; http://sourceforge.net/project/showfiles.php?group_id=6208
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; This file contains an SSE2 implementation of data preparation for progressive
shun-iwasawa 82a8f5
; Huffman encoding.  See jcphuff.c for more details.
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%include "jsimdext.inc"
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
    SECTION     SEG_TEXT
shun-iwasawa 82a8f5
    BITS        64
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
shun-iwasawa 82a8f5
; jsimd_encode_mcu_AC_refine_prepare_sse2()
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%macro LOAD16 0
shun-iwasawa 82a8f5
    pxor        N0, N0
shun-iwasawa 82a8f5
    pxor        N1, N1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  0*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT +  8*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 0
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  1*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT +  9*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 1
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  2*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT + 10*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 2
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  3*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT + 11*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 3
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  4*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT + 12*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 4
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  5*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT + 13*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 5
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  6*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT + 14*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 6
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 6
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  7*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT + 15*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 7
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 7
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%macro LOAD15 0
shun-iwasawa 82a8f5
    pxor        N0, N0
shun-iwasawa 82a8f5
    pxor        N1, N1
shun-iwasawa 82a8f5
    pxor        X1, X1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  0*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT +  8*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 0
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  1*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  2*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  3*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  4*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  5*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  6*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 6
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  7*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 7
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 2
shun-iwasawa 82a8f5
    jl          %%.ELOAD15
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT +  9*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 3
shun-iwasawa 82a8f5
    jl          %%.ELOAD15
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT + 10*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 4
shun-iwasawa 82a8f5
    jl          %%.ELOAD15
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT + 11*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 5
shun-iwasawa 82a8f5
    jl          %%.ELOAD15
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT + 12*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 6
shun-iwasawa 82a8f5
    jl          %%.ELOAD15
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT + 13*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 7
shun-iwasawa 82a8f5
    jl          %%.ELOAD15
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT + 14*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 6
shun-iwasawa 82a8f5
%%.ELOAD15:
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%macro LOAD8 0
shun-iwasawa 82a8f5
    pxor        N0, N0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  0*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  1*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  2*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  3*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  4*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  5*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  6*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 6
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0d, INT [LUT +  7*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 7
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%macro LOAD7 0
shun-iwasawa 82a8f5
    pxor        N0, N0
shun-iwasawa 82a8f5
    pxor        X0, X0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT +  0*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 2
shun-iwasawa 82a8f5
    jl          %%.ELOAD7
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT +  1*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 3
shun-iwasawa 82a8f5
    jl          %%.ELOAD7
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT +  2*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 4
shun-iwasawa 82a8f5
    jl          %%.ELOAD7
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT +  3*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 5
shun-iwasawa 82a8f5
    jl          %%.ELOAD7
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT +  4*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 6
shun-iwasawa 82a8f5
    jl          %%.ELOAD7
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT +  5*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 7
shun-iwasawa 82a8f5
    jl          %%.ELOAD7
shun-iwasawa 82a8f5
    mov         T1d, INT [LUT +  6*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 6
shun-iwasawa 82a8f5
%%.ELOAD7:
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%macro REDUCE0 0
shun-iwasawa 82a8f5
    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
shun-iwasawa 82a8f5
    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
shun-iwasawa 82a8f5
    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
shun-iwasawa 82a8f5
    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
shun-iwasawa 82a8f5
    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
shun-iwasawa 82a8f5
    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
shun-iwasawa 82a8f5
    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
shun-iwasawa 82a8f5
    movdqa      xmm7, XMMWORD [VALUES + (56*2)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pcmpeqw     xmm0, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm1, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm2, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm3, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm4, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm5, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm6, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm7, ZERO
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    packsswb    xmm0, xmm1
shun-iwasawa 82a8f5
    packsswb    xmm2, xmm3
shun-iwasawa 82a8f5
    packsswb    xmm4, xmm5
shun-iwasawa 82a8f5
    packsswb    xmm6, xmm7
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pmovmskb    eax, xmm0
shun-iwasawa 82a8f5
    pmovmskb    ecx, xmm2
shun-iwasawa 82a8f5
    pmovmskb    edx, xmm4
shun-iwasawa 82a8f5
    pmovmskb    esi, xmm6
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    shl         rcx, 16
shun-iwasawa 82a8f5
    shl         rdx, 32
shun-iwasawa 82a8f5
    shl         rsi, 48
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    or          rax, rcx
shun-iwasawa 82a8f5
    or          rdx, rsi
shun-iwasawa 82a8f5
    or          rax, rdx
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    not         rax
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         MMWORD [r15], rax
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Prepare data for jsimd_encode_mcu_AC_first().
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; GLOBAL(void)
shun-iwasawa 82a8f5
; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
shun-iwasawa 82a8f5
;                                        const int *jpeg_natural_order_start,
shun-iwasawa 82a8f5
;                                        int Sl, int Al, JCOEF *values,
shun-iwasawa 82a8f5
;                                        size_t *zerobits)
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; r10 = const JCOEF *block
shun-iwasawa 82a8f5
; r11 = const int *jpeg_natural_order_start
shun-iwasawa 82a8f5
; r12 = int Sl
shun-iwasawa 82a8f5
; r13 = int Al
shun-iwasawa 82a8f5
; r14 = JCOEF *values
shun-iwasawa 82a8f5
; r15 = size_t *zerobits
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define ZERO    xmm9
shun-iwasawa 82a8f5
%define X0      xmm0
shun-iwasawa 82a8f5
%define X1      xmm1
shun-iwasawa 82a8f5
%define N0      xmm2
shun-iwasawa 82a8f5
%define N1      xmm3
shun-iwasawa 82a8f5
%define AL      xmm4
shun-iwasawa 82a8f5
%define K       eax
shun-iwasawa 82a8f5
%define LUT     r11
shun-iwasawa 82a8f5
%define T0      rcx
shun-iwasawa 82a8f5
%define T0d     ecx
shun-iwasawa 82a8f5
%define T1      rdx
shun-iwasawa 82a8f5
%define T1d     edx
shun-iwasawa 82a8f5
%define BLOCK   r10
shun-iwasawa 82a8f5
%define VALUES  r14
shun-iwasawa 82a8f5
%define LEN     r12d
shun-iwasawa 82a8f5
%define LENEND  r13d
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    align       32
shun-iwasawa 82a8f5
    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
shun-iwasawa 82a8f5
    push        rbp
shun-iwasawa 82a8f5
    mov         rax, rsp                     ; rax = original rbp
shun-iwasawa 82a8f5
    sub         rsp, byte 4
shun-iwasawa 82a8f5
    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
shun-iwasawa 82a8f5
    mov         [rsp], rax
shun-iwasawa 82a8f5
    mov         rbp, rsp                     ; rbp = aligned rbp
shun-iwasawa 82a8f5
    lea         rsp, [rbp - 16]
shun-iwasawa 82a8f5
    collect_args 6
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movdqa      XMMWORD [rbp - 16], ZERO
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movd        AL, r13d
shun-iwasawa 82a8f5
    pxor        ZERO, ZERO
shun-iwasawa 82a8f5
    mov         K, LEN
shun-iwasawa 82a8f5
    mov         LENEND, LEN
shun-iwasawa 82a8f5
    and         K, -16
shun-iwasawa 82a8f5
    and         LENEND, 7
shun-iwasawa 82a8f5
    shr         K, 4
shun-iwasawa 82a8f5
    jz          .ELOOP16
shun-iwasawa 82a8f5
.BLOOP16:
shun-iwasawa 82a8f5
    LOAD16
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    pcmpgtw     N1, X1
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    paddw       X1, N1
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    pxor        X1, N1
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    psrlw       X1, AL
shun-iwasawa 82a8f5
    pxor        N0, X0
shun-iwasawa 82a8f5
    pxor        N1, X1
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (8) * 2], X1
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
shun-iwasawa 82a8f5
    add         VALUES, 16*2
shun-iwasawa 82a8f5
    add         LUT, 16*SIZEOF_INT
shun-iwasawa 82a8f5
    dec         K
shun-iwasawa 82a8f5
    jnz         .BLOOP16
shun-iwasawa 82a8f5
    test        LEN, 15
shun-iwasawa 82a8f5
    je          .PADDING
shun-iwasawa 82a8f5
.ELOOP16:
shun-iwasawa 82a8f5
    test        LEN, 8
shun-iwasawa 82a8f5
    jz          .TRY7
shun-iwasawa 82a8f5
    test        LEN, 7
shun-iwasawa 82a8f5
    jz          .TRY8
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    LOAD15
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    pcmpgtw     N1, X1
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    paddw       X1, N1
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    pxor        X1, N1
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    psrlw       X1, AL
shun-iwasawa 82a8f5
    pxor        N0, X0
shun-iwasawa 82a8f5
    pxor        N1, X1
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (8) * 2], X1
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
shun-iwasawa 82a8f5
    add         VALUES, 16*2
shun-iwasawa 82a8f5
    jmp         .PADDING
shun-iwasawa 82a8f5
.TRY8:
shun-iwasawa 82a8f5
    LOAD8
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    pxor        N0, X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
shun-iwasawa 82a8f5
    add         VALUES, 8*2
shun-iwasawa 82a8f5
    jmp         .PADDING
shun-iwasawa 82a8f5
.TRY7:
shun-iwasawa 82a8f5
    LOAD7
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    pxor        N0, X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
shun-iwasawa 82a8f5
    add         VALUES, 8*2
shun-iwasawa 82a8f5
.PADDING:
shun-iwasawa 82a8f5
    mov         K, LEN
shun-iwasawa 82a8f5
    add         K, 7
shun-iwasawa 82a8f5
    and         K, -8
shun-iwasawa 82a8f5
    shr         K, 3
shun-iwasawa 82a8f5
    sub         K, DCTSIZE2/8
shun-iwasawa 82a8f5
    jz          .EPADDING
shun-iwasawa 82a8f5
    align       16
shun-iwasawa 82a8f5
.ZEROLOOP:
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + 0], ZERO
shun-iwasawa 82a8f5
    add         VALUES, 8*2
shun-iwasawa 82a8f5
    inc         K
shun-iwasawa 82a8f5
    jnz         .ZEROLOOP
shun-iwasawa 82a8f5
.EPADDING:
shun-iwasawa 82a8f5
    sub         VALUES, DCTSIZE2*2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    REDUCE0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movdqa      ZERO, XMMWORD [rbp - 16]
shun-iwasawa 82a8f5
    uncollect_args 6
shun-iwasawa 82a8f5
    mov         rsp, rbp                ; rsp <- aligned rbp
shun-iwasawa 82a8f5
    pop         rsp                     ; rsp <- original rbp
shun-iwasawa 82a8f5
    pop         rbp
shun-iwasawa 82a8f5
    ret
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%undef ZERO
shun-iwasawa 82a8f5
%undef X0
shun-iwasawa 82a8f5
%undef X1
shun-iwasawa 82a8f5
%undef N0
shun-iwasawa 82a8f5
%undef N1
shun-iwasawa 82a8f5
%undef AL
shun-iwasawa 82a8f5
%undef K
shun-iwasawa 82a8f5
%undef LUT
shun-iwasawa 82a8f5
%undef T0
shun-iwasawa 82a8f5
%undef T0d
shun-iwasawa 82a8f5
%undef T1
shun-iwasawa 82a8f5
%undef T1d
shun-iwasawa 82a8f5
%undef BLOCK
shun-iwasawa 82a8f5
%undef VALUES
shun-iwasawa 82a8f5
%undef LEN
shun-iwasawa 82a8f5
%undef LENEND
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Prepare data for jsimd_encode_mcu_AC_refine().
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; GLOBAL(int)
shun-iwasawa 82a8f5
; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
shun-iwasawa 82a8f5
;                                         const int *jpeg_natural_order_start,
shun-iwasawa 82a8f5
;                                         int Sl, int Al, JCOEF *absvalues,
shun-iwasawa 82a8f5
;                                         size_t *bits)
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; r10 = const JCOEF *block
shun-iwasawa 82a8f5
; r11 = const int *jpeg_natural_order_start
shun-iwasawa 82a8f5
; r12 = int Sl
shun-iwasawa 82a8f5
; r13 = int Al
shun-iwasawa 82a8f5
; r14 = JCOEF *values
shun-iwasawa 82a8f5
; r15 = size_t *bits
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define ZERO    xmm9
shun-iwasawa 82a8f5
%define ONE     xmm5
shun-iwasawa 82a8f5
%define X0      xmm0
shun-iwasawa 82a8f5
%define X1      xmm1
shun-iwasawa 82a8f5
%define N0      xmm2
shun-iwasawa 82a8f5
%define N1      xmm3
shun-iwasawa 82a8f5
%define AL      xmm4
shun-iwasawa 82a8f5
%define K       eax
shun-iwasawa 82a8f5
%define KK      r9d
shun-iwasawa 82a8f5
%define EOB     r8d
shun-iwasawa 82a8f5
%define SIGN    rdi
shun-iwasawa 82a8f5
%define LUT     r11
shun-iwasawa 82a8f5
%define T0      rcx
shun-iwasawa 82a8f5
%define T0d     ecx
shun-iwasawa 82a8f5
%define T1      rdx
shun-iwasawa 82a8f5
%define T1d     edx
shun-iwasawa 82a8f5
%define BLOCK   r10
shun-iwasawa 82a8f5
%define VALUES  r14
shun-iwasawa 82a8f5
%define LEN     r12d
shun-iwasawa 82a8f5
%define LENEND  r13d
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    align       32
shun-iwasawa 82a8f5
    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
shun-iwasawa 82a8f5
    push        rbp
shun-iwasawa 82a8f5
    mov         rax, rsp                     ; rax = original rbp
shun-iwasawa 82a8f5
    sub         rsp, byte 4
shun-iwasawa 82a8f5
    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
shun-iwasawa 82a8f5
    mov         [rsp], rax
shun-iwasawa 82a8f5
    mov         rbp, rsp                     ; rbp = aligned rbp
shun-iwasawa 82a8f5
    lea         rsp, [rbp - 16]
shun-iwasawa 82a8f5
    collect_args 6
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movdqa      XMMWORD [rbp - 16], ZERO
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    xor         SIGN, SIGN
shun-iwasawa 82a8f5
    xor         EOB, EOB
shun-iwasawa 82a8f5
    xor         KK, KK
shun-iwasawa 82a8f5
    movd        AL, r13d
shun-iwasawa 82a8f5
    pxor        ZERO, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     ONE, ONE
shun-iwasawa 82a8f5
    psrlw       ONE, 15
shun-iwasawa 82a8f5
    mov         K, LEN
shun-iwasawa 82a8f5
    mov         LENEND, LEN
shun-iwasawa 82a8f5
    and         K, -16
shun-iwasawa 82a8f5
    and         LENEND, 7
shun-iwasawa 82a8f5
    shr         K, 4
shun-iwasawa 82a8f5
    jz          .ELOOPR16
shun-iwasawa 82a8f5
.BLOOPR16:
shun-iwasawa 82a8f5
    LOAD16
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    pcmpgtw     N1, X1
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    paddw       X1, N1
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    pxor        X1, N1
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    psrlw       X1, AL
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (8) * 2], X1
shun-iwasawa 82a8f5
    pcmpeqw     X0, ONE
shun-iwasawa 82a8f5
    pcmpeqw     X1, ONE
shun-iwasawa 82a8f5
    packsswb    N0, N1
shun-iwasawa 82a8f5
    packsswb    X0, X1
shun-iwasawa 82a8f5
    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
shun-iwasawa 82a8f5
    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
shun-iwasawa 82a8f5
    shr         SIGN, 16                ; make room for sizebits
shun-iwasawa 82a8f5
    shl         T0, 48
shun-iwasawa 82a8f5
    or          SIGN, T0
shun-iwasawa 82a8f5
    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
shun-iwasawa 82a8f5
    jz          .CONTINUER16            ; if (idx) {
shun-iwasawa 82a8f5
    mov         EOB, KK
shun-iwasawa 82a8f5
    add         EOB, T1d                ; EOB = k + idx;
shun-iwasawa 82a8f5
.CONTINUER16:
shun-iwasawa 82a8f5
    add         VALUES, 16*2
shun-iwasawa 82a8f5
    add         LUT, 16*SIZEOF_INT
shun-iwasawa 82a8f5
    add         KK, 16
shun-iwasawa 82a8f5
    dec         K
shun-iwasawa 82a8f5
    jnz         .BLOOPR16
shun-iwasawa 82a8f5
.ELOOPR16:
shun-iwasawa 82a8f5
    test        LEN, 8
shun-iwasawa 82a8f5
    jz          .TRYR7
shun-iwasawa 82a8f5
    test        LEN, 7
shun-iwasawa 82a8f5
    jz          .TRYR8
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    LOAD15
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    pcmpgtw     N1, X1
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    paddw       X1, N1
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    pxor        X1, N1
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    psrlw       X1, AL
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (8) * 2], X1
shun-iwasawa 82a8f5
    pcmpeqw     X0, ONE
shun-iwasawa 82a8f5
    pcmpeqw     X1, ONE
shun-iwasawa 82a8f5
    packsswb    N0, N1
shun-iwasawa 82a8f5
    packsswb    X0, X1
shun-iwasawa 82a8f5
    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
shun-iwasawa 82a8f5
    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
shun-iwasawa 82a8f5
    shr         SIGN, 16                ; make room for sizebits
shun-iwasawa 82a8f5
    shl         T0, 48
shun-iwasawa 82a8f5
    or          SIGN, T0
shun-iwasawa 82a8f5
    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
shun-iwasawa 82a8f5
    jz          .CONTINUER15            ; if (idx) {
shun-iwasawa 82a8f5
    mov         EOB, KK
shun-iwasawa 82a8f5
    add         EOB, T1d                ; EOB = k + idx;
shun-iwasawa 82a8f5
.CONTINUER15:
shun-iwasawa 82a8f5
    add         VALUES, 16*2
shun-iwasawa 82a8f5
    jmp         .PADDINGR
shun-iwasawa 82a8f5
.TRYR8:
shun-iwasawa 82a8f5
    LOAD8
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    pcmpeqw     X0, ONE
shun-iwasawa 82a8f5
    packsswb    N0, ZERO
shun-iwasawa 82a8f5
    packsswb    X0, ZERO
shun-iwasawa 82a8f5
    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
shun-iwasawa 82a8f5
    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
shun-iwasawa 82a8f5
    shr         SIGN, 8                 ; make room for sizebits
shun-iwasawa 82a8f5
    shl         T0, 56
shun-iwasawa 82a8f5
    or          SIGN, T0
shun-iwasawa 82a8f5
    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
shun-iwasawa 82a8f5
    jz          .CONTINUER8             ; if (idx) {
shun-iwasawa 82a8f5
    mov         EOB, KK
shun-iwasawa 82a8f5
    add         EOB, T1d                ; EOB = k + idx;
shun-iwasawa 82a8f5
.CONTINUER8:
shun-iwasawa 82a8f5
    add         VALUES, 8*2
shun-iwasawa 82a8f5
    jmp         .PADDINGR
shun-iwasawa 82a8f5
.TRYR7:
shun-iwasawa 82a8f5
    LOAD7
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    pcmpeqw     X0, ONE
shun-iwasawa 82a8f5
    packsswb    N0, ZERO
shun-iwasawa 82a8f5
    packsswb    X0, ZERO
shun-iwasawa 82a8f5
    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
shun-iwasawa 82a8f5
    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
shun-iwasawa 82a8f5
    shr         SIGN, 8                 ; make room for sizebits
shun-iwasawa 82a8f5
    shl         T0, 56
shun-iwasawa 82a8f5
    or          SIGN, T0
shun-iwasawa 82a8f5
    bsr         T1d, T1d                ; idx = 16 - (__builtin_clz(idx)>>1);
shun-iwasawa 82a8f5
    jz          .CONTINUER7             ; if (idx) {
shun-iwasawa 82a8f5
    mov         EOB, KK
shun-iwasawa 82a8f5
    add         EOB, T1d                ; EOB = k + idx;
shun-iwasawa 82a8f5
.CONTINUER7:
shun-iwasawa 82a8f5
    add         VALUES, 8*2
shun-iwasawa 82a8f5
.PADDINGR:
shun-iwasawa 82a8f5
    mov         K, LEN
shun-iwasawa 82a8f5
    add         K, 7
shun-iwasawa 82a8f5
    and         K, -8
shun-iwasawa 82a8f5
    shr         K, 3
shun-iwasawa 82a8f5
    sub         K, DCTSIZE2/8
shun-iwasawa 82a8f5
    jz          .EPADDINGR
shun-iwasawa 82a8f5
    align       16
shun-iwasawa 82a8f5
.ZEROLOOPR:
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + 0], ZERO
shun-iwasawa 82a8f5
    shr         SIGN, 8
shun-iwasawa 82a8f5
    add         VALUES, 8*2
shun-iwasawa 82a8f5
    inc         K
shun-iwasawa 82a8f5
    jnz         .ZEROLOOPR
shun-iwasawa 82a8f5
.EPADDINGR:
shun-iwasawa 82a8f5
    not         SIGN
shun-iwasawa 82a8f5
    sub         VALUES, DCTSIZE2*2
shun-iwasawa 82a8f5
    mov         MMWORD [r15+SIZEOF_MMWORD], SIGN
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    REDUCE0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         eax, EOB
shun-iwasawa 82a8f5
    movdqa      ZERO, XMMWORD [rbp - 16]
shun-iwasawa 82a8f5
    uncollect_args 6
shun-iwasawa 82a8f5
    mov         rsp, rbp                ; rsp <- aligned rbp
shun-iwasawa 82a8f5
    pop         rsp                     ; rsp <- original rbp
shun-iwasawa 82a8f5
    pop         rbp
shun-iwasawa 82a8f5
    ret
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%undef ZERO
shun-iwasawa 82a8f5
%undef ONE
shun-iwasawa 82a8f5
%undef X0
shun-iwasawa 82a8f5
%undef X1
shun-iwasawa 82a8f5
%undef N0
shun-iwasawa 82a8f5
%undef N1
shun-iwasawa 82a8f5
%undef AL
shun-iwasawa 82a8f5
%undef K
shun-iwasawa 82a8f5
%undef KK
shun-iwasawa 82a8f5
%undef EOB
shun-iwasawa 82a8f5
%undef SIGN
shun-iwasawa 82a8f5
%undef LUT
shun-iwasawa 82a8f5
%undef T0
shun-iwasawa 82a8f5
%undef T0d
shun-iwasawa 82a8f5
%undef T1
shun-iwasawa 82a8f5
%undef T1d
shun-iwasawa 82a8f5
%undef BLOCK
shun-iwasawa 82a8f5
%undef VALUES
shun-iwasawa 82a8f5
%undef LEN
shun-iwasawa 82a8f5
%undef LENEND
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; For some reason, the OS X linker does not honor the request to align the
shun-iwasawa 82a8f5
; segment unless we do this.
shun-iwasawa 82a8f5
    align       32