shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Copyright (C) 2016, 2018, Matthieu Darbois
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Based on the x86 SIMD extension for IJG JPEG library
shun-iwasawa 82a8f5
; Copyright (C) 1999-2006, MIYASAKA Masaru.
shun-iwasawa 82a8f5
; For conditions of distribution and use, see copyright notice in jsimdext.inc
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; This file should be assembled with NASM (Netwide Assembler),
shun-iwasawa 82a8f5
; can *not* be assembled with Microsoft's MASM or any compatible
shun-iwasawa 82a8f5
; assembler (including Borland's Turbo Assembler).
shun-iwasawa 82a8f5
; NASM is available from http://nasm.sourceforge.net/ or
shun-iwasawa 82a8f5
; http://sourceforge.net/project/showfiles.php?group_id=6208
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; This file contains an SSE2 implementation of data preparation for progressive
shun-iwasawa 82a8f5
; Huffman encoding.  See jcphuff.c for more details.
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%include "jsimdext.inc"
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
    SECTION     SEG_TEXT
shun-iwasawa 82a8f5
    BITS        32
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
shun-iwasawa 82a8f5
; jsimd_encode_mcu_AC_refine_prepare_sse2()
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%macro LOAD16 0
shun-iwasawa 82a8f5
    pxor        N0, N0
shun-iwasawa 82a8f5
    pxor        N1, N1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  0*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1, INT [LUT +  8*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 0
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  1*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1, INT [LUT +  9*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 1
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  2*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1, INT [LUT + 10*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 2
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  3*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1, INT [LUT + 11*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 3
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  4*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1, INT [LUT + 12*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 4
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  5*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1, INT [LUT + 13*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 5
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  6*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1, INT [LUT + 14*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 6
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 6
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  7*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1, INT [LUT + 15*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 7
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 7
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%macro LOAD15 0
shun-iwasawa 82a8f5
    pxor        N0, N0
shun-iwasawa 82a8f5
    pxor        N1, N1
shun-iwasawa 82a8f5
    pxor        X1, X1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  0*SIZEOF_INT]
shun-iwasawa 82a8f5
    mov         T1, INT [LUT +  8*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 0
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  1*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  2*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  3*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  4*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  5*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  6*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 6
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  7*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 7
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 2
shun-iwasawa 82a8f5
    jl          %%.ELOAD15
shun-iwasawa 82a8f5
    mov         T1, INT [LUT +  9*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 3
shun-iwasawa 82a8f5
    jl          %%.ELOAD15
shun-iwasawa 82a8f5
    mov         T1, INT [LUT + 10*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 4
shun-iwasawa 82a8f5
    jl          %%.ELOAD15
shun-iwasawa 82a8f5
    mov         T1, INT [LUT + 11*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 5
shun-iwasawa 82a8f5
    jl          %%.ELOAD15
shun-iwasawa 82a8f5
    mov         T1, INT [LUT + 12*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 6
shun-iwasawa 82a8f5
    jl          %%.ELOAD15
shun-iwasawa 82a8f5
    mov         T1, INT [LUT + 13*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 7
shun-iwasawa 82a8f5
    jl          %%.ELOAD15
shun-iwasawa 82a8f5
    mov         T1, INT [LUT + 14*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X1, word [BLOCK + T1 * 2], 6
shun-iwasawa 82a8f5
%%.ELOAD15:
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%macro LOAD8 0
shun-iwasawa 82a8f5
    pxor        N0, N0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  0*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  1*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  2*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  3*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  4*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  5*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  6*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 6
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T0, INT [LUT +  7*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T0 * 2], 7
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%macro LOAD7 0
shun-iwasawa 82a8f5
    pxor        N0, N0
shun-iwasawa 82a8f5
    pxor        X0, X0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         T1, INT [LUT +  0*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 2
shun-iwasawa 82a8f5
    jl          %%.ELOAD7
shun-iwasawa 82a8f5
    mov         T1, INT [LUT +  1*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 3
shun-iwasawa 82a8f5
    jl          %%.ELOAD7
shun-iwasawa 82a8f5
    mov         T1, INT [LUT +  2*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 4
shun-iwasawa 82a8f5
    jl          %%.ELOAD7
shun-iwasawa 82a8f5
    mov         T1, INT [LUT +  3*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 5
shun-iwasawa 82a8f5
    jl          %%.ELOAD7
shun-iwasawa 82a8f5
    mov         T1, INT [LUT +  4*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 6
shun-iwasawa 82a8f5
    jl          %%.ELOAD7
shun-iwasawa 82a8f5
    mov         T1, INT [LUT +  5*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp         LENEND, 7
shun-iwasawa 82a8f5
    jl          %%.ELOAD7
shun-iwasawa 82a8f5
    mov         T1, INT [LUT +  6*SIZEOF_INT]
shun-iwasawa 82a8f5
    pinsrw      X0, word [BLOCK + T1 * 2], 6
shun-iwasawa 82a8f5
%%.ELOAD7:
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%macro REDUCE0 0
shun-iwasawa 82a8f5
    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
shun-iwasawa 82a8f5
    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
shun-iwasawa 82a8f5
    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
shun-iwasawa 82a8f5
    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
shun-iwasawa 82a8f5
    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
shun-iwasawa 82a8f5
    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
shun-iwasawa 82a8f5
    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pcmpeqw     xmm0, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm1, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm2, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm3, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm4, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm5, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm6, ZERO
shun-iwasawa 82a8f5
    pcmpeqw     xmm7, XMMWORD [VALUES + (56*2)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    packsswb    xmm0, xmm1
shun-iwasawa 82a8f5
    packsswb    xmm2, xmm3
shun-iwasawa 82a8f5
    packsswb    xmm4, xmm5
shun-iwasawa 82a8f5
    packsswb    xmm6, xmm7
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pmovmskb    eax, xmm0
shun-iwasawa 82a8f5
    pmovmskb    ecx, xmm2
shun-iwasawa 82a8f5
    pmovmskb    edx, xmm4
shun-iwasawa 82a8f5
    pmovmskb    esi, xmm6
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    shl         ecx, 16
shun-iwasawa 82a8f5
    shl         esi, 16
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    or          eax, ecx
shun-iwasawa 82a8f5
    or          edx, esi
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    not         eax
shun-iwasawa 82a8f5
    not         edx
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         edi, ZEROBITS
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         INT [edi], eax
shun-iwasawa 82a8f5
    mov         INT [edi+SIZEOF_INT], edx
shun-iwasawa 82a8f5
%endmacro
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Prepare data for jsimd_encode_mcu_AC_first().
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; GLOBAL(void)
shun-iwasawa 82a8f5
; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
shun-iwasawa 82a8f5
;                                        const int *jpeg_natural_order_start,
shun-iwasawa 82a8f5
;                                        int Sl, int Al, JCOEF *values,
shun-iwasawa 82a8f5
;                                        size_t *zerobits)
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; eax + 8 = const JCOEF *block
shun-iwasawa 82a8f5
; eax + 12 = const int *jpeg_natural_order_start
shun-iwasawa 82a8f5
; eax + 16 = int Sl
shun-iwasawa 82a8f5
; eax + 20 = int Al
shun-iwasawa 82a8f5
; eax + 24 = JCOEF *values
shun-iwasawa 82a8f5
; eax + 28 = size_t *zerobits
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define ZERO    xmm7
shun-iwasawa 82a8f5
%define X0      xmm0
shun-iwasawa 82a8f5
%define X1      xmm1
shun-iwasawa 82a8f5
%define N0      xmm2
shun-iwasawa 82a8f5
%define N1      xmm3
shun-iwasawa 82a8f5
%define AL      xmm4
shun-iwasawa 82a8f5
%define K       eax
shun-iwasawa 82a8f5
%define LENEND  eax
shun-iwasawa 82a8f5
%define LUT     ebx
shun-iwasawa 82a8f5
%define T0      ecx
shun-iwasawa 82a8f5
%define T1      edx
shun-iwasawa 82a8f5
%define BLOCK   esi
shun-iwasawa 82a8f5
%define VALUES  edi
shun-iwasawa 82a8f5
%define LEN     ebp
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define ZEROBITS  INT [esp + 5 * 4]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    align       32
shun-iwasawa 82a8f5
    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
shun-iwasawa 82a8f5
    push        ebp
shun-iwasawa 82a8f5
    mov         eax, esp                     ; eax = original ebp
shun-iwasawa 82a8f5
    sub         esp, byte 4
shun-iwasawa 82a8f5
    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
shun-iwasawa 82a8f5
    mov         [esp], eax
shun-iwasawa 82a8f5
    mov         ebp, esp                     ; ebp = aligned ebp
shun-iwasawa 82a8f5
    sub         esp, 4
shun-iwasawa 82a8f5
    push        ebx
shun-iwasawa 82a8f5
    push        ecx
shun-iwasawa 82a8f5
;   push        edx                     ; need not be preserved
shun-iwasawa 82a8f5
    push        esi
shun-iwasawa 82a8f5
    push        edi
shun-iwasawa 82a8f5
    push        ebp
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         BLOCK, INT [eax + 8]
shun-iwasawa 82a8f5
    mov         LUT, INT [eax + 12]
shun-iwasawa 82a8f5
    mov         VALUES, INT [eax + 24]
shun-iwasawa 82a8f5
    movd        AL, INT [eax + 20]
shun-iwasawa 82a8f5
    mov         T0, INT [eax + 28]
shun-iwasawa 82a8f5
    mov         ZEROBITS, T0
shun-iwasawa 82a8f5
    mov         LEN, INT [eax + 16]
shun-iwasawa 82a8f5
    pxor        ZERO, ZERO
shun-iwasawa 82a8f5
    mov         K, LEN
shun-iwasawa 82a8f5
    and         K, -16
shun-iwasawa 82a8f5
    shr         K, 4
shun-iwasawa 82a8f5
    jz          .ELOOP16
shun-iwasawa 82a8f5
.BLOOP16:
shun-iwasawa 82a8f5
    LOAD16
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    pcmpgtw     N1, X1
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    paddw       X1, N1
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    pxor        X1, N1
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    psrlw       X1, AL
shun-iwasawa 82a8f5
    pxor        N0, X0
shun-iwasawa 82a8f5
    pxor        N1, X1
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (8) * 2], X1
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
shun-iwasawa 82a8f5
    add         VALUES, 16*2
shun-iwasawa 82a8f5
    add         LUT, 16*SIZEOF_INT
shun-iwasawa 82a8f5
    dec         K
shun-iwasawa 82a8f5
    jnz         .BLOOP16
shun-iwasawa 82a8f5
    test        LEN, 15
shun-iwasawa 82a8f5
    je          .PADDING
shun-iwasawa 82a8f5
.ELOOP16:
shun-iwasawa 82a8f5
    mov         LENEND, LEN
shun-iwasawa 82a8f5
    and         LENEND, 7
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    test        LEN, 8
shun-iwasawa 82a8f5
    jz          .TRY7
shun-iwasawa 82a8f5
    test        LEN, 7
shun-iwasawa 82a8f5
    jz          .TRY8
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    LOAD15
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    pcmpgtw     N1, X1
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    paddw       X1, N1
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    pxor        X1, N1
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    psrlw       X1, AL
shun-iwasawa 82a8f5
    pxor        N0, X0
shun-iwasawa 82a8f5
    pxor        N1, X1
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (8) * 2], X1
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
shun-iwasawa 82a8f5
    add         VALUES, 16*2
shun-iwasawa 82a8f5
    jmp         .PADDING
shun-iwasawa 82a8f5
.TRY8:
shun-iwasawa 82a8f5
    LOAD8
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    pxor        N0, X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
shun-iwasawa 82a8f5
    add         VALUES, 8*2
shun-iwasawa 82a8f5
    jmp         .PADDING
shun-iwasawa 82a8f5
.TRY7:
shun-iwasawa 82a8f5
    LOAD7
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    pxor        N0, X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
shun-iwasawa 82a8f5
    add         VALUES, 8*2
shun-iwasawa 82a8f5
.PADDING:
shun-iwasawa 82a8f5
    mov         K, LEN
shun-iwasawa 82a8f5
    add         K, 7
shun-iwasawa 82a8f5
    and         K, -8
shun-iwasawa 82a8f5
    shr         K, 3
shun-iwasawa 82a8f5
    sub         K, DCTSIZE2/8
shun-iwasawa 82a8f5
    jz          .EPADDING
shun-iwasawa 82a8f5
    align       16
shun-iwasawa 82a8f5
.ZEROLOOP:
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + 0], ZERO
shun-iwasawa 82a8f5
    add         VALUES, 8*2
shun-iwasawa 82a8f5
    inc         K
shun-iwasawa 82a8f5
    jnz         .ZEROLOOP
shun-iwasawa 82a8f5
.EPADDING:
shun-iwasawa 82a8f5
    sub         VALUES, DCTSIZE2*2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    REDUCE0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pop         ebp
shun-iwasawa 82a8f5
    pop         edi
shun-iwasawa 82a8f5
    pop         esi
shun-iwasawa 82a8f5
;   pop         edx                     ; need not be preserved
shun-iwasawa 82a8f5
    pop         ecx
shun-iwasawa 82a8f5
    pop         ebx
shun-iwasawa 82a8f5
    mov         esp, ebp                ; esp <- aligned ebp
shun-iwasawa 82a8f5
    pop         esp                     ; esp <- original ebp
shun-iwasawa 82a8f5
    pop         ebp
shun-iwasawa 82a8f5
    ret
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%undef ZERO
shun-iwasawa 82a8f5
%undef X0
shun-iwasawa 82a8f5
%undef X1
shun-iwasawa 82a8f5
%undef N0
shun-iwasawa 82a8f5
%undef N1
shun-iwasawa 82a8f5
%undef AL
shun-iwasawa 82a8f5
%undef K
shun-iwasawa 82a8f5
%undef LUT
shun-iwasawa 82a8f5
%undef T0
shun-iwasawa 82a8f5
%undef T1
shun-iwasawa 82a8f5
%undef BLOCK
shun-iwasawa 82a8f5
%undef VALUES
shun-iwasawa 82a8f5
%undef LEN
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Prepare data for jsimd_encode_mcu_AC_refine().
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; GLOBAL(int)
shun-iwasawa 82a8f5
; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
shun-iwasawa 82a8f5
;                                         const int *jpeg_natural_order_start,
shun-iwasawa 82a8f5
;                                         int Sl, int Al, JCOEF *absvalues,
shun-iwasawa 82a8f5
;                                         size_t *bits)
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; eax + 8 = const JCOEF *block
shun-iwasawa 82a8f5
; eax + 12 = const int *jpeg_natural_order_start
shun-iwasawa 82a8f5
; eax + 16 = int Sl
shun-iwasawa 82a8f5
; eax + 20 = int Al
shun-iwasawa 82a8f5
; eax + 24 = JCOEF *values
shun-iwasawa 82a8f5
; eax + 28 = size_t *bits
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define ZERO    xmm7
shun-iwasawa 82a8f5
%define ONE     xmm5
shun-iwasawa 82a8f5
%define X0      xmm0
shun-iwasawa 82a8f5
%define X1      xmm1
shun-iwasawa 82a8f5
%define N0      xmm2
shun-iwasawa 82a8f5
%define N1      xmm3
shun-iwasawa 82a8f5
%define AL      xmm4
shun-iwasawa 82a8f5
%define K       eax
shun-iwasawa 82a8f5
%define LENEND  eax
shun-iwasawa 82a8f5
%define LUT     ebx
shun-iwasawa 82a8f5
%define T0      ecx
shun-iwasawa 82a8f5
%define T0w      cx
shun-iwasawa 82a8f5
%define T1      edx
shun-iwasawa 82a8f5
%define BLOCK   esi
shun-iwasawa 82a8f5
%define VALUES  edi
shun-iwasawa 82a8f5
%define KK      ebp
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define ZEROBITS  INT [esp + 5 * 4]
shun-iwasawa 82a8f5
%define EOB       INT [esp + 5 * 4 + 4]
shun-iwasawa 82a8f5
%define LEN       INT [esp + 5 * 4 + 8]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    align       32
shun-iwasawa 82a8f5
    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
shun-iwasawa 82a8f5
    push        ebp
shun-iwasawa 82a8f5
    mov         eax, esp                     ; eax = original ebp
shun-iwasawa 82a8f5
    sub         esp, byte 4
shun-iwasawa 82a8f5
    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
shun-iwasawa 82a8f5
    mov         [esp], eax
shun-iwasawa 82a8f5
    mov         ebp, esp                     ; ebp = aligned ebp
shun-iwasawa 82a8f5
    sub         esp, 16
shun-iwasawa 82a8f5
    push        ebx
shun-iwasawa 82a8f5
    push        ecx
shun-iwasawa 82a8f5
;   push        edx                     ; need not be preserved
shun-iwasawa 82a8f5
    push        esi
shun-iwasawa 82a8f5
    push        edi
shun-iwasawa 82a8f5
    push        ebp
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pcmpeqw     ONE, ONE
shun-iwasawa 82a8f5
    psrlw       ONE, 15
shun-iwasawa 82a8f5
    mov         BLOCK, INT [eax + 8]
shun-iwasawa 82a8f5
    mov         LUT, INT [eax + 12]
shun-iwasawa 82a8f5
    mov         VALUES, INT [eax + 24]
shun-iwasawa 82a8f5
    movd        AL, INT [eax + 20]
shun-iwasawa 82a8f5
    mov         T0, INT [eax + 28]
shun-iwasawa 82a8f5
    mov         K,  INT [eax + 16]
shun-iwasawa 82a8f5
    mov         INT [T0 + 2 * SIZEOF_INT], -1
shun-iwasawa 82a8f5
    mov         INT [T0 + 3 * SIZEOF_INT], -1
shun-iwasawa 82a8f5
    mov         ZEROBITS, T0
shun-iwasawa 82a8f5
    mov         LEN, K
shun-iwasawa 82a8f5
    pxor        ZERO, ZERO
shun-iwasawa 82a8f5
    and         K, -16
shun-iwasawa 82a8f5
    mov         EOB, 0
shun-iwasawa 82a8f5
    xor         KK, KK
shun-iwasawa 82a8f5
    shr         K, 4
shun-iwasawa 82a8f5
    jz          .ELOOPR16
shun-iwasawa 82a8f5
.BLOOPR16:
shun-iwasawa 82a8f5
    LOAD16
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    pcmpgtw     N1, X1
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    paddw       X1, N1
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    pxor        X1, N1
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    psrlw       X1, AL
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (8) * 2], X1
shun-iwasawa 82a8f5
    pcmpeqw     X0, ONE
shun-iwasawa 82a8f5
    pcmpeqw     X1, ONE
shun-iwasawa 82a8f5
    packsswb    N0, N1
shun-iwasawa 82a8f5
    packsswb    X0, X1
shun-iwasawa 82a8f5
    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
shun-iwasawa 82a8f5
    mov         T1, ZEROBITS
shun-iwasawa 82a8f5
    not         T0
shun-iwasawa 82a8f5
    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
shun-iwasawa 82a8f5
    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
shun-iwasawa 82a8f5
    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
shun-iwasawa 82a8f5
    jz          .CONTINUER16            ; if (idx) {
shun-iwasawa 82a8f5
    lea         T1, [T1+KK*8]
shun-iwasawa 82a8f5
    mov         EOB, T1                 ; EOB = k + idx;
shun-iwasawa 82a8f5
.CONTINUER16:
shun-iwasawa 82a8f5
    add         VALUES, 16*2
shun-iwasawa 82a8f5
    add         LUT, 16*SIZEOF_INT
shun-iwasawa 82a8f5
    add         KK, 2
shun-iwasawa 82a8f5
    dec         K
shun-iwasawa 82a8f5
    jnz         .BLOOPR16
shun-iwasawa 82a8f5
.ELOOPR16:
shun-iwasawa 82a8f5
    mov         LENEND, LEN
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    test        LENEND, 8
shun-iwasawa 82a8f5
    jz          .TRYR7
shun-iwasawa 82a8f5
    test        LENEND, 7
shun-iwasawa 82a8f5
    jz          .TRYR8
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    and         LENEND, 7
shun-iwasawa 82a8f5
    LOAD15
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    pcmpgtw     N1, X1
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    paddw       X1, N1
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    pxor        X1, N1
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    psrlw       X1, AL
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (8) * 2], X1
shun-iwasawa 82a8f5
    pcmpeqw     X0, ONE
shun-iwasawa 82a8f5
    pcmpeqw     X1, ONE
shun-iwasawa 82a8f5
    packsswb    N0, N1
shun-iwasawa 82a8f5
    packsswb    X0, X1
shun-iwasawa 82a8f5
    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
shun-iwasawa 82a8f5
    mov         T1, ZEROBITS
shun-iwasawa 82a8f5
    not         T0
shun-iwasawa 82a8f5
    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
shun-iwasawa 82a8f5
    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
shun-iwasawa 82a8f5
    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
shun-iwasawa 82a8f5
    jz          .CONTINUER15            ; if (idx) {
shun-iwasawa 82a8f5
    lea         T1, [T1+KK*8]
shun-iwasawa 82a8f5
    mov         EOB, T1                 ; EOB = k + idx;
shun-iwasawa 82a8f5
.CONTINUER15:
shun-iwasawa 82a8f5
    add         VALUES, 16*2
shun-iwasawa 82a8f5
    jmp         .PADDINGR
shun-iwasawa 82a8f5
.TRYR8:
shun-iwasawa 82a8f5
    LOAD8
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    pcmpeqw     X0, ONE
shun-iwasawa 82a8f5
    packsswb    N0, ZERO
shun-iwasawa 82a8f5
    packsswb    X0, ZERO
shun-iwasawa 82a8f5
    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
shun-iwasawa 82a8f5
    mov         T1, ZEROBITS
shun-iwasawa 82a8f5
    not         T0
shun-iwasawa 82a8f5
    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
shun-iwasawa 82a8f5
    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
shun-iwasawa 82a8f5
    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
shun-iwasawa 82a8f5
    jz          .CONTINUER8             ; if (idx) {
shun-iwasawa 82a8f5
    lea         T1, [T1+KK*8]
shun-iwasawa 82a8f5
    mov         EOB, T1                 ; EOB = k + idx;
shun-iwasawa 82a8f5
.CONTINUER8:
shun-iwasawa 82a8f5
    add         VALUES, 8*2
shun-iwasawa 82a8f5
    jmp         .PADDINGR
shun-iwasawa 82a8f5
.TRYR7:
shun-iwasawa 82a8f5
    and         LENEND, 7
shun-iwasawa 82a8f5
    LOAD7
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pcmpgtw     N0, X0
shun-iwasawa 82a8f5
    paddw       X0, N0
shun-iwasawa 82a8f5
    pxor        X0, N0
shun-iwasawa 82a8f5
    psrlw       X0, AL
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + (0) * 2], X0
shun-iwasawa 82a8f5
    pcmpeqw     X0, ONE
shun-iwasawa 82a8f5
    packsswb    N0, ZERO
shun-iwasawa 82a8f5
    packsswb    X0, ZERO
shun-iwasawa 82a8f5
    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
shun-iwasawa 82a8f5
    mov         T1, ZEROBITS
shun-iwasawa 82a8f5
    not         T0
shun-iwasawa 82a8f5
    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
shun-iwasawa 82a8f5
    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
shun-iwasawa 82a8f5
    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
shun-iwasawa 82a8f5
    jz          .CONTINUER7             ; if (idx) {
shun-iwasawa 82a8f5
    lea         T1, [T1+KK*8]
shun-iwasawa 82a8f5
    mov         EOB, T1                 ; EOB = k + idx;
shun-iwasawa 82a8f5
.CONTINUER7:
shun-iwasawa 82a8f5
    add         VALUES, 8*2
shun-iwasawa 82a8f5
.PADDINGR:
shun-iwasawa 82a8f5
    mov         K, LEN
shun-iwasawa 82a8f5
    add         K, 7
shun-iwasawa 82a8f5
    and         K, -8
shun-iwasawa 82a8f5
    shr         K, 3
shun-iwasawa 82a8f5
    sub         K, DCTSIZE2/8
shun-iwasawa 82a8f5
    jz          .EPADDINGR
shun-iwasawa 82a8f5
    align       16
shun-iwasawa 82a8f5
.ZEROLOOPR:
shun-iwasawa 82a8f5
    movdqa      XMMWORD [VALUES + 0], ZERO
shun-iwasawa 82a8f5
    add         VALUES, 8*2
shun-iwasawa 82a8f5
    inc         K
shun-iwasawa 82a8f5
    jnz         .ZEROLOOPR
shun-iwasawa 82a8f5
.EPADDINGR:
shun-iwasawa 82a8f5
    sub         VALUES, DCTSIZE2*2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    REDUCE0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         eax, EOB
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pop         ebp
shun-iwasawa 82a8f5
    pop         edi
shun-iwasawa 82a8f5
    pop         esi
shun-iwasawa 82a8f5
;   pop         edx                     ; need not be preserved
shun-iwasawa 82a8f5
    pop         ecx
shun-iwasawa 82a8f5
    pop         ebx
shun-iwasawa 82a8f5
    mov         esp, ebp                ; esp <- aligned ebp
shun-iwasawa 82a8f5
    pop         esp                     ; esp <- original ebp
shun-iwasawa 82a8f5
    pop         ebp
shun-iwasawa 82a8f5
    ret
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%undef ZERO
shun-iwasawa 82a8f5
%undef ONE
shun-iwasawa 82a8f5
%undef X0
shun-iwasawa 82a8f5
%undef X1
shun-iwasawa 82a8f5
%undef N0
shun-iwasawa 82a8f5
%undef N1
shun-iwasawa 82a8f5
%undef AL
shun-iwasawa 82a8f5
%undef K
shun-iwasawa 82a8f5
%undef KK
shun-iwasawa 82a8f5
%undef EOB
shun-iwasawa 82a8f5
%undef SIGN
shun-iwasawa 82a8f5
%undef LUT
shun-iwasawa 82a8f5
%undef T0
shun-iwasawa 82a8f5
%undef T1
shun-iwasawa 82a8f5
%undef BLOCK
shun-iwasawa 82a8f5
%undef VALUES
shun-iwasawa 82a8f5
%undef LEN
shun-iwasawa 82a8f5
%undef LENEND
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; For some reason, the OS X linker does not honor the request to align the
shun-iwasawa 82a8f5
; segment unless we do this.
shun-iwasawa 82a8f5
    align       32