|
shun-iwasawa |
82a8f5 |
;
|
|
shun-iwasawa |
82a8f5 |
; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
|
|
shun-iwasawa |
82a8f5 |
;
|
|
shun-iwasawa |
82a8f5 |
; Copyright (C) 2016, 2018, Matthieu Darbois
|
|
shun-iwasawa |
82a8f5 |
;
|
|
shun-iwasawa |
82a8f5 |
; Based on the x86 SIMD extension for IJG JPEG library
|
|
shun-iwasawa |
82a8f5 |
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
|
shun-iwasawa |
82a8f5 |
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
|
shun-iwasawa |
82a8f5 |
;
|
|
shun-iwasawa |
82a8f5 |
; This file should be assembled with NASM (Netwide Assembler),
|
|
shun-iwasawa |
82a8f5 |
; can *not* be assembled with Microsoft's MASM or any compatible
|
|
shun-iwasawa |
82a8f5 |
; assembler (including Borland's Turbo Assembler).
|
|
shun-iwasawa |
82a8f5 |
; NASM is available from http://nasm.sourceforge.net/ or
|
|
shun-iwasawa |
82a8f5 |
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
|
shun-iwasawa |
82a8f5 |
;
|
|
shun-iwasawa |
82a8f5 |
; This file contains an SSE2 implementation of data preparation for progressive
|
|
shun-iwasawa |
82a8f5 |
; Huffman encoding. See jcphuff.c for more details.
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
%include "jsimdext.inc"
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
; --------------------------------------------------------------------------
|
|
shun-iwasawa |
82a8f5 |
SECTION SEG_TEXT
|
|
shun-iwasawa |
82a8f5 |
BITS 32
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
; --------------------------------------------------------------------------
|
|
shun-iwasawa |
82a8f5 |
; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
|
|
shun-iwasawa |
82a8f5 |
; jsimd_encode_mcu_AC_refine_prepare_sse2()
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
%macro LOAD16 0
|
|
shun-iwasawa |
82a8f5 |
pxor N0, N0
|
|
shun-iwasawa |
82a8f5 |
pxor N1, N1
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 0*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 8*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 0
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 0
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 1*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 9*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 1
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 1
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 2*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 10*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 2
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 2
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 3*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 11*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 3
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 3
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 4*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 12*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 4
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 4
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 5*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 13*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 5
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 5
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 6*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 14*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 6
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 6
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 7*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 15*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 7
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 7
|
|
shun-iwasawa |
82a8f5 |
%endmacro
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
%macro LOAD15 0
|
|
shun-iwasawa |
82a8f5 |
pxor N0, N0
|
|
shun-iwasawa |
82a8f5 |
pxor N1, N1
|
|
shun-iwasawa |
82a8f5 |
pxor X1, X1
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 0*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 8*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 0
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 0
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 1*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 1
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 2*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 2
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 3*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 3
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 4*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 4
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 5*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 5
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 6*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 6
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 7*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 7
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
cmp LENEND, 2
|
|
shun-iwasawa |
82a8f5 |
jl %%.ELOAD15
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 9*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 1
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
cmp LENEND, 3
|
|
shun-iwasawa |
82a8f5 |
jl %%.ELOAD15
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 10*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 2
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
cmp LENEND, 4
|
|
shun-iwasawa |
82a8f5 |
jl %%.ELOAD15
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 11*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 3
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
cmp LENEND, 5
|
|
shun-iwasawa |
82a8f5 |
jl %%.ELOAD15
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 12*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 4
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
cmp LENEND, 6
|
|
shun-iwasawa |
82a8f5 |
jl %%.ELOAD15
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 13*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 5
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
cmp LENEND, 7
|
|
shun-iwasawa |
82a8f5 |
jl %%.ELOAD15
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 14*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X1, word [BLOCK + T1 * 2], 6
|
|
shun-iwasawa |
82a8f5 |
%%.ELOAD15:
|
|
shun-iwasawa |
82a8f5 |
%endmacro
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
%macro LOAD8 0
|
|
shun-iwasawa |
82a8f5 |
pxor N0, N0
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 0*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 0
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 1*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 1
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 2*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 2
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 3*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 3
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 4*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 4
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 5*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 5
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 6*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 6
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [LUT + 7*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T0 * 2], 7
|
|
shun-iwasawa |
82a8f5 |
%endmacro
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
%macro LOAD7 0
|
|
shun-iwasawa |
82a8f5 |
pxor N0, N0
|
|
shun-iwasawa |
82a8f5 |
pxor X0, X0
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 0*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T1 * 2], 0
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
cmp LENEND, 2
|
|
shun-iwasawa |
82a8f5 |
jl %%.ELOAD7
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 1*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T1 * 2], 1
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
cmp LENEND, 3
|
|
shun-iwasawa |
82a8f5 |
jl %%.ELOAD7
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 2*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T1 * 2], 2
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
cmp LENEND, 4
|
|
shun-iwasawa |
82a8f5 |
jl %%.ELOAD7
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 3*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T1 * 2], 3
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
cmp LENEND, 5
|
|
shun-iwasawa |
82a8f5 |
jl %%.ELOAD7
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 4*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T1 * 2], 4
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
cmp LENEND, 6
|
|
shun-iwasawa |
82a8f5 |
jl %%.ELOAD7
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 5*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T1 * 2], 5
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
cmp LENEND, 7
|
|
shun-iwasawa |
82a8f5 |
jl %%.ELOAD7
|
|
shun-iwasawa |
82a8f5 |
mov T1, INT [LUT + 6*SIZEOF_INT]
|
|
shun-iwasawa |
82a8f5 |
pinsrw X0, word [BLOCK + T1 * 2], 6
|
|
shun-iwasawa |
82a8f5 |
%%.ELOAD7:
|
|
shun-iwasawa |
82a8f5 |
%endmacro
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
%macro REDUCE0 0
|
|
shun-iwasawa |
82a8f5 |
movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
|
|
shun-iwasawa |
82a8f5 |
movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
|
|
shun-iwasawa |
82a8f5 |
movdqa xmm2, XMMWORD [VALUES + (16*2)]
|
|
shun-iwasawa |
82a8f5 |
movdqa xmm3, XMMWORD [VALUES + (24*2)]
|
|
shun-iwasawa |
82a8f5 |
movdqa xmm4, XMMWORD [VALUES + (32*2)]
|
|
shun-iwasawa |
82a8f5 |
movdqa xmm5, XMMWORD [VALUES + (40*2)]
|
|
shun-iwasawa |
82a8f5 |
movdqa xmm6, XMMWORD [VALUES + (48*2)]
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw xmm0, ZERO
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw xmm1, ZERO
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw xmm2, ZERO
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw xmm3, ZERO
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw xmm4, ZERO
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw xmm5, ZERO
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw xmm6, ZERO
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw xmm7, XMMWORD [VALUES + (56*2)]
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
packsswb xmm0, xmm1
|
|
shun-iwasawa |
82a8f5 |
packsswb xmm2, xmm3
|
|
shun-iwasawa |
82a8f5 |
packsswb xmm4, xmm5
|
|
shun-iwasawa |
82a8f5 |
packsswb xmm6, xmm7
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
pmovmskb eax, xmm0
|
|
shun-iwasawa |
82a8f5 |
pmovmskb ecx, xmm2
|
|
shun-iwasawa |
82a8f5 |
pmovmskb edx, xmm4
|
|
shun-iwasawa |
82a8f5 |
pmovmskb esi, xmm6
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
shl ecx, 16
|
|
shun-iwasawa |
82a8f5 |
shl esi, 16
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
or eax, ecx
|
|
shun-iwasawa |
82a8f5 |
or edx, esi
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
not eax
|
|
shun-iwasawa |
82a8f5 |
not edx
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov edi, ZEROBITS
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov INT [edi], eax
|
|
shun-iwasawa |
82a8f5 |
mov INT [edi+SIZEOF_INT], edx
|
|
shun-iwasawa |
82a8f5 |
%endmacro
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
;
|
|
shun-iwasawa |
82a8f5 |
; Prepare data for jsimd_encode_mcu_AC_first().
|
|
shun-iwasawa |
82a8f5 |
;
|
|
shun-iwasawa |
82a8f5 |
; GLOBAL(void)
|
|
shun-iwasawa |
82a8f5 |
; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
|
|
shun-iwasawa |
82a8f5 |
; const int *jpeg_natural_order_start,
|
|
shun-iwasawa |
82a8f5 |
; int Sl, int Al, JCOEF *values,
|
|
shun-iwasawa |
82a8f5 |
; size_t *zerobits)
|
|
shun-iwasawa |
82a8f5 |
;
|
|
shun-iwasawa |
82a8f5 |
; eax + 8 = const JCOEF *block
|
|
shun-iwasawa |
82a8f5 |
; eax + 12 = const int *jpeg_natural_order_start
|
|
shun-iwasawa |
82a8f5 |
; eax + 16 = int Sl
|
|
shun-iwasawa |
82a8f5 |
; eax + 20 = int Al
|
|
shun-iwasawa |
82a8f5 |
; eax + 24 = JCOEF *values
|
|
shun-iwasawa |
82a8f5 |
; eax + 28 = size_t *zerobits
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
%define ZERO xmm7
|
|
shun-iwasawa |
82a8f5 |
%define X0 xmm0
|
|
shun-iwasawa |
82a8f5 |
%define X1 xmm1
|
|
shun-iwasawa |
82a8f5 |
%define N0 xmm2
|
|
shun-iwasawa |
82a8f5 |
%define N1 xmm3
|
|
shun-iwasawa |
82a8f5 |
%define AL xmm4
|
|
shun-iwasawa |
82a8f5 |
%define K eax
|
|
shun-iwasawa |
82a8f5 |
%define LENEND eax
|
|
shun-iwasawa |
82a8f5 |
%define LUT ebx
|
|
shun-iwasawa |
82a8f5 |
%define T0 ecx
|
|
shun-iwasawa |
82a8f5 |
%define T1 edx
|
|
shun-iwasawa |
82a8f5 |
%define BLOCK esi
|
|
shun-iwasawa |
82a8f5 |
%define VALUES edi
|
|
shun-iwasawa |
82a8f5 |
%define LEN ebp
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
%define ZEROBITS INT [esp + 5 * 4]
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
align 32
|
|
shun-iwasawa |
82a8f5 |
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
|
|
shun-iwasawa |
82a8f5 |
push ebp
|
|
shun-iwasawa |
82a8f5 |
mov eax, esp ; eax = original ebp
|
|
shun-iwasawa |
82a8f5 |
sub esp, byte 4
|
|
shun-iwasawa |
82a8f5 |
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
|
shun-iwasawa |
82a8f5 |
mov [esp], eax
|
|
shun-iwasawa |
82a8f5 |
mov ebp, esp ; ebp = aligned ebp
|
|
shun-iwasawa |
82a8f5 |
sub esp, 4
|
|
shun-iwasawa |
82a8f5 |
push ebx
|
|
shun-iwasawa |
82a8f5 |
push ecx
|
|
shun-iwasawa |
82a8f5 |
; push edx ; need not be preserved
|
|
shun-iwasawa |
82a8f5 |
push esi
|
|
shun-iwasawa |
82a8f5 |
push edi
|
|
shun-iwasawa |
82a8f5 |
push ebp
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov BLOCK, INT [eax + 8]
|
|
shun-iwasawa |
82a8f5 |
mov LUT, INT [eax + 12]
|
|
shun-iwasawa |
82a8f5 |
mov VALUES, INT [eax + 24]
|
|
shun-iwasawa |
82a8f5 |
movd AL, INT [eax + 20]
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [eax + 28]
|
|
shun-iwasawa |
82a8f5 |
mov ZEROBITS, T0
|
|
shun-iwasawa |
82a8f5 |
mov LEN, INT [eax + 16]
|
|
shun-iwasawa |
82a8f5 |
pxor ZERO, ZERO
|
|
shun-iwasawa |
82a8f5 |
mov K, LEN
|
|
shun-iwasawa |
82a8f5 |
and K, -16
|
|
shun-iwasawa |
82a8f5 |
shr K, 4
|
|
shun-iwasawa |
82a8f5 |
jz .ELOOP16
|
|
shun-iwasawa |
82a8f5 |
.BLOOP16:
|
|
shun-iwasawa |
82a8f5 |
LOAD16
|
|
shun-iwasawa |
82a8f5 |
pcmpgtw N0, X0
|
|
shun-iwasawa |
82a8f5 |
pcmpgtw N1, X1
|
|
shun-iwasawa |
82a8f5 |
paddw X0, N0
|
|
shun-iwasawa |
82a8f5 |
paddw X1, N1
|
|
shun-iwasawa |
82a8f5 |
pxor X0, N0
|
|
shun-iwasawa |
82a8f5 |
pxor X1, N1
|
|
shun-iwasawa |
82a8f5 |
psrlw X0, AL
|
|
shun-iwasawa |
82a8f5 |
psrlw X1, AL
|
|
shun-iwasawa |
82a8f5 |
pxor N0, X0
|
|
shun-iwasawa |
82a8f5 |
pxor N1, X1
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (0) * 2], X0
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (8) * 2], X1
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
|
|
shun-iwasawa |
82a8f5 |
add VALUES, 16*2
|
|
shun-iwasawa |
82a8f5 |
add LUT, 16*SIZEOF_INT
|
|
shun-iwasawa |
82a8f5 |
dec K
|
|
shun-iwasawa |
82a8f5 |
jnz .BLOOP16
|
|
shun-iwasawa |
82a8f5 |
test LEN, 15
|
|
shun-iwasawa |
82a8f5 |
je .PADDING
|
|
shun-iwasawa |
82a8f5 |
.ELOOP16:
|
|
shun-iwasawa |
82a8f5 |
mov LENEND, LEN
|
|
shun-iwasawa |
82a8f5 |
and LENEND, 7
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
test LEN, 8
|
|
shun-iwasawa |
82a8f5 |
jz .TRY7
|
|
shun-iwasawa |
82a8f5 |
test LEN, 7
|
|
shun-iwasawa |
82a8f5 |
jz .TRY8
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
LOAD15
|
|
shun-iwasawa |
82a8f5 |
pcmpgtw N0, X0
|
|
shun-iwasawa |
82a8f5 |
pcmpgtw N1, X1
|
|
shun-iwasawa |
82a8f5 |
paddw X0, N0
|
|
shun-iwasawa |
82a8f5 |
paddw X1, N1
|
|
shun-iwasawa |
82a8f5 |
pxor X0, N0
|
|
shun-iwasawa |
82a8f5 |
pxor X1, N1
|
|
shun-iwasawa |
82a8f5 |
psrlw X0, AL
|
|
shun-iwasawa |
82a8f5 |
psrlw X1, AL
|
|
shun-iwasawa |
82a8f5 |
pxor N0, X0
|
|
shun-iwasawa |
82a8f5 |
pxor N1, X1
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (0) * 2], X0
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (8) * 2], X1
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
|
|
shun-iwasawa |
82a8f5 |
add VALUES, 16*2
|
|
shun-iwasawa |
82a8f5 |
jmp .PADDING
|
|
shun-iwasawa |
82a8f5 |
.TRY8:
|
|
shun-iwasawa |
82a8f5 |
LOAD8
|
|
shun-iwasawa |
82a8f5 |
pcmpgtw N0, X0
|
|
shun-iwasawa |
82a8f5 |
paddw X0, N0
|
|
shun-iwasawa |
82a8f5 |
pxor X0, N0
|
|
shun-iwasawa |
82a8f5 |
psrlw X0, AL
|
|
shun-iwasawa |
82a8f5 |
pxor N0, X0
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (0) * 2], X0
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
|
shun-iwasawa |
82a8f5 |
add VALUES, 8*2
|
|
shun-iwasawa |
82a8f5 |
jmp .PADDING
|
|
shun-iwasawa |
82a8f5 |
.TRY7:
|
|
shun-iwasawa |
82a8f5 |
LOAD7
|
|
shun-iwasawa |
82a8f5 |
pcmpgtw N0, X0
|
|
shun-iwasawa |
82a8f5 |
paddw X0, N0
|
|
shun-iwasawa |
82a8f5 |
pxor X0, N0
|
|
shun-iwasawa |
82a8f5 |
psrlw X0, AL
|
|
shun-iwasawa |
82a8f5 |
pxor N0, X0
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (0) * 2], X0
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
|
|
shun-iwasawa |
82a8f5 |
add VALUES, 8*2
|
|
shun-iwasawa |
82a8f5 |
.PADDING:
|
|
shun-iwasawa |
82a8f5 |
mov K, LEN
|
|
shun-iwasawa |
82a8f5 |
add K, 7
|
|
shun-iwasawa |
82a8f5 |
and K, -8
|
|
shun-iwasawa |
82a8f5 |
shr K, 3
|
|
shun-iwasawa |
82a8f5 |
sub K, DCTSIZE2/8
|
|
shun-iwasawa |
82a8f5 |
jz .EPADDING
|
|
shun-iwasawa |
82a8f5 |
align 16
|
|
shun-iwasawa |
82a8f5 |
.ZEROLOOP:
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + 0], ZERO
|
|
shun-iwasawa |
82a8f5 |
add VALUES, 8*2
|
|
shun-iwasawa |
82a8f5 |
inc K
|
|
shun-iwasawa |
82a8f5 |
jnz .ZEROLOOP
|
|
shun-iwasawa |
82a8f5 |
.EPADDING:
|
|
shun-iwasawa |
82a8f5 |
sub VALUES, DCTSIZE2*2
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
REDUCE0
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
pop ebp
|
|
shun-iwasawa |
82a8f5 |
pop edi
|
|
shun-iwasawa |
82a8f5 |
pop esi
|
|
shun-iwasawa |
82a8f5 |
; pop edx ; need not be preserved
|
|
shun-iwasawa |
82a8f5 |
pop ecx
|
|
shun-iwasawa |
82a8f5 |
pop ebx
|
|
shun-iwasawa |
82a8f5 |
mov esp, ebp ; esp <- aligned ebp
|
|
shun-iwasawa |
82a8f5 |
pop esp ; esp <- original ebp
|
|
shun-iwasawa |
82a8f5 |
pop ebp
|
|
shun-iwasawa |
82a8f5 |
ret
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
%undef ZERO
|
|
shun-iwasawa |
82a8f5 |
%undef X0
|
|
shun-iwasawa |
82a8f5 |
%undef X1
|
|
shun-iwasawa |
82a8f5 |
%undef N0
|
|
shun-iwasawa |
82a8f5 |
%undef N1
|
|
shun-iwasawa |
82a8f5 |
%undef AL
|
|
shun-iwasawa |
82a8f5 |
%undef K
|
|
shun-iwasawa |
82a8f5 |
%undef LUT
|
|
shun-iwasawa |
82a8f5 |
%undef T0
|
|
shun-iwasawa |
82a8f5 |
%undef T1
|
|
shun-iwasawa |
82a8f5 |
%undef BLOCK
|
|
shun-iwasawa |
82a8f5 |
%undef VALUES
|
|
shun-iwasawa |
82a8f5 |
%undef LEN
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
;
|
|
shun-iwasawa |
82a8f5 |
; Prepare data for jsimd_encode_mcu_AC_refine().
|
|
shun-iwasawa |
82a8f5 |
;
|
|
shun-iwasawa |
82a8f5 |
; GLOBAL(int)
|
|
shun-iwasawa |
82a8f5 |
; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
|
|
shun-iwasawa |
82a8f5 |
; const int *jpeg_natural_order_start,
|
|
shun-iwasawa |
82a8f5 |
; int Sl, int Al, JCOEF *absvalues,
|
|
shun-iwasawa |
82a8f5 |
; size_t *bits)
|
|
shun-iwasawa |
82a8f5 |
;
|
|
shun-iwasawa |
82a8f5 |
; eax + 8 = const JCOEF *block
|
|
shun-iwasawa |
82a8f5 |
; eax + 12 = const int *jpeg_natural_order_start
|
|
shun-iwasawa |
82a8f5 |
; eax + 16 = int Sl
|
|
shun-iwasawa |
82a8f5 |
; eax + 20 = int Al
|
|
shun-iwasawa |
82a8f5 |
; eax + 24 = JCOEF *values
|
|
shun-iwasawa |
82a8f5 |
; eax + 28 = size_t *bits
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
%define ZERO xmm7
|
|
shun-iwasawa |
82a8f5 |
%define ONE xmm5
|
|
shun-iwasawa |
82a8f5 |
%define X0 xmm0
|
|
shun-iwasawa |
82a8f5 |
%define X1 xmm1
|
|
shun-iwasawa |
82a8f5 |
%define N0 xmm2
|
|
shun-iwasawa |
82a8f5 |
%define N1 xmm3
|
|
shun-iwasawa |
82a8f5 |
%define AL xmm4
|
|
shun-iwasawa |
82a8f5 |
%define K eax
|
|
shun-iwasawa |
82a8f5 |
%define LENEND eax
|
|
shun-iwasawa |
82a8f5 |
%define LUT ebx
|
|
shun-iwasawa |
82a8f5 |
%define T0 ecx
|
|
shun-iwasawa |
82a8f5 |
%define T0w cx
|
|
shun-iwasawa |
82a8f5 |
%define T1 edx
|
|
shun-iwasawa |
82a8f5 |
%define BLOCK esi
|
|
shun-iwasawa |
82a8f5 |
%define VALUES edi
|
|
shun-iwasawa |
82a8f5 |
%define KK ebp
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
%define ZEROBITS INT [esp + 5 * 4]
|
|
shun-iwasawa |
82a8f5 |
%define EOB INT [esp + 5 * 4 + 4]
|
|
shun-iwasawa |
82a8f5 |
%define LEN INT [esp + 5 * 4 + 8]
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
align 32
|
|
shun-iwasawa |
82a8f5 |
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
|
|
shun-iwasawa |
82a8f5 |
push ebp
|
|
shun-iwasawa |
82a8f5 |
mov eax, esp ; eax = original ebp
|
|
shun-iwasawa |
82a8f5 |
sub esp, byte 4
|
|
shun-iwasawa |
82a8f5 |
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
|
shun-iwasawa |
82a8f5 |
mov [esp], eax
|
|
shun-iwasawa |
82a8f5 |
mov ebp, esp ; ebp = aligned ebp
|
|
shun-iwasawa |
82a8f5 |
sub esp, 16
|
|
shun-iwasawa |
82a8f5 |
push ebx
|
|
shun-iwasawa |
82a8f5 |
push ecx
|
|
shun-iwasawa |
82a8f5 |
; push edx ; need not be preserved
|
|
shun-iwasawa |
82a8f5 |
push esi
|
|
shun-iwasawa |
82a8f5 |
push edi
|
|
shun-iwasawa |
82a8f5 |
push ebp
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw ONE, ONE
|
|
shun-iwasawa |
82a8f5 |
psrlw ONE, 15
|
|
shun-iwasawa |
82a8f5 |
mov BLOCK, INT [eax + 8]
|
|
shun-iwasawa |
82a8f5 |
mov LUT, INT [eax + 12]
|
|
shun-iwasawa |
82a8f5 |
mov VALUES, INT [eax + 24]
|
|
shun-iwasawa |
82a8f5 |
movd AL, INT [eax + 20]
|
|
shun-iwasawa |
82a8f5 |
mov T0, INT [eax + 28]
|
|
shun-iwasawa |
82a8f5 |
mov K, INT [eax + 16]
|
|
shun-iwasawa |
82a8f5 |
mov INT [T0 + 2 * SIZEOF_INT], -1
|
|
shun-iwasawa |
82a8f5 |
mov INT [T0 + 3 * SIZEOF_INT], -1
|
|
shun-iwasawa |
82a8f5 |
mov ZEROBITS, T0
|
|
shun-iwasawa |
82a8f5 |
mov LEN, K
|
|
shun-iwasawa |
82a8f5 |
pxor ZERO, ZERO
|
|
shun-iwasawa |
82a8f5 |
and K, -16
|
|
shun-iwasawa |
82a8f5 |
mov EOB, 0
|
|
shun-iwasawa |
82a8f5 |
xor KK, KK
|
|
shun-iwasawa |
82a8f5 |
shr K, 4
|
|
shun-iwasawa |
82a8f5 |
jz .ELOOPR16
|
|
shun-iwasawa |
82a8f5 |
.BLOOPR16:
|
|
shun-iwasawa |
82a8f5 |
LOAD16
|
|
shun-iwasawa |
82a8f5 |
pcmpgtw N0, X0
|
|
shun-iwasawa |
82a8f5 |
pcmpgtw N1, X1
|
|
shun-iwasawa |
82a8f5 |
paddw X0, N0
|
|
shun-iwasawa |
82a8f5 |
paddw X1, N1
|
|
shun-iwasawa |
82a8f5 |
pxor X0, N0
|
|
shun-iwasawa |
82a8f5 |
pxor X1, N1
|
|
shun-iwasawa |
82a8f5 |
psrlw X0, AL
|
|
shun-iwasawa |
82a8f5 |
psrlw X1, AL
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (0) * 2], X0
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (8) * 2], X1
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw X0, ONE
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw X1, ONE
|
|
shun-iwasawa |
82a8f5 |
packsswb N0, N1
|
|
shun-iwasawa |
82a8f5 |
packsswb X0, X1
|
|
shun-iwasawa |
82a8f5 |
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
|
shun-iwasawa |
82a8f5 |
mov T1, ZEROBITS
|
|
shun-iwasawa |
82a8f5 |
not T0
|
|
shun-iwasawa |
82a8f5 |
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
|
|
shun-iwasawa |
82a8f5 |
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
|
|
shun-iwasawa |
82a8f5 |
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
|
|
shun-iwasawa |
82a8f5 |
jz .CONTINUER16 ; if (idx) {
|
|
shun-iwasawa |
82a8f5 |
lea T1, [T1+KK*8]
|
|
shun-iwasawa |
82a8f5 |
mov EOB, T1 ; EOB = k + idx;
|
|
shun-iwasawa |
82a8f5 |
.CONTINUER16:
|
|
shun-iwasawa |
82a8f5 |
add VALUES, 16*2
|
|
shun-iwasawa |
82a8f5 |
add LUT, 16*SIZEOF_INT
|
|
shun-iwasawa |
82a8f5 |
add KK, 2
|
|
shun-iwasawa |
82a8f5 |
dec K
|
|
shun-iwasawa |
82a8f5 |
jnz .BLOOPR16
|
|
shun-iwasawa |
82a8f5 |
.ELOOPR16:
|
|
shun-iwasawa |
82a8f5 |
mov LENEND, LEN
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
test LENEND, 8
|
|
shun-iwasawa |
82a8f5 |
jz .TRYR7
|
|
shun-iwasawa |
82a8f5 |
test LENEND, 7
|
|
shun-iwasawa |
82a8f5 |
jz .TRYR8
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
and LENEND, 7
|
|
shun-iwasawa |
82a8f5 |
LOAD15
|
|
shun-iwasawa |
82a8f5 |
pcmpgtw N0, X0
|
|
shun-iwasawa |
82a8f5 |
pcmpgtw N1, X1
|
|
shun-iwasawa |
82a8f5 |
paddw X0, N0
|
|
shun-iwasawa |
82a8f5 |
paddw X1, N1
|
|
shun-iwasawa |
82a8f5 |
pxor X0, N0
|
|
shun-iwasawa |
82a8f5 |
pxor X1, N1
|
|
shun-iwasawa |
82a8f5 |
psrlw X0, AL
|
|
shun-iwasawa |
82a8f5 |
psrlw X1, AL
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (0) * 2], X0
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (8) * 2], X1
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw X0, ONE
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw X1, ONE
|
|
shun-iwasawa |
82a8f5 |
packsswb N0, N1
|
|
shun-iwasawa |
82a8f5 |
packsswb X0, X1
|
|
shun-iwasawa |
82a8f5 |
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
|
shun-iwasawa |
82a8f5 |
mov T1, ZEROBITS
|
|
shun-iwasawa |
82a8f5 |
not T0
|
|
shun-iwasawa |
82a8f5 |
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
|
|
shun-iwasawa |
82a8f5 |
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
|
|
shun-iwasawa |
82a8f5 |
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
|
|
shun-iwasawa |
82a8f5 |
jz .CONTINUER15 ; if (idx) {
|
|
shun-iwasawa |
82a8f5 |
lea T1, [T1+KK*8]
|
|
shun-iwasawa |
82a8f5 |
mov EOB, T1 ; EOB = k + idx;
|
|
shun-iwasawa |
82a8f5 |
.CONTINUER15:
|
|
shun-iwasawa |
82a8f5 |
add VALUES, 16*2
|
|
shun-iwasawa |
82a8f5 |
jmp .PADDINGR
|
|
shun-iwasawa |
82a8f5 |
.TRYR8:
|
|
shun-iwasawa |
82a8f5 |
LOAD8
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
pcmpgtw N0, X0
|
|
shun-iwasawa |
82a8f5 |
paddw X0, N0
|
|
shun-iwasawa |
82a8f5 |
pxor X0, N0
|
|
shun-iwasawa |
82a8f5 |
psrlw X0, AL
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (0) * 2], X0
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw X0, ONE
|
|
shun-iwasawa |
82a8f5 |
packsswb N0, ZERO
|
|
shun-iwasawa |
82a8f5 |
packsswb X0, ZERO
|
|
shun-iwasawa |
82a8f5 |
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
|
shun-iwasawa |
82a8f5 |
mov T1, ZEROBITS
|
|
shun-iwasawa |
82a8f5 |
not T0
|
|
shun-iwasawa |
82a8f5 |
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
|
|
shun-iwasawa |
82a8f5 |
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
|
|
shun-iwasawa |
82a8f5 |
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
|
|
shun-iwasawa |
82a8f5 |
jz .CONTINUER8 ; if (idx) {
|
|
shun-iwasawa |
82a8f5 |
lea T1, [T1+KK*8]
|
|
shun-iwasawa |
82a8f5 |
mov EOB, T1 ; EOB = k + idx;
|
|
shun-iwasawa |
82a8f5 |
.CONTINUER8:
|
|
shun-iwasawa |
82a8f5 |
add VALUES, 8*2
|
|
shun-iwasawa |
82a8f5 |
jmp .PADDINGR
|
|
shun-iwasawa |
82a8f5 |
.TRYR7:
|
|
shun-iwasawa |
82a8f5 |
and LENEND, 7
|
|
shun-iwasawa |
82a8f5 |
LOAD7
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
pcmpgtw N0, X0
|
|
shun-iwasawa |
82a8f5 |
paddw X0, N0
|
|
shun-iwasawa |
82a8f5 |
pxor X0, N0
|
|
shun-iwasawa |
82a8f5 |
psrlw X0, AL
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + (0) * 2], X0
|
|
shun-iwasawa |
82a8f5 |
pcmpeqw X0, ONE
|
|
shun-iwasawa |
82a8f5 |
packsswb N0, ZERO
|
|
shun-iwasawa |
82a8f5 |
packsswb X0, ZERO
|
|
shun-iwasawa |
82a8f5 |
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
|
shun-iwasawa |
82a8f5 |
mov T1, ZEROBITS
|
|
shun-iwasawa |
82a8f5 |
not T0
|
|
shun-iwasawa |
82a8f5 |
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
|
|
shun-iwasawa |
82a8f5 |
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
|
|
shun-iwasawa |
82a8f5 |
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
|
|
shun-iwasawa |
82a8f5 |
jz .CONTINUER7 ; if (idx) {
|
|
shun-iwasawa |
82a8f5 |
lea T1, [T1+KK*8]
|
|
shun-iwasawa |
82a8f5 |
mov EOB, T1 ; EOB = k + idx;
|
|
shun-iwasawa |
82a8f5 |
.CONTINUER7:
|
|
shun-iwasawa |
82a8f5 |
add VALUES, 8*2
|
|
shun-iwasawa |
82a8f5 |
.PADDINGR:
|
|
shun-iwasawa |
82a8f5 |
mov K, LEN
|
|
shun-iwasawa |
82a8f5 |
add K, 7
|
|
shun-iwasawa |
82a8f5 |
and K, -8
|
|
shun-iwasawa |
82a8f5 |
shr K, 3
|
|
shun-iwasawa |
82a8f5 |
sub K, DCTSIZE2/8
|
|
shun-iwasawa |
82a8f5 |
jz .EPADDINGR
|
|
shun-iwasawa |
82a8f5 |
align 16
|
|
shun-iwasawa |
82a8f5 |
.ZEROLOOPR:
|
|
shun-iwasawa |
82a8f5 |
movdqa XMMWORD [VALUES + 0], ZERO
|
|
shun-iwasawa |
82a8f5 |
add VALUES, 8*2
|
|
shun-iwasawa |
82a8f5 |
inc K
|
|
shun-iwasawa |
82a8f5 |
jnz .ZEROLOOPR
|
|
shun-iwasawa |
82a8f5 |
.EPADDINGR:
|
|
shun-iwasawa |
82a8f5 |
sub VALUES, DCTSIZE2*2
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
REDUCE0
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
mov eax, EOB
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
pop ebp
|
|
shun-iwasawa |
82a8f5 |
pop edi
|
|
shun-iwasawa |
82a8f5 |
pop esi
|
|
shun-iwasawa |
82a8f5 |
; pop edx ; need not be preserved
|
|
shun-iwasawa |
82a8f5 |
pop ecx
|
|
shun-iwasawa |
82a8f5 |
pop ebx
|
|
shun-iwasawa |
82a8f5 |
mov esp, ebp ; esp <- aligned ebp
|
|
shun-iwasawa |
82a8f5 |
pop esp ; esp <- original ebp
|
|
shun-iwasawa |
82a8f5 |
pop ebp
|
|
shun-iwasawa |
82a8f5 |
ret
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
%undef ZERO
|
|
shun-iwasawa |
82a8f5 |
%undef ONE
|
|
shun-iwasawa |
82a8f5 |
%undef X0
|
|
shun-iwasawa |
82a8f5 |
%undef X1
|
|
shun-iwasawa |
82a8f5 |
%undef N0
|
|
shun-iwasawa |
82a8f5 |
%undef N1
|
|
shun-iwasawa |
82a8f5 |
%undef AL
|
|
shun-iwasawa |
82a8f5 |
%undef K
|
|
shun-iwasawa |
82a8f5 |
%undef KK
|
|
shun-iwasawa |
82a8f5 |
%undef EOB
|
|
shun-iwasawa |
82a8f5 |
%undef SIGN
|
|
shun-iwasawa |
82a8f5 |
%undef LUT
|
|
shun-iwasawa |
82a8f5 |
%undef T0
|
|
shun-iwasawa |
82a8f5 |
%undef T1
|
|
shun-iwasawa |
82a8f5 |
%undef BLOCK
|
|
shun-iwasawa |
82a8f5 |
%undef VALUES
|
|
shun-iwasawa |
82a8f5 |
%undef LEN
|
|
shun-iwasawa |
82a8f5 |
%undef LENEND
|
|
shun-iwasawa |
82a8f5 |
|
|
shun-iwasawa |
82a8f5 |
; For some reason, the OS X linker does not honor the request to align the
|
|
shun-iwasawa |
82a8f5 |
; segment unless we do this.
|
|
shun-iwasawa |
82a8f5 |
align 32
|