Blame thirdparty/libjpeg-turbo/libjpeg-turbo-2.0.6/simd/i386/jidctred-mmx.asm

shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; jidctred.asm - reduced-size IDCT (MMX)
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB</ossman@cendio.se>
shun-iwasawa 82a8f5
; Copyright (C) 2016, D. R. Commander.
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Based on the x86 SIMD extension for IJG JPEG library
shun-iwasawa 82a8f5
; Copyright (C) 1999-2006, MIYASAKA Masaru.
shun-iwasawa 82a8f5
; For conditions of distribution and use, see copyright notice in jsimdext.inc
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; This file should be assembled with NASM (Netwide Assembler),
shun-iwasawa 82a8f5
; can *not* be assembled with Microsoft's MASM or any compatible
shun-iwasawa 82a8f5
; assembler (including Borland's Turbo Assembler).
shun-iwasawa 82a8f5
; NASM is available from http://nasm.sourceforge.net/ or
shun-iwasawa 82a8f5
; http://sourceforge.net/project/showfiles.php?group_id=6208
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; This file contains inverse-DCT routines that produce reduced-size
shun-iwasawa 82a8f5
; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
shun-iwasawa 82a8f5
; The following code is based directly on the IJG's original jidctred.c;
shun-iwasawa 82a8f5
; see the jidctred.c for more details.
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%include "jsimdext.inc"
shun-iwasawa 82a8f5
%include "jdct.inc"
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define CONST_BITS    13
shun-iwasawa 82a8f5
%define PASS1_BITS    2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
shun-iwasawa 82a8f5
%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
shun-iwasawa 82a8f5
%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
shun-iwasawa 82a8f5
%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%if CONST_BITS == 13
shun-iwasawa 82a8f5
F_0_211 equ  1730  ; FIX(0.211164243)
shun-iwasawa 82a8f5
F_0_509 equ  4176  ; FIX(0.509795579)
shun-iwasawa 82a8f5
F_0_601 equ  4926  ; FIX(0.601344887)
shun-iwasawa 82a8f5
F_0_720 equ  5906  ; FIX(0.720959822)
shun-iwasawa 82a8f5
F_0_765 equ  6270  ; FIX(0.765366865)
shun-iwasawa 82a8f5
F_0_850 equ  6967  ; FIX(0.850430095)
shun-iwasawa 82a8f5
F_0_899 equ  7373  ; FIX(0.899976223)
shun-iwasawa 82a8f5
F_1_061 equ  8697  ; FIX(1.061594337)
shun-iwasawa 82a8f5
F_1_272 equ 10426  ; FIX(1.272758580)
shun-iwasawa 82a8f5
F_1_451 equ 11893  ; FIX(1.451774981)
shun-iwasawa 82a8f5
F_1_847 equ 15137  ; FIX(1.847759065)
shun-iwasawa 82a8f5
F_2_172 equ 17799  ; FIX(2.172734803)
shun-iwasawa 82a8f5
F_2_562 equ 20995  ; FIX(2.562915447)
shun-iwasawa 82a8f5
F_3_624 equ 29692  ; FIX(3.624509785)
shun-iwasawa 82a8f5
%else
shun-iwasawa 82a8f5
; NASM cannot do compile-time arithmetic on floating-point constants.
shun-iwasawa 82a8f5
%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
shun-iwasawa 82a8f5
F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
shun-iwasawa 82a8f5
F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
shun-iwasawa 82a8f5
F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
shun-iwasawa 82a8f5
F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
shun-iwasawa 82a8f5
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
shun-iwasawa 82a8f5
F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
shun-iwasawa 82a8f5
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
shun-iwasawa 82a8f5
F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
shun-iwasawa 82a8f5
F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
shun-iwasawa 82a8f5
F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
shun-iwasawa 82a8f5
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
shun-iwasawa 82a8f5
F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
shun-iwasawa 82a8f5
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
shun-iwasawa 82a8f5
F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
    SECTION     SEG_CONST
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    alignz      32
shun-iwasawa 82a8f5
    GLOBAL_DATA(jconst_idct_red_mmx)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
EXTN(jconst_idct_red_mmx):
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
PW_F184_MF076   times 2 dw  F_1_847, -F_0_765
shun-iwasawa 82a8f5
PW_F256_F089    times 2 dw  F_2_562,  F_0_899
shun-iwasawa 82a8f5
PW_F106_MF217   times 2 dw  F_1_061, -F_2_172
shun-iwasawa 82a8f5
PW_MF060_MF050  times 2 dw -F_0_601, -F_0_509
shun-iwasawa 82a8f5
PW_F145_MF021   times 2 dw  F_1_451, -F_0_211
shun-iwasawa 82a8f5
PW_F362_MF127   times 2 dw  F_3_624, -F_1_272
shun-iwasawa 82a8f5
PW_F085_MF072   times 2 dw  F_0_850, -F_0_720
shun-iwasawa 82a8f5
PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4 - 1)
shun-iwasawa 82a8f5
PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4 - 1)
shun-iwasawa 82a8f5
PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2 - 1)
shun-iwasawa 82a8f5
PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2 - 1)
shun-iwasawa 82a8f5
PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    alignz      32
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
    SECTION     SEG_TEXT
shun-iwasawa 82a8f5
    BITS        32
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Perform dequantization and inverse DCT on one block of coefficients,
shun-iwasawa 82a8f5
; producing a reduced-size 4x4 output block.
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; GLOBAL(void)
shun-iwasawa 82a8f5
; jsimd_idct_4x4_mmx(void *dct_table, JCOEFPTR coef_block,
shun-iwasawa 82a8f5
;                    JSAMPARRAY output_buf, JDIMENSION output_col)
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define dct_table(b)   (b) + 8          ; void *dct_table
shun-iwasawa 82a8f5
%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
shun-iwasawa 82a8f5
%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
shun-iwasawa 82a8f5
%define output_col(b)  (b) + 20         ; JDIMENSION output_col
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define original_ebp   ebp + 0
shun-iwasawa 82a8f5
%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
shun-iwasawa 82a8f5
                                        ; mmword wk[WK_NUM]
shun-iwasawa 82a8f5
%define WK_NUM         2
shun-iwasawa 82a8f5
%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
shun-iwasawa 82a8f5
                                        ; JCOEF workspace[DCTSIZE2]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    align       32
shun-iwasawa 82a8f5
    GLOBAL_FUNCTION(jsimd_idct_4x4_mmx)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
EXTN(jsimd_idct_4x4_mmx):
shun-iwasawa 82a8f5
    push        ebp
shun-iwasawa 82a8f5
    mov         eax, esp                    ; eax = original ebp
shun-iwasawa 82a8f5
    sub         esp, byte 4
shun-iwasawa 82a8f5
    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
shun-iwasawa 82a8f5
    mov         [esp], eax
shun-iwasawa 82a8f5
    mov         ebp, esp                    ; ebp = aligned ebp
shun-iwasawa 82a8f5
    lea         esp, [workspace]
shun-iwasawa 82a8f5
    pushpic     ebx
shun-iwasawa 82a8f5
;   push        ecx                     ; need not be preserved
shun-iwasawa 82a8f5
;   push        edx                     ; need not be preserved
shun-iwasawa 82a8f5
    push        esi
shun-iwasawa 82a8f5
    push        edi
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    get_GOT     ebx                     ; get GOT address
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; ---- Pass 1: process columns from input, store into work array.
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
;   mov         eax, [original_ebp]
shun-iwasawa 82a8f5
    mov         edx, POINTER [dct_table(eax)]    ; quantptr
shun-iwasawa 82a8f5
    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
shun-iwasawa 82a8f5
    lea         edi, [workspace]                 ; JCOEF *wsptr
shun-iwasawa 82a8f5
    mov         ecx, DCTSIZE/4                   ; ctr
shun-iwasawa 82a8f5
    alignx      16, 7
shun-iwasawa 82a8f5
.columnloop:
shun-iwasawa 82a8f5
%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
shun-iwasawa 82a8f5
    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    jnz         short .columnDCT
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    por         mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    por         mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    por         mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    por         mm0, mm1
shun-iwasawa 82a8f5
    packsswb    mm0, mm0
shun-iwasawa 82a8f5
    movd        eax, mm0
shun-iwasawa 82a8f5
    test        eax, eax
shun-iwasawa 82a8f5
    jnz         short .columnDCT
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- AC terms all zero
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    psllw       mm0, PASS1_BITS
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
shun-iwasawa 82a8f5
    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
shun-iwasawa 82a8f5
    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm1, mm0
shun-iwasawa 82a8f5
    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
shun-iwasawa 82a8f5
    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
shun-iwasawa 82a8f5
    movq        mm3, mm2
shun-iwasawa 82a8f5
    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
shun-iwasawa 82a8f5
    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
shun-iwasawa 82a8f5
    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
shun-iwasawa 82a8f5
    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
shun-iwasawa 82a8f5
    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
shun-iwasawa 82a8f5
    jmp         near .nextcolumn
shun-iwasawa 82a8f5
    alignx      16, 7
shun-iwasawa 82a8f5
%endif
shun-iwasawa 82a8f5
.columnDCT:
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- Odd part
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm4, mm0
shun-iwasawa 82a8f5
    movq        mm5, mm0
shun-iwasawa 82a8f5
    punpcklwd   mm4, mm1
shun-iwasawa 82a8f5
    punpckhwd   mm5, mm1
shun-iwasawa 82a8f5
    movq        mm0, mm4
shun-iwasawa 82a8f5
    movq        mm1, mm5
shun-iwasawa 82a8f5
    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
shun-iwasawa 82a8f5
    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
shun-iwasawa 82a8f5
    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
shun-iwasawa 82a8f5
    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm6, mm2
shun-iwasawa 82a8f5
    movq        mm7, mm2
shun-iwasawa 82a8f5
    punpcklwd   mm6, mm3
shun-iwasawa 82a8f5
    punpckhwd   mm7, mm3
shun-iwasawa 82a8f5
    movq        mm2, mm6
shun-iwasawa 82a8f5
    movq        mm3, mm7
shun-iwasawa 82a8f5
    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
shun-iwasawa 82a8f5
    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
shun-iwasawa 82a8f5
    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
shun-iwasawa 82a8f5
    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    paddd       mm6, mm4                ; mm6=tmp2L
shun-iwasawa 82a8f5
    paddd       mm7, mm5                ; mm7=tmp2H
shun-iwasawa 82a8f5
    paddd       mm2, mm0                ; mm2=tmp0L
shun-iwasawa 82a8f5
    paddd       mm3, mm1                ; mm3=tmp0H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
shun-iwasawa 82a8f5
    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- Even part
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    pmullw      mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
    pmullw      mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
    pmullw      mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pxor        mm1, mm1
shun-iwasawa 82a8f5
    pxor        mm2, mm2
shun-iwasawa 82a8f5
    punpcklwd   mm1, mm4                ; mm1=tmp0L
shun-iwasawa 82a8f5
    punpckhwd   mm2, mm4                ; mm2=tmp0H
shun-iwasawa 82a8f5
    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
shun-iwasawa 82a8f5
    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm3, mm5                ; mm5=in2=z2
shun-iwasawa 82a8f5
    punpcklwd   mm5, mm0                ; mm0=in6=z3
shun-iwasawa 82a8f5
    punpckhwd   mm3, mm0
shun-iwasawa 82a8f5
    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
shun-iwasawa 82a8f5
    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm4, mm1
shun-iwasawa 82a8f5
    movq        mm0, mm2
shun-iwasawa 82a8f5
    paddd       mm1, mm5                ; mm1=tmp10L
shun-iwasawa 82a8f5
    paddd       mm2, mm3                ; mm2=tmp10H
shun-iwasawa 82a8f5
    psubd       mm4, mm5                ; mm4=tmp12L
shun-iwasawa 82a8f5
    psubd       mm0, mm3                ; mm0=tmp12H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- Final output stage
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm5, mm1
shun-iwasawa 82a8f5
    movq        mm3, mm2
shun-iwasawa 82a8f5
    paddd       mm1, mm6                ; mm1=data0L
shun-iwasawa 82a8f5
    paddd       mm2, mm7                ; mm2=data0H
shun-iwasawa 82a8f5
    psubd       mm5, mm6                ; mm5=data3L
shun-iwasawa 82a8f5
    psubd       mm3, mm7                ; mm3=data3H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm6=[PD_DESCALE_P1_4]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    paddd       mm1, mm6
shun-iwasawa 82a8f5
    paddd       mm2, mm6
shun-iwasawa 82a8f5
    psrad       mm1, DESCALE_P1_4
shun-iwasawa 82a8f5
    psrad       mm2, DESCALE_P1_4
shun-iwasawa 82a8f5
    paddd       mm5, mm6
shun-iwasawa 82a8f5
    paddd       mm3, mm6
shun-iwasawa 82a8f5
    psrad       mm5, DESCALE_P1_4
shun-iwasawa 82a8f5
    psrad       mm3, DESCALE_P1_4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    packssdw    mm1, mm2                ; mm1=data0=(00 01 02 03)
shun-iwasawa 82a8f5
    packssdw    mm5, mm3                ; mm5=data3=(30 31 32 33)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
shun-iwasawa 82a8f5
    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm2, mm4
shun-iwasawa 82a8f5
    movq        mm3, mm0
shun-iwasawa 82a8f5
    paddd       mm4, mm7                ; mm4=data1L
shun-iwasawa 82a8f5
    paddd       mm0, mm6                ; mm0=data1H
shun-iwasawa 82a8f5
    psubd       mm2, mm7                ; mm2=data2L
shun-iwasawa 82a8f5
    psubd       mm3, mm6                ; mm3=data2H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm7=[PD_DESCALE_P1_4]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    paddd       mm4, mm7
shun-iwasawa 82a8f5
    paddd       mm0, mm7
shun-iwasawa 82a8f5
    psrad       mm4, DESCALE_P1_4
shun-iwasawa 82a8f5
    psrad       mm0, DESCALE_P1_4
shun-iwasawa 82a8f5
    paddd       mm2, mm7
shun-iwasawa 82a8f5
    paddd       mm3, mm7
shun-iwasawa 82a8f5
    psrad       mm2, DESCALE_P1_4
shun-iwasawa 82a8f5
    psrad       mm3, DESCALE_P1_4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    packssdw    mm4, mm0                ; mm4=data1=(10 11 12 13)
shun-iwasawa 82a8f5
    packssdw    mm2, mm3                ; mm2=data2=(20 21 22 23)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm6, mm1                ; transpose coefficients(phase 1)
shun-iwasawa 82a8f5
    punpcklwd   mm1, mm4                ; mm1=(00 10 01 11)
shun-iwasawa 82a8f5
    punpckhwd   mm6, mm4                ; mm6=(02 12 03 13)
shun-iwasawa 82a8f5
    movq        mm7, mm2                ; transpose coefficients(phase 1)
shun-iwasawa 82a8f5
    punpcklwd   mm2, mm5                ; mm2=(20 30 21 31)
shun-iwasawa 82a8f5
    punpckhwd   mm7, mm5                ; mm7=(22 32 23 33)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm0, mm1                ; transpose coefficients(phase 2)
shun-iwasawa 82a8f5
    punpckldq   mm1, mm2                ; mm1=(00 10 20 30)
shun-iwasawa 82a8f5
    punpckhdq   mm0, mm2                ; mm0=(01 11 21 31)
shun-iwasawa 82a8f5
    movq        mm3, mm6                ; transpose coefficients(phase 2)
shun-iwasawa 82a8f5
    punpckldq   mm6, mm7                ; mm6=(02 12 22 32)
shun-iwasawa 82a8f5
    punpckhdq   mm3, mm7                ; mm3=(03 13 23 33)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
shun-iwasawa 82a8f5
    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
shun-iwasawa 82a8f5
    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
shun-iwasawa 82a8f5
    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.nextcolumn:
shun-iwasawa 82a8f5
    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
shun-iwasawa 82a8f5
    add         edx, byte 4*SIZEOF_ISLOW_MULT_TYPE  ; quantptr
shun-iwasawa 82a8f5
    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
shun-iwasawa 82a8f5
    dec         ecx                                 ; ctr
shun-iwasawa 82a8f5
    jnz         near .columnloop
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; ---- Pass 2: process rows from work array, store into output array.
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         eax, [original_ebp]
shun-iwasawa 82a8f5
    lea         esi, [workspace]                   ; JCOEF *wsptr
shun-iwasawa 82a8f5
    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
shun-iwasawa 82a8f5
    mov         eax, JDIMENSION [output_col(eax)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- Odd part
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm4, mm0
shun-iwasawa 82a8f5
    movq        mm5, mm0
shun-iwasawa 82a8f5
    punpcklwd   mm4, mm1
shun-iwasawa 82a8f5
    punpckhwd   mm5, mm1
shun-iwasawa 82a8f5
    movq        mm0, mm4
shun-iwasawa 82a8f5
    movq        mm1, mm5
shun-iwasawa 82a8f5
    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
shun-iwasawa 82a8f5
    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
shun-iwasawa 82a8f5
    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
shun-iwasawa 82a8f5
    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm6, mm2
shun-iwasawa 82a8f5
    movq        mm7, mm2
shun-iwasawa 82a8f5
    punpcklwd   mm6, mm3
shun-iwasawa 82a8f5
    punpckhwd   mm7, mm3
shun-iwasawa 82a8f5
    movq        mm2, mm6
shun-iwasawa 82a8f5
    movq        mm3, mm7
shun-iwasawa 82a8f5
    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
shun-iwasawa 82a8f5
    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
shun-iwasawa 82a8f5
    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
shun-iwasawa 82a8f5
    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    paddd       mm6, mm4                ; mm6=tmp2L
shun-iwasawa 82a8f5
    paddd       mm7, mm5                ; mm7=tmp2H
shun-iwasawa 82a8f5
    paddd       mm2, mm0                ; mm2=tmp0L
shun-iwasawa 82a8f5
    paddd       mm3, mm1                ; mm3=tmp0H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
shun-iwasawa 82a8f5
    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- Even part
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pxor        mm1, mm1
shun-iwasawa 82a8f5
    pxor        mm2, mm2
shun-iwasawa 82a8f5
    punpcklwd   mm1, mm4                ; mm1=tmp0L
shun-iwasawa 82a8f5
    punpckhwd   mm2, mm4                ; mm2=tmp0H
shun-iwasawa 82a8f5
    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
shun-iwasawa 82a8f5
    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm3, mm5                ; mm5=in2=z2
shun-iwasawa 82a8f5
    punpcklwd   mm5, mm0                ; mm0=in6=z3
shun-iwasawa 82a8f5
    punpckhwd   mm3, mm0
shun-iwasawa 82a8f5
    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
shun-iwasawa 82a8f5
    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm4, mm1
shun-iwasawa 82a8f5
    movq        mm0, mm2
shun-iwasawa 82a8f5
    paddd       mm1, mm5                ; mm1=tmp10L
shun-iwasawa 82a8f5
    paddd       mm2, mm3                ; mm2=tmp10H
shun-iwasawa 82a8f5
    psubd       mm4, mm5                ; mm4=tmp12L
shun-iwasawa 82a8f5
    psubd       mm0, mm3                ; mm0=tmp12H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- Final output stage
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm5, mm1
shun-iwasawa 82a8f5
    movq        mm3, mm2
shun-iwasawa 82a8f5
    paddd       mm1, mm6                ; mm1=data0L
shun-iwasawa 82a8f5
    paddd       mm2, mm7                ; mm2=data0H
shun-iwasawa 82a8f5
    psubd       mm5, mm6                ; mm5=data3L
shun-iwasawa 82a8f5
    psubd       mm3, mm7                ; mm3=data3H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm6=[PD_DESCALE_P2_4]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    paddd       mm1, mm6
shun-iwasawa 82a8f5
    paddd       mm2, mm6
shun-iwasawa 82a8f5
    psrad       mm1, DESCALE_P2_4
shun-iwasawa 82a8f5
    psrad       mm2, DESCALE_P2_4
shun-iwasawa 82a8f5
    paddd       mm5, mm6
shun-iwasawa 82a8f5
    paddd       mm3, mm6
shun-iwasawa 82a8f5
    psrad       mm5, DESCALE_P2_4
shun-iwasawa 82a8f5
    psrad       mm3, DESCALE_P2_4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    packssdw    mm1, mm2                ; mm1=data0=(00 10 20 30)
shun-iwasawa 82a8f5
    packssdw    mm5, mm3                ; mm5=data3=(03 13 23 33)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
shun-iwasawa 82a8f5
    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm2, mm4
shun-iwasawa 82a8f5
    movq        mm3, mm0
shun-iwasawa 82a8f5
    paddd       mm4, mm7                ; mm4=data1L
shun-iwasawa 82a8f5
    paddd       mm0, mm6                ; mm0=data1H
shun-iwasawa 82a8f5
    psubd       mm2, mm7                ; mm2=data2L
shun-iwasawa 82a8f5
    psubd       mm3, mm6                ; mm3=data2H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm7=[PD_DESCALE_P2_4]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    paddd       mm4, mm7
shun-iwasawa 82a8f5
    paddd       mm0, mm7
shun-iwasawa 82a8f5
    psrad       mm4, DESCALE_P2_4
shun-iwasawa 82a8f5
    psrad       mm0, DESCALE_P2_4
shun-iwasawa 82a8f5
    paddd       mm2, mm7
shun-iwasawa 82a8f5
    paddd       mm3, mm7
shun-iwasawa 82a8f5
    psrad       mm2, DESCALE_P2_4
shun-iwasawa 82a8f5
    psrad       mm3, DESCALE_P2_4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    packssdw    mm4, mm0                ; mm4=data1=(01 11 21 31)
shun-iwasawa 82a8f5
    packssdw    mm2, mm3                ; mm2=data2=(02 12 22 32)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    packsswb    mm1, mm2                ; mm1=(00 10 20 30 02 12 22 32)
shun-iwasawa 82a8f5
    packsswb    mm4, mm5                ; mm4=(01 11 21 31 03 13 23 33)
shun-iwasawa 82a8f5
    paddb       mm1, mm6
shun-iwasawa 82a8f5
    paddb       mm4, mm6
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm7, mm1                ; transpose coefficients(phase 1)
shun-iwasawa 82a8f5
    punpcklbw   mm1, mm4                ; mm1=(00 01 10 11 20 21 30 31)
shun-iwasawa 82a8f5
    punpckhbw   mm7, mm4                ; mm7=(02 03 12 13 22 23 32 33)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm0, mm1                ; transpose coefficients(phase 2)
shun-iwasawa 82a8f5
    punpcklwd   mm1, mm7                ; mm1=(00 01 02 03 10 11 12 13)
shun-iwasawa 82a8f5
    punpckhwd   mm0, mm7                ; mm0=(20 21 22 23 30 31 32 33)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
shun-iwasawa 82a8f5
    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
shun-iwasawa 82a8f5
    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
shun-iwasawa 82a8f5
    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    psrlq       mm1, 4*BYTE_BIT
shun-iwasawa 82a8f5
    psrlq       mm0, 4*BYTE_BIT
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
shun-iwasawa 82a8f5
    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
shun-iwasawa 82a8f5
    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
shun-iwasawa 82a8f5
    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    emms                                ; empty MMX state
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pop         edi
shun-iwasawa 82a8f5
    pop         esi
shun-iwasawa 82a8f5
;   pop         edx                     ; need not be preserved
shun-iwasawa 82a8f5
;   pop         ecx                     ; need not be preserved
shun-iwasawa 82a8f5
    poppic      ebx
shun-iwasawa 82a8f5
    mov         esp, ebp                ; esp <- aligned ebp
shun-iwasawa 82a8f5
    pop         esp                     ; esp <- original ebp
shun-iwasawa 82a8f5
    pop         ebp
shun-iwasawa 82a8f5
    ret
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Perform dequantization and inverse DCT on one block of coefficients,
shun-iwasawa 82a8f5
; producing a reduced-size 2x2 output block.
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; GLOBAL(void)
shun-iwasawa 82a8f5
; jsimd_idct_2x2_mmx(void *dct_table, JCOEFPTR coef_block,
shun-iwasawa 82a8f5
;                    JSAMPARRAY output_buf, JDIMENSION output_col)
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%define dct_table(b)   (b) + 8          ; void *dct_table
shun-iwasawa 82a8f5
%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
shun-iwasawa 82a8f5
%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
shun-iwasawa 82a8f5
%define output_col(b)  (b) + 20         ; JDIMENSION output_col
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    align       32
shun-iwasawa 82a8f5
    GLOBAL_FUNCTION(jsimd_idct_2x2_mmx)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
EXTN(jsimd_idct_2x2_mmx):
shun-iwasawa 82a8f5
    push        ebp
shun-iwasawa 82a8f5
    mov         ebp, esp
shun-iwasawa 82a8f5
    push        ebx
shun-iwasawa 82a8f5
;   push        ecx                     ; need not be preserved
shun-iwasawa 82a8f5
;   push        edx                     ; need not be preserved
shun-iwasawa 82a8f5
    push        esi
shun-iwasawa 82a8f5
    push        edi
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    get_GOT     ebx                     ; get GOT address
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; ---- Pass 1: process columns from input.
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         edx, POINTER [dct_table(ebp)]    ; quantptr
shun-iwasawa 82a8f5
    mov         esi, JCOEFPTR [coef_block(ebp)]  ; inptr
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; | input:                  | result:        |
shun-iwasawa 82a8f5
    ; | 00 01 ** 03 ** 05 ** 07 |                |
shun-iwasawa 82a8f5
    ; | 10 11 ** 13 ** 15 ** 17 |                |
shun-iwasawa 82a8f5
    ; | ** ** ** ** ** ** ** ** |                |
shun-iwasawa 82a8f5
    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
shun-iwasawa 82a8f5
    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
shun-iwasawa 82a8f5
    ; | 50 51 ** 53 ** 55 ** 57 |                |
shun-iwasawa 82a8f5
    ; | ** ** ** ** ** ** ** ** |                |
shun-iwasawa 82a8f5
    ; | 70 71 ** 73 ** 75 ** 77 |                |
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- Odd part
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
shun-iwasawa 82a8f5
    ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pcmpeqd     mm7, mm7
shun-iwasawa 82a8f5
    pslld       mm7, WORD_BIT           ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm4, mm0                ; mm4=(10 11 ** 13)
shun-iwasawa 82a8f5
    movq        mm5, mm2                ; mm5=(50 51 ** 53)
shun-iwasawa 82a8f5
    punpcklwd   mm4, mm1                ; mm4=(10 30 11 31)
shun-iwasawa 82a8f5
    punpcklwd   mm5, mm3                ; mm5=(50 70 51 71)
shun-iwasawa 82a8f5
    pmaddwd     mm4, [GOTOFF(ebx,PW_F362_MF127)]
shun-iwasawa 82a8f5
    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    psrld       mm0, WORD_BIT           ; mm0=(11 -- 13 --)
shun-iwasawa 82a8f5
    pand        mm1, mm7                ; mm1=(-- 31 -- 33)
shun-iwasawa 82a8f5
    psrld       mm2, WORD_BIT           ; mm2=(51 -- 53 --)
shun-iwasawa 82a8f5
    pand        mm3, mm7                ; mm3=(-- 71 -- 73)
shun-iwasawa 82a8f5
    por         mm0, mm1                ; mm0=(11 31 13 33)
shun-iwasawa 82a8f5
    por         mm2, mm3                ; mm2=(51 71 53 73)
shun-iwasawa 82a8f5
    pmaddwd     mm0, [GOTOFF(ebx,PW_F362_MF127)]
shun-iwasawa 82a8f5
    pmaddwd     mm2, [GOTOFF(ebx,PW_F085_MF072)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    paddd       mm4, mm5                ; mm4=tmp0[col0 col1]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    pmullw      mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
    pmullw      mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
    movq        mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    pmullw      mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
    pmullw      mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
shun-iwasawa 82a8f5
    ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    psrld       mm6, WORD_BIT           ; mm6=(15 -- 17 --)
shun-iwasawa 82a8f5
    pand        mm1, mm7                ; mm1=(-- 35 -- 37)
shun-iwasawa 82a8f5
    psrld       mm3, WORD_BIT           ; mm3=(55 -- 57 --)
shun-iwasawa 82a8f5
    pand        mm5, mm7                ; mm5=(-- 75 -- 77)
shun-iwasawa 82a8f5
    por         mm6, mm1                ; mm6=(15 35 17 37)
shun-iwasawa 82a8f5
    por         mm3, mm5                ; mm3=(55 75 57 77)
shun-iwasawa 82a8f5
    pmaddwd     mm6, [GOTOFF(ebx,PW_F362_MF127)]
shun-iwasawa 82a8f5
    pmaddwd     mm3, [GOTOFF(ebx,PW_F085_MF072)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    paddd       mm0, mm2                ; mm0=tmp0[col1 col3]
shun-iwasawa 82a8f5
    paddd       mm6, mm3                ; mm6=tmp0[col5 col7]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- Even part
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    movq        mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
shun-iwasawa 82a8f5
    pmullw      mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
    pmullw      mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm2, mm1                      ; mm2=(00 01 ** 03)
shun-iwasawa 82a8f5
    pslld       mm1, WORD_BIT                 ; mm1=(-- 00 -- **)
shun-iwasawa 82a8f5
    psrad       mm1, (WORD_BIT-CONST_BITS-2)  ; mm1=tmp10[col0 ****]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pand        mm2, mm7                      ; mm2=(-- 01 -- 03)
shun-iwasawa 82a8f5
    pand        mm5, mm7                      ; mm5=(-- 05 -- 07)
shun-iwasawa 82a8f5
    psrad       mm2, (WORD_BIT-CONST_BITS-2)  ; mm2=tmp10[col1 col3]
shun-iwasawa 82a8f5
    psrad       mm5, (WORD_BIT-CONST_BITS-2)  ; mm5=tmp10[col5 col7]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- Final output stage
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm3, mm1
shun-iwasawa 82a8f5
    paddd       mm1, mm4                ; mm1=data0[col0 ****]=(A0 **)
shun-iwasawa 82a8f5
    psubd       mm3, mm4                ; mm3=data1[col0 ****]=(B0 **)
shun-iwasawa 82a8f5
    punpckldq   mm1, mm3                ; mm1=(A0 B0)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_2)]  ; mm7=[PD_DESCALE_P1_2]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm4, mm2
shun-iwasawa 82a8f5
    movq        mm3, mm5
shun-iwasawa 82a8f5
    paddd       mm2, mm0                ; mm2=data0[col1 col3]=(A1 A3)
shun-iwasawa 82a8f5
    paddd       mm5, mm6                ; mm5=data0[col5 col7]=(A5 A7)
shun-iwasawa 82a8f5
    psubd       mm4, mm0                ; mm4=data1[col1 col3]=(B1 B3)
shun-iwasawa 82a8f5
    psubd       mm3, mm6                ; mm3=data1[col5 col7]=(B5 B7)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    paddd       mm1, mm7
shun-iwasawa 82a8f5
    psrad       mm1, DESCALE_P1_2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    paddd       mm2, mm7
shun-iwasawa 82a8f5
    paddd       mm5, mm7
shun-iwasawa 82a8f5
    psrad       mm2, DESCALE_P1_2
shun-iwasawa 82a8f5
    psrad       mm5, DESCALE_P1_2
shun-iwasawa 82a8f5
    paddd       mm4, mm7
shun-iwasawa 82a8f5
    paddd       mm3, mm7
shun-iwasawa 82a8f5
    psrad       mm4, DESCALE_P1_2
shun-iwasawa 82a8f5
    psrad       mm3, DESCALE_P1_2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; ---- Pass 2: process rows, store into output array.
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         edi, JSAMPARRAY [output_buf(ebp)]  ; (JSAMPROW *)
shun-iwasawa 82a8f5
    mov         eax, JDIMENSION [output_col(ebp)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; | input:| result:|
shun-iwasawa 82a8f5
    ; | A0 B0 |        |
shun-iwasawa 82a8f5
    ; | A1 B1 | C0 C1  |
shun-iwasawa 82a8f5
    ; | A3 B3 | D0 D1  |
shun-iwasawa 82a8f5
    ; | A5 B5 |        |
shun-iwasawa 82a8f5
    ; | A7 B7 |        |
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- Odd part
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    packssdw    mm2, mm4                ; mm2=(A1 A3 B1 B3)
shun-iwasawa 82a8f5
    packssdw    mm5, mm3                ; mm5=(A5 A7 B5 B7)
shun-iwasawa 82a8f5
    pmaddwd     mm2, [GOTOFF(ebx,PW_F362_MF127)]
shun-iwasawa 82a8f5
    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    paddd       mm2, mm5                ; mm2=tmp0[row0 row1]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- Even part
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pslld       mm1, (CONST_BITS+2)     ; mm1=tmp10[row0 row1]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; -- Final output stage
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm0, [GOTOFF(ebx,PD_DESCALE_P2_2)]  ; mm0=[PD_DESCALE_P2_2]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm6, mm1
shun-iwasawa 82a8f5
    paddd       mm1, mm2                ; mm1=data0[row0 row1]=(C0 C1)
shun-iwasawa 82a8f5
    psubd       mm6, mm2                ; mm6=data1[row0 row1]=(D0 D1)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    paddd       mm1, mm0
shun-iwasawa 82a8f5
    paddd       mm6, mm0
shun-iwasawa 82a8f5
    psrad       mm1, DESCALE_P2_2
shun-iwasawa 82a8f5
    psrad       mm6, DESCALE_P2_2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movq        mm7, mm1                ; transpose coefficients
shun-iwasawa 82a8f5
    punpckldq   mm1, mm6                ; mm1=(C0 D0)
shun-iwasawa 82a8f5
    punpckhdq   mm7, mm6                ; mm7=(C1 D1)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    packssdw    mm1, mm7                ; mm1=(C0 D0 C1 D1)
shun-iwasawa 82a8f5
    packsswb    mm1, mm1                ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
shun-iwasawa 82a8f5
    paddb       mm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    movd        ecx, mm1
shun-iwasawa 82a8f5
    movd        ebx, mm1                ; ebx=(C0 D0 C1 D1)
shun-iwasawa 82a8f5
    shr         ecx, 2*BYTE_BIT         ; ecx=(C1 D1 -- --)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
shun-iwasawa 82a8f5
    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
shun-iwasawa 82a8f5
    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
shun-iwasawa 82a8f5
    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    emms                                ; empty MMX state
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pop         edi
shun-iwasawa 82a8f5
    pop         esi
shun-iwasawa 82a8f5
;   pop         edx                     ; need not be preserved
shun-iwasawa 82a8f5
;   pop         ecx                     ; need not be preserved
shun-iwasawa 82a8f5
    pop         ebx
shun-iwasawa 82a8f5
    pop         ebp
shun-iwasawa 82a8f5
    ret
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; For some reason, the OS X linker does not honor the request to align the
shun-iwasawa 82a8f5
; segment unless we do this.
shun-iwasawa 82a8f5
    align       32