shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Armv7 Neon optimizations for libjpeg-turbo
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
shun-iwasawa 82a8f5
 *                          All Rights Reserved.
shun-iwasawa 82a8f5
 * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com></siarhei.siamashka@nokia.com>
shun-iwasawa 82a8f5
 * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
shun-iwasawa 82a8f5
 * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
shun-iwasawa 82a8f5
 * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
shun-iwasawa 82a8f5
 * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This software is provided 'as-is', without any express or implied
shun-iwasawa 82a8f5
 * warranty.  In no event will the authors be held liable for any damages
shun-iwasawa 82a8f5
 * arising from the use of this software.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Permission is granted to anyone to use this software for any purpose,
shun-iwasawa 82a8f5
 * including commercial applications, and to alter it and redistribute it
shun-iwasawa 82a8f5
 * freely, subject to the following restrictions:
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * 1. The origin of this software must not be misrepresented; you must not
shun-iwasawa 82a8f5
 *    claim that you wrote the original software. If you use this software
shun-iwasawa 82a8f5
 *    in a product, an acknowledgment in the product documentation would be
shun-iwasawa 82a8f5
 *    appreciated but is not required.
shun-iwasawa 82a8f5
 * 2. Altered source versions must be plainly marked as such, and must not be
shun-iwasawa 82a8f5
 *    misrepresented as being the original software.
shun-iwasawa 82a8f5
 * 3. This notice may not be removed or altered from any source distribution.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#if defined(__linux__) && defined(__ELF__)
shun-iwasawa 82a8f5
.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.text
shun-iwasawa 82a8f5
.fpu neon
shun-iwasawa 82a8f5
.arch armv7a
shun-iwasawa 82a8f5
.object_arch armv4
shun-iwasawa 82a8f5
.arm
shun-iwasawa 82a8f5
.syntax unified
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define RESPECT_STRICT_ALIGNMENT  1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Supplementary macro for setting function attributes */
shun-iwasawa 82a8f5
.macro asm_function fname
shun-iwasawa 82a8f5
#ifdef __APPLE__
shun-iwasawa 82a8f5
    .private_extern _\fname
shun-iwasawa 82a8f5
    .globl _\fname
shun-iwasawa 82a8f5
_\fname:
shun-iwasawa 82a8f5
#else
shun-iwasawa 82a8f5
    .global \fname
shun-iwasawa 82a8f5
#ifdef __ELF__
shun-iwasawa 82a8f5
    .hidden \fname
shun-iwasawa 82a8f5
    .type \fname, %function
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
\fname:
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Transpose a block of 4x4 coefficients in four 64-bit registers */
shun-iwasawa 82a8f5
.macro transpose_4x4 x0, x1, x2, x3
shun-iwasawa 82a8f5
    vtrn.16         \x0, \x1
shun-iwasawa 82a8f5
    vtrn.16         \x2, \x3
shun-iwasawa 82a8f5
    vtrn.32         \x0, \x2
shun-iwasawa 82a8f5
    vtrn.32         \x1, \x3
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define CENTERJSAMPLE  128
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Perform dequantization and inverse DCT on one block of coefficients.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * GLOBAL(void)
shun-iwasawa 82a8f5
 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
shun-iwasawa 82a8f5
 *                       JSAMPARRAY output_buf, JDIMENSION output_col)
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define FIX_0_298631336  (2446)
shun-iwasawa 82a8f5
#define FIX_0_390180644  (3196)
shun-iwasawa 82a8f5
#define FIX_0_541196100  (4433)
shun-iwasawa 82a8f5
#define FIX_0_765366865  (6270)
shun-iwasawa 82a8f5
#define FIX_0_899976223  (7373)
shun-iwasawa 82a8f5
#define FIX_1_175875602  (9633)
shun-iwasawa 82a8f5
#define FIX_1_501321110  (12299)
shun-iwasawa 82a8f5
#define FIX_1_847759065  (15137)
shun-iwasawa 82a8f5
#define FIX_1_961570560  (16069)
shun-iwasawa 82a8f5
#define FIX_2_053119869  (16819)
shun-iwasawa 82a8f5
#define FIX_2_562915447  (20995)
shun-iwasawa 82a8f5
#define FIX_3_072711026  (25172)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define FIX_1_175875602_MINUS_1_961570560  (FIX_1_175875602 - FIX_1_961570560)
shun-iwasawa 82a8f5
#define FIX_1_175875602_MINUS_0_390180644  (FIX_1_175875602 - FIX_0_390180644)
shun-iwasawa 82a8f5
#define FIX_0_541196100_MINUS_1_847759065  (FIX_0_541196100 - FIX_1_847759065)
shun-iwasawa 82a8f5
#define FIX_3_072711026_MINUS_2_562915447  (FIX_3_072711026 - FIX_2_562915447)
shun-iwasawa 82a8f5
#define FIX_0_298631336_MINUS_0_899976223  (FIX_0_298631336 - FIX_0_899976223)
shun-iwasawa 82a8f5
#define FIX_1_501321110_MINUS_0_899976223  (FIX_1_501321110 - FIX_0_899976223)
shun-iwasawa 82a8f5
#define FIX_2_053119869_MINUS_2_562915447  (FIX_2_053119869 - FIX_2_562915447)
shun-iwasawa 82a8f5
#define FIX_0_541196100_PLUS_0_765366865   (FIX_0_541196100 + FIX_0_765366865)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
shun-iwasawa 82a8f5
 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
shun-iwasawa 82a8f5
  DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
shun-iwasawa 82a8f5
  JLONG   q1, q2, q3, q4, q5, q6, q7; \
shun-iwasawa 82a8f5
  JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
shun-iwasawa 82a8f5
  \
shun-iwasawa 82a8f5
  /* 1-D iDCT input data */ \
shun-iwasawa 82a8f5
  row0 = xrow0; \
shun-iwasawa 82a8f5
  row1 = xrow1; \
shun-iwasawa 82a8f5
  row2 = xrow2; \
shun-iwasawa 82a8f5
  row3 = xrow3; \
shun-iwasawa 82a8f5
  row4 = xrow4; \
shun-iwasawa 82a8f5
  row5 = xrow5; \
shun-iwasawa 82a8f5
  row6 = xrow6; \
shun-iwasawa 82a8f5
  row7 = xrow7; \
shun-iwasawa 82a8f5
  \
shun-iwasawa 82a8f5
  q5 = row7 + row3; \
shun-iwasawa 82a8f5
  q4 = row5 + row1; \
shun-iwasawa 82a8f5
  q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
shun-iwasawa 82a8f5
       MULTIPLY(q4, FIX_1_175875602); \
shun-iwasawa 82a8f5
  q7 = MULTIPLY(q5, FIX_1_175875602) + \
shun-iwasawa 82a8f5
       MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
shun-iwasawa 82a8f5
  q2 = MULTIPLY(row2, FIX_0_541196100) + \
shun-iwasawa 82a8f5
       MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
shun-iwasawa 82a8f5
  q4 = q6; \
shun-iwasawa 82a8f5
  q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
shun-iwasawa 82a8f5
  q6 += MULTIPLY(row5, -FIX_2_562915447) + \
shun-iwasawa 82a8f5
        MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
shun-iwasawa 82a8f5
  /* now we can use q1 (reloadable constants have been used up) */ \
shun-iwasawa 82a8f5
  q1 = q3 + q2; \
shun-iwasawa 82a8f5
  q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
shun-iwasawa 82a8f5
        MULTIPLY(row1, -FIX_0_899976223); \
shun-iwasawa 82a8f5
  q5 = q7; \
shun-iwasawa 82a8f5
  q1 = q1 + q6; \
shun-iwasawa 82a8f5
  q7 += MULTIPLY(row7, -FIX_0_899976223) + \
shun-iwasawa 82a8f5
        MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
shun-iwasawa 82a8f5
  \
shun-iwasawa 82a8f5
  /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
shun-iwasawa 82a8f5
  tmp11_plus_tmp2 = q1; \
shun-iwasawa 82a8f5
  row1 = 0; \
shun-iwasawa 82a8f5
  \
shun-iwasawa 82a8f5
  q1 = q1 - q6; \
shun-iwasawa 82a8f5
  q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
shun-iwasawa 82a8f5
        MULTIPLY(row3, -FIX_2_562915447); \
shun-iwasawa 82a8f5
  q1 = q1 - q6; \
shun-iwasawa 82a8f5
  q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
shun-iwasawa 82a8f5
       MULTIPLY(row6, FIX_0_541196100); \
shun-iwasawa 82a8f5
  q3 = q3 - q2; \
shun-iwasawa 82a8f5
  \
shun-iwasawa 82a8f5
  /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
shun-iwasawa 82a8f5
  tmp11_minus_tmp2 = q1; \
shun-iwasawa 82a8f5
  \
shun-iwasawa 82a8f5
  q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
shun-iwasawa 82a8f5
  q2 = q1 + q6; \
shun-iwasawa 82a8f5
  q1 = q1 - q6; \
shun-iwasawa 82a8f5
  \
shun-iwasawa 82a8f5
  /* pick up the results */ \
shun-iwasawa 82a8f5
  tmp0  = q4; \
shun-iwasawa 82a8f5
  tmp1  = q5; \
shun-iwasawa 82a8f5
  tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
shun-iwasawa 82a8f5
  tmp3  = q7; \
shun-iwasawa 82a8f5
  tmp10 = q2; \
shun-iwasawa 82a8f5
  tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
shun-iwasawa 82a8f5
  tmp12 = q3; \
shun-iwasawa 82a8f5
  tmp13 = q1; \
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define XFIX_0_899976223                    d0[0]
shun-iwasawa 82a8f5
#define XFIX_0_541196100                    d0[1]
shun-iwasawa 82a8f5
#define XFIX_2_562915447                    d0[2]
shun-iwasawa 82a8f5
#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
shun-iwasawa 82a8f5
#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
shun-iwasawa 82a8f5
#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
shun-iwasawa 82a8f5
#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
shun-iwasawa 82a8f5
#define XFIX_1_175875602                    d1[3]
shun-iwasawa 82a8f5
#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
shun-iwasawa 82a8f5
#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
shun-iwasawa 82a8f5
#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
shun-iwasawa 82a8f5
#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
jsimd_idct_islow_neon_consts:
shun-iwasawa 82a8f5
  .short FIX_0_899976223                    /* d0[0] */
shun-iwasawa 82a8f5
  .short FIX_0_541196100                    /* d0[1] */
shun-iwasawa 82a8f5
  .short FIX_2_562915447                    /* d0[2] */
shun-iwasawa 82a8f5
  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
shun-iwasawa 82a8f5
  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
shun-iwasawa 82a8f5
  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
shun-iwasawa 82a8f5
  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
shun-iwasawa 82a8f5
  .short FIX_1_175875602                    /* d1[3] */
shun-iwasawa 82a8f5
  /* reloadable constants */
shun-iwasawa 82a8f5
  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
shun-iwasawa 82a8f5
  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
shun-iwasawa 82a8f5
  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
shun-iwasawa 82a8f5
  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_idct_islow_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    DCT_TABLE       .req r0
shun-iwasawa 82a8f5
    COEF_BLOCK      .req r1
shun-iwasawa 82a8f5
    OUTPUT_BUF      .req r2
shun-iwasawa 82a8f5
    OUTPUT_COL      .req r3
shun-iwasawa 82a8f5
    TMP1            .req r0
shun-iwasawa 82a8f5
    TMP2            .req r1
shun-iwasawa 82a8f5
    TMP3            .req r2
shun-iwasawa 82a8f5
    TMP4            .req ip
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ROW0L           .req d16
shun-iwasawa 82a8f5
    ROW0R           .req d17
shun-iwasawa 82a8f5
    ROW1L           .req d18
shun-iwasawa 82a8f5
    ROW1R           .req d19
shun-iwasawa 82a8f5
    ROW2L           .req d20
shun-iwasawa 82a8f5
    ROW2R           .req d21
shun-iwasawa 82a8f5
    ROW3L           .req d22
shun-iwasawa 82a8f5
    ROW3R           .req d23
shun-iwasawa 82a8f5
    ROW4L           .req d24
shun-iwasawa 82a8f5
    ROW4R           .req d25
shun-iwasawa 82a8f5
    ROW5L           .req d26
shun-iwasawa 82a8f5
    ROW5R           .req d27
shun-iwasawa 82a8f5
    ROW6L           .req d28
shun-iwasawa 82a8f5
    ROW6R           .req d29
shun-iwasawa 82a8f5
    ROW7L           .req d30
shun-iwasawa 82a8f5
    ROW7R           .req d31
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load and dequantize coefficients into Neon registers
shun-iwasawa 82a8f5
     * with the following allocation:
shun-iwasawa 82a8f5
     *       0 1 2 3 | 4 5 6 7
shun-iwasawa 82a8f5
     *      ---------+--------
shun-iwasawa 82a8f5
     *   0 | d16     | d17     ( q8  )
shun-iwasawa 82a8f5
     *   1 | d18     | d19     ( q9  )
shun-iwasawa 82a8f5
     *   2 | d20     | d21     ( q10 )
shun-iwasawa 82a8f5
     *   3 | d22     | d23     ( q11 )
shun-iwasawa 82a8f5
     *   4 | d24     | d25     ( q12 )
shun-iwasawa 82a8f5
     *   5 | d26     | d27     ( q13 )
shun-iwasawa 82a8f5
     *   6 | d28     | d29     ( q14 )
shun-iwasawa 82a8f5
     *   7 | d30     | d31     ( q15 )
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
    adr             ip, jsimd_idct_islow_neon_consts
shun-iwasawa 82a8f5
    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q8, q8, q0
shun-iwasawa 82a8f5
    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q9, q9, q1
shun-iwasawa 82a8f5
    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q10, q10, q2
shun-iwasawa 82a8f5
    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q11, q11, q3
shun-iwasawa 82a8f5
    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
shun-iwasawa 82a8f5
    vmul.s16        q12, q12, q0
shun-iwasawa 82a8f5
    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q14, q14, q2
shun-iwasawa 82a8f5
    vmul.s16        q13, q13, q1
shun-iwasawa 82a8f5
    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
shun-iwasawa 82a8f5
    add             ip, ip, #16
shun-iwasawa 82a8f5
    vmul.s16        q15, q15, q3
shun-iwasawa 82a8f5
    vpush           {d8 - d15}                    /* save Neon registers */
shun-iwasawa 82a8f5
    /* 1-D IDCT, pass 1, left 4x8 half */
shun-iwasawa 82a8f5
    vadd.s16        d4, ROW7L, ROW3L
shun-iwasawa 82a8f5
    vadd.s16        d5, ROW5L, ROW1L
shun-iwasawa 82a8f5
    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
shun-iwasawa 82a8f5
    vmlal.s16       q6, d5, XFIX_1_175875602
shun-iwasawa 82a8f5
    vmull.s16       q7, d4, XFIX_1_175875602
shun-iwasawa 82a8f5
      /* Check for the zero coefficients in the right 4x8 half */
shun-iwasawa 82a8f5
      push            {r4, r5}
shun-iwasawa 82a8f5
    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
shun-iwasawa 82a8f5
    vsubl.s16       q3, ROW0L, ROW4L
shun-iwasawa 82a8f5
      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
shun-iwasawa 82a8f5
    vmull.s16       q2, ROW2L, XFIX_0_541196100
shun-iwasawa 82a8f5
    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
shun-iwasawa 82a8f5
      orr             r0, r4, r5
shun-iwasawa 82a8f5
    vmov            q4, q6
shun-iwasawa 82a8f5
    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
shun-iwasawa 82a8f5
      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
shun-iwasawa 82a8f5
    vshl.s32        q3, q3, #13
shun-iwasawa 82a8f5
      orr             r0, r0, r4
shun-iwasawa 82a8f5
    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
shun-iwasawa 82a8f5
      orr             r0, r0, r5
shun-iwasawa 82a8f5
    vadd.s32        q1, q3, q2
shun-iwasawa 82a8f5
      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
shun-iwasawa 82a8f5
    vmov            q5, q7
shun-iwasawa 82a8f5
    vadd.s32        q1, q1, q6
shun-iwasawa 82a8f5
      orr             r0, r0, r4
shun-iwasawa 82a8f5
    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
shun-iwasawa 82a8f5
      orr             r0, r0, r5
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
shun-iwasawa 82a8f5
    vrshrn.s32      ROW1L, q1, #11
shun-iwasawa 82a8f5
      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
shun-iwasawa 82a8f5
    vsub.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
shun-iwasawa 82a8f5
      orr             r0, r0, r4
shun-iwasawa 82a8f5
    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
shun-iwasawa 82a8f5
      orr             r0, r0, r5
shun-iwasawa 82a8f5
    vsub.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
shun-iwasawa 82a8f5
      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW6L, XFIX_0_541196100
shun-iwasawa 82a8f5
    vsub.s32        q3, q3, q2
shun-iwasawa 82a8f5
      orr             r0, r0, r4
shun-iwasawa 82a8f5
    vrshrn.s32      ROW6L, q1, #11
shun-iwasawa 82a8f5
      orr             r0, r0, r5
shun-iwasawa 82a8f5
    vadd.s32        q1, q3, q5
shun-iwasawa 82a8f5
      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
shun-iwasawa 82a8f5
    vsub.s32        q3, q3, q5
shun-iwasawa 82a8f5
    vaddl.s16       q5, ROW0L, ROW4L
shun-iwasawa 82a8f5
      orr             r0, r0, r4
shun-iwasawa 82a8f5
    vrshrn.s32      ROW2L, q1, #11
shun-iwasawa 82a8f5
      orr             r0, r0, r5
shun-iwasawa 82a8f5
    vrshrn.s32      ROW5L, q3, #11
shun-iwasawa 82a8f5
      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
shun-iwasawa 82a8f5
    vshl.s32        q5, q5, #13
shun-iwasawa 82a8f5
    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
shun-iwasawa 82a8f5
      orr             r0, r0, r4
shun-iwasawa 82a8f5
    vadd.s32        q2, q5, q6
shun-iwasawa 82a8f5
      orrs            r0, r0, r5
shun-iwasawa 82a8f5
    vsub.s32        q1, q5, q6
shun-iwasawa 82a8f5
    vadd.s32        q6, q2, q7
shun-iwasawa 82a8f5
      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
shun-iwasawa 82a8f5
    vsub.s32        q2, q2, q7
shun-iwasawa 82a8f5
    vadd.s32        q5, q1, q4
shun-iwasawa 82a8f5
      orr             r0, r4, r5
shun-iwasawa 82a8f5
    vsub.s32        q3, q1, q4
shun-iwasawa 82a8f5
      pop             {r4, r5}
shun-iwasawa 82a8f5
    vrshrn.s32      ROW7L, q2, #11
shun-iwasawa 82a8f5
    vrshrn.s32      ROW3L, q5, #11
shun-iwasawa 82a8f5
    vrshrn.s32      ROW0L, q6, #11
shun-iwasawa 82a8f5
    vrshrn.s32      ROW4L, q3, #11
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
      beq             3f  /* Go to do some special handling for the sparse
shun-iwasawa 82a8f5
                             right 4x8 half */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* 1-D IDCT, pass 1, right 4x8 half */
shun-iwasawa 82a8f5
    vld1.s16        {d2}, [ip, :64]  /* reload constants */
shun-iwasawa 82a8f5
    vadd.s16        d10, ROW7R, ROW3R
shun-iwasawa 82a8f5
    vadd.s16        d8, ROW5R, ROW1R
shun-iwasawa 82a8f5
      /* Transpose left 4x8 half */
shun-iwasawa 82a8f5
      vtrn.16         ROW6L, ROW7L
shun-iwasawa 82a8f5
    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
shun-iwasawa 82a8f5
    vmlal.s16       q6, d8, XFIX_1_175875602
shun-iwasawa 82a8f5
      vtrn.16         ROW2L, ROW3L
shun-iwasawa 82a8f5
    vmull.s16       q7, d10, XFIX_1_175875602
shun-iwasawa 82a8f5
    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
shun-iwasawa 82a8f5
      vtrn.16         ROW0L, ROW1L
shun-iwasawa 82a8f5
    vsubl.s16       q3, ROW0R, ROW4R
shun-iwasawa 82a8f5
    vmull.s16       q2, ROW2R, XFIX_0_541196100
shun-iwasawa 82a8f5
    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
shun-iwasawa 82a8f5
      vtrn.16         ROW4L, ROW5L
shun-iwasawa 82a8f5
    vmov            q4, q6
shun-iwasawa 82a8f5
    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
shun-iwasawa 82a8f5
      vtrn.32         ROW1L, ROW3L
shun-iwasawa 82a8f5
    vshl.s32        q3, q3, #13
shun-iwasawa 82a8f5
    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
shun-iwasawa 82a8f5
      vtrn.32         ROW4L, ROW6L
shun-iwasawa 82a8f5
    vadd.s32        q1, q3, q2
shun-iwasawa 82a8f5
    vmov            q5, q7
shun-iwasawa 82a8f5
    vadd.s32        q1, q1, q6
shun-iwasawa 82a8f5
      vtrn.32         ROW0L, ROW2L
shun-iwasawa 82a8f5
    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
shun-iwasawa 82a8f5
    vrshrn.s32      ROW1R, q1, #11
shun-iwasawa 82a8f5
      vtrn.32         ROW5L, ROW7L
shun-iwasawa 82a8f5
    vsub.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
shun-iwasawa 82a8f5
    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
shun-iwasawa 82a8f5
    vsub.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW6R, XFIX_0_541196100
shun-iwasawa 82a8f5
    vsub.s32        q3, q3, q2
shun-iwasawa 82a8f5
    vrshrn.s32      ROW6R, q1, #11
shun-iwasawa 82a8f5
    vadd.s32        q1, q3, q5
shun-iwasawa 82a8f5
    vsub.s32        q3, q3, q5
shun-iwasawa 82a8f5
    vaddl.s16       q5, ROW0R, ROW4R
shun-iwasawa 82a8f5
    vrshrn.s32      ROW2R, q1, #11
shun-iwasawa 82a8f5
    vrshrn.s32      ROW5R, q3, #11
shun-iwasawa 82a8f5
    vshl.s32        q5, q5, #13
shun-iwasawa 82a8f5
    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
shun-iwasawa 82a8f5
    vadd.s32        q2, q5, q6
shun-iwasawa 82a8f5
    vsub.s32        q1, q5, q6
shun-iwasawa 82a8f5
    vadd.s32        q6, q2, q7
shun-iwasawa 82a8f5
    vsub.s32        q2, q2, q7
shun-iwasawa 82a8f5
    vadd.s32        q5, q1, q4
shun-iwasawa 82a8f5
    vsub.s32        q3, q1, q4
shun-iwasawa 82a8f5
    vrshrn.s32      ROW7R, q2, #11
shun-iwasawa 82a8f5
    vrshrn.s32      ROW3R, q5, #11
shun-iwasawa 82a8f5
    vrshrn.s32      ROW0R, q6, #11
shun-iwasawa 82a8f5
    vrshrn.s32      ROW4R, q3, #11
shun-iwasawa 82a8f5
    /* Transpose right 4x8 half */
shun-iwasawa 82a8f5
    vtrn.16         ROW6R, ROW7R
shun-iwasawa 82a8f5
    vtrn.16         ROW2R, ROW3R
shun-iwasawa 82a8f5
    vtrn.16         ROW0R, ROW1R
shun-iwasawa 82a8f5
    vtrn.16         ROW4R, ROW5R
shun-iwasawa 82a8f5
    vtrn.32         ROW1R, ROW3R
shun-iwasawa 82a8f5
    vtrn.32         ROW4R, ROW6R
shun-iwasawa 82a8f5
    vtrn.32         ROW0R, ROW2R
shun-iwasawa 82a8f5
    vtrn.32         ROW5R, ROW7R
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
shun-iwasawa 82a8f5
    vld1.s16        {d2}, [ip, :64]               /* reload constants */
shun-iwasawa 82a8f5
    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW1L, XFIX_1_175875602
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
shun-iwasawa 82a8f5
    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW3L, XFIX_1_175875602
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
shun-iwasawa 82a8f5
    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
shun-iwasawa 82a8f5
    vmull.s16       q2, ROW2L, XFIX_0_541196100
shun-iwasawa 82a8f5
    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
shun-iwasawa 82a8f5
    vmov            q4, q6
shun-iwasawa 82a8f5
    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
shun-iwasawa 82a8f5
    vshl.s32        q3, q3, #13
shun-iwasawa 82a8f5
    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
shun-iwasawa 82a8f5
    vadd.s32        q1, q3, q2
shun-iwasawa 82a8f5
    vmov            q5, q7
shun-iwasawa 82a8f5
    vadd.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
shun-iwasawa 82a8f5
    vshrn.s32       ROW1L, q1, #16
shun-iwasawa 82a8f5
    vsub.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
shun-iwasawa 82a8f5
    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
shun-iwasawa 82a8f5
    vsub.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
shun-iwasawa 82a8f5
    vsub.s32        q3, q3, q2
shun-iwasawa 82a8f5
    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
shun-iwasawa 82a8f5
    vadd.s32        q1, q3, q5
shun-iwasawa 82a8f5
    vsub.s32        q3, q3, q5
shun-iwasawa 82a8f5
    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
shun-iwasawa 82a8f5
    vshrn.s32       ROW2L, q1, #16
shun-iwasawa 82a8f5
    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
shun-iwasawa 82a8f5
    vshl.s32        q5, q5, #13
shun-iwasawa 82a8f5
    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
shun-iwasawa 82a8f5
    vadd.s32        q2, q5, q6
shun-iwasawa 82a8f5
    vsub.s32        q1, q5, q6
shun-iwasawa 82a8f5
    vadd.s32        q6, q2, q7
shun-iwasawa 82a8f5
    vsub.s32        q2, q2, q7
shun-iwasawa 82a8f5
    vadd.s32        q5, q1, q4
shun-iwasawa 82a8f5
    vsub.s32        q3, q1, q4
shun-iwasawa 82a8f5
    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
shun-iwasawa 82a8f5
    vshrn.s32       ROW3L, q5, #16
shun-iwasawa 82a8f5
    vshrn.s32       ROW0L, q6, #16
shun-iwasawa 82a8f5
    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
shun-iwasawa 82a8f5
    /* 1-D IDCT, pass 2, right 4x8 half */
shun-iwasawa 82a8f5
    vld1.s16        {d2}, [ip, :64]               /* reload constants */
shun-iwasawa 82a8f5
    vmull.s16       q6, ROW5R, XFIX_1_175875602
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
shun-iwasawa 82a8f5
    vmull.s16       q7, ROW7R, XFIX_1_175875602
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
shun-iwasawa 82a8f5
    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
shun-iwasawa 82a8f5
    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
shun-iwasawa 82a8f5
    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
shun-iwasawa 82a8f5
    vmov            q4, q6
shun-iwasawa 82a8f5
    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
shun-iwasawa 82a8f5
    vshl.s32        q3, q3, #13
shun-iwasawa 82a8f5
    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
shun-iwasawa 82a8f5
    vadd.s32        q1, q3, q2
shun-iwasawa 82a8f5
    vmov            q5, q7
shun-iwasawa 82a8f5
    vadd.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
shun-iwasawa 82a8f5
    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
shun-iwasawa 82a8f5
    vsub.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
shun-iwasawa 82a8f5
    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
shun-iwasawa 82a8f5
    vsub.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW6R, XFIX_0_541196100
shun-iwasawa 82a8f5
    vsub.s32        q3, q3, q2
shun-iwasawa 82a8f5
    vshrn.s32       ROW6R, q1, #16
shun-iwasawa 82a8f5
    vadd.s32        q1, q3, q5
shun-iwasawa 82a8f5
    vsub.s32        q3, q3, q5
shun-iwasawa 82a8f5
    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
shun-iwasawa 82a8f5
    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
shun-iwasawa 82a8f5
    vshrn.s32       ROW5R, q3, #16
shun-iwasawa 82a8f5
    vshl.s32        q5, q5, #13
shun-iwasawa 82a8f5
    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
shun-iwasawa 82a8f5
    vadd.s32        q2, q5, q6
shun-iwasawa 82a8f5
    vsub.s32        q1, q5, q6
shun-iwasawa 82a8f5
    vadd.s32        q6, q2, q7
shun-iwasawa 82a8f5
    vsub.s32        q2, q2, q7
shun-iwasawa 82a8f5
    vadd.s32        q5, q1, q4
shun-iwasawa 82a8f5
    vsub.s32        q3, q1, q4
shun-iwasawa 82a8f5
    vshrn.s32       ROW7R, q2, #16
shun-iwasawa 82a8f5
    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
shun-iwasawa 82a8f5
    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
shun-iwasawa 82a8f5
    vshrn.s32       ROW4R, q3, #16
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
2:  /* Descale to 8-bit and range limit */
shun-iwasawa 82a8f5
    vqrshrn.s16     d16, q8, #2
shun-iwasawa 82a8f5
    vqrshrn.s16     d17, q9, #2
shun-iwasawa 82a8f5
    vqrshrn.s16     d18, q10, #2
shun-iwasawa 82a8f5
    vqrshrn.s16     d19, q11, #2
shun-iwasawa 82a8f5
    vpop            {d8 - d15}                    /* restore Neon registers */
shun-iwasawa 82a8f5
    vqrshrn.s16     d20, q12, #2
shun-iwasawa 82a8f5
      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
shun-iwasawa 82a8f5
      vtrn.16         q8, q9
shun-iwasawa 82a8f5
    vqrshrn.s16     d21, q13, #2
shun-iwasawa 82a8f5
    vqrshrn.s16     d22, q14, #2
shun-iwasawa 82a8f5
      vmov.u8         q0, #(CENTERJSAMPLE)
shun-iwasawa 82a8f5
    vqrshrn.s16     d23, q15, #2
shun-iwasawa 82a8f5
      vtrn.8          d16, d17
shun-iwasawa 82a8f5
      vtrn.8          d18, d19
shun-iwasawa 82a8f5
      vadd.u8         q8, q8, q0
shun-iwasawa 82a8f5
      vadd.u8         q9, q9, q0
shun-iwasawa 82a8f5
      vtrn.16         q10, q11
shun-iwasawa 82a8f5
        /* Store results to the output buffer */
shun-iwasawa 82a8f5
        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
shun-iwasawa 82a8f5
        add             TMP1, TMP1, OUTPUT_COL
shun-iwasawa 82a8f5
        add             TMP2, TMP2, OUTPUT_COL
shun-iwasawa 82a8f5
        vst1.8          {d16}, [TMP1]
shun-iwasawa 82a8f5
      vtrn.8          d20, d21
shun-iwasawa 82a8f5
        vst1.8          {d17}, [TMP2]
shun-iwasawa 82a8f5
        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
shun-iwasawa 82a8f5
        add             TMP1, TMP1, OUTPUT_COL
shun-iwasawa 82a8f5
        add             TMP2, TMP2, OUTPUT_COL
shun-iwasawa 82a8f5
        vst1.8          {d18}, [TMP1]
shun-iwasawa 82a8f5
      vadd.u8         q10, q10, q0
shun-iwasawa 82a8f5
        vst1.8          {d19}, [TMP2]
shun-iwasawa 82a8f5
        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
shun-iwasawa 82a8f5
        add             TMP1, TMP1, OUTPUT_COL
shun-iwasawa 82a8f5
        add             TMP2, TMP2, OUTPUT_COL
shun-iwasawa 82a8f5
        add             TMP3, TMP3, OUTPUT_COL
shun-iwasawa 82a8f5
        add             TMP4, TMP4, OUTPUT_COL
shun-iwasawa 82a8f5
      vtrn.8          d22, d23
shun-iwasawa 82a8f5
        vst1.8          {d20}, [TMP1]
shun-iwasawa 82a8f5
      vadd.u8         q11, q11, q0
shun-iwasawa 82a8f5
        vst1.8          {d21}, [TMP2]
shun-iwasawa 82a8f5
        vst1.8          {d22}, [TMP3]
shun-iwasawa 82a8f5
        vst1.8          {d23}, [TMP4]
shun-iwasawa 82a8f5
    bx              lr
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Transpose left 4x8 half */
shun-iwasawa 82a8f5
    vtrn.16         ROW6L, ROW7L
shun-iwasawa 82a8f5
    vtrn.16         ROW2L, ROW3L
shun-iwasawa 82a8f5
    vtrn.16         ROW0L, ROW1L
shun-iwasawa 82a8f5
    vtrn.16         ROW4L, ROW5L
shun-iwasawa 82a8f5
    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
shun-iwasawa 82a8f5
    vtrn.32         ROW1L, ROW3L
shun-iwasawa 82a8f5
    vtrn.32         ROW4L, ROW6L
shun-iwasawa 82a8f5
    vtrn.32         ROW0L, ROW2L
shun-iwasawa 82a8f5
    vtrn.32         ROW5L, ROW7L
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmp             r0, #0
shun-iwasawa 82a8f5
    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
shun-iwasawa 82a8f5
                           pass */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Only row 0 is non-zero for the right 4x8 half  */
shun-iwasawa 82a8f5
    vdup.s16        ROW1R, ROW0R[1]
shun-iwasawa 82a8f5
    vdup.s16        ROW2R, ROW0R[2]
shun-iwasawa 82a8f5
    vdup.s16        ROW3R, ROW0R[3]
shun-iwasawa 82a8f5
    vdup.s16        ROW4R, ROW0R[0]
shun-iwasawa 82a8f5
    vdup.s16        ROW5R, ROW0R[1]
shun-iwasawa 82a8f5
    vdup.s16        ROW6R, ROW0R[2]
shun-iwasawa 82a8f5
    vdup.s16        ROW7R, ROW0R[3]
shun-iwasawa 82a8f5
    vdup.s16        ROW0R, ROW0R[0]
shun-iwasawa 82a8f5
    b               1b  /* Go to 'normal' second pass */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
shun-iwasawa 82a8f5
    vld1.s16        {d2}, [ip, :64]               /* reload constants */
shun-iwasawa 82a8f5
    vmull.s16       q6, ROW1L, XFIX_1_175875602
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
shun-iwasawa 82a8f5
    vmull.s16       q7, ROW3L, XFIX_1_175875602
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
shun-iwasawa 82a8f5
    vmull.s16       q2, ROW2L, XFIX_0_541196100
shun-iwasawa 82a8f5
    vshll.s16       q3, ROW0L, #13
shun-iwasawa 82a8f5
    vmov            q4, q6
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
shun-iwasawa 82a8f5
    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
shun-iwasawa 82a8f5
    vadd.s32        q1, q3, q2
shun-iwasawa 82a8f5
    vmov            q5, q7
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
shun-iwasawa 82a8f5
    vadd.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vadd.s32        q6, q6, q6
shun-iwasawa 82a8f5
    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
shun-iwasawa 82a8f5
    vshrn.s32       ROW1L, q1, #16
shun-iwasawa 82a8f5
    vsub.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
shun-iwasawa 82a8f5
    vsub.s32        q3, q3, q2
shun-iwasawa 82a8f5
    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
shun-iwasawa 82a8f5
    vadd.s32        q1, q3, q5
shun-iwasawa 82a8f5
    vsub.s32        q3, q3, q5
shun-iwasawa 82a8f5
    vshll.s16       q5, ROW0L, #13
shun-iwasawa 82a8f5
    vshrn.s32       ROW2L, q1, #16
shun-iwasawa 82a8f5
    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
shun-iwasawa 82a8f5
    vadd.s32        q2, q5, q6
shun-iwasawa 82a8f5
    vsub.s32        q1, q5, q6
shun-iwasawa 82a8f5
    vadd.s32        q6, q2, q7
shun-iwasawa 82a8f5
    vsub.s32        q2, q2, q7
shun-iwasawa 82a8f5
    vadd.s32        q5, q1, q4
shun-iwasawa 82a8f5
    vsub.s32        q3, q1, q4
shun-iwasawa 82a8f5
    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
shun-iwasawa 82a8f5
    vshrn.s32       ROW3L, q5, #16
shun-iwasawa 82a8f5
    vshrn.s32       ROW0L, q6, #16
shun-iwasawa 82a8f5
    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
shun-iwasawa 82a8f5
    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
shun-iwasawa 82a8f5
    vld1.s16        {d2}, [ip, :64]               /* reload constants */
shun-iwasawa 82a8f5
    vmull.s16       q6, ROW5L, XFIX_1_175875602
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
shun-iwasawa 82a8f5
    vmull.s16       q7, ROW7L, XFIX_1_175875602
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
shun-iwasawa 82a8f5
    vmull.s16       q2, ROW6L, XFIX_0_541196100
shun-iwasawa 82a8f5
    vshll.s16       q3, ROW4L, #13
shun-iwasawa 82a8f5
    vmov            q4, q6
shun-iwasawa 82a8f5
    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
shun-iwasawa 82a8f5
    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
shun-iwasawa 82a8f5
    vadd.s32        q1, q3, q2
shun-iwasawa 82a8f5
    vmov            q5, q7
shun-iwasawa 82a8f5
    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
shun-iwasawa 82a8f5
    vadd.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vadd.s32        q6, q6, q6
shun-iwasawa 82a8f5
    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
shun-iwasawa 82a8f5
    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
shun-iwasawa 82a8f5
    vsub.s32        q1, q1, q6
shun-iwasawa 82a8f5
    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
shun-iwasawa 82a8f5
    vsub.s32        q3, q3, q2
shun-iwasawa 82a8f5
    vshrn.s32       ROW6R, q1, #16
shun-iwasawa 82a8f5
    vadd.s32        q1, q3, q5
shun-iwasawa 82a8f5
    vsub.s32        q3, q3, q5
shun-iwasawa 82a8f5
    vshll.s16       q5, ROW4L, #13
shun-iwasawa 82a8f5
    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
shun-iwasawa 82a8f5
    vshrn.s32       ROW5R, q3, #16
shun-iwasawa 82a8f5
    vadd.s32        q2, q5, q6
shun-iwasawa 82a8f5
    vsub.s32        q1, q5, q6
shun-iwasawa 82a8f5
    vadd.s32        q6, q2, q7
shun-iwasawa 82a8f5
    vsub.s32        q2, q2, q7
shun-iwasawa 82a8f5
    vadd.s32        q5, q1, q4
shun-iwasawa 82a8f5
    vsub.s32        q3, q1, q4
shun-iwasawa 82a8f5
    vshrn.s32       ROW7R, q2, #16
shun-iwasawa 82a8f5
    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
shun-iwasawa 82a8f5
    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
shun-iwasawa 82a8f5
    vshrn.s32       ROW4R, q3, #16
shun-iwasawa 82a8f5
    b               2b                            /* Go to epilogue */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          DCT_TABLE
shun-iwasawa 82a8f5
    .unreq          COEF_BLOCK
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF
shun-iwasawa 82a8f5
    .unreq          OUTPUT_COL
shun-iwasawa 82a8f5
    .unreq          TMP1
shun-iwasawa 82a8f5
    .unreq          TMP2
shun-iwasawa 82a8f5
    .unreq          TMP3
shun-iwasawa 82a8f5
    .unreq          TMP4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          ROW0L
shun-iwasawa 82a8f5
    .unreq          ROW0R
shun-iwasawa 82a8f5
    .unreq          ROW1L
shun-iwasawa 82a8f5
    .unreq          ROW1R
shun-iwasawa 82a8f5
    .unreq          ROW2L
shun-iwasawa 82a8f5
    .unreq          ROW2R
shun-iwasawa 82a8f5
    .unreq          ROW3L
shun-iwasawa 82a8f5
    .unreq          ROW3R
shun-iwasawa 82a8f5
    .unreq          ROW4L
shun-iwasawa 82a8f5
    .unreq          ROW4R
shun-iwasawa 82a8f5
    .unreq          ROW5L
shun-iwasawa 82a8f5
    .unreq          ROW5R
shun-iwasawa 82a8f5
    .unreq          ROW6L
shun-iwasawa 82a8f5
    .unreq          ROW6R
shun-iwasawa 82a8f5
    .unreq          ROW7L
shun-iwasawa 82a8f5
    .unreq          ROW7R
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_idct_ifast_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This function contains a fast, not so accurate integer implementation of
shun-iwasawa 82a8f5
 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
shun-iwasawa 82a8f5
 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
shun-iwasawa 82a8f5
 * function from jidctfst.c
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
shun-iwasawa 82a8f5
 * But in Arm Neon case some extra additions are required because VQDMULH
shun-iwasawa 82a8f5
 * instruction can't handle the constants larger than 1. So the expressions
shun-iwasawa 82a8f5
 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
shun-iwasawa 82a8f5
 * which introduces an extra addition. Overall, there are 6 extra additions
shun-iwasawa 82a8f5
 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define XFIX_1_082392200  d0[0]
shun-iwasawa 82a8f5
#define XFIX_1_414213562  d0[1]
shun-iwasawa 82a8f5
#define XFIX_1_847759065  d0[2]
shun-iwasawa 82a8f5
#define XFIX_2_613125930  d0[3]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
jsimd_idct_ifast_neon_consts:
shun-iwasawa 82a8f5
  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
shun-iwasawa 82a8f5
  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
shun-iwasawa 82a8f5
  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
shun-iwasawa 82a8f5
  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_idct_ifast_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    DCT_TABLE       .req r0
shun-iwasawa 82a8f5
    COEF_BLOCK      .req r1
shun-iwasawa 82a8f5
    OUTPUT_BUF      .req r2
shun-iwasawa 82a8f5
    OUTPUT_COL      .req r3
shun-iwasawa 82a8f5
    TMP1            .req r0
shun-iwasawa 82a8f5
    TMP2            .req r1
shun-iwasawa 82a8f5
    TMP3            .req r2
shun-iwasawa 82a8f5
    TMP4            .req ip
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load and dequantize coefficients into Neon registers
shun-iwasawa 82a8f5
     * with the following allocation:
shun-iwasawa 82a8f5
     *       0 1 2 3 | 4 5 6 7
shun-iwasawa 82a8f5
     *      ---------+--------
shun-iwasawa 82a8f5
     *   0 | d16     | d17     ( q8  )
shun-iwasawa 82a8f5
     *   1 | d18     | d19     ( q9  )
shun-iwasawa 82a8f5
     *   2 | d20     | d21     ( q10 )
shun-iwasawa 82a8f5
     *   3 | d22     | d23     ( q11 )
shun-iwasawa 82a8f5
     *   4 | d24     | d25     ( q12 )
shun-iwasawa 82a8f5
     *   5 | d26     | d27     ( q13 )
shun-iwasawa 82a8f5
     *   6 | d28     | d29     ( q14 )
shun-iwasawa 82a8f5
     *   7 | d30     | d31     ( q15 )
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
    adr             ip, jsimd_idct_ifast_neon_consts
shun-iwasawa 82a8f5
    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q8, q8, q0
shun-iwasawa 82a8f5
    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q9, q9, q1
shun-iwasawa 82a8f5
    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q10, q10, q2
shun-iwasawa 82a8f5
    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q11, q11, q3
shun-iwasawa 82a8f5
    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
shun-iwasawa 82a8f5
    vmul.s16        q12, q12, q0
shun-iwasawa 82a8f5
    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q14, q14, q2
shun-iwasawa 82a8f5
    vmul.s16        q13, q13, q1
shun-iwasawa 82a8f5
    vld1.16         {d0}, [ip, :64]  /* load constants */
shun-iwasawa 82a8f5
    vmul.s16        q15, q15, q3
shun-iwasawa 82a8f5
    vpush           {d8 - d13}       /* save Neon registers */
shun-iwasawa 82a8f5
    /* 1-D IDCT, pass 1 */
shun-iwasawa 82a8f5
    vsub.s16        q2, q10, q14
shun-iwasawa 82a8f5
    vadd.s16        q14, q10, q14
shun-iwasawa 82a8f5
    vsub.s16        q1, q11, q13
shun-iwasawa 82a8f5
    vadd.s16        q13, q11, q13
shun-iwasawa 82a8f5
    vsub.s16        q5, q9, q15
shun-iwasawa 82a8f5
    vadd.s16        q15, q9, q15
shun-iwasawa 82a8f5
    vqdmulh.s16     q4, q2, XFIX_1_414213562
shun-iwasawa 82a8f5
    vqdmulh.s16     q6, q1, XFIX_2_613125930
shun-iwasawa 82a8f5
    vadd.s16        q3, q1, q1
shun-iwasawa 82a8f5
    vsub.s16        q1, q5, q1
shun-iwasawa 82a8f5
    vadd.s16        q10, q2, q4
shun-iwasawa 82a8f5
    vqdmulh.s16     q4, q1, XFIX_1_847759065
shun-iwasawa 82a8f5
    vsub.s16        q2, q15, q13
shun-iwasawa 82a8f5
    vadd.s16        q3, q3, q6
shun-iwasawa 82a8f5
    vqdmulh.s16     q6, q2, XFIX_1_414213562
shun-iwasawa 82a8f5
    vadd.s16        q1, q1, q4
shun-iwasawa 82a8f5
    vqdmulh.s16     q4, q5, XFIX_1_082392200
shun-iwasawa 82a8f5
    vsub.s16        q10, q10, q14
shun-iwasawa 82a8f5
    vadd.s16        q2, q2, q6
shun-iwasawa 82a8f5
    vsub.s16        q6, q8, q12
shun-iwasawa 82a8f5
    vadd.s16        q12, q8, q12
shun-iwasawa 82a8f5
    vadd.s16        q9, q5, q4
shun-iwasawa 82a8f5
    vadd.s16        q5, q6, q10
shun-iwasawa 82a8f5
    vsub.s16        q10, q6, q10
shun-iwasawa 82a8f5
    vadd.s16        q6, q15, q13
shun-iwasawa 82a8f5
    vadd.s16        q8, q12, q14
shun-iwasawa 82a8f5
    vsub.s16        q3, q6, q3
shun-iwasawa 82a8f5
    vsub.s16        q12, q12, q14
shun-iwasawa 82a8f5
    vsub.s16        q3, q3, q1
shun-iwasawa 82a8f5
    vsub.s16        q1, q9, q1
shun-iwasawa 82a8f5
    vadd.s16        q2, q3, q2
shun-iwasawa 82a8f5
    vsub.s16        q15, q8, q6
shun-iwasawa 82a8f5
    vadd.s16        q1, q1, q2
shun-iwasawa 82a8f5
    vadd.s16        q8, q8, q6
shun-iwasawa 82a8f5
    vadd.s16        q14, q5, q3
shun-iwasawa 82a8f5
    vsub.s16        q9, q5, q3
shun-iwasawa 82a8f5
    vsub.s16        q13, q10, q2
shun-iwasawa 82a8f5
    vadd.s16        q10, q10, q2
shun-iwasawa 82a8f5
      /* Transpose */
shun-iwasawa 82a8f5
      vtrn.16         q8, q9
shun-iwasawa 82a8f5
    vsub.s16        q11, q12, q1
shun-iwasawa 82a8f5
      vtrn.16         q14, q15
shun-iwasawa 82a8f5
    vadd.s16        q12, q12, q1
shun-iwasawa 82a8f5
      vtrn.16         q10, q11
shun-iwasawa 82a8f5
      vtrn.16         q12, q13
shun-iwasawa 82a8f5
      vtrn.32         q9, q11
shun-iwasawa 82a8f5
      vtrn.32         q12, q14
shun-iwasawa 82a8f5
      vtrn.32         q8, q10
shun-iwasawa 82a8f5
      vtrn.32         q13, q15
shun-iwasawa 82a8f5
      vswp            d28, d21
shun-iwasawa 82a8f5
      vswp            d26, d19
shun-iwasawa 82a8f5
    /* 1-D IDCT, pass 2 */
shun-iwasawa 82a8f5
    vsub.s16        q2, q10, q14
shun-iwasawa 82a8f5
      vswp            d30, d23
shun-iwasawa 82a8f5
    vadd.s16        q14, q10, q14
shun-iwasawa 82a8f5
      vswp            d24, d17
shun-iwasawa 82a8f5
    vsub.s16        q1, q11, q13
shun-iwasawa 82a8f5
    vadd.s16        q13, q11, q13
shun-iwasawa 82a8f5
    vsub.s16        q5, q9, q15
shun-iwasawa 82a8f5
    vadd.s16        q15, q9, q15
shun-iwasawa 82a8f5
    vqdmulh.s16     q4, q2, XFIX_1_414213562
shun-iwasawa 82a8f5
    vqdmulh.s16     q6, q1, XFIX_2_613125930
shun-iwasawa 82a8f5
    vadd.s16        q3, q1, q1
shun-iwasawa 82a8f5
    vsub.s16        q1, q5, q1
shun-iwasawa 82a8f5
    vadd.s16        q10, q2, q4
shun-iwasawa 82a8f5
    vqdmulh.s16     q4, q1, XFIX_1_847759065
shun-iwasawa 82a8f5
    vsub.s16        q2, q15, q13
shun-iwasawa 82a8f5
    vadd.s16        q3, q3, q6
shun-iwasawa 82a8f5
    vqdmulh.s16     q6, q2, XFIX_1_414213562
shun-iwasawa 82a8f5
    vadd.s16        q1, q1, q4
shun-iwasawa 82a8f5
    vqdmulh.s16     q4, q5, XFIX_1_082392200
shun-iwasawa 82a8f5
    vsub.s16        q10, q10, q14
shun-iwasawa 82a8f5
    vadd.s16        q2, q2, q6
shun-iwasawa 82a8f5
    vsub.s16        q6, q8, q12
shun-iwasawa 82a8f5
    vadd.s16        q12, q8, q12
shun-iwasawa 82a8f5
    vadd.s16        q9, q5, q4
shun-iwasawa 82a8f5
    vadd.s16        q5, q6, q10
shun-iwasawa 82a8f5
    vsub.s16        q10, q6, q10
shun-iwasawa 82a8f5
    vadd.s16        q6, q15, q13
shun-iwasawa 82a8f5
    vadd.s16        q8, q12, q14
shun-iwasawa 82a8f5
    vsub.s16        q3, q6, q3
shun-iwasawa 82a8f5
    vsub.s16        q12, q12, q14
shun-iwasawa 82a8f5
    vsub.s16        q3, q3, q1
shun-iwasawa 82a8f5
    vsub.s16        q1, q9, q1
shun-iwasawa 82a8f5
    vadd.s16        q2, q3, q2
shun-iwasawa 82a8f5
    vsub.s16        q15, q8, q6
shun-iwasawa 82a8f5
    vadd.s16        q1, q1, q2
shun-iwasawa 82a8f5
    vadd.s16        q8, q8, q6
shun-iwasawa 82a8f5
    vadd.s16        q14, q5, q3
shun-iwasawa 82a8f5
    vsub.s16        q9, q5, q3
shun-iwasawa 82a8f5
    vsub.s16        q13, q10, q2
shun-iwasawa 82a8f5
    vpop            {d8 - d13}    /* restore Neon registers */
shun-iwasawa 82a8f5
    vadd.s16        q10, q10, q2
shun-iwasawa 82a8f5
    vsub.s16        q11, q12, q1
shun-iwasawa 82a8f5
    vadd.s16        q12, q12, q1
shun-iwasawa 82a8f5
    /* Descale to 8-bit and range limit */
shun-iwasawa 82a8f5
    vmov.u8         q0, #0x80
shun-iwasawa 82a8f5
    vqshrn.s16      d16, q8, #5
shun-iwasawa 82a8f5
    vqshrn.s16      d17, q9, #5
shun-iwasawa 82a8f5
    vqshrn.s16      d18, q10, #5
shun-iwasawa 82a8f5
    vqshrn.s16      d19, q11, #5
shun-iwasawa 82a8f5
    vqshrn.s16      d20, q12, #5
shun-iwasawa 82a8f5
    vqshrn.s16      d21, q13, #5
shun-iwasawa 82a8f5
    vqshrn.s16      d22, q14, #5
shun-iwasawa 82a8f5
    vqshrn.s16      d23, q15, #5
shun-iwasawa 82a8f5
    vadd.u8         q8, q8, q0
shun-iwasawa 82a8f5
    vadd.u8         q9, q9, q0
shun-iwasawa 82a8f5
    vadd.u8         q10, q10, q0
shun-iwasawa 82a8f5
    vadd.u8         q11, q11, q0
shun-iwasawa 82a8f5
    /* Transpose the final 8-bit samples */
shun-iwasawa 82a8f5
    vtrn.16         q8, q9
shun-iwasawa 82a8f5
    vtrn.16         q10, q11
shun-iwasawa 82a8f5
    vtrn.32         q8, q10
shun-iwasawa 82a8f5
    vtrn.32         q9, q11
shun-iwasawa 82a8f5
    vtrn.8          d16, d17
shun-iwasawa 82a8f5
    vtrn.8          d18, d19
shun-iwasawa 82a8f5
      /* Store results to the output buffer */
shun-iwasawa 82a8f5
      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
shun-iwasawa 82a8f5
      add             TMP1, TMP1, OUTPUT_COL
shun-iwasawa 82a8f5
      add             TMP2, TMP2, OUTPUT_COL
shun-iwasawa 82a8f5
      vst1.8          {d16}, [TMP1]
shun-iwasawa 82a8f5
      vst1.8          {d17}, [TMP2]
shun-iwasawa 82a8f5
      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
shun-iwasawa 82a8f5
      add             TMP1, TMP1, OUTPUT_COL
shun-iwasawa 82a8f5
      add             TMP2, TMP2, OUTPUT_COL
shun-iwasawa 82a8f5
      vst1.8          {d18}, [TMP1]
shun-iwasawa 82a8f5
    vtrn.8          d20, d21
shun-iwasawa 82a8f5
      vst1.8          {d19}, [TMP2]
shun-iwasawa 82a8f5
      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
shun-iwasawa 82a8f5
      add             TMP1, TMP1, OUTPUT_COL
shun-iwasawa 82a8f5
      add             TMP2, TMP2, OUTPUT_COL
shun-iwasawa 82a8f5
      add             TMP3, TMP3, OUTPUT_COL
shun-iwasawa 82a8f5
      add             TMP4, TMP4, OUTPUT_COL
shun-iwasawa 82a8f5
      vst1.8          {d20}, [TMP1]
shun-iwasawa 82a8f5
    vtrn.8          d22, d23
shun-iwasawa 82a8f5
      vst1.8          {d21}, [TMP2]
shun-iwasawa 82a8f5
      vst1.8          {d22}, [TMP3]
shun-iwasawa 82a8f5
      vst1.8          {d23}, [TMP4]
shun-iwasawa 82a8f5
    bx              lr
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          DCT_TABLE
shun-iwasawa 82a8f5
    .unreq          COEF_BLOCK
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF
shun-iwasawa 82a8f5
    .unreq          OUTPUT_COL
shun-iwasawa 82a8f5
    .unreq          TMP1
shun-iwasawa 82a8f5
    .unreq          TMP2
shun-iwasawa 82a8f5
    .unreq          TMP3
shun-iwasawa 82a8f5
    .unreq          TMP4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_idct_4x4_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This function contains inverse-DCT code for getting reduced-size
shun-iwasawa 82a8f5
 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
shun-iwasawa 82a8f5
 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
shun-iwasawa 82a8f5
 * function from jpeg-6b (jidctred.c).
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
shun-iwasawa 82a8f5
 *       requires much less arithmetic operations and hence should be faster.
shun-iwasawa 82a8f5
 *       The primary purpose of this particular Neon optimized function is
shun-iwasawa 82a8f5
 *       bit exact compatibility with jpeg-6b.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * TODO: a bit better instructions scheduling can be achieved by expanding
shun-iwasawa 82a8f5
 *       idct_helper/transpose_4x4 macros and reordering instructions,
shun-iwasawa 82a8f5
 *       but readability will suffer somewhat.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define CONST_BITS  13
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
shun-iwasawa 82a8f5
#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
shun-iwasawa 82a8f5
#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
shun-iwasawa 82a8f5
#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
shun-iwasawa 82a8f5
#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
shun-iwasawa 82a8f5
#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
shun-iwasawa 82a8f5
#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
shun-iwasawa 82a8f5
#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
shun-iwasawa 82a8f5
#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
shun-iwasawa 82a8f5
#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
shun-iwasawa 82a8f5
#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
shun-iwasawa 82a8f5
#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
shun-iwasawa 82a8f5
#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
shun-iwasawa 82a8f5
#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
jsimd_idct_4x4_neon_consts:
shun-iwasawa 82a8f5
  .short FIX_1_847759065        /* d0[0] */
shun-iwasawa 82a8f5
  .short -FIX_0_765366865       /* d0[1] */
shun-iwasawa 82a8f5
  .short -FIX_0_211164243       /* d0[2] */
shun-iwasawa 82a8f5
  .short FIX_1_451774981        /* d0[3] */
shun-iwasawa 82a8f5
  .short -FIX_2_172734803       /* d1[0] */
shun-iwasawa 82a8f5
  .short FIX_1_061594337        /* d1[1] */
shun-iwasawa 82a8f5
  .short -FIX_0_509795579       /* d1[2] */
shun-iwasawa 82a8f5
  .short -FIX_0_601344887       /* d1[3] */
shun-iwasawa 82a8f5
  .short FIX_0_899976223        /* d2[0] */
shun-iwasawa 82a8f5
  .short FIX_2_562915447        /* d2[1] */
shun-iwasawa 82a8f5
  .short 1 << (CONST_BITS + 1)  /* d2[2] */
shun-iwasawa 82a8f5
  .short 0                      /* d2[3] */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
shun-iwasawa 82a8f5
    vmull.s16       q14, \x4, d2[2]
shun-iwasawa 82a8f5
    vmlal.s16       q14, \x8, d0[0]
shun-iwasawa 82a8f5
    vmlal.s16       q14, \x14, d0[1]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vmull.s16       q13, \x16, d1[2]
shun-iwasawa 82a8f5
    vmlal.s16       q13, \x12, d1[3]
shun-iwasawa 82a8f5
    vmlal.s16       q13, \x10, d2[0]
shun-iwasawa 82a8f5
    vmlal.s16       q13, \x6, d2[1]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vmull.s16       q15, \x4, d2[2]
shun-iwasawa 82a8f5
    vmlsl.s16       q15, \x8, d0[0]
shun-iwasawa 82a8f5
    vmlsl.s16       q15, \x14, d0[1]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vmull.s16       q12, \x16, d0[2]
shun-iwasawa 82a8f5
    vmlal.s16       q12, \x12, d0[3]
shun-iwasawa 82a8f5
    vmlal.s16       q12, \x10, d1[0]
shun-iwasawa 82a8f5
    vmlal.s16       q12, \x6, d1[1]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vadd.s32        q10, q14, q13
shun-iwasawa 82a8f5
    vsub.s32        q14, q14, q13
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  .if \shift > 16
shun-iwasawa 82a8f5
    vrshr.s32       q10, q10, #\shift
shun-iwasawa 82a8f5
    vrshr.s32       q14, q14, #\shift
shun-iwasawa 82a8f5
    vmovn.s32       \y26, q10
shun-iwasawa 82a8f5
    vmovn.s32       \y29, q14
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    vrshrn.s32      \y26, q10, #\shift
shun-iwasawa 82a8f5
    vrshrn.s32      \y29, q14, #\shift
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vadd.s32        q10, q15, q12
shun-iwasawa 82a8f5
    vsub.s32        q15, q15, q12
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  .if \shift > 16
shun-iwasawa 82a8f5
    vrshr.s32       q10, q10, #\shift
shun-iwasawa 82a8f5
    vrshr.s32       q15, q15, #\shift
shun-iwasawa 82a8f5
    vmovn.s32       \y27, q10
shun-iwasawa 82a8f5
    vmovn.s32       \y28, q15
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    vrshrn.s32      \y27, q10, #\shift
shun-iwasawa 82a8f5
    vrshrn.s32      \y28, q15, #\shift
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_idct_4x4_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    DCT_TABLE       .req r0
shun-iwasawa 82a8f5
    COEF_BLOCK      .req r1
shun-iwasawa 82a8f5
    OUTPUT_BUF      .req r2
shun-iwasawa 82a8f5
    OUTPUT_COL      .req r3
shun-iwasawa 82a8f5
    TMP1            .req r0
shun-iwasawa 82a8f5
    TMP2            .req r1
shun-iwasawa 82a8f5
    TMP3            .req r2
shun-iwasawa 82a8f5
    TMP4            .req ip
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vpush           {d8 - d15}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load constants (d3 is just used for padding) */
shun-iwasawa 82a8f5
    adr             TMP4, jsimd_idct_4x4_neon_consts
shun-iwasawa 82a8f5
    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load all COEF_BLOCK into Neon registers with the following allocation:
shun-iwasawa 82a8f5
     *       0 1 2 3 | 4 5 6 7
shun-iwasawa 82a8f5
     *      ---------+--------
shun-iwasawa 82a8f5
     *   0 | d4      | d5
shun-iwasawa 82a8f5
     *   1 | d6      | d7
shun-iwasawa 82a8f5
     *   2 | d8      | d9
shun-iwasawa 82a8f5
     *   3 | d10     | d11
shun-iwasawa 82a8f5
     *   4 | -       | -
shun-iwasawa 82a8f5
     *   5 | d12     | d13
shun-iwasawa 82a8f5
     *   6 | d14     | d15
shun-iwasawa 82a8f5
     *   7 | d16     | d17
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    add COEF_BLOCK, COEF_BLOCK, #16
shun-iwasawa 82a8f5
    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    /* dequantize */
shun-iwasawa 82a8f5
    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q2, q2, q9
shun-iwasawa 82a8f5
    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q3, q3, q10
shun-iwasawa 82a8f5
    vmul.s16        q4, q4, q11
shun-iwasawa 82a8f5
    add             DCT_TABLE, DCT_TABLE, #16
shun-iwasawa 82a8f5
    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q5, q5, q12
shun-iwasawa 82a8f5
    vmul.s16        q6, q6, q13
shun-iwasawa 82a8f5
    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q7, q7, q14
shun-iwasawa 82a8f5
    vmul.s16        q8, q8, q15
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Pass 1 */
shun-iwasawa 82a8f5
    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
shun-iwasawa 82a8f5
    transpose_4x4   d4, d6, d8, d10
shun-iwasawa 82a8f5
    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
shun-iwasawa 82a8f5
    transpose_4x4   d5, d7, d9, d11
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Pass 2 */
shun-iwasawa 82a8f5
    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
shun-iwasawa 82a8f5
    transpose_4x4   d26, d27, d28, d29
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Range limit */
shun-iwasawa 82a8f5
    vmov.u16        q15, #0x80
shun-iwasawa 82a8f5
    vadd.s16        q13, q13, q15
shun-iwasawa 82a8f5
    vadd.s16        q14, q14, q15
shun-iwasawa 82a8f5
    vqmovun.s16     d26, q13
shun-iwasawa 82a8f5
    vqmovun.s16     d27, q14
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Store results to the output buffer */
shun-iwasawa 82a8f5
    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
shun-iwasawa 82a8f5
    add             TMP1, TMP1, OUTPUT_COL
shun-iwasawa 82a8f5
    add             TMP2, TMP2, OUTPUT_COL
shun-iwasawa 82a8f5
    add             TMP3, TMP3, OUTPUT_COL
shun-iwasawa 82a8f5
    add             TMP4, TMP4, OUTPUT_COL
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
shun-iwasawa 82a8f5
    /* We can use much less instructions on little endian systems if the
shun-iwasawa 82a8f5
     * OS kernel is not configured to trap unaligned memory accesses
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
    vst1.32         {d26[0]}, [TMP1]!
shun-iwasawa 82a8f5
    vst1.32         {d27[0]}, [TMP3]!
shun-iwasawa 82a8f5
    vst1.32         {d26[1]}, [TMP2]!
shun-iwasawa 82a8f5
    vst1.32         {d27[1]}, [TMP4]!
shun-iwasawa 82a8f5
#else
shun-iwasawa 82a8f5
    vst1.8          {d26[0]}, [TMP1]!
shun-iwasawa 82a8f5
    vst1.8          {d27[0]}, [TMP3]!
shun-iwasawa 82a8f5
    vst1.8          {d26[1]}, [TMP1]!
shun-iwasawa 82a8f5
    vst1.8          {d27[1]}, [TMP3]!
shun-iwasawa 82a8f5
    vst1.8          {d26[2]}, [TMP1]!
shun-iwasawa 82a8f5
    vst1.8          {d27[2]}, [TMP3]!
shun-iwasawa 82a8f5
    vst1.8          {d26[3]}, [TMP1]!
shun-iwasawa 82a8f5
    vst1.8          {d27[3]}, [TMP3]!
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vst1.8          {d26[4]}, [TMP2]!
shun-iwasawa 82a8f5
    vst1.8          {d27[4]}, [TMP4]!
shun-iwasawa 82a8f5
    vst1.8          {d26[5]}, [TMP2]!
shun-iwasawa 82a8f5
    vst1.8          {d27[5]}, [TMP4]!
shun-iwasawa 82a8f5
    vst1.8          {d26[6]}, [TMP2]!
shun-iwasawa 82a8f5
    vst1.8          {d27[6]}, [TMP4]!
shun-iwasawa 82a8f5
    vst1.8          {d26[7]}, [TMP2]!
shun-iwasawa 82a8f5
    vst1.8          {d27[7]}, [TMP4]!
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vpop            {d8 - d15}
shun-iwasawa 82a8f5
    bx              lr
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          DCT_TABLE
shun-iwasawa 82a8f5
    .unreq          COEF_BLOCK
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF
shun-iwasawa 82a8f5
    .unreq          OUTPUT_COL
shun-iwasawa 82a8f5
    .unreq          TMP1
shun-iwasawa 82a8f5
    .unreq          TMP2
shun-iwasawa 82a8f5
    .unreq          TMP3
shun-iwasawa 82a8f5
    .unreq          TMP4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem idct_helper
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_idct_2x2_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This function contains inverse-DCT code for getting reduced-size
shun-iwasawa 82a8f5
 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
shun-iwasawa 82a8f5
 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
shun-iwasawa 82a8f5
 * function from jpeg-6b (jidctred.c).
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
shun-iwasawa 82a8f5
 *       requires much less arithmetic operations and hence should be faster.
shun-iwasawa 82a8f5
 *       The primary purpose of this particular Neon optimized function is
shun-iwasawa 82a8f5
 *       bit exact compatibility with jpeg-6b.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 8
shun-iwasawa 82a8f5
jsimd_idct_2x2_neon_consts:
shun-iwasawa 82a8f5
  .short -FIX_0_720959822  /* d0[0] */
shun-iwasawa 82a8f5
  .short FIX_0_850430095   /* d0[1] */
shun-iwasawa 82a8f5
  .short -FIX_1_272758580  /* d0[2] */
shun-iwasawa 82a8f5
  .short FIX_3_624509785   /* d0[3] */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
shun-iwasawa 82a8f5
    vshll.s16       q14, \x4, #15
shun-iwasawa 82a8f5
    vmull.s16       q13, \x6, d0[3]
shun-iwasawa 82a8f5
    vmlal.s16       q13, \x10, d0[2]
shun-iwasawa 82a8f5
    vmlal.s16       q13, \x12, d0[1]
shun-iwasawa 82a8f5
    vmlal.s16       q13, \x16, d0[0]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vadd.s32        q10, q14, q13
shun-iwasawa 82a8f5
    vsub.s32        q14, q14, q13
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  .if \shift > 16
shun-iwasawa 82a8f5
    vrshr.s32       q10, q10, #\shift
shun-iwasawa 82a8f5
    vrshr.s32       q14, q14, #\shift
shun-iwasawa 82a8f5
    vmovn.s32       \y26, q10
shun-iwasawa 82a8f5
    vmovn.s32       \y27, q14
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    vrshrn.s32      \y26, q10, #\shift
shun-iwasawa 82a8f5
    vrshrn.s32      \y27, q14, #\shift
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_idct_2x2_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    DCT_TABLE       .req r0
shun-iwasawa 82a8f5
    COEF_BLOCK      .req r1
shun-iwasawa 82a8f5
    OUTPUT_BUF      .req r2
shun-iwasawa 82a8f5
    OUTPUT_COL      .req r3
shun-iwasawa 82a8f5
    TMP1            .req r0
shun-iwasawa 82a8f5
    TMP2            .req ip
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vpush           {d8 - d15}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load constants */
shun-iwasawa 82a8f5
    adr             TMP2, jsimd_idct_2x2_neon_consts
shun-iwasawa 82a8f5
    vld1.16         {d0}, [TMP2, :64]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load all COEF_BLOCK into Neon registers with the following allocation:
shun-iwasawa 82a8f5
     *       0 1 2 3 | 4 5 6 7
shun-iwasawa 82a8f5
     *      ---------+--------
shun-iwasawa 82a8f5
     *   0 | d4      | d5
shun-iwasawa 82a8f5
     *   1 | d6      | d7
shun-iwasawa 82a8f5
     *   2 | -       | -
shun-iwasawa 82a8f5
     *   3 | d10     | d11
shun-iwasawa 82a8f5
     *   4 | -       | -
shun-iwasawa 82a8f5
     *   5 | d12     | d13
shun-iwasawa 82a8f5
     *   6 | -       | -
shun-iwasawa 82a8f5
     *   7 | d16     | d17
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    add             COEF_BLOCK, COEF_BLOCK, #16
shun-iwasawa 82a8f5
    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    add             COEF_BLOCK, COEF_BLOCK, #16
shun-iwasawa 82a8f5
    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    add             COEF_BLOCK, COEF_BLOCK, #16
shun-iwasawa 82a8f5
    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    /* Dequantize */
shun-iwasawa 82a8f5
    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q2, q2, q9
shun-iwasawa 82a8f5
    vmul.s16        q3, q3, q10
shun-iwasawa 82a8f5
    add             DCT_TABLE, DCT_TABLE, #16
shun-iwasawa 82a8f5
    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q5, q5, q12
shun-iwasawa 82a8f5
    add             DCT_TABLE, DCT_TABLE, #16
shun-iwasawa 82a8f5
    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q6, q6, q13
shun-iwasawa 82a8f5
    add             DCT_TABLE, DCT_TABLE, #16
shun-iwasawa 82a8f5
    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
shun-iwasawa 82a8f5
    vmul.s16        q8, q8, q15
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Pass 1 */
shun-iwasawa 82a8f5
#if 0
shun-iwasawa 82a8f5
    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
shun-iwasawa 82a8f5
    transpose_4x4   d4, d6, d8, d10
shun-iwasawa 82a8f5
    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
shun-iwasawa 82a8f5
    transpose_4x4   d5, d7, d9, d11
shun-iwasawa 82a8f5
#else
shun-iwasawa 82a8f5
    vmull.s16       q13, d6, d0[3]
shun-iwasawa 82a8f5
    vmlal.s16       q13, d10, d0[2]
shun-iwasawa 82a8f5
    vmlal.s16       q13, d12, d0[1]
shun-iwasawa 82a8f5
    vmlal.s16       q13, d16, d0[0]
shun-iwasawa 82a8f5
    vmull.s16       q12, d7, d0[3]
shun-iwasawa 82a8f5
    vmlal.s16       q12, d11, d0[2]
shun-iwasawa 82a8f5
    vmlal.s16       q12, d13, d0[1]
shun-iwasawa 82a8f5
    vmlal.s16       q12, d17, d0[0]
shun-iwasawa 82a8f5
    vshll.s16       q14, d4, #15
shun-iwasawa 82a8f5
    vshll.s16       q15, d5, #15
shun-iwasawa 82a8f5
    vadd.s32        q10, q14, q13
shun-iwasawa 82a8f5
    vsub.s32        q14, q14, q13
shun-iwasawa 82a8f5
    vrshrn.s32      d4, q10, #13
shun-iwasawa 82a8f5
    vrshrn.s32      d6, q14, #13
shun-iwasawa 82a8f5
    vadd.s32        q10, q15, q12
shun-iwasawa 82a8f5
    vsub.s32        q14, q15, q12
shun-iwasawa 82a8f5
    vrshrn.s32      d5, q10, #13
shun-iwasawa 82a8f5
    vrshrn.s32      d7, q14, #13
shun-iwasawa 82a8f5
    vtrn.16         q2, q3
shun-iwasawa 82a8f5
    vtrn.32         q3, q5
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Pass 2 */
shun-iwasawa 82a8f5
    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Range limit */
shun-iwasawa 82a8f5
    vmov.u16        q15, #0x80
shun-iwasawa 82a8f5
    vadd.s16        q13, q13, q15
shun-iwasawa 82a8f5
    vqmovun.s16     d26, q13
shun-iwasawa 82a8f5
    vqmovun.s16     d27, q13
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Store results to the output buffer */
shun-iwasawa 82a8f5
    ldmia           OUTPUT_BUF, {TMP1, TMP2}
shun-iwasawa 82a8f5
    add             TMP1, TMP1, OUTPUT_COL
shun-iwasawa 82a8f5
    add             TMP2, TMP2, OUTPUT_COL
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vst1.8          {d26[0]}, [TMP1]!
shun-iwasawa 82a8f5
    vst1.8          {d27[4]}, [TMP1]!
shun-iwasawa 82a8f5
    vst1.8          {d26[1]}, [TMP2]!
shun-iwasawa 82a8f5
    vst1.8          {d27[5]}, [TMP2]!
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vpop            {d8 - d15}
shun-iwasawa 82a8f5
    bx              lr
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          DCT_TABLE
shun-iwasawa 82a8f5
    .unreq          COEF_BLOCK
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF
shun-iwasawa 82a8f5
    .unreq          OUTPUT_COL
shun-iwasawa 82a8f5
    .unreq          TMP1
shun-iwasawa 82a8f5
    .unreq          TMP2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem idct_helper
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_ycc_extrgb_convert_neon
shun-iwasawa 82a8f5
 * jsimd_ycc_extbgr_convert_neon
shun-iwasawa 82a8f5
 * jsimd_ycc_extrgbx_convert_neon
shun-iwasawa 82a8f5
 * jsimd_ycc_extbgrx_convert_neon
shun-iwasawa 82a8f5
 * jsimd_ycc_extxbgr_convert_neon
shun-iwasawa 82a8f5
 * jsimd_ycc_extxrgb_convert_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Colorspace conversion YCbCr -> RGB
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_load size
shun-iwasawa 82a8f5
  .if \size == 8
shun-iwasawa 82a8f5
    vld1.8          {d4}, [U, :64]!
shun-iwasawa 82a8f5
    vld1.8          {d5}, [V, :64]!
shun-iwasawa 82a8f5
    vld1.8          {d0}, [Y, :64]!
shun-iwasawa 82a8f5
    pld             [U, #64]
shun-iwasawa 82a8f5
    pld             [V, #64]
shun-iwasawa 82a8f5
    pld             [Y, #64]
shun-iwasawa 82a8f5
  .elseif \size == 4
shun-iwasawa 82a8f5
    vld1.8          {d4[0]}, [U]!
shun-iwasawa 82a8f5
    vld1.8          {d4[1]}, [U]!
shun-iwasawa 82a8f5
    vld1.8          {d4[2]}, [U]!
shun-iwasawa 82a8f5
    vld1.8          {d4[3]}, [U]!
shun-iwasawa 82a8f5
    vld1.8          {d5[0]}, [V]!
shun-iwasawa 82a8f5
    vld1.8          {d5[1]}, [V]!
shun-iwasawa 82a8f5
    vld1.8          {d5[2]}, [V]!
shun-iwasawa 82a8f5
    vld1.8          {d5[3]}, [V]!
shun-iwasawa 82a8f5
    vld1.8          {d0[0]}, [Y]!
shun-iwasawa 82a8f5
    vld1.8          {d0[1]}, [Y]!
shun-iwasawa 82a8f5
    vld1.8          {d0[2]}, [Y]!
shun-iwasawa 82a8f5
    vld1.8          {d0[3]}, [Y]!
shun-iwasawa 82a8f5
  .elseif \size == 2
shun-iwasawa 82a8f5
    vld1.8          {d4[4]}, [U]!
shun-iwasawa 82a8f5
    vld1.8          {d4[5]}, [U]!
shun-iwasawa 82a8f5
    vld1.8          {d5[4]}, [V]!
shun-iwasawa 82a8f5
    vld1.8          {d5[5]}, [V]!
shun-iwasawa 82a8f5
    vld1.8          {d0[4]}, [Y]!
shun-iwasawa 82a8f5
    vld1.8          {d0[5]}, [Y]!
shun-iwasawa 82a8f5
  .elseif \size == 1
shun-iwasawa 82a8f5
    vld1.8          {d4[6]}, [U]!
shun-iwasawa 82a8f5
    vld1.8          {d5[6]}, [V]!
shun-iwasawa 82a8f5
    vld1.8          {d0[6]}, [Y]!
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    .error unsupported macroblock size
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_store bpp, size
shun-iwasawa 82a8f5
  .if \bpp == 24
shun-iwasawa 82a8f5
    .if \size == 8
shun-iwasawa 82a8f5
      vst3.8        {d10, d11, d12}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 4
shun-iwasawa 82a8f5
      vst3.8        {d10[0], d11[0], d12[0]}, [RGB]!
shun-iwasawa 82a8f5
      vst3.8        {d10[1], d11[1], d12[1]}, [RGB]!
shun-iwasawa 82a8f5
      vst3.8        {d10[2], d11[2], d12[2]}, [RGB]!
shun-iwasawa 82a8f5
      vst3.8        {d10[3], d11[3], d12[3]}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 2
shun-iwasawa 82a8f5
      vst3.8        {d10[4], d11[4], d12[4]}, [RGB]!
shun-iwasawa 82a8f5
      vst3.8        {d10[5], d11[5], d12[5]}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 1
shun-iwasawa 82a8f5
      vst3.8        {d10[6], d11[6], d12[6]}, [RGB]!
shun-iwasawa 82a8f5
    .else
shun-iwasawa 82a8f5
      .error unsupported macroblock size
shun-iwasawa 82a8f5
    .endif
shun-iwasawa 82a8f5
  .elseif \bpp == 32
shun-iwasawa 82a8f5
    .if \size == 8
shun-iwasawa 82a8f5
      vst4.8        {d10, d11, d12, d13}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 4
shun-iwasawa 82a8f5
      vst4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
shun-iwasawa 82a8f5
      vst4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
shun-iwasawa 82a8f5
      vst4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
shun-iwasawa 82a8f5
      vst4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 2
shun-iwasawa 82a8f5
      vst4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
shun-iwasawa 82a8f5
      vst4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 1
shun-iwasawa 82a8f5
      vst4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
shun-iwasawa 82a8f5
    .else
shun-iwasawa 82a8f5
      .error unsupported macroblock size
shun-iwasawa 82a8f5
    .endif
shun-iwasawa 82a8f5
  .elseif \bpp == 16
shun-iwasawa 82a8f5
    .if \size == 8
shun-iwasawa 82a8f5
      vst1.16       {q15}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 4
shun-iwasawa 82a8f5
      vst1.16       {d30}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 2
shun-iwasawa 82a8f5
      vst1.16       {d31[0]}, [RGB]!
shun-iwasawa 82a8f5
      vst1.16       {d31[1]}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 1
shun-iwasawa 82a8f5
      vst1.16       {d31[2]}, [RGB]!
shun-iwasawa 82a8f5
    .else
shun-iwasawa 82a8f5
      .error unsupported macroblock size
shun-iwasawa 82a8f5
    .endif
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    .error unsupported bpp
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * 2-stage pipelined YCbCr->RGB conversion
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_yuv_to_rgb_stage1
shun-iwasawa 82a8f5
    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
shun-iwasawa 82a8f5
    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
shun-iwasawa 82a8f5
    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
shun-iwasawa 82a8f5
    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
shun-iwasawa 82a8f5
    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
shun-iwasawa 82a8f5
    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
shun-iwasawa 82a8f5
    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
shun-iwasawa 82a8f5
    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
shun-iwasawa 82a8f5
    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
shun-iwasawa 82a8f5
    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_yuv_to_rgb_stage2
shun-iwasawa 82a8f5
    vrshrn.s32      d20, q10, #15
shun-iwasawa 82a8f5
    vrshrn.s32      d21, q11, #15
shun-iwasawa 82a8f5
    vrshrn.s32      d24, q12, #14
shun-iwasawa 82a8f5
    vrshrn.s32      d25, q13, #14
shun-iwasawa 82a8f5
    vrshrn.s32      d28, q14, #14
shun-iwasawa 82a8f5
    vrshrn.s32      d29, q15, #14
shun-iwasawa 82a8f5
    vaddw.u8        q11, q10, d0
shun-iwasawa 82a8f5
    vaddw.u8        q12, q12, d0
shun-iwasawa 82a8f5
    vaddw.u8        q14, q14, d0
shun-iwasawa 82a8f5
  .if \bpp != 16
shun-iwasawa 82a8f5
    vqmovun.s16     d1\g_offs, q11
shun-iwasawa 82a8f5
    vqmovun.s16     d1\r_offs, q12
shun-iwasawa 82a8f5
    vqmovun.s16     d1\b_offs, q14
shun-iwasawa 82a8f5
  .else  /* rgb565 */
shun-iwasawa 82a8f5
    vqshlu.s16      q13, q11, #8
shun-iwasawa 82a8f5
    vqshlu.s16      q15, q12, #8
shun-iwasawa 82a8f5
    vqshlu.s16      q14, q14, #8
shun-iwasawa 82a8f5
    vsri.u16        q15, q13, #5
shun-iwasawa 82a8f5
    vsri.u16        q15, q14, #11
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_yuv_to_rgb_stage2_store_load_stage1
shun-iwasawa 82a8f5
                                       /* "do_yuv_to_rgb_stage2" and "store" */
shun-iwasawa 82a8f5
                                       vrshrn.s32      d20, q10, #15
shun-iwasawa 82a8f5
    /* "load" and "do_yuv_to_rgb_stage1" */
shun-iwasawa 82a8f5
    pld             [U, #64]
shun-iwasawa 82a8f5
                                       vrshrn.s32      d21, q11, #15
shun-iwasawa 82a8f5
    pld             [V, #64]
shun-iwasawa 82a8f5
                                       vrshrn.s32      d24, q12, #14
shun-iwasawa 82a8f5
                                       vrshrn.s32      d25, q13, #14
shun-iwasawa 82a8f5
    vld1.8          {d4}, [U, :64]!
shun-iwasawa 82a8f5
                                       vrshrn.s32      d28, q14, #14
shun-iwasawa 82a8f5
    vld1.8          {d5}, [V, :64]!
shun-iwasawa 82a8f5
                                       vrshrn.s32      d29, q15, #14
shun-iwasawa 82a8f5
    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
shun-iwasawa 82a8f5
    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
shun-iwasawa 82a8f5
                                       vaddw.u8        q11, q10, d0
shun-iwasawa 82a8f5
    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
shun-iwasawa 82a8f5
    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
shun-iwasawa 82a8f5
                                       vaddw.u8        q12, q12, d0
shun-iwasawa 82a8f5
                                       vaddw.u8        q14, q14, d0
shun-iwasawa 82a8f5
  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
shun-iwasawa 82a8f5
                                       vqmovun.s16     d1\g_offs, q11
shun-iwasawa 82a8f5
    pld             [Y, #64]
shun-iwasawa 82a8f5
                                       vqmovun.s16     d1\r_offs, q12
shun-iwasawa 82a8f5
    vld1.8          {d0}, [Y, :64]!
shun-iwasawa 82a8f5
                                       vqmovun.s16     d1\b_offs, q14
shun-iwasawa 82a8f5
    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
shun-iwasawa 82a8f5
    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
shun-iwasawa 82a8f5
                                       do_store        \bpp, 8
shun-iwasawa 82a8f5
    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
shun-iwasawa 82a8f5
    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
shun-iwasawa 82a8f5
    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
shun-iwasawa 82a8f5
    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
shun-iwasawa 82a8f5
  .else  /**************************** rgb565 ********************************/
shun-iwasawa 82a8f5
                                       vqshlu.s16      q13, q11, #8
shun-iwasawa 82a8f5
    pld             [Y, #64]
shun-iwasawa 82a8f5
                                       vqshlu.s16      q15, q12, #8
shun-iwasawa 82a8f5
                                       vqshlu.s16      q14, q14, #8
shun-iwasawa 82a8f5
    vld1.8          {d0}, [Y, :64]!
shun-iwasawa 82a8f5
    vmull.s16       q11, d7, d1[1]
shun-iwasawa 82a8f5
    vmlal.s16       q11, d9, d1[2]
shun-iwasawa 82a8f5
                                       vsri.u16        q15, q13, #5
shun-iwasawa 82a8f5
    vmull.s16       q12, d8, d1[0]
shun-iwasawa 82a8f5
                                       vsri.u16        q15, q14, #11
shun-iwasawa 82a8f5
    vmull.s16       q13, d9, d1[0]
shun-iwasawa 82a8f5
    vmull.s16       q14, d6, d1[3]
shun-iwasawa 82a8f5
                                       do_store        \bpp, 8
shun-iwasawa 82a8f5
    vmull.s16       q15, d7, d1[3]
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_yuv_to_rgb
shun-iwasawa 82a8f5
    do_yuv_to_rgb_stage1
shun-iwasawa 82a8f5
    do_yuv_to_rgb_stage2
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Apple gas crashes on adrl, work around that by using adr.
shun-iwasawa 82a8f5
 * But this requires a copy of these constants for each function.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
jsimd_ycc_\colorid\()_neon_consts:
shun-iwasawa 82a8f5
  .short 0,      0,     0,      0
shun-iwasawa 82a8f5
  .short 22971, -11277, -23401, 29033
shun-iwasawa 82a8f5
  .short -128,  -128,   -128,   -128
shun-iwasawa 82a8f5
  .short -128,  -128,   -128,   -128
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_ycc_\colorid\()_convert_neon
shun-iwasawa 82a8f5
    OUTPUT_WIDTH    .req r0
shun-iwasawa 82a8f5
    INPUT_BUF       .req r1
shun-iwasawa 82a8f5
    INPUT_ROW       .req r2
shun-iwasawa 82a8f5
    OUTPUT_BUF      .req r3
shun-iwasawa 82a8f5
    NUM_ROWS        .req r4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    INPUT_BUF0      .req r5
shun-iwasawa 82a8f5
    INPUT_BUF1      .req r6
shun-iwasawa 82a8f5
    INPUT_BUF2      .req INPUT_BUF
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    RGB             .req r7
shun-iwasawa 82a8f5
    Y               .req r8
shun-iwasawa 82a8f5
    U               .req r9
shun-iwasawa 82a8f5
    V               .req r10
shun-iwasawa 82a8f5
    N               .req ip
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
shun-iwasawa 82a8f5
    adr             ip, jsimd_ycc_\colorid\()_neon_consts
shun-iwasawa 82a8f5
    vld1.16         {d0, d1, d2, d3}, [ip, :128]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Save Arm registers and handle input arguments */
shun-iwasawa 82a8f5
    push            {r4, r5, r6, r7, r8, r9, r10, lr}
shun-iwasawa 82a8f5
    ldr             NUM_ROWS, [sp, #(4 * 8)]
shun-iwasawa 82a8f5
    ldr             INPUT_BUF0, [INPUT_BUF]
shun-iwasawa 82a8f5
    ldr             INPUT_BUF1, [INPUT_BUF, #4]
shun-iwasawa 82a8f5
    ldr             INPUT_BUF2, [INPUT_BUF, #8]
shun-iwasawa 82a8f5
    .unreq          INPUT_BUF
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Save Neon registers */
shun-iwasawa 82a8f5
    vpush           {d8 - d15}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Initially set d10, d11, d12, d13 to 0xFF */
shun-iwasawa 82a8f5
    vmov.u8         q5, #255
shun-iwasawa 82a8f5
    vmov.u8         q6, #255
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Outer loop over scanlines */
shun-iwasawa 82a8f5
    cmp             NUM_ROWS, #1
shun-iwasawa 82a8f5
    blt             9f
shun-iwasawa 82a8f5
0:
shun-iwasawa 82a8f5
    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
shun-iwasawa 82a8f5
    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
shun-iwasawa 82a8f5
    mov             N, OUTPUT_WIDTH
shun-iwasawa 82a8f5
    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
shun-iwasawa 82a8f5
    add             INPUT_ROW, INPUT_ROW, #1
shun-iwasawa 82a8f5
    ldr             RGB, [OUTPUT_BUF], #4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Inner loop over pixels */
shun-iwasawa 82a8f5
    subs            N, N, #8
shun-iwasawa 82a8f5
    blt             3f
shun-iwasawa 82a8f5
    do_load         8
shun-iwasawa 82a8f5
    do_yuv_to_rgb_stage1
shun-iwasawa 82a8f5
    subs            N, N, #8
shun-iwasawa 82a8f5
    blt             2f
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    do_yuv_to_rgb_stage2_store_load_stage1
shun-iwasawa 82a8f5
    subs            N, N, #8
shun-iwasawa 82a8f5
    bge             1b
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    do_yuv_to_rgb_stage2
shun-iwasawa 82a8f5
    do_store        \bpp, 8
shun-iwasawa 82a8f5
    tst             N, #7
shun-iwasawa 82a8f5
    beq             8f
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    tst             N, #4
shun-iwasawa 82a8f5
    beq             3f
shun-iwasawa 82a8f5
    do_load         4
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    tst             N, #2
shun-iwasawa 82a8f5
    beq             4f
shun-iwasawa 82a8f5
    do_load         2
shun-iwasawa 82a8f5
4:
shun-iwasawa 82a8f5
    tst             N, #1
shun-iwasawa 82a8f5
    beq             5f
shun-iwasawa 82a8f5
    do_load         1
shun-iwasawa 82a8f5
5:
shun-iwasawa 82a8f5
    do_yuv_to_rgb
shun-iwasawa 82a8f5
    tst             N, #4
shun-iwasawa 82a8f5
    beq             6f
shun-iwasawa 82a8f5
    do_store        \bpp, 4
shun-iwasawa 82a8f5
6:
shun-iwasawa 82a8f5
    tst             N, #2
shun-iwasawa 82a8f5
    beq             7f
shun-iwasawa 82a8f5
    do_store        \bpp, 2
shun-iwasawa 82a8f5
7:
shun-iwasawa 82a8f5
    tst             N, #1
shun-iwasawa 82a8f5
    beq             8f
shun-iwasawa 82a8f5
    do_store        \bpp, 1
shun-iwasawa 82a8f5
8:
shun-iwasawa 82a8f5
    subs            NUM_ROWS, NUM_ROWS, #1
shun-iwasawa 82a8f5
    bgt             0b
shun-iwasawa 82a8f5
9:
shun-iwasawa 82a8f5
    /* Restore all registers and return */
shun-iwasawa 82a8f5
    vpop            {d8 - d15}
shun-iwasawa 82a8f5
    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          OUTPUT_WIDTH
shun-iwasawa 82a8f5
    .unreq          INPUT_ROW
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF
shun-iwasawa 82a8f5
    .unreq          NUM_ROWS
shun-iwasawa 82a8f5
    .unreq          INPUT_BUF0
shun-iwasawa 82a8f5
    .unreq          INPUT_BUF1
shun-iwasawa 82a8f5
    .unreq          INPUT_BUF2
shun-iwasawa 82a8f5
    .unreq          RGB
shun-iwasawa 82a8f5
    .unreq          Y
shun-iwasawa 82a8f5
    .unreq          U
shun-iwasawa 82a8f5
    .unreq          V
shun-iwasawa 82a8f5
    .unreq          N
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem do_yuv_to_rgb
shun-iwasawa 82a8f5
.purgem do_yuv_to_rgb_stage1
shun-iwasawa 82a8f5
.purgem do_yuv_to_rgb_stage2
shun-iwasawa 82a8f5
.purgem do_yuv_to_rgb_stage2_store_load_stage1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*--------------------------------- id ----- bpp R  G  B */
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 0, 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem do_load
shun-iwasawa 82a8f5
.purgem do_store
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_extrgb_ycc_convert_neon
shun-iwasawa 82a8f5
 * jsimd_extbgr_ycc_convert_neon
shun-iwasawa 82a8f5
 * jsimd_extrgbx_ycc_convert_neon
shun-iwasawa 82a8f5
 * jsimd_extbgrx_ycc_convert_neon
shun-iwasawa 82a8f5
 * jsimd_extxbgr_ycc_convert_neon
shun-iwasawa 82a8f5
 * jsimd_extxrgb_ycc_convert_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Colorspace conversion RGB -> YCbCr
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_store size
shun-iwasawa 82a8f5
  .if \size == 8
shun-iwasawa 82a8f5
    vst1.8          {d20}, [Y]!
shun-iwasawa 82a8f5
    vst1.8          {d21}, [U]!
shun-iwasawa 82a8f5
    vst1.8          {d22}, [V]!
shun-iwasawa 82a8f5
  .elseif \size == 4
shun-iwasawa 82a8f5
    vst1.8          {d20[0]}, [Y]!
shun-iwasawa 82a8f5
    vst1.8          {d20[1]}, [Y]!
shun-iwasawa 82a8f5
    vst1.8          {d20[2]}, [Y]!
shun-iwasawa 82a8f5
    vst1.8          {d20[3]}, [Y]!
shun-iwasawa 82a8f5
    vst1.8          {d21[0]}, [U]!
shun-iwasawa 82a8f5
    vst1.8          {d21[1]}, [U]!
shun-iwasawa 82a8f5
    vst1.8          {d21[2]}, [U]!
shun-iwasawa 82a8f5
    vst1.8          {d21[3]}, [U]!
shun-iwasawa 82a8f5
    vst1.8          {d22[0]}, [V]!
shun-iwasawa 82a8f5
    vst1.8          {d22[1]}, [V]!
shun-iwasawa 82a8f5
    vst1.8          {d22[2]}, [V]!
shun-iwasawa 82a8f5
    vst1.8          {d22[3]}, [V]!
shun-iwasawa 82a8f5
  .elseif \size == 2
shun-iwasawa 82a8f5
    vst1.8          {d20[4]}, [Y]!
shun-iwasawa 82a8f5
    vst1.8          {d20[5]}, [Y]!
shun-iwasawa 82a8f5
    vst1.8          {d21[4]}, [U]!
shun-iwasawa 82a8f5
    vst1.8          {d21[5]}, [U]!
shun-iwasawa 82a8f5
    vst1.8          {d22[4]}, [V]!
shun-iwasawa 82a8f5
    vst1.8          {d22[5]}, [V]!
shun-iwasawa 82a8f5
  .elseif \size == 1
shun-iwasawa 82a8f5
    vst1.8          {d20[6]}, [Y]!
shun-iwasawa 82a8f5
    vst1.8          {d21[6]}, [U]!
shun-iwasawa 82a8f5
    vst1.8          {d22[6]}, [V]!
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    .error unsupported macroblock size
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_load bpp, size
shun-iwasawa 82a8f5
  .if \bpp == 24
shun-iwasawa 82a8f5
    .if \size == 8
shun-iwasawa 82a8f5
      vld3.8        {d10, d11, d12}, [RGB]!
shun-iwasawa 82a8f5
      pld           [RGB, #128]
shun-iwasawa 82a8f5
    .elseif \size == 4
shun-iwasawa 82a8f5
      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
shun-iwasawa 82a8f5
      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
shun-iwasawa 82a8f5
      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
shun-iwasawa 82a8f5
      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 2
shun-iwasawa 82a8f5
      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
shun-iwasawa 82a8f5
      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 1
shun-iwasawa 82a8f5
      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
shun-iwasawa 82a8f5
    .else
shun-iwasawa 82a8f5
      .error unsupported macroblock size
shun-iwasawa 82a8f5
    .endif
shun-iwasawa 82a8f5
  .elseif \bpp == 32
shun-iwasawa 82a8f5
    .if \size == 8
shun-iwasawa 82a8f5
      vld4.8        {d10, d11, d12, d13}, [RGB]!
shun-iwasawa 82a8f5
      pld           [RGB, #128]
shun-iwasawa 82a8f5
    .elseif \size == 4
shun-iwasawa 82a8f5
      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
shun-iwasawa 82a8f5
      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
shun-iwasawa 82a8f5
      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
shun-iwasawa 82a8f5
      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 2
shun-iwasawa 82a8f5
      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
shun-iwasawa 82a8f5
      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
shun-iwasawa 82a8f5
    .elseif \size == 1
shun-iwasawa 82a8f5
      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
shun-iwasawa 82a8f5
    .else
shun-iwasawa 82a8f5
      .error unsupported macroblock size
shun-iwasawa 82a8f5
    .endif
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    .error unsupported bpp
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * 2-stage pipelined RGB->YCbCr conversion
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_rgb_to_yuv_stage1
shun-iwasawa 82a8f5
    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
shun-iwasawa 82a8f5
    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
shun-iwasawa 82a8f5
    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
shun-iwasawa 82a8f5
    vmull.u16       q7, d4, d0[0]
shun-iwasawa 82a8f5
    vmlal.u16       q7, d6, d0[1]
shun-iwasawa 82a8f5
    vmlal.u16       q7, d8, d0[2]
shun-iwasawa 82a8f5
    vmull.u16       q8, d5, d0[0]
shun-iwasawa 82a8f5
    vmlal.u16       q8, d7, d0[1]
shun-iwasawa 82a8f5
    vmlal.u16       q8, d9, d0[2]
shun-iwasawa 82a8f5
    vrev64.32       q9, q1
shun-iwasawa 82a8f5
    vrev64.32       q13, q1
shun-iwasawa 82a8f5
    vmlsl.u16       q9, d4, d0[3]
shun-iwasawa 82a8f5
    vmlsl.u16       q9, d6, d1[0]
shun-iwasawa 82a8f5
    vmlal.u16       q9, d8, d1[1]
shun-iwasawa 82a8f5
    vmlsl.u16       q13, d5, d0[3]
shun-iwasawa 82a8f5
    vmlsl.u16       q13, d7, d1[0]
shun-iwasawa 82a8f5
    vmlal.u16       q13, d9, d1[1]
shun-iwasawa 82a8f5
    vrev64.32       q14, q1
shun-iwasawa 82a8f5
    vrev64.32       q15, q1
shun-iwasawa 82a8f5
    vmlal.u16       q14, d4, d1[1]
shun-iwasawa 82a8f5
    vmlsl.u16       q14, d6, d1[2]
shun-iwasawa 82a8f5
    vmlsl.u16       q14, d8, d1[3]
shun-iwasawa 82a8f5
    vmlal.u16       q15, d5, d1[1]
shun-iwasawa 82a8f5
    vmlsl.u16       q15, d7, d1[2]
shun-iwasawa 82a8f5
    vmlsl.u16       q15, d9, d1[3]
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_rgb_to_yuv_stage2
shun-iwasawa 82a8f5
    vrshrn.u32      d20, q7, #16
shun-iwasawa 82a8f5
    vrshrn.u32      d21, q8, #16
shun-iwasawa 82a8f5
    vshrn.u32       d22, q9, #16
shun-iwasawa 82a8f5
    vshrn.u32       d23, q13, #16
shun-iwasawa 82a8f5
    vshrn.u32       d24, q14, #16
shun-iwasawa 82a8f5
    vshrn.u32       d25, q15, #16
shun-iwasawa 82a8f5
    vmovn.u16       d20, q10       /* d20 = y */
shun-iwasawa 82a8f5
    vmovn.u16       d21, q11       /* d21 = u */
shun-iwasawa 82a8f5
    vmovn.u16       d22, q12       /* d22 = v */
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_rgb_to_yuv
shun-iwasawa 82a8f5
    do_rgb_to_yuv_stage1
shun-iwasawa 82a8f5
    do_rgb_to_yuv_stage2
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_rgb_to_yuv_stage2_store_load_stage1
shun-iwasawa 82a8f5
      vrshrn.u32      d20, q7, #16
shun-iwasawa 82a8f5
      vrshrn.u32      d21, q8, #16
shun-iwasawa 82a8f5
      vshrn.u32       d22, q9, #16
shun-iwasawa 82a8f5
    vrev64.32       q9, q1
shun-iwasawa 82a8f5
      vshrn.u32       d23, q13, #16
shun-iwasawa 82a8f5
    vrev64.32       q13, q1
shun-iwasawa 82a8f5
      vshrn.u32       d24, q14, #16
shun-iwasawa 82a8f5
      vshrn.u32       d25, q15, #16
shun-iwasawa 82a8f5
    do_load         \bpp, 8
shun-iwasawa 82a8f5
      vmovn.u16       d20, q10     /* d20 = y */
shun-iwasawa 82a8f5
    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
shun-iwasawa 82a8f5
      vmovn.u16       d21, q11     /* d21 = u */
shun-iwasawa 82a8f5
    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
shun-iwasawa 82a8f5
      vmovn.u16       d22, q12     /* d22 = v */
shun-iwasawa 82a8f5
    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
shun-iwasawa 82a8f5
    vmull.u16       q7, d4, d0[0]
shun-iwasawa 82a8f5
    vmlal.u16       q7, d6, d0[1]
shun-iwasawa 82a8f5
    vmlal.u16       q7, d8, d0[2]
shun-iwasawa 82a8f5
      vst1.8          {d20}, [Y]!
shun-iwasawa 82a8f5
    vmull.u16       q8, d5, d0[0]
shun-iwasawa 82a8f5
    vmlal.u16       q8, d7, d0[1]
shun-iwasawa 82a8f5
    vmlal.u16       q8, d9, d0[2]
shun-iwasawa 82a8f5
    vmlsl.u16       q9, d4, d0[3]
shun-iwasawa 82a8f5
    vmlsl.u16       q9, d6, d1[0]
shun-iwasawa 82a8f5
    vmlal.u16       q9, d8, d1[1]
shun-iwasawa 82a8f5
      vst1.8          {d21}, [U]!
shun-iwasawa 82a8f5
    vmlsl.u16       q13, d5, d0[3]
shun-iwasawa 82a8f5
    vmlsl.u16       q13, d7, d1[0]
shun-iwasawa 82a8f5
    vmlal.u16       q13, d9, d1[1]
shun-iwasawa 82a8f5
    vrev64.32       q14, q1
shun-iwasawa 82a8f5
    vrev64.32       q15, q1
shun-iwasawa 82a8f5
    vmlal.u16       q14, d4, d1[1]
shun-iwasawa 82a8f5
    vmlsl.u16       q14, d6, d1[2]
shun-iwasawa 82a8f5
    vmlsl.u16       q14, d8, d1[3]
shun-iwasawa 82a8f5
      vst1.8          {d22}, [V]!
shun-iwasawa 82a8f5
    vmlal.u16       q15, d5, d1[1]
shun-iwasawa 82a8f5
    vmlsl.u16       q15, d7, d1[2]
shun-iwasawa 82a8f5
    vmlsl.u16       q15, d9, d1[3]
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
jsimd_\colorid\()_ycc_neon_consts:
shun-iwasawa 82a8f5
  .short 19595, 38470, 7471,  11059
shun-iwasawa 82a8f5
  .short 21709, 32768, 27439, 5329
shun-iwasawa 82a8f5
  .short 32767, 128,   32767, 128
shun-iwasawa 82a8f5
  .short 32767, 128,   32767, 128
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_\colorid\()_ycc_convert_neon
shun-iwasawa 82a8f5
    OUTPUT_WIDTH    .req r0
shun-iwasawa 82a8f5
    INPUT_BUF       .req r1
shun-iwasawa 82a8f5
    OUTPUT_BUF      .req r2
shun-iwasawa 82a8f5
    OUTPUT_ROW      .req r3
shun-iwasawa 82a8f5
    NUM_ROWS        .req r4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    OUTPUT_BUF0     .req r5
shun-iwasawa 82a8f5
    OUTPUT_BUF1     .req r6
shun-iwasawa 82a8f5
    OUTPUT_BUF2     .req OUTPUT_BUF
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    RGB             .req r7
shun-iwasawa 82a8f5
    Y               .req r8
shun-iwasawa 82a8f5
    U               .req r9
shun-iwasawa 82a8f5
    V               .req r10
shun-iwasawa 82a8f5
    N               .req ip
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load constants to d0, d1, d2, d3 */
shun-iwasawa 82a8f5
    adr             ip, jsimd_\colorid\()_ycc_neon_consts
shun-iwasawa 82a8f5
    vld1.16         {d0, d1, d2, d3}, [ip, :128]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Save Arm registers and handle input arguments */
shun-iwasawa 82a8f5
    push            {r4, r5, r6, r7, r8, r9, r10, lr}
shun-iwasawa 82a8f5
    ldr             NUM_ROWS, [sp, #(4 * 8)]
shun-iwasawa 82a8f5
    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
shun-iwasawa 82a8f5
    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
shun-iwasawa 82a8f5
    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Save Neon registers */
shun-iwasawa 82a8f5
    vpush           {d8 - d15}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Outer loop over scanlines */
shun-iwasawa 82a8f5
    cmp             NUM_ROWS, #1
shun-iwasawa 82a8f5
    blt             9f
shun-iwasawa 82a8f5
0:
shun-iwasawa 82a8f5
    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
shun-iwasawa 82a8f5
    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
shun-iwasawa 82a8f5
    mov             N, OUTPUT_WIDTH
shun-iwasawa 82a8f5
    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
shun-iwasawa 82a8f5
    add             OUTPUT_ROW, OUTPUT_ROW, #1
shun-iwasawa 82a8f5
    ldr             RGB, [INPUT_BUF], #4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Inner loop over pixels */
shun-iwasawa 82a8f5
    subs            N, N, #8
shun-iwasawa 82a8f5
    blt             3f
shun-iwasawa 82a8f5
    do_load         \bpp, 8
shun-iwasawa 82a8f5
    do_rgb_to_yuv_stage1
shun-iwasawa 82a8f5
    subs            N, N, #8
shun-iwasawa 82a8f5
    blt             2f
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    do_rgb_to_yuv_stage2_store_load_stage1
shun-iwasawa 82a8f5
    subs            N, N, #8
shun-iwasawa 82a8f5
    bge             1b
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    do_rgb_to_yuv_stage2
shun-iwasawa 82a8f5
    do_store        8
shun-iwasawa 82a8f5
    tst             N, #7
shun-iwasawa 82a8f5
    beq             8f
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    tst             N, #4
shun-iwasawa 82a8f5
    beq             3f
shun-iwasawa 82a8f5
    do_load         \bpp, 4
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    tst             N, #2
shun-iwasawa 82a8f5
    beq             4f
shun-iwasawa 82a8f5
    do_load         \bpp, 2
shun-iwasawa 82a8f5
4:
shun-iwasawa 82a8f5
    tst             N, #1
shun-iwasawa 82a8f5
    beq             5f
shun-iwasawa 82a8f5
    do_load         \bpp, 1
shun-iwasawa 82a8f5
5:
shun-iwasawa 82a8f5
    do_rgb_to_yuv
shun-iwasawa 82a8f5
    tst             N, #4
shun-iwasawa 82a8f5
    beq             6f
shun-iwasawa 82a8f5
    do_store        4
shun-iwasawa 82a8f5
6:
shun-iwasawa 82a8f5
    tst             N, #2
shun-iwasawa 82a8f5
    beq             7f
shun-iwasawa 82a8f5
    do_store        2
shun-iwasawa 82a8f5
7:
shun-iwasawa 82a8f5
    tst             N, #1
shun-iwasawa 82a8f5
    beq             8f
shun-iwasawa 82a8f5
    do_store        1
shun-iwasawa 82a8f5
8:
shun-iwasawa 82a8f5
    subs            NUM_ROWS, NUM_ROWS, #1
shun-iwasawa 82a8f5
    bgt             0b
shun-iwasawa 82a8f5
9:
shun-iwasawa 82a8f5
    /* Restore all registers and return */
shun-iwasawa 82a8f5
    vpop            {d8 - d15}
shun-iwasawa 82a8f5
    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          OUTPUT_WIDTH
shun-iwasawa 82a8f5
    .unreq          OUTPUT_ROW
shun-iwasawa 82a8f5
    .unreq          INPUT_BUF
shun-iwasawa 82a8f5
    .unreq          NUM_ROWS
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF0
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF1
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF2
shun-iwasawa 82a8f5
    .unreq          RGB
shun-iwasawa 82a8f5
    .unreq          Y
shun-iwasawa 82a8f5
    .unreq          U
shun-iwasawa 82a8f5
    .unreq          V
shun-iwasawa 82a8f5
    .unreq          N
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem do_rgb_to_yuv
shun-iwasawa 82a8f5
.purgem do_rgb_to_yuv_stage1
shun-iwasawa 82a8f5
.purgem do_rgb_to_yuv_stage2
shun-iwasawa 82a8f5
.purgem do_rgb_to_yuv_stage2_store_load_stage1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*--------------------------------- id ----- bpp R  G  B */
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem do_load
shun-iwasawa 82a8f5
.purgem do_store
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Load data into workspace, applying unsigned->signed conversion
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
shun-iwasawa 82a8f5
 *       rid of VST1.16 instructions
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_convsamp_neon
shun-iwasawa 82a8f5
    SAMPLE_DATA     .req r0
shun-iwasawa 82a8f5
    START_COL       .req r1
shun-iwasawa 82a8f5
    WORKSPACE       .req r2
shun-iwasawa 82a8f5
    TMP1            .req r3
shun-iwasawa 82a8f5
    TMP2            .req r4
shun-iwasawa 82a8f5
    TMP3            .req r5
shun-iwasawa 82a8f5
    TMP4            .req ip
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    push            {r4, r5}
shun-iwasawa 82a8f5
    vmov.u8         d0, #128
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
shun-iwasawa 82a8f5
    add             TMP1, TMP1, START_COL
shun-iwasawa 82a8f5
    add             TMP2, TMP2, START_COL
shun-iwasawa 82a8f5
    add             TMP3, TMP3, START_COL
shun-iwasawa 82a8f5
    add             TMP4, TMP4, START_COL
shun-iwasawa 82a8f5
    vld1.8          {d16}, [TMP1]
shun-iwasawa 82a8f5
    vsubl.u8        q8, d16, d0
shun-iwasawa 82a8f5
    vld1.8          {d18}, [TMP2]
shun-iwasawa 82a8f5
    vsubl.u8        q9, d18, d0
shun-iwasawa 82a8f5
    vld1.8          {d20}, [TMP3]
shun-iwasawa 82a8f5
    vsubl.u8        q10, d20, d0
shun-iwasawa 82a8f5
    vld1.8          {d22}, [TMP4]
shun-iwasawa 82a8f5
    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
shun-iwasawa 82a8f5
    vsubl.u8        q11, d22, d0
shun-iwasawa 82a8f5
    vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
shun-iwasawa 82a8f5
    add             TMP1, TMP1, START_COL
shun-iwasawa 82a8f5
    add             TMP2, TMP2, START_COL
shun-iwasawa 82a8f5
    vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
shun-iwasawa 82a8f5
    add             TMP3, TMP3, START_COL
shun-iwasawa 82a8f5
    add             TMP4, TMP4, START_COL
shun-iwasawa 82a8f5
    vld1.8          {d24}, [TMP1]
shun-iwasawa 82a8f5
    vsubl.u8        q12, d24, d0
shun-iwasawa 82a8f5
    vld1.8          {d26}, [TMP2]
shun-iwasawa 82a8f5
    vsubl.u8        q13, d26, d0
shun-iwasawa 82a8f5
    vld1.8          {d28}, [TMP3]
shun-iwasawa 82a8f5
    vsubl.u8        q14, d28, d0
shun-iwasawa 82a8f5
    vld1.8          {d30}, [TMP4]
shun-iwasawa 82a8f5
    vsubl.u8        q15, d30, d0
shun-iwasawa 82a8f5
    vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
shun-iwasawa 82a8f5
    vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
shun-iwasawa 82a8f5
    pop             {r4, r5}
shun-iwasawa 82a8f5
    bx              lr
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          SAMPLE_DATA
shun-iwasawa 82a8f5
    .unreq          START_COL
shun-iwasawa 82a8f5
    .unreq          WORKSPACE
shun-iwasawa 82a8f5
    .unreq          TMP1
shun-iwasawa 82a8f5
    .unreq          TMP2
shun-iwasawa 82a8f5
    .unreq          TMP3
shun-iwasawa 82a8f5
    .unreq          TMP4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_fdct_ifast_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This function contains a fast, not so accurate integer implementation of
shun-iwasawa 82a8f5
 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
shun-iwasawa 82a8f5
 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
shun-iwasawa 82a8f5
 * function from jfdctfst.c
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * TODO: can be combined with 'jsimd_convsamp_neon' to get
shun-iwasawa 82a8f5
 *       rid of a bunch of VLD1.16 instructions
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define XFIX_0_382683433  d0[0]
shun-iwasawa 82a8f5
#define XFIX_0_541196100  d0[1]
shun-iwasawa 82a8f5
#define XFIX_0_707106781  d0[2]
shun-iwasawa 82a8f5
#define XFIX_1_306562965  d0[3]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
jsimd_fdct_ifast_neon_consts:
shun-iwasawa 82a8f5
  .short (98 * 128)               /* XFIX_0_382683433 */
shun-iwasawa 82a8f5
  .short (139 * 128)              /* XFIX_0_541196100 */
shun-iwasawa 82a8f5
  .short (181 * 128)              /* XFIX_0_707106781 */
shun-iwasawa 82a8f5
  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_fdct_ifast_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    DATA            .req r0
shun-iwasawa 82a8f5
    TMP             .req ip
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vpush           {d8 - d15}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load constants */
shun-iwasawa 82a8f5
    adr             TMP, jsimd_fdct_ifast_neon_consts
shun-iwasawa 82a8f5
    vld1.16         {d0}, [TMP, :64]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load all DATA into Neon registers with the following allocation:
shun-iwasawa 82a8f5
     *       0 1 2 3 | 4 5 6 7
shun-iwasawa 82a8f5
     *      ---------+--------
shun-iwasawa 82a8f5
     *   0 | d16     | d17    | q8
shun-iwasawa 82a8f5
     *   1 | d18     | d19    | q9
shun-iwasawa 82a8f5
     *   2 | d20     | d21    | q10
shun-iwasawa 82a8f5
     *   3 | d22     | d23    | q11
shun-iwasawa 82a8f5
     *   4 | d24     | d25    | q12
shun-iwasawa 82a8f5
     *   5 | d26     | d27    | q13
shun-iwasawa 82a8f5
     *   6 | d28     | d29    | q14
shun-iwasawa 82a8f5
     *   7 | d30     | d31    | q15
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
shun-iwasawa 82a8f5
    vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
shun-iwasawa 82a8f5
    vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
shun-iwasawa 82a8f5
    vld1.16         {d28, d29, d30, d31}, [DATA, :128]
shun-iwasawa 82a8f5
    sub             DATA, DATA, #(128 - 32)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov             TMP, #2
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    /* Transpose */
shun-iwasawa 82a8f5
    vtrn.16         q12, q13
shun-iwasawa 82a8f5
    vtrn.16         q10, q11
shun-iwasawa 82a8f5
    vtrn.16         q8, q9
shun-iwasawa 82a8f5
    vtrn.16         q14, q15
shun-iwasawa 82a8f5
    vtrn.32         q9, q11
shun-iwasawa 82a8f5
    vtrn.32         q13, q15
shun-iwasawa 82a8f5
    vtrn.32         q8, q10
shun-iwasawa 82a8f5
    vtrn.32         q12, q14
shun-iwasawa 82a8f5
    vswp            d30, d23
shun-iwasawa 82a8f5
    vswp            d24, d17
shun-iwasawa 82a8f5
    vswp            d26, d19
shun-iwasawa 82a8f5
      /* 1-D FDCT */
shun-iwasawa 82a8f5
      vadd.s16        q2, q11, q12
shun-iwasawa 82a8f5
    vswp            d28, d21
shun-iwasawa 82a8f5
      vsub.s16        q12, q11, q12
shun-iwasawa 82a8f5
      vsub.s16        q6, q10, q13
shun-iwasawa 82a8f5
      vadd.s16        q10, q10, q13
shun-iwasawa 82a8f5
      vsub.s16        q7, q9, q14
shun-iwasawa 82a8f5
      vadd.s16        q9, q9, q14
shun-iwasawa 82a8f5
      vsub.s16        q1, q8, q15
shun-iwasawa 82a8f5
      vadd.s16        q8, q8, q15
shun-iwasawa 82a8f5
      vsub.s16        q4, q9, q10
shun-iwasawa 82a8f5
      vsub.s16        q5, q8, q2
shun-iwasawa 82a8f5
      vadd.s16        q3, q9, q10
shun-iwasawa 82a8f5
      vadd.s16        q4, q4, q5
shun-iwasawa 82a8f5
      vadd.s16        q2, q8, q2
shun-iwasawa 82a8f5
      vqdmulh.s16     q4, q4, XFIX_0_707106781
shun-iwasawa 82a8f5
      vadd.s16        q11, q12, q6
shun-iwasawa 82a8f5
      vadd.s16        q8, q2, q3
shun-iwasawa 82a8f5
      vsub.s16        q12, q2, q3
shun-iwasawa 82a8f5
      vadd.s16        q3, q6, q7
shun-iwasawa 82a8f5
      vadd.s16        q7, q7, q1
shun-iwasawa 82a8f5
      vqdmulh.s16     q3, q3, XFIX_0_707106781
shun-iwasawa 82a8f5
      vsub.s16        q6, q11, q7
shun-iwasawa 82a8f5
      vadd.s16        q10, q5, q4
shun-iwasawa 82a8f5
      vqdmulh.s16     q6, q6, XFIX_0_382683433
shun-iwasawa 82a8f5
      vsub.s16        q14, q5, q4
shun-iwasawa 82a8f5
      vqdmulh.s16     q11, q11, XFIX_0_541196100
shun-iwasawa 82a8f5
      vqdmulh.s16     q5, q7, XFIX_1_306562965
shun-iwasawa 82a8f5
      vadd.s16        q4, q1, q3
shun-iwasawa 82a8f5
      vsub.s16        q3, q1, q3
shun-iwasawa 82a8f5
      vadd.s16        q7, q7, q6
shun-iwasawa 82a8f5
      vadd.s16        q11, q11, q6
shun-iwasawa 82a8f5
      vadd.s16        q7, q7, q5
shun-iwasawa 82a8f5
      vadd.s16        q13, q3, q11
shun-iwasawa 82a8f5
      vsub.s16        q11, q3, q11
shun-iwasawa 82a8f5
      vadd.s16        q9, q4, q7
shun-iwasawa 82a8f5
      vsub.s16        q15, q4, q7
shun-iwasawa 82a8f5
    subs            TMP, TMP, #1
shun-iwasawa 82a8f5
    bne             1b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* store results */
shun-iwasawa 82a8f5
    vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
shun-iwasawa 82a8f5
    vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
shun-iwasawa 82a8f5
    vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
shun-iwasawa 82a8f5
    vst1.16         {d28, d29, d30, d31}, [DATA, :128]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vpop            {d8 - d15}
shun-iwasawa 82a8f5
    bx              lr
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          DATA
shun-iwasawa 82a8f5
    .unreq          TMP
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * GLOBAL(void)
shun-iwasawa 82a8f5
 * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
shun-iwasawa 82a8f5
 *                     DCTELEM *workspace);
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Note: the code uses 2 stage pipelining in order to improve instructions
shun-iwasawa 82a8f5
 *       scheduling and eliminate stalls (this provides ~15% better
shun-iwasawa 82a8f5
 *       performance for this function on both Arm Cortex-A8 and
shun-iwasawa 82a8f5
 *       Arm Cortex-A9 when compared to the non-pipelined variant).
shun-iwasawa 82a8f5
 *       The instructions which belong to the second stage use different
shun-iwasawa 82a8f5
 *       indentation for better readiability.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
asm_function jsimd_quantize_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    COEF_BLOCK      .req r0
shun-iwasawa 82a8f5
    DIVISORS        .req r1
shun-iwasawa 82a8f5
    WORKSPACE       .req r2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    RECIPROCAL      .req DIVISORS
shun-iwasawa 82a8f5
    CORRECTION      .req r3
shun-iwasawa 82a8f5
    SHIFT           .req ip
shun-iwasawa 82a8f5
    LOOP_COUNT      .req r4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
shun-iwasawa 82a8f5
    vabs.s16        q12, q0
shun-iwasawa 82a8f5
    add             CORRECTION, DIVISORS, #(64 * 2)
shun-iwasawa 82a8f5
    add             SHIFT, DIVISORS, #(64 * 6)
shun-iwasawa 82a8f5
    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
shun-iwasawa 82a8f5
    vabs.s16        q13, q1
shun-iwasawa 82a8f5
    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
shun-iwasawa 82a8f5
    vadd.u16        q12, q12, q10  /* add correction */
shun-iwasawa 82a8f5
    vadd.u16        q13, q13, q11
shun-iwasawa 82a8f5
    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
shun-iwasawa 82a8f5
    vmull.u16       q11, d25, d17
shun-iwasawa 82a8f5
    vmull.u16       q8, d26, d18
shun-iwasawa 82a8f5
    vmull.u16       q9, d27, d19
shun-iwasawa 82a8f5
    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
shun-iwasawa 82a8f5
    vshrn.u32       d20, q10, #16
shun-iwasawa 82a8f5
    vshrn.u32       d21, q11, #16
shun-iwasawa 82a8f5
    vshrn.u32       d22, q8, #16
shun-iwasawa 82a8f5
    vshrn.u32       d23, q9, #16
shun-iwasawa 82a8f5
    vneg.s16        q12, q12
shun-iwasawa 82a8f5
    vneg.s16        q13, q13
shun-iwasawa 82a8f5
    vshr.s16        q2, q0, #15    /* extract sign */
shun-iwasawa 82a8f5
    vshr.s16        q3, q1, #15
shun-iwasawa 82a8f5
    vshl.u16        q14, q10, q12  /* shift */
shun-iwasawa 82a8f5
    vshl.u16        q15, q11, q13
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    push            {r4, r5}
shun-iwasawa 82a8f5
    mov             LOOP_COUNT, #3
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
shun-iwasawa 82a8f5
      veor.u16        q14, q14, q2  /* restore sign */
shun-iwasawa 82a8f5
    vabs.s16        q12, q0
shun-iwasawa 82a8f5
    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
shun-iwasawa 82a8f5
    vabs.s16        q13, q1
shun-iwasawa 82a8f5
      veor.u16        q15, q15, q3
shun-iwasawa 82a8f5
    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
shun-iwasawa 82a8f5
    vadd.u16        q12, q12, q10  /* add correction */
shun-iwasawa 82a8f5
    vadd.u16        q13, q13, q11
shun-iwasawa 82a8f5
    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
shun-iwasawa 82a8f5
    vmull.u16       q11, d25, d17
shun-iwasawa 82a8f5
    vmull.u16       q8, d26, d18
shun-iwasawa 82a8f5
    vmull.u16       q9, d27, d19
shun-iwasawa 82a8f5
      vsub.u16        q14, q14, q2
shun-iwasawa 82a8f5
    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
shun-iwasawa 82a8f5
      vsub.u16        q15, q15, q3
shun-iwasawa 82a8f5
    vshrn.u32       d20, q10, #16
shun-iwasawa 82a8f5
    vshrn.u32       d21, q11, #16
shun-iwasawa 82a8f5
      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
    vshrn.u32       d22, q8, #16
shun-iwasawa 82a8f5
    vshrn.u32       d23, q9, #16
shun-iwasawa 82a8f5
    vneg.s16        q12, q12
shun-iwasawa 82a8f5
    vneg.s16        q13, q13
shun-iwasawa 82a8f5
    vshr.s16        q2, q0, #15    /* extract sign */
shun-iwasawa 82a8f5
    vshr.s16        q3, q1, #15
shun-iwasawa 82a8f5
    vshl.u16        q14, q10, q12  /* shift */
shun-iwasawa 82a8f5
    vshl.u16        q15, q11, q13
shun-iwasawa 82a8f5
    subs            LOOP_COUNT, LOOP_COUNT, #1
shun-iwasawa 82a8f5
    bne             1b
shun-iwasawa 82a8f5
    pop             {r4, r5}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
      veor.u16        q14, q14, q2  /* restore sign */
shun-iwasawa 82a8f5
      veor.u16        q15, q15, q3
shun-iwasawa 82a8f5
      vsub.u16        q14, q14, q2
shun-iwasawa 82a8f5
      vsub.u16        q15, q15, q3
shun-iwasawa 82a8f5
      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    bx              lr  /* return */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          COEF_BLOCK
shun-iwasawa 82a8f5
    .unreq          DIVISORS
shun-iwasawa 82a8f5
    .unreq          WORKSPACE
shun-iwasawa 82a8f5
    .unreq          RECIPROCAL
shun-iwasawa 82a8f5
    .unreq          CORRECTION
shun-iwasawa 82a8f5
    .unreq          SHIFT
shun-iwasawa 82a8f5
    .unreq          LOOP_COUNT
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * GLOBAL(void)
shun-iwasawa 82a8f5
 * jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
shun-iwasawa 82a8f5
 *                                JDIMENSION downsampled_width,
shun-iwasawa 82a8f5
 *                                JSAMPARRAY input_data,
shun-iwasawa 82a8f5
 *                                JSAMPARRAY *output_data_ptr);
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Note: the use of unaligned writes is the main remaining bottleneck in
shun-iwasawa 82a8f5
 *       this code, which can be potentially solved to get up to tens
shun-iwasawa 82a8f5
 *       of percents performance improvement on Cortex-A8/Cortex-A9.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
shun-iwasawa 82a8f5
 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
shun-iwasawa 82a8f5
 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
shun-iwasawa 82a8f5
 * Register d28 is used for multiplication by 3. Register q15 is used
shun-iwasawa 82a8f5
 * for adding +1 bias.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
.macro upsample16 OUTPTR, INPTR
shun-iwasawa 82a8f5
    vld1.8          {q0}, [\INPTR]!
shun-iwasawa 82a8f5
    vmovl.u8        q8, d0
shun-iwasawa 82a8f5
    vext.8          q2, q1, q0, #15
shun-iwasawa 82a8f5
    vmovl.u8        q9, d1
shun-iwasawa 82a8f5
    vaddw.u8        q10, q15, d4
shun-iwasawa 82a8f5
    vaddw.u8        q11, q15, d5
shun-iwasawa 82a8f5
    vmlal.u8        q8, d4, d28
shun-iwasawa 82a8f5
    vmlal.u8        q9, d5, d28
shun-iwasawa 82a8f5
    vmlal.u8        q10, d0, d28
shun-iwasawa 82a8f5
    vmlal.u8        q11, d1, d28
shun-iwasawa 82a8f5
    vmov            q1, q0        /* backup source pixels to q1 */
shun-iwasawa 82a8f5
    vrshrn.u16      d6, q8, #2
shun-iwasawa 82a8f5
    vrshrn.u16      d7, q9, #2
shun-iwasawa 82a8f5
    vshrn.u16       d8, q10, #2
shun-iwasawa 82a8f5
    vshrn.u16       d9, q11, #2
shun-iwasawa 82a8f5
    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
shun-iwasawa 82a8f5
 * macro, the roles of q0 and q1 registers are reversed for even and odd
shun-iwasawa 82a8f5
 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
shun-iwasawa 82a8f5
 * Also this unrolling allows to reorder loads and stores to compensate
shun-iwasawa 82a8f5
 * multiplication latency and reduce stalls.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
.macro upsample32 OUTPTR, INPTR
shun-iwasawa 82a8f5
    /* even 16 pixels group */
shun-iwasawa 82a8f5
    vld1.8          {q0}, [\INPTR]!
shun-iwasawa 82a8f5
    vmovl.u8        q8, d0
shun-iwasawa 82a8f5
    vext.8          q2, q1, q0, #15
shun-iwasawa 82a8f5
    vmovl.u8        q9, d1
shun-iwasawa 82a8f5
    vaddw.u8        q10, q15, d4
shun-iwasawa 82a8f5
    vaddw.u8        q11, q15, d5
shun-iwasawa 82a8f5
    vmlal.u8        q8, d4, d28
shun-iwasawa 82a8f5
    vmlal.u8        q9, d5, d28
shun-iwasawa 82a8f5
    vmlal.u8        q10, d0, d28
shun-iwasawa 82a8f5
    vmlal.u8        q11, d1, d28
shun-iwasawa 82a8f5
      /* odd 16 pixels group */
shun-iwasawa 82a8f5
      vld1.8          {q1}, [\INPTR]!
shun-iwasawa 82a8f5
    vrshrn.u16      d6, q8, #2
shun-iwasawa 82a8f5
    vrshrn.u16      d7, q9, #2
shun-iwasawa 82a8f5
    vshrn.u16       d8, q10, #2
shun-iwasawa 82a8f5
    vshrn.u16       d9, q11, #2
shun-iwasawa 82a8f5
      vmovl.u8        q8, d2
shun-iwasawa 82a8f5
      vext.8          q2, q0, q1, #15
shun-iwasawa 82a8f5
      vmovl.u8        q9, d3
shun-iwasawa 82a8f5
      vaddw.u8        q10, q15, d4
shun-iwasawa 82a8f5
      vaddw.u8        q11, q15, d5
shun-iwasawa 82a8f5
      vmlal.u8        q8, d4, d28
shun-iwasawa 82a8f5
      vmlal.u8        q9, d5, d28
shun-iwasawa 82a8f5
      vmlal.u8        q10, d2, d28
shun-iwasawa 82a8f5
      vmlal.u8        q11, d3, d28
shun-iwasawa 82a8f5
    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
shun-iwasawa 82a8f5
      vrshrn.u16      d6, q8, #2
shun-iwasawa 82a8f5
      vrshrn.u16      d7, q9, #2
shun-iwasawa 82a8f5
      vshrn.u16       d8, q10, #2
shun-iwasawa 82a8f5
      vshrn.u16       d9, q11, #2
shun-iwasawa 82a8f5
      vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
shun-iwasawa 82a8f5
    /* special case for the first and last pixels */
shun-iwasawa 82a8f5
    sub             \WIDTH, \WIDTH, #1
shun-iwasawa 82a8f5
    add             \OUTPTR, \OUTPTR, #1
shun-iwasawa 82a8f5
    ldrb            \TMP1, [\INPTR, \WIDTH]
shun-iwasawa 82a8f5
    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
shun-iwasawa 82a8f5
    ldrb            \TMP1, [\INPTR], #1
shun-iwasawa 82a8f5
    strb            \TMP1, [\OUTPTR, #-1]
shun-iwasawa 82a8f5
    vmov.8          d3[7], \TMP1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    subs            \WIDTH, \WIDTH, #32
shun-iwasawa 82a8f5
    blt             5f
shun-iwasawa 82a8f5
0:  /* process 32 pixels per iteration */
shun-iwasawa 82a8f5
    upsample32      \OUTPTR, \INPTR
shun-iwasawa 82a8f5
    subs            \WIDTH, \WIDTH, #32
shun-iwasawa 82a8f5
    bge             0b
shun-iwasawa 82a8f5
5:
shun-iwasawa 82a8f5
    adds            \WIDTH, \WIDTH, #16
shun-iwasawa 82a8f5
    blt             1f
shun-iwasawa 82a8f5
0:  /* process 16 pixels if needed */
shun-iwasawa 82a8f5
    upsample16      \OUTPTR, \INPTR
shun-iwasawa 82a8f5
    subs            \WIDTH, \WIDTH, #16
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    adds            \WIDTH, \WIDTH, #16
shun-iwasawa 82a8f5
    beq             9f
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* load the remaining 1-15 pixels */
shun-iwasawa 82a8f5
    add             \INPTR, \INPTR, \WIDTH
shun-iwasawa 82a8f5
    tst             \WIDTH, #1
shun-iwasawa 82a8f5
    beq             2f
shun-iwasawa 82a8f5
    sub             \INPTR, \INPTR, #1
shun-iwasawa 82a8f5
    vld1.8          {d0[0]}, [\INPTR]
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    tst             \WIDTH, #2
shun-iwasawa 82a8f5
    beq             2f
shun-iwasawa 82a8f5
    vext.8          d0, d0, d0, #6
shun-iwasawa 82a8f5
    sub             \INPTR, \INPTR, #1
shun-iwasawa 82a8f5
    vld1.8          {d0[1]}, [\INPTR]
shun-iwasawa 82a8f5
    sub             \INPTR, \INPTR, #1
shun-iwasawa 82a8f5
    vld1.8          {d0[0]}, [\INPTR]
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    tst             \WIDTH, #4
shun-iwasawa 82a8f5
    beq             2f
shun-iwasawa 82a8f5
    vrev64.32       d0, d0
shun-iwasawa 82a8f5
    sub             \INPTR, \INPTR, #1
shun-iwasawa 82a8f5
    vld1.8          {d0[3]}, [\INPTR]
shun-iwasawa 82a8f5
    sub             \INPTR, \INPTR, #1
shun-iwasawa 82a8f5
    vld1.8          {d0[2]}, [\INPTR]
shun-iwasawa 82a8f5
    sub             \INPTR, \INPTR, #1
shun-iwasawa 82a8f5
    vld1.8          {d0[1]}, [\INPTR]
shun-iwasawa 82a8f5
    sub             \INPTR, \INPTR, #1
shun-iwasawa 82a8f5
    vld1.8          {d0[0]}, [\INPTR]
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    tst             \WIDTH, #8
shun-iwasawa 82a8f5
    beq             2f
shun-iwasawa 82a8f5
    vmov            d1, d0
shun-iwasawa 82a8f5
    sub             \INPTR, \INPTR, #8
shun-iwasawa 82a8f5
    vld1.8          {d0}, [\INPTR]
shun-iwasawa 82a8f5
2:  /* upsample the remaining pixels */
shun-iwasawa 82a8f5
    vmovl.u8        q8, d0
shun-iwasawa 82a8f5
    vext.8          q2, q1, q0, #15
shun-iwasawa 82a8f5
    vmovl.u8        q9, d1
shun-iwasawa 82a8f5
    vaddw.u8        q10, q15, d4
shun-iwasawa 82a8f5
    vaddw.u8        q11, q15, d5
shun-iwasawa 82a8f5
    vmlal.u8        q8, d4, d28
shun-iwasawa 82a8f5
    vmlal.u8        q9, d5, d28
shun-iwasawa 82a8f5
    vmlal.u8        q10, d0, d28
shun-iwasawa 82a8f5
    vmlal.u8        q11, d1, d28
shun-iwasawa 82a8f5
    vrshrn.u16      d10, q8, #2
shun-iwasawa 82a8f5
    vrshrn.u16      d12, q9, #2
shun-iwasawa 82a8f5
    vshrn.u16       d11, q10, #2
shun-iwasawa 82a8f5
    vshrn.u16       d13, q11, #2
shun-iwasawa 82a8f5
    vzip.8          d10, d11
shun-iwasawa 82a8f5
    vzip.8          d12, d13
shun-iwasawa 82a8f5
    /* store the remaining pixels */
shun-iwasawa 82a8f5
    tst             \WIDTH, #8
shun-iwasawa 82a8f5
    beq             2f
shun-iwasawa 82a8f5
    vst1.8          {d10, d11}, [\OUTPTR]!
shun-iwasawa 82a8f5
    vmov            q5, q6
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    tst             \WIDTH, #4
shun-iwasawa 82a8f5
    beq             2f
shun-iwasawa 82a8f5
    vst1.8          {d10}, [\OUTPTR]!
shun-iwasawa 82a8f5
    vmov            d10, d11
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    tst             \WIDTH, #2
shun-iwasawa 82a8f5
    beq             2f
shun-iwasawa 82a8f5
    vst1.8          {d10[0]}, [\OUTPTR]!
shun-iwasawa 82a8f5
    vst1.8          {d10[1]}, [\OUTPTR]!
shun-iwasawa 82a8f5
    vst1.8          {d10[2]}, [\OUTPTR]!
shun-iwasawa 82a8f5
    vst1.8          {d10[3]}, [\OUTPTR]!
shun-iwasawa 82a8f5
    vext.8          d10, d10, d10, #4
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    tst             \WIDTH, #1
shun-iwasawa 82a8f5
    beq             2f
shun-iwasawa 82a8f5
    vst1.8          {d10[0]}, [\OUTPTR]!
shun-iwasawa 82a8f5
    vst1.8          {d10[1]}, [\OUTPTR]!
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
9:
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_h2v1_fancy_upsample_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    MAX_V_SAMP_FACTOR .req r0
shun-iwasawa 82a8f5
    DOWNSAMPLED_WIDTH .req r1
shun-iwasawa 82a8f5
    INPUT_DATA        .req r2
shun-iwasawa 82a8f5
    OUTPUT_DATA_PTR   .req r3
shun-iwasawa 82a8f5
    OUTPUT_DATA       .req OUTPUT_DATA_PTR
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    OUTPTR            .req r4
shun-iwasawa 82a8f5
    INPTR             .req r5
shun-iwasawa 82a8f5
    WIDTH             .req ip
shun-iwasawa 82a8f5
    TMP               .req lr
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    push            {r4, r5, r6, lr}
shun-iwasawa 82a8f5
    vpush           {d8 - d15}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
shun-iwasawa 82a8f5
    cmp             MAX_V_SAMP_FACTOR, #0
shun-iwasawa 82a8f5
    ble             99f
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* initialize constants */
shun-iwasawa 82a8f5
    vmov.u8         d28, #3
shun-iwasawa 82a8f5
    vmov.u16        q15, #1
shun-iwasawa 82a8f5
11:
shun-iwasawa 82a8f5
    ldr             INPTR, [INPUT_DATA], #4
shun-iwasawa 82a8f5
    ldr             OUTPTR, [OUTPUT_DATA], #4
shun-iwasawa 82a8f5
    mov             WIDTH, DOWNSAMPLED_WIDTH
shun-iwasawa 82a8f5
    upsample_row    OUTPTR, INPTR, WIDTH, TMP
shun-iwasawa 82a8f5
    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
shun-iwasawa 82a8f5
    bgt             11b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
99:
shun-iwasawa 82a8f5
    vpop            {d8 - d15}
shun-iwasawa 82a8f5
    pop             {r4, r5, r6, pc}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          MAX_V_SAMP_FACTOR
shun-iwasawa 82a8f5
    .unreq          DOWNSAMPLED_WIDTH
shun-iwasawa 82a8f5
    .unreq          INPUT_DATA
shun-iwasawa 82a8f5
    .unreq          OUTPUT_DATA_PTR
shun-iwasawa 82a8f5
    .unreq          OUTPUT_DATA
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          OUTPTR
shun-iwasawa 82a8f5
    .unreq          INPTR
shun-iwasawa 82a8f5
    .unreq          WIDTH
shun-iwasawa 82a8f5
    .unreq          TMP
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem upsample16
shun-iwasawa 82a8f5
.purgem upsample32
shun-iwasawa 82a8f5
.purgem upsample_row
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * GLOBAL(JOCTET *)
shun-iwasawa 82a8f5
 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
shun-iwasawa 82a8f5
 *                             JCOEFPTR block, int last_dc_val,
shun-iwasawa 82a8f5
 *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
shun-iwasawa 82a8f5
    sub             \PUT_BITS, \PUT_BITS, #0x8
shun-iwasawa 82a8f5
    lsr             \TMP, \PUT_BUFFER, \PUT_BITS
shun-iwasawa 82a8f5
    uxtb            \TMP, \TMP
shun-iwasawa 82a8f5
    strb            \TMP, [\BUFFER, #1]!
shun-iwasawa 82a8f5
    cmp             \TMP, #0xff
shun-iwasawa 82a8f5
    /*it eq*/
shun-iwasawa 82a8f5
    strbeq          \ZERO, [\BUFFER, #1]!
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
shun-iwasawa 82a8f5
    /*lsl             \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
shun-iwasawa 82a8f5
    add             \PUT_BITS, \SIZE
shun-iwasawa 82a8f5
    /*orr             \PUT_BUFFER, \PUT_BUFFER, \CODE*/
shun-iwasawa 82a8f5
    orr             \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
shun-iwasawa 82a8f5
  cmp               \PUT_BITS, #0x10
shun-iwasawa 82a8f5
  blt               15f
shun-iwasawa 82a8f5
    eor               \ZERO, \ZERO, \ZERO
shun-iwasawa 82a8f5
    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
shun-iwasawa 82a8f5
    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
shun-iwasawa 82a8f5
15:
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
jsimd_huff_encode_one_block_neon_consts:
shun-iwasawa 82a8f5
  .byte 0x01
shun-iwasawa 82a8f5
  .byte 0x02
shun-iwasawa 82a8f5
  .byte 0x04
shun-iwasawa 82a8f5
  .byte 0x08
shun-iwasawa 82a8f5
  .byte 0x10
shun-iwasawa 82a8f5
  .byte 0x20
shun-iwasawa 82a8f5
  .byte 0x40
shun-iwasawa 82a8f5
  .byte 0x80
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_huff_encode_one_block_neon
shun-iwasawa 82a8f5
    push            {r4, r5, r6, r7, r8, r9, r10, r11, lr}
shun-iwasawa 82a8f5
    add             r7, sp, #0x1c
shun-iwasawa 82a8f5
    sub             r4, sp, #0x40
shun-iwasawa 82a8f5
    bfc             r4, #0, #5
shun-iwasawa 82a8f5
    mov             sp, r4           /* align sp on 32 bytes */
shun-iwasawa 82a8f5
    vst1.64         {d8, d9, d10, d11}, [r4, :128]!
shun-iwasawa 82a8f5
    vst1.64         {d12, d13, d14, d15}, [r4, :128]
shun-iwasawa 82a8f5
    sub             sp, #0x140       /* reserve 320 bytes */
shun-iwasawa 82a8f5
    str             r0, [sp, #0x18]  /* working state > sp + Ox18 */
shun-iwasawa 82a8f5
    add             r4, sp, #0x20    /* r4 = t1 */
shun-iwasawa 82a8f5
    ldr             lr, [r7, #0x8]   /* lr = dctbl */
shun-iwasawa 82a8f5
    sub             r10, r1, #0x1    /* r10=buffer-- */
shun-iwasawa 82a8f5
    ldrsh           r1, [r2]
shun-iwasawa 82a8f5
    mov             r9, #0x10
shun-iwasawa 82a8f5
    mov             r8, #0x1
shun-iwasawa 82a8f5
    adr             r5, jsimd_huff_encode_one_block_neon_consts
shun-iwasawa 82a8f5
    /* prepare data */
shun-iwasawa 82a8f5
    vld1.8          {d26}, [r5, :64]
shun-iwasawa 82a8f5
    veor            q8, q8, q8
shun-iwasawa 82a8f5
    veor            q9, q9, q9
shun-iwasawa 82a8f5
    vdup.16         q14, r9
shun-iwasawa 82a8f5
    vdup.16         q15, r8
shun-iwasawa 82a8f5
    veor            q10, q10, q10
shun-iwasawa 82a8f5
    veor            q11, q11, q11
shun-iwasawa 82a8f5
    sub             r1, r1, r3
shun-iwasawa 82a8f5
    add             r9, r2, #0x22
shun-iwasawa 82a8f5
    add             r8, r2, #0x18
shun-iwasawa 82a8f5
    add             r3, r2, #0x36
shun-iwasawa 82a8f5
    vmov.16         d0[0], r1
shun-iwasawa 82a8f5
    vld1.16         {d2[0]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d4[0]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d6[0]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x2
shun-iwasawa 82a8f5
    add             r9, r2, #0x30
shun-iwasawa 82a8f5
    add             r8, r2, #0x26
shun-iwasawa 82a8f5
    add             r3, r2, #0x28
shun-iwasawa 82a8f5
    vld1.16         {d0[1]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d2[1]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d4[1]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d6[1]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x10
shun-iwasawa 82a8f5
    add             r9, r2, #0x40
shun-iwasawa 82a8f5
    add             r8, r2, #0x34
shun-iwasawa 82a8f5
    add             r3, r2, #0x1a
shun-iwasawa 82a8f5
    vld1.16         {d0[2]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d2[2]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d4[2]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d6[2]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x20
shun-iwasawa 82a8f5
    add             r9, r2, #0x32
shun-iwasawa 82a8f5
    add             r8, r2, #0x42
shun-iwasawa 82a8f5
    add             r3, r2, #0xc
shun-iwasawa 82a8f5
    vld1.16         {d0[3]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d2[3]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d4[3]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d6[3]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x12
shun-iwasawa 82a8f5
    add             r9, r2, #0x24
shun-iwasawa 82a8f5
    add             r8, r2, #0x50
shun-iwasawa 82a8f5
    add             r3, r2, #0xe
shun-iwasawa 82a8f5
    vld1.16         {d1[0]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d3[0]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d5[0]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d7[0]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x4
shun-iwasawa 82a8f5
    add             r9, r2, #0x16
shun-iwasawa 82a8f5
    add             r8, r2, #0x60
shun-iwasawa 82a8f5
    add             r3, r2, #0x1c
shun-iwasawa 82a8f5
    vld1.16         {d1[1]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d3[1]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d5[1]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d7[1]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x6
shun-iwasawa 82a8f5
    add             r9, r2, #0x8
shun-iwasawa 82a8f5
    add             r8, r2, #0x52
shun-iwasawa 82a8f5
    add             r3, r2, #0x2a
shun-iwasawa 82a8f5
    vld1.16         {d1[2]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d3[2]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d5[2]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d7[2]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x14
shun-iwasawa 82a8f5
    add             r9, r2, #0xa
shun-iwasawa 82a8f5
    add             r8, r2, #0x44
shun-iwasawa 82a8f5
    add             r3, r2, #0x38
shun-iwasawa 82a8f5
    vld1.16         {d1[3]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d3[3]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d5[3]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d7[3]}, [r3, :16]
shun-iwasawa 82a8f5
    vcgt.s16        q8, q8, q0
shun-iwasawa 82a8f5
    vcgt.s16        q9, q9, q1
shun-iwasawa 82a8f5
    vcgt.s16        q10, q10, q2
shun-iwasawa 82a8f5
    vcgt.s16        q11, q11, q3
shun-iwasawa 82a8f5
    vabs.s16        q0, q0
shun-iwasawa 82a8f5
    vabs.s16        q1, q1
shun-iwasawa 82a8f5
    vabs.s16        q2, q2
shun-iwasawa 82a8f5
    vabs.s16        q3, q3
shun-iwasawa 82a8f5
    veor            q8, q8, q0
shun-iwasawa 82a8f5
    veor            q9, q9, q1
shun-iwasawa 82a8f5
    veor            q10, q10, q2
shun-iwasawa 82a8f5
    veor            q11, q11, q3
shun-iwasawa 82a8f5
    add             r9, r4, #0x20
shun-iwasawa 82a8f5
    add             r8, r4, #0x80
shun-iwasawa 82a8f5
    add             r3, r4, #0xa0
shun-iwasawa 82a8f5
    vclz.i16        q0, q0
shun-iwasawa 82a8f5
    vclz.i16        q1, q1
shun-iwasawa 82a8f5
    vclz.i16        q2, q2
shun-iwasawa 82a8f5
    vclz.i16        q3, q3
shun-iwasawa 82a8f5
    vsub.i16        q0, q14, q0
shun-iwasawa 82a8f5
    vsub.i16        q1, q14, q1
shun-iwasawa 82a8f5
    vsub.i16        q2, q14, q2
shun-iwasawa 82a8f5
    vsub.i16        q3, q14, q3
shun-iwasawa 82a8f5
    vst1.16         {d0, d1, d2, d3}, [r4, :256]
shun-iwasawa 82a8f5
    vst1.16         {d4, d5, d6, d7}, [r9, :256]
shun-iwasawa 82a8f5
    vshl.s16        q0, q15, q0
shun-iwasawa 82a8f5
    vshl.s16        q1, q15, q1
shun-iwasawa 82a8f5
    vshl.s16        q2, q15, q2
shun-iwasawa 82a8f5
    vshl.s16        q3, q15, q3
shun-iwasawa 82a8f5
    vsub.i16        q0, q0, q15
shun-iwasawa 82a8f5
    vsub.i16        q1, q1, q15
shun-iwasawa 82a8f5
    vsub.i16        q2, q2, q15
shun-iwasawa 82a8f5
    vsub.i16        q3, q3, q15
shun-iwasawa 82a8f5
    vand            q8, q8, q0
shun-iwasawa 82a8f5
    vand            q9, q9, q1
shun-iwasawa 82a8f5
    vand            q10, q10, q2
shun-iwasawa 82a8f5
    vand            q11, q11, q3
shun-iwasawa 82a8f5
    vst1.16         {d16, d17, d18, d19}, [r8, :256]
shun-iwasawa 82a8f5
    vst1.16         {d20, d21, d22, d23}, [r3, :256]
shun-iwasawa 82a8f5
    add             r1, r2, #0x46
shun-iwasawa 82a8f5
    add             r9, r2, #0x3a
shun-iwasawa 82a8f5
    add             r8, r2, #0x74
shun-iwasawa 82a8f5
    add             r3, r2, #0x6a
shun-iwasawa 82a8f5
    vld1.16         {d8[0]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d10[0]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d12[0]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d14[0]}, [r3, :16]
shun-iwasawa 82a8f5
    veor            q8, q8, q8
shun-iwasawa 82a8f5
    veor            q9, q9, q9
shun-iwasawa 82a8f5
    veor            q10, q10, q10
shun-iwasawa 82a8f5
    veor            q11, q11, q11
shun-iwasawa 82a8f5
    add             r1, r2, #0x54
shun-iwasawa 82a8f5
    add             r9, r2, #0x2c
shun-iwasawa 82a8f5
    add             r8, r2, #0x76
shun-iwasawa 82a8f5
    add             r3, r2, #0x78
shun-iwasawa 82a8f5
    vld1.16         {d8[1]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d10[1]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d12[1]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d14[1]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x62
shun-iwasawa 82a8f5
    add             r9, r2, #0x1e
shun-iwasawa 82a8f5
    add             r8, r2, #0x68
shun-iwasawa 82a8f5
    add             r3, r2, #0x7a
shun-iwasawa 82a8f5
    vld1.16         {d8[2]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d10[2]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d12[2]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d14[2]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x70
shun-iwasawa 82a8f5
    add             r9, r2, #0x2e
shun-iwasawa 82a8f5
    add             r8, r2, #0x5a
shun-iwasawa 82a8f5
    add             r3, r2, #0x6c
shun-iwasawa 82a8f5
    vld1.16         {d8[3]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d10[3]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d12[3]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d14[3]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x72
shun-iwasawa 82a8f5
    add             r9, r2, #0x3c
shun-iwasawa 82a8f5
    add             r8, r2, #0x4c
shun-iwasawa 82a8f5
    add             r3, r2, #0x5e
shun-iwasawa 82a8f5
    vld1.16         {d9[0]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d11[0]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d13[0]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d15[0]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x64
shun-iwasawa 82a8f5
    add             r9, r2, #0x4a
shun-iwasawa 82a8f5
    add             r8, r2, #0x3e
shun-iwasawa 82a8f5
    add             r3, r2, #0x6e
shun-iwasawa 82a8f5
    vld1.16         {d9[1]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d11[1]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d13[1]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d15[1]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x56
shun-iwasawa 82a8f5
    add             r9, r2, #0x58
shun-iwasawa 82a8f5
    add             r8, r2, #0x4e
shun-iwasawa 82a8f5
    add             r3, r2, #0x7c
shun-iwasawa 82a8f5
    vld1.16         {d9[2]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d11[2]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d13[2]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d15[2]}, [r3, :16]
shun-iwasawa 82a8f5
    add             r1, r2, #0x48
shun-iwasawa 82a8f5
    add             r9, r2, #0x66
shun-iwasawa 82a8f5
    add             r8, r2, #0x5c
shun-iwasawa 82a8f5
    add             r3, r2, #0x7e
shun-iwasawa 82a8f5
    vld1.16         {d9[3]}, [r1, :16]
shun-iwasawa 82a8f5
    vld1.16         {d11[3]}, [r9, :16]
shun-iwasawa 82a8f5
    vld1.16         {d13[3]}, [r8, :16]
shun-iwasawa 82a8f5
    vld1.16         {d15[3]}, [r3, :16]
shun-iwasawa 82a8f5
    vcgt.s16        q8, q8, q4
shun-iwasawa 82a8f5
    vcgt.s16        q9, q9, q5
shun-iwasawa 82a8f5
    vcgt.s16        q10, q10, q6
shun-iwasawa 82a8f5
    vcgt.s16        q11, q11, q7
shun-iwasawa 82a8f5
    vabs.s16        q4, q4
shun-iwasawa 82a8f5
    vabs.s16        q5, q5
shun-iwasawa 82a8f5
    vabs.s16        q6, q6
shun-iwasawa 82a8f5
    vabs.s16        q7, q7
shun-iwasawa 82a8f5
    veor            q8, q8, q4
shun-iwasawa 82a8f5
    veor            q9, q9, q5
shun-iwasawa 82a8f5
    veor            q10, q10, q6
shun-iwasawa 82a8f5
    veor            q11, q11, q7
shun-iwasawa 82a8f5
    add             r1, r4, #0x40
shun-iwasawa 82a8f5
    add             r9, r4, #0x60
shun-iwasawa 82a8f5
    add             r8, r4, #0xc0
shun-iwasawa 82a8f5
    add             r3, r4, #0xe0
shun-iwasawa 82a8f5
    vclz.i16        q4, q4
shun-iwasawa 82a8f5
    vclz.i16        q5, q5
shun-iwasawa 82a8f5
    vclz.i16        q6, q6
shun-iwasawa 82a8f5
    vclz.i16        q7, q7
shun-iwasawa 82a8f5
    vsub.i16        q4, q14, q4
shun-iwasawa 82a8f5
    vsub.i16        q5, q14, q5
shun-iwasawa 82a8f5
    vsub.i16        q6, q14, q6
shun-iwasawa 82a8f5
    vsub.i16        q7, q14, q7
shun-iwasawa 82a8f5
    vst1.16         {d8, d9, d10, d11}, [r1, :256]
shun-iwasawa 82a8f5
    vst1.16         {d12, d13, d14, d15}, [r9, :256]
shun-iwasawa 82a8f5
    vshl.s16        q4, q15, q4
shun-iwasawa 82a8f5
    vshl.s16        q5, q15, q5
shun-iwasawa 82a8f5
    vshl.s16        q6, q15, q6
shun-iwasawa 82a8f5
    vshl.s16        q7, q15, q7
shun-iwasawa 82a8f5
    vsub.i16        q4, q4, q15
shun-iwasawa 82a8f5
    vsub.i16        q5, q5, q15
shun-iwasawa 82a8f5
    vsub.i16        q6, q6, q15
shun-iwasawa 82a8f5
    vsub.i16        q7, q7, q15
shun-iwasawa 82a8f5
    vand            q8, q8, q4
shun-iwasawa 82a8f5
    vand            q9, q9, q5
shun-iwasawa 82a8f5
    vand            q10, q10, q6
shun-iwasawa 82a8f5
    vand            q11, q11, q7
shun-iwasawa 82a8f5
    vst1.16         {d16, d17, d18, d19}, [r8, :256]
shun-iwasawa 82a8f5
    vst1.16         {d20, d21, d22, d23}, [r3, :256]
shun-iwasawa 82a8f5
    ldr             r12, [r7, #0xc]       /* r12 = actbl */
shun-iwasawa 82a8f5
    add             r1, lr, #0x400        /* r1 = dctbl->ehufsi */
shun-iwasawa 82a8f5
    mov             r9, r12               /* r9 = actbl */
shun-iwasawa 82a8f5
    add             r6, r4, #0x80         /* r6 = t2 */
shun-iwasawa 82a8f5
    ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
shun-iwasawa 82a8f5
    ldr             r4, [r0, #0xc]        /* r4  = put_bits */
shun-iwasawa 82a8f5
    ldrh            r2, [r6, #-128]       /* r2  = nbits */
shun-iwasawa 82a8f5
    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG)1)<
shun-iwasawa 82a8f5
    ldr             r0, [lr, r2, lsl #2]
shun-iwasawa 82a8f5
    ldrb            r5, [r1, r2]
shun-iwasawa 82a8f5
    put_bits        r11, r4, r0, r5
shun-iwasawa 82a8f5
    checkbuf15      r10, r11, r4, r5, r0
shun-iwasawa 82a8f5
    put_bits        r11, r4, r3, r2
shun-iwasawa 82a8f5
    checkbuf15      r10, r11, r4, r5, r0
shun-iwasawa 82a8f5
    mov             lr, r6                /* lr = t2 */
shun-iwasawa 82a8f5
    add             r5, r9, #0x400        /* r5 = actbl->ehufsi */
shun-iwasawa 82a8f5
    ldrsb           r6, [r5, #0xf0]       /* r6 = actbl->ehufsi[0xf0] */
shun-iwasawa 82a8f5
    veor            q8, q8, q8
shun-iwasawa 82a8f5
    vceq.i16        q0, q0, q8
shun-iwasawa 82a8f5
    vceq.i16        q1, q1, q8
shun-iwasawa 82a8f5
    vceq.i16        q2, q2, q8
shun-iwasawa 82a8f5
    vceq.i16        q3, q3, q8
shun-iwasawa 82a8f5
    vceq.i16        q4, q4, q8
shun-iwasawa 82a8f5
    vceq.i16        q5, q5, q8
shun-iwasawa 82a8f5
    vceq.i16        q6, q6, q8
shun-iwasawa 82a8f5
    vceq.i16        q7, q7, q8
shun-iwasawa 82a8f5
    vmovn.i16       d0, q0
shun-iwasawa 82a8f5
    vmovn.i16       d2, q1
shun-iwasawa 82a8f5
    vmovn.i16       d4, q2
shun-iwasawa 82a8f5
    vmovn.i16       d6, q3
shun-iwasawa 82a8f5
    vmovn.i16       d8, q4
shun-iwasawa 82a8f5
    vmovn.i16       d10, q5
shun-iwasawa 82a8f5
    vmovn.i16       d12, q6
shun-iwasawa 82a8f5
    vmovn.i16       d14, q7
shun-iwasawa 82a8f5
    vand            d0, d0, d26
shun-iwasawa 82a8f5
    vand            d2, d2, d26
shun-iwasawa 82a8f5
    vand            d4, d4, d26
shun-iwasawa 82a8f5
    vand            d6, d6, d26
shun-iwasawa 82a8f5
    vand            d8, d8, d26
shun-iwasawa 82a8f5
    vand            d10, d10, d26
shun-iwasawa 82a8f5
    vand            d12, d12, d26
shun-iwasawa 82a8f5
    vand            d14, d14, d26
shun-iwasawa 82a8f5
    vpadd.i8        d0, d0, d2
shun-iwasawa 82a8f5
    vpadd.i8        d4, d4, d6
shun-iwasawa 82a8f5
    vpadd.i8        d8, d8, d10
shun-iwasawa 82a8f5
    vpadd.i8        d12, d12, d14
shun-iwasawa 82a8f5
    vpadd.i8        d0, d0, d4
shun-iwasawa 82a8f5
    vpadd.i8        d8, d8, d12
shun-iwasawa 82a8f5
    vpadd.i8        d0, d0, d8
shun-iwasawa 82a8f5
    vmov.32         r1, d0[1]
shun-iwasawa 82a8f5
    vmov.32         r8, d0[0]
shun-iwasawa 82a8f5
    mvn             r1, r1
shun-iwasawa 82a8f5
    mvn             r8, r8
shun-iwasawa 82a8f5
    lsrs            r1, r1, #0x1
shun-iwasawa 82a8f5
    rrx             r8, r8            /* shift in last r1 bit while shifting out DC bit */
shun-iwasawa 82a8f5
    rbit            r1, r1            /* r1 = index1 */
shun-iwasawa 82a8f5
    rbit            r8, r8            /* r8 = index0 */
shun-iwasawa 82a8f5
    ldr             r0, [r9, #0x3c0]  /* r0 = actbl->ehufco[0xf0] */
shun-iwasawa 82a8f5
    str             r1, [sp, #0x14]   /* index1 > sp + 0x14 */
shun-iwasawa 82a8f5
    cmp             r8, #0x0
shun-iwasawa 82a8f5
    beq             6f
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    clz             r2, r8
shun-iwasawa 82a8f5
    add             lr, lr, r2, lsl #1
shun-iwasawa 82a8f5
    lsl             r8, r8, r2
shun-iwasawa 82a8f5
    ldrh            r1, [lr, #-126]
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    cmp             r2, #0x10
shun-iwasawa 82a8f5
    blt             3f
shun-iwasawa 82a8f5
    sub             r2, r2, #0x10
shun-iwasawa 82a8f5
    put_bits        r11, r4, r0, r6
shun-iwasawa 82a8f5
    cmp             r4, #0x10
shun-iwasawa 82a8f5
    blt             2b
shun-iwasawa 82a8f5
    eor             r3, r3, r3
shun-iwasawa 82a8f5
    emit_byte       r10, r11, r4, r3, r12
shun-iwasawa 82a8f5
    emit_byte       r10, r11, r4, r3, r12
shun-iwasawa 82a8f5
    b               2b
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    add             r2, r1, r2, lsl #4
shun-iwasawa 82a8f5
    ldrh            r3, [lr, #2]!
shun-iwasawa 82a8f5
    ldr             r12, [r9, r2, lsl #2]
shun-iwasawa 82a8f5
    ldrb            r2, [r5, r2]
shun-iwasawa 82a8f5
    put_bits        r11, r4, r12, r2
shun-iwasawa 82a8f5
    checkbuf15      r10, r11, r4, r2, r12
shun-iwasawa 82a8f5
    put_bits        r11, r4, r3, r1
shun-iwasawa 82a8f5
    checkbuf15      r10, r11, r4, r2, r12
shun-iwasawa 82a8f5
    lsls            r8, r8, #0x1
shun-iwasawa 82a8f5
    bne             1b
shun-iwasawa 82a8f5
6:
shun-iwasawa 82a8f5
    add             r12, sp, #0x20   /* r12 = t1 */
shun-iwasawa 82a8f5
    ldr             r8, [sp, #0x14]  /* r8 = index1 */
shun-iwasawa 82a8f5
    adds            r12, #0xc0       /* r12 = t2 + (DCTSIZE2/2) */
shun-iwasawa 82a8f5
    cmp             r8, #0x0
shun-iwasawa 82a8f5
    beq             6f
shun-iwasawa 82a8f5
    clz             r2, r8
shun-iwasawa 82a8f5
    sub             r12, r12, lr
shun-iwasawa 82a8f5
    lsl             r8, r8, r2
shun-iwasawa 82a8f5
    add             r2, r2, r12, lsr #1
shun-iwasawa 82a8f5
    add             lr, lr, r2, lsl #1
shun-iwasawa 82a8f5
    b               7f
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    clz             r2, r8
shun-iwasawa 82a8f5
    add             lr, lr, r2, lsl #1
shun-iwasawa 82a8f5
    lsl             r8, r8, r2
shun-iwasawa 82a8f5
7:
shun-iwasawa 82a8f5
    ldrh            r1, [lr, #-126]
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    cmp             r2, #0x10
shun-iwasawa 82a8f5
    blt             3f
shun-iwasawa 82a8f5
    sub             r2, r2, #0x10
shun-iwasawa 82a8f5
    put_bits        r11, r4, r0, r6
shun-iwasawa 82a8f5
    cmp             r4, #0x10
shun-iwasawa 82a8f5
    blt             2b
shun-iwasawa 82a8f5
    eor             r3, r3, r3
shun-iwasawa 82a8f5
    emit_byte       r10, r11, r4, r3, r12
shun-iwasawa 82a8f5
    emit_byte       r10, r11, r4, r3, r12
shun-iwasawa 82a8f5
    b               2b
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    add             r2, r1, r2, lsl #4
shun-iwasawa 82a8f5
    ldrh            r3, [lr, #2]!
shun-iwasawa 82a8f5
    ldr             r12, [r9, r2, lsl #2]
shun-iwasawa 82a8f5
    ldrb            r2, [r5, r2]
shun-iwasawa 82a8f5
    put_bits        r11, r4, r12, r2
shun-iwasawa 82a8f5
    checkbuf15      r10, r11, r4, r2, r12
shun-iwasawa 82a8f5
    put_bits        r11, r4, r3, r1
shun-iwasawa 82a8f5
    checkbuf15      r10, r11, r4, r2, r12
shun-iwasawa 82a8f5
    lsls            r8, r8, #0x1
shun-iwasawa 82a8f5
    bne             1b
shun-iwasawa 82a8f5
6:
shun-iwasawa 82a8f5
    add             r0, sp, #0x20
shun-iwasawa 82a8f5
    add             r0, #0xfe
shun-iwasawa 82a8f5
    cmp             lr, r0
shun-iwasawa 82a8f5
    bhs             1f
shun-iwasawa 82a8f5
    ldr             r1, [r9]
shun-iwasawa 82a8f5
    ldrb            r0, [r5]
shun-iwasawa 82a8f5
    put_bits        r11, r4, r1, r0
shun-iwasawa 82a8f5
    checkbuf15      r10, r11, r4, r0, r1
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    ldr             r12, [sp, #0x18]
shun-iwasawa 82a8f5
    str             r11, [r12, #0x8]
shun-iwasawa 82a8f5
    str             r4, [r12, #0xc]
shun-iwasawa 82a8f5
    add             r0, r10, #0x1
shun-iwasawa 82a8f5
    add             r4, sp, #0x140
shun-iwasawa 82a8f5
    vld1.64         {d8, d9, d10, d11}, [r4, :128]!
shun-iwasawa 82a8f5
    vld1.64         {d12, d13, d14, d15}, [r4, :128]
shun-iwasawa 82a8f5
    sub             r4, r7, #0x1c
shun-iwasawa 82a8f5
    mov             sp, r4
shun-iwasawa 82a8f5
    pop             {r4, r5, r6, r7, r8, r9, r10, r11, pc}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem emit_byte
shun-iwasawa 82a8f5
.purgem put_bits
shun-iwasawa 82a8f5
.purgem checkbuf15