shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Armv8 Neon optimizations for libjpeg-turbo
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
shun-iwasawa 82a8f5
 *                          All Rights Reserved.
shun-iwasawa 82a8f5
 * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com></siarhei.siamashka@nokia.com>
shun-iwasawa 82a8f5
 * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
shun-iwasawa 82a8f5
 * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org></ragesh.r@linaro.org>
shun-iwasawa 82a8f5
 * Copyright (C) 2014-2016, 2020, D. R. Commander.  All Rights Reserved.
shun-iwasawa 82a8f5
 * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
shun-iwasawa 82a8f5
 * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This software is provided 'as-is', without any express or implied
shun-iwasawa 82a8f5
 * warranty.  In no event will the authors be held liable for any damages
shun-iwasawa 82a8f5
 * arising from the use of this software.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Permission is granted to anyone to use this software for any purpose,
shun-iwasawa 82a8f5
 * including commercial applications, and to alter it and redistribute it
shun-iwasawa 82a8f5
 * freely, subject to the following restrictions:
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * 1. The origin of this software must not be misrepresented; you must not
shun-iwasawa 82a8f5
 *    claim that you wrote the original software. If you use this software
shun-iwasawa 82a8f5
 *    in a product, an acknowledgment in the product documentation would be
shun-iwasawa 82a8f5
 *    appreciated but is not required.
shun-iwasawa 82a8f5
 * 2. Altered source versions must be plainly marked as such, and must not be
shun-iwasawa 82a8f5
 *    misrepresented as being the original software.
shun-iwasawa 82a8f5
 * 3. This notice may not be removed or altered from any source distribution.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#if defined(__linux__) && defined(__ELF__)
shun-iwasawa 82a8f5
.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#if defined(__APPLE__)
shun-iwasawa 82a8f5
.section __DATA, __const
shun-iwasawa 82a8f5
#elif defined(_WIN32)
shun-iwasawa 82a8f5
.section .rdata
shun-iwasawa 82a8f5
#else
shun-iwasawa 82a8f5
.section .rodata, "a", %progbits
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Constants for jsimd_idct_islow_neon() */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define F_0_298   2446  /* FIX(0.298631336) */
shun-iwasawa 82a8f5
#define F_0_390   3196  /* FIX(0.390180644) */
shun-iwasawa 82a8f5
#define F_0_541   4433  /* FIX(0.541196100) */
shun-iwasawa 82a8f5
#define F_0_765   6270  /* FIX(0.765366865) */
shun-iwasawa 82a8f5
#define F_0_899   7373  /* FIX(0.899976223) */
shun-iwasawa 82a8f5
#define F_1_175   9633  /* FIX(1.175875602) */
shun-iwasawa 82a8f5
#define F_1_501  12299  /* FIX(1.501321110) */
shun-iwasawa 82a8f5
#define F_1_847  15137  /* FIX(1.847759065) */
shun-iwasawa 82a8f5
#define F_1_961  16069  /* FIX(1.961570560) */
shun-iwasawa 82a8f5
#define F_2_053  16819  /* FIX(2.053119869) */
shun-iwasawa 82a8f5
#define F_2_562  20995  /* FIX(2.562915447) */
shun-iwasawa 82a8f5
#define F_3_072  25172  /* FIX(3.072711026) */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
Ljsimd_idct_islow_neon_consts:
shun-iwasawa 82a8f5
  .short F_0_298
shun-iwasawa 82a8f5
  .short -F_0_390
shun-iwasawa 82a8f5
  .short F_0_541
shun-iwasawa 82a8f5
  .short F_0_765
shun-iwasawa 82a8f5
  .short - F_0_899
shun-iwasawa 82a8f5
  .short F_1_175
shun-iwasawa 82a8f5
  .short F_1_501
shun-iwasawa 82a8f5
  .short - F_1_847
shun-iwasawa 82a8f5
  .short - F_1_961
shun-iwasawa 82a8f5
  .short F_2_053
shun-iwasawa 82a8f5
  .short - F_2_562
shun-iwasawa 82a8f5
  .short F_3_072
shun-iwasawa 82a8f5
  .short 0          /* padding */
shun-iwasawa 82a8f5
  .short 0
shun-iwasawa 82a8f5
  .short 0
shun-iwasawa 82a8f5
  .short 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#undef F_0_298
shun-iwasawa 82a8f5
#undef F_0_390
shun-iwasawa 82a8f5
#undef F_0_541
shun-iwasawa 82a8f5
#undef F_0_765
shun-iwasawa 82a8f5
#undef F_0_899
shun-iwasawa 82a8f5
#undef F_1_175
shun-iwasawa 82a8f5
#undef F_1_501
shun-iwasawa 82a8f5
#undef F_1_847
shun-iwasawa 82a8f5
#undef F_1_961
shun-iwasawa 82a8f5
#undef F_2_053
shun-iwasawa 82a8f5
#undef F_2_562
shun-iwasawa 82a8f5
#undef F_3_072
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Constants for jsimd_idct_ifast_neon() */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
Ljsimd_idct_ifast_neon_consts:
shun-iwasawa 82a8f5
  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
shun-iwasawa 82a8f5
  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
shun-iwasawa 82a8f5
  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
shun-iwasawa 82a8f5
  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define CONST_BITS  13
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
shun-iwasawa 82a8f5
#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
shun-iwasawa 82a8f5
#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
shun-iwasawa 82a8f5
#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
shun-iwasawa 82a8f5
#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
shun-iwasawa 82a8f5
#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
shun-iwasawa 82a8f5
#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
shun-iwasawa 82a8f5
#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
shun-iwasawa 82a8f5
#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
shun-iwasawa 82a8f5
#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
shun-iwasawa 82a8f5
#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
shun-iwasawa 82a8f5
#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
shun-iwasawa 82a8f5
#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
shun-iwasawa 82a8f5
#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
Ljsimd_idct_4x4_neon_consts:
shun-iwasawa 82a8f5
  .short FIX_1_847759065        /* v0.h[0] */
shun-iwasawa 82a8f5
  .short -FIX_0_765366865       /* v0.h[1] */
shun-iwasawa 82a8f5
  .short -FIX_0_211164243       /* v0.h[2] */
shun-iwasawa 82a8f5
  .short FIX_1_451774981        /* v0.h[3] */
shun-iwasawa 82a8f5
  .short -FIX_2_172734803       /* d1[0] */
shun-iwasawa 82a8f5
  .short FIX_1_061594337        /* d1[1] */
shun-iwasawa 82a8f5
  .short -FIX_0_509795579       /* d1[2] */
shun-iwasawa 82a8f5
  .short -FIX_0_601344887       /* d1[3] */
shun-iwasawa 82a8f5
  .short FIX_0_899976223        /* v2.h[0] */
shun-iwasawa 82a8f5
  .short FIX_2_562915447        /* v2.h[1] */
shun-iwasawa 82a8f5
  .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
shun-iwasawa 82a8f5
  .short 0                      /* v2.h[3] */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 8
shun-iwasawa 82a8f5
Ljsimd_idct_2x2_neon_consts:
shun-iwasawa 82a8f5
  .short -FIX_0_720959822  /* v14[0] */
shun-iwasawa 82a8f5
  .short FIX_0_850430095   /* v14[1] */
shun-iwasawa 82a8f5
  .short -FIX_1_272758580  /* v14[2] */
shun-iwasawa 82a8f5
  .short FIX_3_624509785   /* v14[3] */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Constants for jsimd_ycc_*_neon() */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
Ljsimd_ycc_rgb_neon_consts:
shun-iwasawa 82a8f5
  .short 0,      0,     0,      0
shun-iwasawa 82a8f5
  .short 22971, -11277, -23401, 29033
shun-iwasawa 82a8f5
  .short -128,  -128,   -128,   -128
shun-iwasawa 82a8f5
  .short -128,  -128,   -128,   -128
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Constants for jsimd_*_ycc_neon() */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
Ljsimd_rgb_ycc_neon_consts:
shun-iwasawa 82a8f5
  .short 19595, 38470, 7471, 11059
shun-iwasawa 82a8f5
  .short 21709, 32768, 27439, 5329
shun-iwasawa 82a8f5
  .short 32767, 128, 32767, 128
shun-iwasawa 82a8f5
  .short 32767, 128, 32767, 128
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Constants for jsimd_fdct_islow_neon() */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define F_0_298   2446  /* FIX(0.298631336) */
shun-iwasawa 82a8f5
#define F_0_390   3196  /* FIX(0.390180644) */
shun-iwasawa 82a8f5
#define F_0_541   4433  /* FIX(0.541196100) */
shun-iwasawa 82a8f5
#define F_0_765   6270  /* FIX(0.765366865) */
shun-iwasawa 82a8f5
#define F_0_899   7373  /* FIX(0.899976223) */
shun-iwasawa 82a8f5
#define F_1_175   9633  /* FIX(1.175875602) */
shun-iwasawa 82a8f5
#define F_1_501  12299  /* FIX(1.501321110) */
shun-iwasawa 82a8f5
#define F_1_847  15137  /* FIX(1.847759065) */
shun-iwasawa 82a8f5
#define F_1_961  16069  /* FIX(1.961570560) */
shun-iwasawa 82a8f5
#define F_2_053  16819  /* FIX(2.053119869) */
shun-iwasawa 82a8f5
#define F_2_562  20995  /* FIX(2.562915447) */
shun-iwasawa 82a8f5
#define F_3_072  25172  /* FIX(3.072711026) */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
Ljsimd_fdct_islow_neon_consts:
shun-iwasawa 82a8f5
  .short F_0_298
shun-iwasawa 82a8f5
  .short -F_0_390
shun-iwasawa 82a8f5
  .short F_0_541
shun-iwasawa 82a8f5
  .short F_0_765
shun-iwasawa 82a8f5
  .short - F_0_899
shun-iwasawa 82a8f5
  .short F_1_175
shun-iwasawa 82a8f5
  .short F_1_501
shun-iwasawa 82a8f5
  .short - F_1_847
shun-iwasawa 82a8f5
  .short - F_1_961
shun-iwasawa 82a8f5
  .short F_2_053
shun-iwasawa 82a8f5
  .short - F_2_562
shun-iwasawa 82a8f5
  .short F_3_072
shun-iwasawa 82a8f5
  .short 0          /* padding */
shun-iwasawa 82a8f5
  .short 0
shun-iwasawa 82a8f5
  .short 0
shun-iwasawa 82a8f5
  .short 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#undef F_0_298
shun-iwasawa 82a8f5
#undef F_0_390
shun-iwasawa 82a8f5
#undef F_0_541
shun-iwasawa 82a8f5
#undef F_0_765
shun-iwasawa 82a8f5
#undef F_0_899
shun-iwasawa 82a8f5
#undef F_1_175
shun-iwasawa 82a8f5
#undef F_1_501
shun-iwasawa 82a8f5
#undef F_1_847
shun-iwasawa 82a8f5
#undef F_1_961
shun-iwasawa 82a8f5
#undef F_2_053
shun-iwasawa 82a8f5
#undef F_2_562
shun-iwasawa 82a8f5
#undef F_3_072
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Constants for jsimd_fdct_ifast_neon() */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
Ljsimd_fdct_ifast_neon_consts:
shun-iwasawa 82a8f5
  .short (98 * 128)               /* XFIX_0_382683433 */
shun-iwasawa 82a8f5
  .short (139 * 128)              /* XFIX_0_541196100 */
shun-iwasawa 82a8f5
  .short (181 * 128)              /* XFIX_0_707106781 */
shun-iwasawa 82a8f5
  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Constants for jsimd_h2*_downsample_neon() */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
Ljsimd_h2_downsample_neon_consts:
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
shun-iwasawa 82a8f5
        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
shun-iwasawa 82a8f5
        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
shun-iwasawa 82a8f5
        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
shun-iwasawa 82a8f5
        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
shun-iwasawa 82a8f5
        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
shun-iwasawa 82a8f5
        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
shun-iwasawa 82a8f5
        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
shun-iwasawa 82a8f5
        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
shun-iwasawa 82a8f5
        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
shun-iwasawa 82a8f5
        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
shun-iwasawa 82a8f5
        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
shun-iwasawa 82a8f5
        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
shun-iwasawa 82a8f5
        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
shun-iwasawa 82a8f5
        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
shun-iwasawa 82a8f5
        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
shun-iwasawa 82a8f5
  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
shun-iwasawa 82a8f5
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Constants for jsimd_huff_encode_one_block_neon() */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
Ljsimd_huff_encode_one_block_neon_consts:
shun-iwasawa 82a8f5
    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
shun-iwasawa 82a8f5
          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
shun-iwasawa 82a8f5
    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
shun-iwasawa 82a8f5
            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
shun-iwasawa 82a8f5
    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
shun-iwasawa 82a8f5
            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
shun-iwasawa 82a8f5
    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
shun-iwasawa 82a8f5
           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
shun-iwasawa 82a8f5
    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
shun-iwasawa 82a8f5
            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
shun-iwasawa 82a8f5
    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
shun-iwasawa 82a8f5
            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
shun-iwasawa 82a8f5
    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
shun-iwasawa 82a8f5
            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
shun-iwasawa 82a8f5
    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
shun-iwasawa 82a8f5
            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
shun-iwasawa 82a8f5
    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
shun-iwasawa 82a8f5
            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
shun-iwasawa 82a8f5
    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
shun-iwasawa 82a8f5
           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
shun-iwasawa 82a8f5
    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
shun-iwasawa 82a8f5
             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
shun-iwasawa 82a8f5
    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
shun-iwasawa 82a8f5
           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
shun-iwasawa 82a8f5
    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
shun-iwasawa 82a8f5
           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.text
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define RESPECT_STRICT_ALIGNMENT  1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Supplementary macro for setting function attributes */
shun-iwasawa 82a8f5
.macro asm_function fname
shun-iwasawa 82a8f5
#ifdef __APPLE__
shun-iwasawa 82a8f5
    .private_extern _\fname
shun-iwasawa 82a8f5
    .globl _\fname
shun-iwasawa 82a8f5
_\fname:
shun-iwasawa 82a8f5
#else
shun-iwasawa 82a8f5
    .global \fname
shun-iwasawa 82a8f5
#ifdef __ELF__
shun-iwasawa 82a8f5
    .hidden \fname
shun-iwasawa 82a8f5
    .type \fname, %function
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
\fname:
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Get symbol location */
shun-iwasawa 82a8f5
.macro get_symbol_loc reg, symbol
shun-iwasawa 82a8f5
#ifdef __APPLE__
shun-iwasawa 82a8f5
    adrp            \reg, \symbol@PAGE
shun-iwasawa 82a8f5
    add             \reg, \reg, \symbol@PAGEOFF
shun-iwasawa 82a8f5
#else
shun-iwasawa 82a8f5
    adrp            \reg, \symbol
shun-iwasawa 82a8f5
    add             \reg, \reg, :lo12:\symbol
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Transpose elements of single 128 bit registers */
shun-iwasawa 82a8f5
.macro transpose_single x0, x1, xi, xilen, literal
shun-iwasawa 82a8f5
    ins             \xi\xilen[0], \x0\xilen[0]
shun-iwasawa 82a8f5
    ins             \x1\xilen[0], \x0\xilen[1]
shun-iwasawa 82a8f5
    trn1            \x0\literal, \x0\literal, \x1\literal
shun-iwasawa 82a8f5
    trn2            \x1\literal, \xi\literal, \x1\literal
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Transpose elements of 2 different registers */
shun-iwasawa 82a8f5
.macro transpose x0, x1, xi, xilen, literal
shun-iwasawa 82a8f5
    mov             \xi\xilen, \x0\xilen
shun-iwasawa 82a8f5
    trn1            \x0\literal, \x0\literal, \x1\literal
shun-iwasawa 82a8f5
    trn2            \x1\literal, \xi\literal, \x1\literal
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Transpose a block of 4x4 coefficients in four 64-bit registers */
shun-iwasawa 82a8f5
.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
shun-iwasawa 82a8f5
    mov             \xi\xilen, \x0\xilen
shun-iwasawa 82a8f5
    trn1            \x0\x0len, \x0\x0len, \x2\x2len
shun-iwasawa 82a8f5
    trn2            \x2\x2len, \xi\x0len, \x2\x2len
shun-iwasawa 82a8f5
    mov             \xi\xilen, \x1\xilen
shun-iwasawa 82a8f5
    trn1            \x1\x1len, \x1\x1len, \x3\x3len
shun-iwasawa 82a8f5
    trn2            \x3\x3len, \xi\x1len, \x3\x3len
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
shun-iwasawa 82a8f5
    mov             \xi\xilen, \x0\xilen
shun-iwasawa 82a8f5
    trn1            \x0\x0len, \x0\x0len, \x1\x1len
shun-iwasawa 82a8f5
    trn2            \x1\x2len, \xi\x0len, \x1\x2len
shun-iwasawa 82a8f5
    mov             \xi\xilen, \x2\xilen
shun-iwasawa 82a8f5
    trn1            \x2\x2len, \x2\x2len, \x3\x3len
shun-iwasawa 82a8f5
    trn2            \x3\x2len, \xi\x1len, \x3\x3len
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro transpose_4x4 x0, x1, x2, x3, x5
shun-iwasawa 82a8f5
    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
shun-iwasawa 82a8f5
    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
shun-iwasawa 82a8f5
    trn1            \t0\().8h, \l0\().8h, \l1\().8h
shun-iwasawa 82a8f5
    trn1            \t1\().8h, \l2\().8h, \l3\().8h
shun-iwasawa 82a8f5
    trn1            \t2\().8h, \l4\().8h, \l5\().8h
shun-iwasawa 82a8f5
    trn1            \t3\().8h, \l6\().8h, \l7\().8h
shun-iwasawa 82a8f5
    trn2            \l1\().8h, \l0\().8h, \l1\().8h
shun-iwasawa 82a8f5
    trn2            \l3\().8h, \l2\().8h, \l3\().8h
shun-iwasawa 82a8f5
    trn2            \l5\().8h, \l4\().8h, \l5\().8h
shun-iwasawa 82a8f5
    trn2            \l7\().8h, \l6\().8h, \l7\().8h
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    trn1            \l4\().4s, \t2\().4s, \t3\().4s
shun-iwasawa 82a8f5
    trn2            \t3\().4s, \t2\().4s, \t3\().4s
shun-iwasawa 82a8f5
    trn1            \t2\().4s, \t0\().4s, \t1\().4s
shun-iwasawa 82a8f5
    trn2            \l2\().4s, \t0\().4s, \t1\().4s
shun-iwasawa 82a8f5
    trn1            \t0\().4s, \l1\().4s, \l3\().4s
shun-iwasawa 82a8f5
    trn2            \l3\().4s, \l1\().4s, \l3\().4s
shun-iwasawa 82a8f5
    trn2            \t1\().4s, \l5\().4s, \l7\().4s
shun-iwasawa 82a8f5
    trn1            \l5\().4s, \l5\().4s, \l7\().4s
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    trn2            \l6\().2d, \l2\().2d, \t3\().2d
shun-iwasawa 82a8f5
    trn1            \l0\().2d, \t2\().2d, \l4\().2d
shun-iwasawa 82a8f5
    trn1            \l1\().2d, \t0\().2d, \l5\().2d
shun-iwasawa 82a8f5
    trn2            \l7\().2d, \l3\().2d, \t1\().2d
shun-iwasawa 82a8f5
    trn1            \l2\().2d, \l2\().2d, \t3\().2d
shun-iwasawa 82a8f5
    trn2            \l4\().2d, \t2\().2d, \l4\().2d
shun-iwasawa 82a8f5
    trn1            \l3\().2d, \l3\().2d, \t1\().2d
shun-iwasawa 82a8f5
    trn2            \l5\().2d, \t0\().2d, \l5\().2d
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define CENTERJSAMPLE  128
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Perform dequantization and inverse DCT on one block of coefficients.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * GLOBAL(void)
shun-iwasawa 82a8f5
 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
shun-iwasawa 82a8f5
 *                       JSAMPARRAY output_buf, JDIMENSION output_col)
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define CONST_BITS  13
shun-iwasawa 82a8f5
#define PASS1_BITS  2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define XFIX_P_0_298  v0.h[0]
shun-iwasawa 82a8f5
#define XFIX_N_0_390  v0.h[1]
shun-iwasawa 82a8f5
#define XFIX_P_0_541  v0.h[2]
shun-iwasawa 82a8f5
#define XFIX_P_0_765  v0.h[3]
shun-iwasawa 82a8f5
#define XFIX_N_0_899  v0.h[4]
shun-iwasawa 82a8f5
#define XFIX_P_1_175  v0.h[5]
shun-iwasawa 82a8f5
#define XFIX_P_1_501  v0.h[6]
shun-iwasawa 82a8f5
#define XFIX_N_1_847  v0.h[7]
shun-iwasawa 82a8f5
#define XFIX_N_1_961  v1.h[0]
shun-iwasawa 82a8f5
#define XFIX_P_2_053  v1.h[1]
shun-iwasawa 82a8f5
#define XFIX_N_2_562  v1.h[2]
shun-iwasawa 82a8f5
#define XFIX_P_3_072  v1.h[3]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_idct_islow_neon
shun-iwasawa 82a8f5
    DCT_TABLE       .req x0
shun-iwasawa 82a8f5
    COEF_BLOCK      .req x1
shun-iwasawa 82a8f5
    OUTPUT_BUF      .req x2
shun-iwasawa 82a8f5
    OUTPUT_COL      .req x3
shun-iwasawa 82a8f5
    TMP1            .req x0
shun-iwasawa 82a8f5
    TMP2            .req x1
shun-iwasawa 82a8f5
    TMP3            .req x9
shun-iwasawa 82a8f5
    TMP4            .req x10
shun-iwasawa 82a8f5
    TMP5            .req x11
shun-iwasawa 82a8f5
    TMP6            .req x12
shun-iwasawa 82a8f5
    TMP7            .req x13
shun-iwasawa 82a8f5
    TMP8            .req x14
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
shun-iwasawa 82a8f5
       guarantee that the upper (unused) 32 bits of x3 are valid.  This
shun-iwasawa 82a8f5
       instruction ensures that those bits are set to zero. */
shun-iwasawa 82a8f5
    uxtw x3, w3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    sub             sp, sp, #64
shun-iwasawa 82a8f5
    get_symbol_loc  x15, Ljsimd_idct_islow_neon_consts
shun-iwasawa 82a8f5
    mov             x10, sp
shun-iwasawa 82a8f5
    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
shun-iwasawa 82a8f5
    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
shun-iwasawa 82a8f5
    ld1             {v0.8h, v1.8h}, [x15]
shun-iwasawa 82a8f5
    ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
shun-iwasawa 82a8f5
    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
shun-iwasawa 82a8f5
    ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
shun-iwasawa 82a8f5
    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cmeq            v16.8h, v3.8h, #0
shun-iwasawa 82a8f5
    cmeq            v26.8h, v4.8h, #0
shun-iwasawa 82a8f5
    cmeq            v27.8h, v5.8h, #0
shun-iwasawa 82a8f5
    cmeq            v28.8h, v6.8h, #0
shun-iwasawa 82a8f5
    cmeq            v29.8h, v7.8h, #0
shun-iwasawa 82a8f5
    cmeq            v30.8h, v8.8h, #0
shun-iwasawa 82a8f5
    cmeq            v31.8h, v9.8h, #0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    and             v10.16b, v16.16b, v26.16b
shun-iwasawa 82a8f5
    and             v11.16b, v27.16b, v28.16b
shun-iwasawa 82a8f5
    and             v12.16b, v29.16b, v30.16b
shun-iwasawa 82a8f5
    and             v13.16b, v31.16b, v10.16b
shun-iwasawa 82a8f5
    and             v14.16b, v11.16b, v12.16b
shun-iwasawa 82a8f5
    mul             v2.8h, v2.8h, v18.8h
shun-iwasawa 82a8f5
    and             v15.16b, v13.16b, v14.16b
shun-iwasawa 82a8f5
    shl             v10.8h, v2.8h, #(PASS1_BITS)
shun-iwasawa 82a8f5
    sqxtn           v16.8b, v15.8h
shun-iwasawa 82a8f5
    mov             TMP1, v16.d[0]
shun-iwasawa 82a8f5
    mvn             TMP2, TMP1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    cbnz            TMP2, 2f
shun-iwasawa 82a8f5
    /* case all AC coeffs are zeros */
shun-iwasawa 82a8f5
    dup             v2.2d, v10.d[0]
shun-iwasawa 82a8f5
    dup             v6.2d, v10.d[1]
shun-iwasawa 82a8f5
    mov             v3.16b, v2.16b
shun-iwasawa 82a8f5
    mov             v7.16b, v6.16b
shun-iwasawa 82a8f5
    mov             v4.16b, v2.16b
shun-iwasawa 82a8f5
    mov             v8.16b, v6.16b
shun-iwasawa 82a8f5
    mov             v5.16b, v2.16b
shun-iwasawa 82a8f5
    mov             v9.16b, v6.16b
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    /* for this transpose, we should organise data like this:
shun-iwasawa 82a8f5
     * 00, 01, 02, 03, 40, 41, 42, 43
shun-iwasawa 82a8f5
     * 10, 11, 12, 13, 50, 51, 52, 53
shun-iwasawa 82a8f5
     * 20, 21, 22, 23, 60, 61, 62, 63
shun-iwasawa 82a8f5
     * 30, 31, 32, 33, 70, 71, 72, 73
shun-iwasawa 82a8f5
     * 04, 05, 06, 07, 44, 45, 46, 47
shun-iwasawa 82a8f5
     * 14, 15, 16, 17, 54, 55, 56, 57
shun-iwasawa 82a8f5
     * 24, 25, 26, 27, 64, 65, 66, 67
shun-iwasawa 82a8f5
     * 34, 35, 36, 37, 74, 75, 76, 77
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
    trn1            v28.8h, v2.8h, v3.8h
shun-iwasawa 82a8f5
    trn1            v29.8h, v4.8h, v5.8h
shun-iwasawa 82a8f5
    trn1            v30.8h, v6.8h, v7.8h
shun-iwasawa 82a8f5
    trn1            v31.8h, v8.8h, v9.8h
shun-iwasawa 82a8f5
    trn2            v16.8h, v2.8h, v3.8h
shun-iwasawa 82a8f5
    trn2            v17.8h, v4.8h, v5.8h
shun-iwasawa 82a8f5
    trn2            v18.8h, v6.8h, v7.8h
shun-iwasawa 82a8f5
    trn2            v19.8h, v8.8h, v9.8h
shun-iwasawa 82a8f5
    trn1            v2.4s, v28.4s, v29.4s
shun-iwasawa 82a8f5
    trn1            v6.4s, v30.4s, v31.4s
shun-iwasawa 82a8f5
    trn1            v3.4s, v16.4s, v17.4s
shun-iwasawa 82a8f5
    trn1            v7.4s, v18.4s, v19.4s
shun-iwasawa 82a8f5
    trn2            v4.4s, v28.4s, v29.4s
shun-iwasawa 82a8f5
    trn2            v8.4s, v30.4s, v31.4s
shun-iwasawa 82a8f5
    trn2            v5.4s, v16.4s, v17.4s
shun-iwasawa 82a8f5
    trn2            v9.4s, v18.4s, v19.4s
shun-iwasawa 82a8f5
    /* Even part: reverse the even part of the forward DCT. */
shun-iwasawa 82a8f5
    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
shun-iwasawa 82a8f5
    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
shun-iwasawa 82a8f5
    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
shun-iwasawa 82a8f5
    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
shun-iwasawa 82a8f5
    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
shun-iwasawa 82a8f5
    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
shun-iwasawa 82a8f5
    mov             v21.16b, v19.16b               /* tmp3 = z1 */
shun-iwasawa 82a8f5
    mov             v20.16b, v18.16b               /* tmp3 = z1 */
shun-iwasawa 82a8f5
    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
shun-iwasawa 82a8f5
    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
shun-iwasawa 82a8f5
    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
shun-iwasawa 82a8f5
    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
shun-iwasawa 82a8f5
    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
shun-iwasawa 82a8f5
    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
shun-iwasawa 82a8f5
    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
shun-iwasawa 82a8f5
    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
shun-iwasawa 82a8f5
    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
shun-iwasawa 82a8f5
    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
shun-iwasawa 82a8f5
    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
shun-iwasawa 82a8f5
    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
shun-iwasawa 82a8f5
    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
shun-iwasawa 82a8f5
    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
shun-iwasawa 82a8f5
    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Odd part per figure 8; the matrix is unitary and hence its
shun-iwasawa 82a8f5
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
shun-iwasawa 82a8f5
    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
shun-iwasawa 82a8f5
    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
shun-iwasawa 82a8f5
    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
shun-iwasawa 82a8f5
    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
shun-iwasawa 82a8f5
    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
shun-iwasawa 82a8f5
    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
shun-iwasawa 82a8f5
    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
shun-iwasawa 82a8f5
    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
shun-iwasawa 82a8f5
    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
shun-iwasawa 82a8f5
    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
shun-iwasawa 82a8f5
    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
shun-iwasawa 82a8f5
    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
shun-iwasawa 82a8f5
    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
shun-iwasawa 82a8f5
    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
shun-iwasawa 82a8f5
    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
shun-iwasawa 82a8f5
    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
shun-iwasawa 82a8f5
    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
shun-iwasawa 82a8f5
    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
shun-iwasawa 82a8f5
    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
shun-iwasawa 82a8f5
    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
shun-iwasawa 82a8f5
    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
shun-iwasawa 82a8f5
    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
shun-iwasawa 82a8f5
    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
shun-iwasawa 82a8f5
    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
shun-iwasawa 82a8f5
    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
shun-iwasawa 82a8f5
    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
shun-iwasawa 82a8f5
    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
shun-iwasawa 82a8f5
    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
shun-iwasawa 82a8f5
    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
shun-iwasawa 82a8f5
    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
shun-iwasawa 82a8f5
    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
shun-iwasawa 82a8f5
    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
shun-iwasawa 82a8f5
    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
shun-iwasawa 82a8f5
    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
shun-iwasawa 82a8f5
    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
shun-iwasawa 82a8f5
    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
shun-iwasawa 82a8f5
    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
shun-iwasawa 82a8f5
    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
shun-iwasawa 82a8f5
    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
shun-iwasawa 82a8f5
    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
shun-iwasawa 82a8f5
    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
shun-iwasawa 82a8f5
    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
shun-iwasawa 82a8f5
    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
shun-iwasawa 82a8f5
    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
shun-iwasawa 82a8f5
    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
shun-iwasawa 82a8f5
    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
shun-iwasawa 82a8f5
    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
shun-iwasawa 82a8f5
    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
shun-iwasawa 82a8f5
    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
shun-iwasawa 82a8f5
    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
shun-iwasawa 82a8f5
    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
shun-iwasawa 82a8f5
    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
shun-iwasawa 82a8f5
    movi            v0.16b, #(CENTERJSAMPLE)
shun-iwasawa 82a8f5
    /* Prepare pointers (dual-issue with Neon instructions) */
shun-iwasawa 82a8f5
      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
shun-iwasawa 82a8f5
    sqrshrn         v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
shun-iwasawa 82a8f5
      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
shun-iwasawa 82a8f5
    sqrshrn         v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
shun-iwasawa 82a8f5
      add             TMP1, TMP1, OUTPUT_COL
shun-iwasawa 82a8f5
    sqrshrn         v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
shun-iwasawa 82a8f5
      add             TMP2, TMP2, OUTPUT_COL
shun-iwasawa 82a8f5
    sqrshrn         v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
shun-iwasawa 82a8f5
      add             TMP3, TMP3, OUTPUT_COL
shun-iwasawa 82a8f5
    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
shun-iwasawa 82a8f5
      add             TMP4, TMP4, OUTPUT_COL
shun-iwasawa 82a8f5
    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
shun-iwasawa 82a8f5
      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
shun-iwasawa 82a8f5
    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
shun-iwasawa 82a8f5
      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
shun-iwasawa 82a8f5
    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
shun-iwasawa 82a8f5
      add             TMP5, TMP5, OUTPUT_COL
shun-iwasawa 82a8f5
    add             v16.16b, v28.16b, v0.16b
shun-iwasawa 82a8f5
      add             TMP6, TMP6, OUTPUT_COL
shun-iwasawa 82a8f5
    add             v18.16b, v29.16b, v0.16b
shun-iwasawa 82a8f5
      add             TMP7, TMP7, OUTPUT_COL
shun-iwasawa 82a8f5
    add             v20.16b, v30.16b, v0.16b
shun-iwasawa 82a8f5
      add             TMP8, TMP8, OUTPUT_COL
shun-iwasawa 82a8f5
    add             v22.16b, v31.16b, v0.16b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Transpose the final 8-bit samples */
shun-iwasawa 82a8f5
    trn1            v28.16b, v16.16b, v18.16b
shun-iwasawa 82a8f5
    trn1            v30.16b, v20.16b, v22.16b
shun-iwasawa 82a8f5
    trn2            v29.16b, v16.16b, v18.16b
shun-iwasawa 82a8f5
    trn2            v31.16b, v20.16b, v22.16b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    trn1            v16.8h, v28.8h, v30.8h
shun-iwasawa 82a8f5
    trn2            v18.8h, v28.8h, v30.8h
shun-iwasawa 82a8f5
    trn1            v20.8h, v29.8h, v31.8h
shun-iwasawa 82a8f5
    trn2            v22.8h, v29.8h, v31.8h
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    uzp1            v28.4s, v16.4s, v18.4s
shun-iwasawa 82a8f5
    uzp2            v30.4s, v16.4s, v18.4s
shun-iwasawa 82a8f5
    uzp1            v29.4s, v20.4s, v22.4s
shun-iwasawa 82a8f5
    uzp2            v31.4s, v20.4s, v22.4s
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Store results to the output buffer */
shun-iwasawa 82a8f5
    st1             {v28.d}[0], [TMP1]
shun-iwasawa 82a8f5
    st1             {v29.d}[0], [TMP2]
shun-iwasawa 82a8f5
    st1             {v28.d}[1], [TMP3]
shun-iwasawa 82a8f5
    st1             {v29.d}[1], [TMP4]
shun-iwasawa 82a8f5
    st1             {v30.d}[0], [TMP5]
shun-iwasawa 82a8f5
    st1             {v31.d}[0], [TMP6]
shun-iwasawa 82a8f5
    st1             {v30.d}[1], [TMP7]
shun-iwasawa 82a8f5
    st1             {v31.d}[1], [TMP8]
shun-iwasawa 82a8f5
    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
shun-iwasawa 82a8f5
    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
shun-iwasawa 82a8f5
    blr             x30
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    mul             v3.8h, v3.8h, v19.8h
shun-iwasawa 82a8f5
    mul             v4.8h, v4.8h, v20.8h
shun-iwasawa 82a8f5
    mul             v5.8h, v5.8h, v21.8h
shun-iwasawa 82a8f5
    add             TMP4, xzr, TMP2, LSL #32
shun-iwasawa 82a8f5
    mul             v6.8h, v6.8h, v22.8h
shun-iwasawa 82a8f5
    mul             v7.8h, v7.8h, v23.8h
shun-iwasawa 82a8f5
    adds            TMP3, xzr, TMP2, LSR #32
shun-iwasawa 82a8f5
    mul             v8.8h, v8.8h, v24.8h
shun-iwasawa 82a8f5
    mul             v9.8h, v9.8h, v25.8h
shun-iwasawa 82a8f5
    b.ne            3f
shun-iwasawa 82a8f5
    /* Right AC coef is zero */
shun-iwasawa 82a8f5
    dup             v15.2d, v10.d[1]
shun-iwasawa 82a8f5
    /* Even part: reverse the even part of the forward DCT. */
shun-iwasawa 82a8f5
    add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
shun-iwasawa 82a8f5
    add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
shun-iwasawa 82a8f5
    sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
shun-iwasawa 82a8f5
    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
shun-iwasawa 82a8f5
    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
shun-iwasawa 82a8f5
    mov             v20.16b, v18.16b               /* tmp3 = z1 */
shun-iwasawa 82a8f5
    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
shun-iwasawa 82a8f5
    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
shun-iwasawa 82a8f5
    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
shun-iwasawa 82a8f5
    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
shun-iwasawa 82a8f5
    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
shun-iwasawa 82a8f5
    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
shun-iwasawa 82a8f5
    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Odd part per figure 8; the matrix is unitary and hence its
shun-iwasawa 82a8f5
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
shun-iwasawa 82a8f5
    add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
shun-iwasawa 82a8f5
    add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
shun-iwasawa 82a8f5
    add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
shun-iwasawa 82a8f5
    add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
shun-iwasawa 82a8f5
    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
shun-iwasawa 82a8f5
    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
shun-iwasawa 82a8f5
    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
shun-iwasawa 82a8f5
    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
shun-iwasawa 82a8f5
    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
shun-iwasawa 82a8f5
    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
shun-iwasawa 82a8f5
    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
shun-iwasawa 82a8f5
    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
shun-iwasawa 82a8f5
    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
shun-iwasawa 82a8f5
    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
shun-iwasawa 82a8f5
    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
shun-iwasawa 82a8f5
    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
shun-iwasawa 82a8f5
    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
shun-iwasawa 82a8f5
    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
shun-iwasawa 82a8f5
    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
shun-iwasawa 82a8f5
    sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
shun-iwasawa 82a8f5
    add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
shun-iwasawa 82a8f5
    sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
shun-iwasawa 82a8f5
    add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
shun-iwasawa 82a8f5
    sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
shun-iwasawa 82a8f5
    add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
shun-iwasawa 82a8f5
    sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    mov             v6.16b, v15.16b
shun-iwasawa 82a8f5
    mov             v7.16b, v15.16b
shun-iwasawa 82a8f5
    mov             v8.16b, v15.16b
shun-iwasawa 82a8f5
    mov             v9.16b, v15.16b
shun-iwasawa 82a8f5
    b               1b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    cbnz            TMP4, 4f
shun-iwasawa 82a8f5
    /* Left AC coef is zero */
shun-iwasawa 82a8f5
    dup             v14.2d, v10.d[0]
shun-iwasawa 82a8f5
    /* Even part: reverse the even part of the forward DCT. */
shun-iwasawa 82a8f5
    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
shun-iwasawa 82a8f5
    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
shun-iwasawa 82a8f5
    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
shun-iwasawa 82a8f5
    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
shun-iwasawa 82a8f5
    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
shun-iwasawa 82a8f5
    mov             v21.16b, v19.16b               /* tmp3 = z1 */
shun-iwasawa 82a8f5
    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
shun-iwasawa 82a8f5
    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
shun-iwasawa 82a8f5
    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
shun-iwasawa 82a8f5
    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
shun-iwasawa 82a8f5
    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
shun-iwasawa 82a8f5
    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
shun-iwasawa 82a8f5
    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Odd part per figure 8; the matrix is unitary and hence its
shun-iwasawa 82a8f5
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
shun-iwasawa 82a8f5
    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
shun-iwasawa 82a8f5
    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
shun-iwasawa 82a8f5
    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
shun-iwasawa 82a8f5
    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
shun-iwasawa 82a8f5
    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
shun-iwasawa 82a8f5
    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
shun-iwasawa 82a8f5
    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
shun-iwasawa 82a8f5
    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
shun-iwasawa 82a8f5
    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
shun-iwasawa 82a8f5
    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
shun-iwasawa 82a8f5
    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
shun-iwasawa 82a8f5
    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
shun-iwasawa 82a8f5
    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
shun-iwasawa 82a8f5
    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
shun-iwasawa 82a8f5
    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
shun-iwasawa 82a8f5
    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
shun-iwasawa 82a8f5
    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
shun-iwasawa 82a8f5
    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
shun-iwasawa 82a8f5
    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
shun-iwasawa 82a8f5
    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
shun-iwasawa 82a8f5
    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
shun-iwasawa 82a8f5
    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
shun-iwasawa 82a8f5
    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
shun-iwasawa 82a8f5
    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
shun-iwasawa 82a8f5
    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
shun-iwasawa 82a8f5
    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
shun-iwasawa 82a8f5
    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
shun-iwasawa 82a8f5
    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov             v2.16b, v14.16b
shun-iwasawa 82a8f5
    mov             v3.16b, v14.16b
shun-iwasawa 82a8f5
    mov             v4.16b, v14.16b
shun-iwasawa 82a8f5
    mov             v5.16b, v14.16b
shun-iwasawa 82a8f5
    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    b               1b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
4:
shun-iwasawa 82a8f5
    /* "No" AC coef is zero */
shun-iwasawa 82a8f5
    /* Even part: reverse the even part of the forward DCT. */
shun-iwasawa 82a8f5
    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
shun-iwasawa 82a8f5
    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
shun-iwasawa 82a8f5
    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
shun-iwasawa 82a8f5
    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
shun-iwasawa 82a8f5
    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
shun-iwasawa 82a8f5
    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
shun-iwasawa 82a8f5
    mov             v21.16b, v19.16b               /* tmp3 = z1 */
shun-iwasawa 82a8f5
    mov             v20.16b, v18.16b               /* tmp3 = z1 */
shun-iwasawa 82a8f5
    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
shun-iwasawa 82a8f5
    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
shun-iwasawa 82a8f5
    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
shun-iwasawa 82a8f5
    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
shun-iwasawa 82a8f5
    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
shun-iwasawa 82a8f5
    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
shun-iwasawa 82a8f5
    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
shun-iwasawa 82a8f5
    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
shun-iwasawa 82a8f5
    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
shun-iwasawa 82a8f5
    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
shun-iwasawa 82a8f5
    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
shun-iwasawa 82a8f5
    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
shun-iwasawa 82a8f5
    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
shun-iwasawa 82a8f5
    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
shun-iwasawa 82a8f5
    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Odd part per figure 8; the matrix is unitary and hence its
shun-iwasawa 82a8f5
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
shun-iwasawa 82a8f5
    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
shun-iwasawa 82a8f5
    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
shun-iwasawa 82a8f5
    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
shun-iwasawa 82a8f5
    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
shun-iwasawa 82a8f5
    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
shun-iwasawa 82a8f5
    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
shun-iwasawa 82a8f5
    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
shun-iwasawa 82a8f5
    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
shun-iwasawa 82a8f5
    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
shun-iwasawa 82a8f5
    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
shun-iwasawa 82a8f5
    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
shun-iwasawa 82a8f5
    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
shun-iwasawa 82a8f5
    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
shun-iwasawa 82a8f5
    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
shun-iwasawa 82a8f5
    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
shun-iwasawa 82a8f5
    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
shun-iwasawa 82a8f5
    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
shun-iwasawa 82a8f5
    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
shun-iwasawa 82a8f5
    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
shun-iwasawa 82a8f5
    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
shun-iwasawa 82a8f5
    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
shun-iwasawa 82a8f5
    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
shun-iwasawa 82a8f5
    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
shun-iwasawa 82a8f5
    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
shun-iwasawa 82a8f5
    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
shun-iwasawa 82a8f5
    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
shun-iwasawa 82a8f5
    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
shun-iwasawa 82a8f5
    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
shun-iwasawa 82a8f5
    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
shun-iwasawa 82a8f5
    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
shun-iwasawa 82a8f5
    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
shun-iwasawa 82a8f5
    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
shun-iwasawa 82a8f5
    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
shun-iwasawa 82a8f5
    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
shun-iwasawa 82a8f5
    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
shun-iwasawa 82a8f5
    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
shun-iwasawa 82a8f5
    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
shun-iwasawa 82a8f5
    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
shun-iwasawa 82a8f5
    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
shun-iwasawa 82a8f5
    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
shun-iwasawa 82a8f5
    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
shun-iwasawa 82a8f5
    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
shun-iwasawa 82a8f5
    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
shun-iwasawa 82a8f5
    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
shun-iwasawa 82a8f5
    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
shun-iwasawa 82a8f5
    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
shun-iwasawa 82a8f5
    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
shun-iwasawa 82a8f5
    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
shun-iwasawa 82a8f5
    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
shun-iwasawa 82a8f5
    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
shun-iwasawa 82a8f5
    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
shun-iwasawa 82a8f5
    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
shun-iwasawa 82a8f5
    b               1b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          DCT_TABLE
shun-iwasawa 82a8f5
    .unreq          COEF_BLOCK
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF
shun-iwasawa 82a8f5
    .unreq          OUTPUT_COL
shun-iwasawa 82a8f5
    .unreq          TMP1
shun-iwasawa 82a8f5
    .unreq          TMP2
shun-iwasawa 82a8f5
    .unreq          TMP3
shun-iwasawa 82a8f5
    .unreq          TMP4
shun-iwasawa 82a8f5
    .unreq          TMP5
shun-iwasawa 82a8f5
    .unreq          TMP6
shun-iwasawa 82a8f5
    .unreq          TMP7
shun-iwasawa 82a8f5
    .unreq          TMP8
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#undef CENTERJSAMPLE
shun-iwasawa 82a8f5
#undef CONST_BITS
shun-iwasawa 82a8f5
#undef PASS1_BITS
shun-iwasawa 82a8f5
#undef XFIX_P_0_298
shun-iwasawa 82a8f5
#undef XFIX_N_0_390
shun-iwasawa 82a8f5
#undef XFIX_P_0_541
shun-iwasawa 82a8f5
#undef XFIX_P_0_765
shun-iwasawa 82a8f5
#undef XFIX_N_0_899
shun-iwasawa 82a8f5
#undef XFIX_P_1_175
shun-iwasawa 82a8f5
#undef XFIX_P_1_501
shun-iwasawa 82a8f5
#undef XFIX_N_1_847
shun-iwasawa 82a8f5
#undef XFIX_N_1_961
shun-iwasawa 82a8f5
#undef XFIX_P_2_053
shun-iwasawa 82a8f5
#undef XFIX_N_2_562
shun-iwasawa 82a8f5
#undef XFIX_P_3_072
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_idct_ifast_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This function contains a fast, not so accurate integer implementation of
shun-iwasawa 82a8f5
 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
shun-iwasawa 82a8f5
 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
shun-iwasawa 82a8f5
 * function from jidctfst.c
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
shun-iwasawa 82a8f5
 * But in Arm Neon case some extra additions are required because VQDMULH
shun-iwasawa 82a8f5
 * instruction can't handle the constants larger than 1. So the expressions
shun-iwasawa 82a8f5
 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
shun-iwasawa 82a8f5
 * which introduces an extra addition. Overall, there are 6 extra additions
shun-iwasawa 82a8f5
 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define XFIX_1_082392200  v0.h[0]
shun-iwasawa 82a8f5
#define XFIX_1_414213562  v0.h[1]
shun-iwasawa 82a8f5
#define XFIX_1_847759065  v0.h[2]
shun-iwasawa 82a8f5
#define XFIX_2_613125930  v0.h[3]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_idct_ifast_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    DCT_TABLE       .req x0
shun-iwasawa 82a8f5
    COEF_BLOCK      .req x1
shun-iwasawa 82a8f5
    OUTPUT_BUF      .req x2
shun-iwasawa 82a8f5
    OUTPUT_COL      .req x3
shun-iwasawa 82a8f5
    TMP1            .req x0
shun-iwasawa 82a8f5
    TMP2            .req x1
shun-iwasawa 82a8f5
    TMP3            .req x9
shun-iwasawa 82a8f5
    TMP4            .req x10
shun-iwasawa 82a8f5
    TMP5            .req x11
shun-iwasawa 82a8f5
    TMP6            .req x12
shun-iwasawa 82a8f5
    TMP7            .req x13
shun-iwasawa 82a8f5
    TMP8            .req x14
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
shun-iwasawa 82a8f5
       guarantee that the upper (unused) 32 bits of x3 are valid.  This
shun-iwasawa 82a8f5
       instruction ensures that those bits are set to zero. */
shun-iwasawa 82a8f5
    uxtw x3, w3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load and dequantize coefficients into Neon registers
shun-iwasawa 82a8f5
     * with the following allocation:
shun-iwasawa 82a8f5
     *       0 1 2 3 | 4 5 6 7
shun-iwasawa 82a8f5
     *      ---------+--------
shun-iwasawa 82a8f5
     *   0 | d16     | d17     ( v16.8h )
shun-iwasawa 82a8f5
     *   1 | d18     | d19     ( v17.8h )
shun-iwasawa 82a8f5
     *   2 | d20     | d21     ( v18.8h )
shun-iwasawa 82a8f5
     *   3 | d22     | d23     ( v19.8h )
shun-iwasawa 82a8f5
     *   4 | d24     | d25     ( v20.8h )
shun-iwasawa 82a8f5
     *   5 | d26     | d27     ( v21.8h )
shun-iwasawa 82a8f5
     *   6 | d28     | d29     ( v22.8h )
shun-iwasawa 82a8f5
     *   7 | d30     | d31     ( v23.8h )
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
    /* Save Neon registers used in fast IDCT */
shun-iwasawa 82a8f5
    get_symbol_loc  TMP5, Ljsimd_idct_ifast_neon_consts
shun-iwasawa 82a8f5
    ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
shun-iwasawa 82a8f5
    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
shun-iwasawa 82a8f5
    ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
shun-iwasawa 82a8f5
    mul             v16.8h, v16.8h, v0.8h
shun-iwasawa 82a8f5
    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
shun-iwasawa 82a8f5
    mul             v17.8h, v17.8h, v1.8h
shun-iwasawa 82a8f5
    ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
shun-iwasawa 82a8f5
    mul             v18.8h, v18.8h, v2.8h
shun-iwasawa 82a8f5
    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
shun-iwasawa 82a8f5
    mul             v19.8h, v19.8h, v3.8h
shun-iwasawa 82a8f5
    ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
shun-iwasawa 82a8f5
    mul             v20.8h, v20.8h, v0.8h
shun-iwasawa 82a8f5
    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
shun-iwasawa 82a8f5
    mul             v22.8h, v22.8h, v2.8h
shun-iwasawa 82a8f5
    mul             v21.8h, v21.8h, v1.8h
shun-iwasawa 82a8f5
    ld1             {v0.4h}, [TMP5]        /* load constants */
shun-iwasawa 82a8f5
    mul             v23.8h, v23.8h, v3.8h
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* 1-D IDCT, pass 1 */
shun-iwasawa 82a8f5
    sub             v2.8h, v18.8h, v22.8h
shun-iwasawa 82a8f5
    add             v22.8h, v18.8h, v22.8h
shun-iwasawa 82a8f5
    sub             v1.8h, v19.8h, v21.8h
shun-iwasawa 82a8f5
    add             v21.8h, v19.8h, v21.8h
shun-iwasawa 82a8f5
    sub             v5.8h, v17.8h, v23.8h
shun-iwasawa 82a8f5
    add             v23.8h, v17.8h, v23.8h
shun-iwasawa 82a8f5
    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
shun-iwasawa 82a8f5
    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
shun-iwasawa 82a8f5
    add             v3.8h, v1.8h, v1.8h
shun-iwasawa 82a8f5
    sub             v1.8h, v5.8h, v1.8h
shun-iwasawa 82a8f5
    add             v18.8h, v2.8h, v4.8h
shun-iwasawa 82a8f5
    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
shun-iwasawa 82a8f5
    sub             v2.8h, v23.8h, v21.8h
shun-iwasawa 82a8f5
    add             v3.8h, v3.8h, v6.8h
shun-iwasawa 82a8f5
    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
shun-iwasawa 82a8f5
    add             v1.8h, v1.8h, v4.8h
shun-iwasawa 82a8f5
    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
shun-iwasawa 82a8f5
    sub             v18.8h, v18.8h, v22.8h
shun-iwasawa 82a8f5
    add             v2.8h, v2.8h, v6.8h
shun-iwasawa 82a8f5
    sub             v6.8h, v16.8h, v20.8h
shun-iwasawa 82a8f5
    add             v20.8h, v16.8h, v20.8h
shun-iwasawa 82a8f5
    add             v17.8h, v5.8h, v4.8h
shun-iwasawa 82a8f5
    add             v5.8h, v6.8h, v18.8h
shun-iwasawa 82a8f5
    sub             v18.8h, v6.8h, v18.8h
shun-iwasawa 82a8f5
    add             v6.8h, v23.8h, v21.8h
shun-iwasawa 82a8f5
    add             v16.8h, v20.8h, v22.8h
shun-iwasawa 82a8f5
    sub             v3.8h, v6.8h, v3.8h
shun-iwasawa 82a8f5
    sub             v20.8h, v20.8h, v22.8h
shun-iwasawa 82a8f5
    sub             v3.8h, v3.8h, v1.8h
shun-iwasawa 82a8f5
    sub             v1.8h, v17.8h, v1.8h
shun-iwasawa 82a8f5
    add             v2.8h, v3.8h, v2.8h
shun-iwasawa 82a8f5
    sub             v23.8h, v16.8h, v6.8h
shun-iwasawa 82a8f5
    add             v1.8h, v1.8h, v2.8h
shun-iwasawa 82a8f5
    add             v16.8h, v16.8h, v6.8h
shun-iwasawa 82a8f5
    add             v22.8h, v5.8h, v3.8h
shun-iwasawa 82a8f5
    sub             v17.8h, v5.8h, v3.8h
shun-iwasawa 82a8f5
    sub             v21.8h, v18.8h, v2.8h
shun-iwasawa 82a8f5
    add             v18.8h, v18.8h, v2.8h
shun-iwasawa 82a8f5
    sub             v19.8h, v20.8h, v1.8h
shun-iwasawa 82a8f5
    add             v20.8h, v20.8h, v1.8h
shun-iwasawa 82a8f5
    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
shun-iwasawa 82a8f5
    /* 1-D IDCT, pass 2 */
shun-iwasawa 82a8f5
    sub             v2.8h, v18.8h, v22.8h
shun-iwasawa 82a8f5
    add             v22.8h, v18.8h, v22.8h
shun-iwasawa 82a8f5
    sub             v1.8h, v19.8h, v21.8h
shun-iwasawa 82a8f5
    add             v21.8h, v19.8h, v21.8h
shun-iwasawa 82a8f5
    sub             v5.8h, v17.8h, v23.8h
shun-iwasawa 82a8f5
    add             v23.8h, v17.8h, v23.8h
shun-iwasawa 82a8f5
    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
shun-iwasawa 82a8f5
    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
shun-iwasawa 82a8f5
    add             v3.8h, v1.8h, v1.8h
shun-iwasawa 82a8f5
    sub             v1.8h, v5.8h, v1.8h
shun-iwasawa 82a8f5
    add             v18.8h, v2.8h, v4.8h
shun-iwasawa 82a8f5
    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
shun-iwasawa 82a8f5
    sub             v2.8h, v23.8h, v21.8h
shun-iwasawa 82a8f5
    add             v3.8h, v3.8h, v6.8h
shun-iwasawa 82a8f5
    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
shun-iwasawa 82a8f5
    add             v1.8h, v1.8h, v4.8h
shun-iwasawa 82a8f5
    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
shun-iwasawa 82a8f5
    sub             v18.8h, v18.8h, v22.8h
shun-iwasawa 82a8f5
    add             v2.8h, v2.8h, v6.8h
shun-iwasawa 82a8f5
    sub             v6.8h, v16.8h, v20.8h
shun-iwasawa 82a8f5
    add             v20.8h, v16.8h, v20.8h
shun-iwasawa 82a8f5
    add             v17.8h, v5.8h, v4.8h
shun-iwasawa 82a8f5
    add             v5.8h, v6.8h, v18.8h
shun-iwasawa 82a8f5
    sub             v18.8h, v6.8h, v18.8h
shun-iwasawa 82a8f5
    add             v6.8h, v23.8h, v21.8h
shun-iwasawa 82a8f5
    add             v16.8h, v20.8h, v22.8h
shun-iwasawa 82a8f5
    sub             v3.8h, v6.8h, v3.8h
shun-iwasawa 82a8f5
    sub             v20.8h, v20.8h, v22.8h
shun-iwasawa 82a8f5
    sub             v3.8h, v3.8h, v1.8h
shun-iwasawa 82a8f5
    sub             v1.8h, v17.8h, v1.8h
shun-iwasawa 82a8f5
    add             v2.8h, v3.8h, v2.8h
shun-iwasawa 82a8f5
    sub             v23.8h, v16.8h, v6.8h
shun-iwasawa 82a8f5
    add             v1.8h, v1.8h, v2.8h
shun-iwasawa 82a8f5
    add             v16.8h, v16.8h, v6.8h
shun-iwasawa 82a8f5
    add             v22.8h, v5.8h, v3.8h
shun-iwasawa 82a8f5
    sub             v17.8h, v5.8h, v3.8h
shun-iwasawa 82a8f5
    sub             v21.8h, v18.8h, v2.8h
shun-iwasawa 82a8f5
    add             v18.8h, v18.8h, v2.8h
shun-iwasawa 82a8f5
    sub             v19.8h, v20.8h, v1.8h
shun-iwasawa 82a8f5
    add             v20.8h, v20.8h, v1.8h
shun-iwasawa 82a8f5
    /* Descale to 8-bit and range limit */
shun-iwasawa 82a8f5
    movi            v0.16b, #0x80
shun-iwasawa 82a8f5
      /* Prepare pointers (dual-issue with Neon instructions) */
shun-iwasawa 82a8f5
      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
shun-iwasawa 82a8f5
    sqshrn          v28.8b, v16.8h, #5
shun-iwasawa 82a8f5
      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
shun-iwasawa 82a8f5
    sqshrn          v29.8b, v17.8h, #5
shun-iwasawa 82a8f5
      add             TMP1, TMP1, OUTPUT_COL
shun-iwasawa 82a8f5
    sqshrn          v30.8b, v18.8h, #5
shun-iwasawa 82a8f5
      add             TMP2, TMP2, OUTPUT_COL
shun-iwasawa 82a8f5
    sqshrn          v31.8b, v19.8h, #5
shun-iwasawa 82a8f5
      add             TMP3, TMP3, OUTPUT_COL
shun-iwasawa 82a8f5
    sqshrn2         v28.16b, v20.8h, #5
shun-iwasawa 82a8f5
      add             TMP4, TMP4, OUTPUT_COL
shun-iwasawa 82a8f5
    sqshrn2         v29.16b, v21.8h, #5
shun-iwasawa 82a8f5
      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
shun-iwasawa 82a8f5
    sqshrn2         v30.16b, v22.8h, #5
shun-iwasawa 82a8f5
      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
shun-iwasawa 82a8f5
    sqshrn2         v31.16b, v23.8h, #5
shun-iwasawa 82a8f5
      add             TMP5, TMP5, OUTPUT_COL
shun-iwasawa 82a8f5
    add             v16.16b, v28.16b, v0.16b
shun-iwasawa 82a8f5
      add             TMP6, TMP6, OUTPUT_COL
shun-iwasawa 82a8f5
    add             v18.16b, v29.16b, v0.16b
shun-iwasawa 82a8f5
      add             TMP7, TMP7, OUTPUT_COL
shun-iwasawa 82a8f5
    add             v20.16b, v30.16b, v0.16b
shun-iwasawa 82a8f5
      add             TMP8, TMP8, OUTPUT_COL
shun-iwasawa 82a8f5
    add             v22.16b, v31.16b, v0.16b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Transpose the final 8-bit samples */
shun-iwasawa 82a8f5
    trn1            v28.16b, v16.16b, v18.16b
shun-iwasawa 82a8f5
    trn1            v30.16b, v20.16b, v22.16b
shun-iwasawa 82a8f5
    trn2            v29.16b, v16.16b, v18.16b
shun-iwasawa 82a8f5
    trn2            v31.16b, v20.16b, v22.16b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    trn1            v16.8h, v28.8h, v30.8h
shun-iwasawa 82a8f5
    trn2            v18.8h, v28.8h, v30.8h
shun-iwasawa 82a8f5
    trn1            v20.8h, v29.8h, v31.8h
shun-iwasawa 82a8f5
    trn2            v22.8h, v29.8h, v31.8h
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    uzp1            v28.4s, v16.4s, v18.4s
shun-iwasawa 82a8f5
    uzp2            v30.4s, v16.4s, v18.4s
shun-iwasawa 82a8f5
    uzp1            v29.4s, v20.4s, v22.4s
shun-iwasawa 82a8f5
    uzp2            v31.4s, v20.4s, v22.4s
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Store results to the output buffer */
shun-iwasawa 82a8f5
    st1             {v28.d}[0], [TMP1]
shun-iwasawa 82a8f5
    st1             {v29.d}[0], [TMP2]
shun-iwasawa 82a8f5
    st1             {v28.d}[1], [TMP3]
shun-iwasawa 82a8f5
    st1             {v29.d}[1], [TMP4]
shun-iwasawa 82a8f5
    st1             {v30.d}[0], [TMP5]
shun-iwasawa 82a8f5
    st1             {v31.d}[0], [TMP6]
shun-iwasawa 82a8f5
    st1             {v30.d}[1], [TMP7]
shun-iwasawa 82a8f5
    st1             {v31.d}[1], [TMP8]
shun-iwasawa 82a8f5
    blr             x30
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          DCT_TABLE
shun-iwasawa 82a8f5
    .unreq          COEF_BLOCK
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF
shun-iwasawa 82a8f5
    .unreq          OUTPUT_COL
shun-iwasawa 82a8f5
    .unreq          TMP1
shun-iwasawa 82a8f5
    .unreq          TMP2
shun-iwasawa 82a8f5
    .unreq          TMP3
shun-iwasawa 82a8f5
    .unreq          TMP4
shun-iwasawa 82a8f5
    .unreq          TMP5
shun-iwasawa 82a8f5
    .unreq          TMP6
shun-iwasawa 82a8f5
    .unreq          TMP7
shun-iwasawa 82a8f5
    .unreq          TMP8
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_idct_4x4_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This function contains inverse-DCT code for getting reduced-size
shun-iwasawa 82a8f5
 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
shun-iwasawa 82a8f5
 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
shun-iwasawa 82a8f5
 * function from jpeg-6b (jidctred.c).
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
shun-iwasawa 82a8f5
 *       requires much less arithmetic operations and hence should be faster.
shun-iwasawa 82a8f5
 *       The primary purpose of this particular Neon optimized function is
shun-iwasawa 82a8f5
 *       bit exact compatibility with jpeg-6b.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * TODO: a bit better instructions scheduling can be achieved by expanding
shun-iwasawa 82a8f5
 *       idct_helper/transpose_4x4 macros and reordering instructions,
shun-iwasawa 82a8f5
 *       but readability will suffer somewhat.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
shun-iwasawa 82a8f5
    smull           v28.4s, \x4, v2.h[2]
shun-iwasawa 82a8f5
    smlal           v28.4s, \x8, v0.h[0]
shun-iwasawa 82a8f5
    smlal           v28.4s, \x14, v0.h[1]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull           v26.4s, \x16, v1.h[2]
shun-iwasawa 82a8f5
    smlal           v26.4s, \x12, v1.h[3]
shun-iwasawa 82a8f5
    smlal           v26.4s, \x10, v2.h[0]
shun-iwasawa 82a8f5
    smlal           v26.4s, \x6, v2.h[1]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull           v30.4s, \x4, v2.h[2]
shun-iwasawa 82a8f5
    smlsl           v30.4s, \x8, v0.h[0]
shun-iwasawa 82a8f5
    smlsl           v30.4s, \x14, v0.h[1]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull           v24.4s, \x16, v0.h[2]
shun-iwasawa 82a8f5
    smlal           v24.4s, \x12, v0.h[3]
shun-iwasawa 82a8f5
    smlal           v24.4s, \x10, v1.h[0]
shun-iwasawa 82a8f5
    smlal           v24.4s, \x6, v1.h[1]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v20.4s, v28.4s, v26.4s
shun-iwasawa 82a8f5
    sub             v28.4s, v28.4s, v26.4s
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  .if \shift > 16
shun-iwasawa 82a8f5
    srshr           v20.4s, v20.4s, #\shift
shun-iwasawa 82a8f5
    srshr           v28.4s, v28.4s, #\shift
shun-iwasawa 82a8f5
    xtn             \y26, v20.4s
shun-iwasawa 82a8f5
    xtn             \y29, v28.4s
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    rshrn           \y26, v20.4s, #\shift
shun-iwasawa 82a8f5
    rshrn           \y29, v28.4s, #\shift
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v20.4s, v30.4s, v24.4s
shun-iwasawa 82a8f5
    sub             v30.4s, v30.4s, v24.4s
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  .if \shift > 16
shun-iwasawa 82a8f5
    srshr           v20.4s, v20.4s, #\shift
shun-iwasawa 82a8f5
    srshr           v30.4s, v30.4s, #\shift
shun-iwasawa 82a8f5
    xtn             \y27, v20.4s
shun-iwasawa 82a8f5
    xtn             \y28, v30.4s
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    rshrn           \y27, v20.4s, #\shift
shun-iwasawa 82a8f5
    rshrn           \y28, v30.4s, #\shift
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_idct_4x4_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    DCT_TABLE       .req x0
shun-iwasawa 82a8f5
    COEF_BLOCK      .req x1
shun-iwasawa 82a8f5
    OUTPUT_BUF      .req x2
shun-iwasawa 82a8f5
    OUTPUT_COL      .req x3
shun-iwasawa 82a8f5
    TMP1            .req x0
shun-iwasawa 82a8f5
    TMP2            .req x1
shun-iwasawa 82a8f5
    TMP3            .req x2
shun-iwasawa 82a8f5
    TMP4            .req x15
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
shun-iwasawa 82a8f5
       guarantee that the upper (unused) 32 bits of x3 are valid.  This
shun-iwasawa 82a8f5
       instruction ensures that those bits are set to zero. */
shun-iwasawa 82a8f5
    uxtw x3, w3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Save all used Neon registers */
shun-iwasawa 82a8f5
    sub             sp, sp, 64
shun-iwasawa 82a8f5
    mov             x9, sp
shun-iwasawa 82a8f5
    /* Load constants (v3.4h is just used for padding) */
shun-iwasawa 82a8f5
    get_symbol_loc  TMP4, Ljsimd_idct_4x4_neon_consts
shun-iwasawa 82a8f5
    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
shun-iwasawa 82a8f5
    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
shun-iwasawa 82a8f5
    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load all COEF_BLOCK into Neon registers with the following allocation:
shun-iwasawa 82a8f5
     *       0 1 2 3 | 4 5 6 7
shun-iwasawa 82a8f5
     *      ---------+--------
shun-iwasawa 82a8f5
     *   0 | v4.4h   | v5.4h
shun-iwasawa 82a8f5
     *   1 | v6.4h   | v7.4h
shun-iwasawa 82a8f5
     *   2 | v8.4h   | v9.4h
shun-iwasawa 82a8f5
     *   3 | v10.4h  | v11.4h
shun-iwasawa 82a8f5
     *   4 | -       | -
shun-iwasawa 82a8f5
     *   5 | v12.4h  | v13.4h
shun-iwasawa 82a8f5
     *   6 | v14.4h  | v15.4h
shun-iwasawa 82a8f5
     *   7 | v16.4h  | v17.4h
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
shun-iwasawa 82a8f5
    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
shun-iwasawa 82a8f5
    add             COEF_BLOCK, COEF_BLOCK, #16
shun-iwasawa 82a8f5
    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
shun-iwasawa 82a8f5
    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
shun-iwasawa 82a8f5
    /* dequantize */
shun-iwasawa 82a8f5
    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
shun-iwasawa 82a8f5
    mul             v4.4h, v4.4h, v18.4h
shun-iwasawa 82a8f5
    mul             v5.4h, v5.4h, v19.4h
shun-iwasawa 82a8f5
    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
shun-iwasawa 82a8f5
    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
shun-iwasawa 82a8f5
    mul             v6.4h, v6.4h, v20.4h
shun-iwasawa 82a8f5
    mul             v7.4h, v7.4h, v21.4h
shun-iwasawa 82a8f5
    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
shun-iwasawa 82a8f5
    mul             v8.4h, v8.4h, v22.4h
shun-iwasawa 82a8f5
    mul             v9.4h, v9.4h, v23.4h
shun-iwasawa 82a8f5
    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
shun-iwasawa 82a8f5
    add             DCT_TABLE, DCT_TABLE, #16
shun-iwasawa 82a8f5
    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
shun-iwasawa 82a8f5
    mul             v10.4h, v10.4h, v24.4h
shun-iwasawa 82a8f5
    mul             v11.4h, v11.4h, v25.4h
shun-iwasawa 82a8f5
    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
shun-iwasawa 82a8f5
    mul             v12.4h, v12.4h, v26.4h
shun-iwasawa 82a8f5
    mul             v13.4h, v13.4h, v27.4h
shun-iwasawa 82a8f5
    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
shun-iwasawa 82a8f5
    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
shun-iwasawa 82a8f5
    mul             v14.4h, v14.4h, v28.4h
shun-iwasawa 82a8f5
    mul             v15.4h, v15.4h, v29.4h
shun-iwasawa 82a8f5
    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
shun-iwasawa 82a8f5
    mul             v16.4h, v16.4h, v30.4h
shun-iwasawa 82a8f5
    mul             v17.4h, v17.4h, v31.4h
shun-iwasawa 82a8f5
    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Pass 1 */
shun-iwasawa 82a8f5
    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
shun-iwasawa 82a8f5
                    v4.4h, v6.4h, v8.4h, v10.4h
shun-iwasawa 82a8f5
    transpose_4x4   v4, v6, v8, v10, v3
shun-iwasawa 82a8f5
    ins             v10.d[1], v11.d[0]
shun-iwasawa 82a8f5
    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
shun-iwasawa 82a8f5
                    v5.4h, v7.4h, v9.4h, v11.4h
shun-iwasawa 82a8f5
    transpose_4x4   v5, v7, v9, v11, v3
shun-iwasawa 82a8f5
    ins             v10.d[1], v11.d[0]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Pass 2 */
shun-iwasawa 82a8f5
    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
shun-iwasawa 82a8f5
                    v26.4h, v27.4h, v28.4h, v29.4h
shun-iwasawa 82a8f5
    transpose_4x4   v26, v27, v28, v29, v3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Range limit */
shun-iwasawa 82a8f5
    movi            v30.8h, #0x80
shun-iwasawa 82a8f5
    ins             v26.d[1], v27.d[0]
shun-iwasawa 82a8f5
    ins             v28.d[1], v29.d[0]
shun-iwasawa 82a8f5
    add             v26.8h, v26.8h, v30.8h
shun-iwasawa 82a8f5
    add             v28.8h, v28.8h, v30.8h
shun-iwasawa 82a8f5
    sqxtun          v26.8b, v26.8h
shun-iwasawa 82a8f5
    sqxtun          v27.8b, v28.8h
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Store results to the output buffer */
shun-iwasawa 82a8f5
    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
shun-iwasawa 82a8f5
    ldp             TMP3, TMP4, [OUTPUT_BUF]
shun-iwasawa 82a8f5
    add             TMP1, TMP1, OUTPUT_COL
shun-iwasawa 82a8f5
    add             TMP2, TMP2, OUTPUT_COL
shun-iwasawa 82a8f5
    add             TMP3, TMP3, OUTPUT_COL
shun-iwasawa 82a8f5
    add             TMP4, TMP4, OUTPUT_COL
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
shun-iwasawa 82a8f5
    /* We can use much less instructions on little endian systems if the
shun-iwasawa 82a8f5
     * OS kernel is not configured to trap unaligned memory accesses
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
    st1             {v26.s}[0], [TMP1], 4
shun-iwasawa 82a8f5
    st1             {v27.s}[0], [TMP3], 4
shun-iwasawa 82a8f5
    st1             {v26.s}[1], [TMP2], 4
shun-iwasawa 82a8f5
    st1             {v27.s}[1], [TMP4], 4
shun-iwasawa 82a8f5
#else
shun-iwasawa 82a8f5
    st1             {v26.b}[0], [TMP1], 1
shun-iwasawa 82a8f5
    st1             {v27.b}[0], [TMP3], 1
shun-iwasawa 82a8f5
    st1             {v26.b}[1], [TMP1], 1
shun-iwasawa 82a8f5
    st1             {v27.b}[1], [TMP3], 1
shun-iwasawa 82a8f5
    st1             {v26.b}[2], [TMP1], 1
shun-iwasawa 82a8f5
    st1             {v27.b}[2], [TMP3], 1
shun-iwasawa 82a8f5
    st1             {v26.b}[3], [TMP1], 1
shun-iwasawa 82a8f5
    st1             {v27.b}[3], [TMP3], 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    st1             {v26.b}[4], [TMP2], 1
shun-iwasawa 82a8f5
    st1             {v27.b}[4], [TMP4], 1
shun-iwasawa 82a8f5
    st1             {v26.b}[5], [TMP2], 1
shun-iwasawa 82a8f5
    st1             {v27.b}[5], [TMP4], 1
shun-iwasawa 82a8f5
    st1             {v26.b}[6], [TMP2], 1
shun-iwasawa 82a8f5
    st1             {v27.b}[6], [TMP4], 1
shun-iwasawa 82a8f5
    st1             {v26.b}[7], [TMP2], 1
shun-iwasawa 82a8f5
    st1             {v27.b}[7], [TMP4], 1
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* vpop            {v8.4h - v15.4h}    (not available) */
shun-iwasawa 82a8f5
    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
shun-iwasawa 82a8f5
    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
shun-iwasawa 82a8f5
    blr             x30
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          DCT_TABLE
shun-iwasawa 82a8f5
    .unreq          COEF_BLOCK
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF
shun-iwasawa 82a8f5
    .unreq          OUTPUT_COL
shun-iwasawa 82a8f5
    .unreq          TMP1
shun-iwasawa 82a8f5
    .unreq          TMP2
shun-iwasawa 82a8f5
    .unreq          TMP3
shun-iwasawa 82a8f5
    .unreq          TMP4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem idct_helper
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_idct_2x2_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This function contains inverse-DCT code for getting reduced-size
shun-iwasawa 82a8f5
 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
shun-iwasawa 82a8f5
 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
shun-iwasawa 82a8f5
 * function from jpeg-6b (jidctred.c).
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
shun-iwasawa 82a8f5
 *       requires much less arithmetic operations and hence should be faster.
shun-iwasawa 82a8f5
 *       The primary purpose of this particular Neon optimized function is
shun-iwasawa 82a8f5
 *       bit exact compatibility with jpeg-6b.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
shun-iwasawa 82a8f5
    sshll           v15.4s, \x4, #15
shun-iwasawa 82a8f5
    smull           v26.4s, \x6, v14.h[3]
shun-iwasawa 82a8f5
    smlal           v26.4s, \x10, v14.h[2]
shun-iwasawa 82a8f5
    smlal           v26.4s, \x12, v14.h[1]
shun-iwasawa 82a8f5
    smlal           v26.4s, \x16, v14.h[0]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v20.4s, v15.4s, v26.4s
shun-iwasawa 82a8f5
    sub             v15.4s, v15.4s, v26.4s
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  .if \shift > 16
shun-iwasawa 82a8f5
    srshr           v20.4s, v20.4s, #\shift
shun-iwasawa 82a8f5
    srshr           v15.4s, v15.4s, #\shift
shun-iwasawa 82a8f5
    xtn             \y26, v20.4s
shun-iwasawa 82a8f5
    xtn             \y27, v15.4s
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    rshrn           \y26, v20.4s, #\shift
shun-iwasawa 82a8f5
    rshrn           \y27, v15.4s, #\shift
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_idct_2x2_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    DCT_TABLE       .req x0
shun-iwasawa 82a8f5
    COEF_BLOCK      .req x1
shun-iwasawa 82a8f5
    OUTPUT_BUF      .req x2
shun-iwasawa 82a8f5
    OUTPUT_COL      .req x3
shun-iwasawa 82a8f5
    TMP1            .req x0
shun-iwasawa 82a8f5
    TMP2            .req x15
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
shun-iwasawa 82a8f5
       guarantee that the upper (unused) 32 bits of x3 are valid.  This
shun-iwasawa 82a8f5
       instruction ensures that those bits are set to zero. */
shun-iwasawa 82a8f5
    uxtw x3, w3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* vpush           {v8.4h - v15.4h}    (not available) */
shun-iwasawa 82a8f5
    sub             sp, sp, 64
shun-iwasawa 82a8f5
    mov             x9, sp
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load constants */
shun-iwasawa 82a8f5
    get_symbol_loc  TMP2, Ljsimd_idct_2x2_neon_consts
shun-iwasawa 82a8f5
    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
shun-iwasawa 82a8f5
    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
shun-iwasawa 82a8f5
    ld1             {v14.4h}, [TMP2]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load all COEF_BLOCK into Neon registers with the following allocation:
shun-iwasawa 82a8f5
     *       0 1 2 3 | 4 5 6 7
shun-iwasawa 82a8f5
     *      ---------+--------
shun-iwasawa 82a8f5
     *   0 | v4.4h   | v5.4h
shun-iwasawa 82a8f5
     *   1 | v6.4h   | v7.4h
shun-iwasawa 82a8f5
     *   2 | -       | -
shun-iwasawa 82a8f5
     *   3 | v10.4h  | v11.4h
shun-iwasawa 82a8f5
     *   4 | -       | -
shun-iwasawa 82a8f5
     *   5 | v12.4h  | v13.4h
shun-iwasawa 82a8f5
     *   6 | -       | -
shun-iwasawa 82a8f5
     *   7 | v16.4h  | v17.4h
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
shun-iwasawa 82a8f5
    add             COEF_BLOCK, COEF_BLOCK, #16
shun-iwasawa 82a8f5
    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
shun-iwasawa 82a8f5
    add             COEF_BLOCK, COEF_BLOCK, #16
shun-iwasawa 82a8f5
    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
shun-iwasawa 82a8f5
    add             COEF_BLOCK, COEF_BLOCK, #16
shun-iwasawa 82a8f5
    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
shun-iwasawa 82a8f5
    /* Dequantize */
shun-iwasawa 82a8f5
    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
shun-iwasawa 82a8f5
    mul             v4.4h, v4.4h, v18.4h
shun-iwasawa 82a8f5
    mul             v5.4h, v5.4h, v19.4h
shun-iwasawa 82a8f5
    ins             v4.d[1], v5.d[0]
shun-iwasawa 82a8f5
    mul             v6.4h, v6.4h, v20.4h
shun-iwasawa 82a8f5
    mul             v7.4h, v7.4h, v21.4h
shun-iwasawa 82a8f5
    ins             v6.d[1], v7.d[0]
shun-iwasawa 82a8f5
    add             DCT_TABLE, DCT_TABLE, #16
shun-iwasawa 82a8f5
    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
shun-iwasawa 82a8f5
    mul             v10.4h, v10.4h, v24.4h
shun-iwasawa 82a8f5
    mul             v11.4h, v11.4h, v25.4h
shun-iwasawa 82a8f5
    ins             v10.d[1], v11.d[0]
shun-iwasawa 82a8f5
    add             DCT_TABLE, DCT_TABLE, #16
shun-iwasawa 82a8f5
    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
shun-iwasawa 82a8f5
    mul             v12.4h, v12.4h, v26.4h
shun-iwasawa 82a8f5
    mul             v13.4h, v13.4h, v27.4h
shun-iwasawa 82a8f5
    ins             v12.d[1], v13.d[0]
shun-iwasawa 82a8f5
    add             DCT_TABLE, DCT_TABLE, #16
shun-iwasawa 82a8f5
    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
shun-iwasawa 82a8f5
    mul             v16.4h, v16.4h, v30.4h
shun-iwasawa 82a8f5
    mul             v17.4h, v17.4h, v31.4h
shun-iwasawa 82a8f5
    ins             v16.d[1], v17.d[0]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Pass 1 */
shun-iwasawa 82a8f5
#if 0
shun-iwasawa 82a8f5
    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
shun-iwasawa 82a8f5
    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
shun-iwasawa 82a8f5
    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
shun-iwasawa 82a8f5
    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
shun-iwasawa 82a8f5
#else
shun-iwasawa 82a8f5
    smull           v26.4s, v6.4h, v14.h[3]
shun-iwasawa 82a8f5
    smlal           v26.4s, v10.4h, v14.h[2]
shun-iwasawa 82a8f5
    smlal           v26.4s, v12.4h, v14.h[1]
shun-iwasawa 82a8f5
    smlal           v26.4s, v16.4h, v14.h[0]
shun-iwasawa 82a8f5
    smull           v24.4s, v7.4h, v14.h[3]
shun-iwasawa 82a8f5
    smlal           v24.4s, v11.4h, v14.h[2]
shun-iwasawa 82a8f5
    smlal           v24.4s, v13.4h, v14.h[1]
shun-iwasawa 82a8f5
    smlal           v24.4s, v17.4h, v14.h[0]
shun-iwasawa 82a8f5
    sshll           v15.4s, v4.4h, #15
shun-iwasawa 82a8f5
    sshll           v30.4s, v5.4h, #15
shun-iwasawa 82a8f5
    add             v20.4s, v15.4s, v26.4s
shun-iwasawa 82a8f5
    sub             v15.4s, v15.4s, v26.4s
shun-iwasawa 82a8f5
    rshrn           v4.4h, v20.4s, #13
shun-iwasawa 82a8f5
    rshrn           v6.4h, v15.4s, #13
shun-iwasawa 82a8f5
    add             v20.4s, v30.4s, v24.4s
shun-iwasawa 82a8f5
    sub             v15.4s, v30.4s, v24.4s
shun-iwasawa 82a8f5
    rshrn           v5.4h, v20.4s, #13
shun-iwasawa 82a8f5
    rshrn           v7.4h, v15.4s, #13
shun-iwasawa 82a8f5
    ins             v4.d[1], v5.d[0]
shun-iwasawa 82a8f5
    ins             v6.d[1], v7.d[0]
shun-iwasawa 82a8f5
    transpose       v4, v6, v3, .16b, .8h
shun-iwasawa 82a8f5
    transpose       v6, v10, v3, .16b, .4s
shun-iwasawa 82a8f5
    ins             v11.d[0], v10.d[1]
shun-iwasawa 82a8f5
    ins             v7.d[0], v6.d[1]
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Pass 2 */
shun-iwasawa 82a8f5
    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Range limit */
shun-iwasawa 82a8f5
    movi            v30.8h, #0x80
shun-iwasawa 82a8f5
    ins             v26.d[1], v27.d[0]
shun-iwasawa 82a8f5
    add             v26.8h, v26.8h, v30.8h
shun-iwasawa 82a8f5
    sqxtun          v30.8b, v26.8h
shun-iwasawa 82a8f5
    ins             v26.d[0], v30.d[0]
shun-iwasawa 82a8f5
    sqxtun          v27.8b, v26.8h
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Store results to the output buffer */
shun-iwasawa 82a8f5
    ldp             TMP1, TMP2, [OUTPUT_BUF]
shun-iwasawa 82a8f5
    add             TMP1, TMP1, OUTPUT_COL
shun-iwasawa 82a8f5
    add             TMP2, TMP2, OUTPUT_COL
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    st1             {v26.b}[0], [TMP1], 1
shun-iwasawa 82a8f5
    st1             {v27.b}[4], [TMP1], 1
shun-iwasawa 82a8f5
    st1             {v26.b}[1], [TMP2], 1
shun-iwasawa 82a8f5
    st1             {v27.b}[5], [TMP2], 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
shun-iwasawa 82a8f5
    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
shun-iwasawa 82a8f5
    blr             x30
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          DCT_TABLE
shun-iwasawa 82a8f5
    .unreq          COEF_BLOCK
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF
shun-iwasawa 82a8f5
    .unreq          OUTPUT_COL
shun-iwasawa 82a8f5
    .unreq          TMP1
shun-iwasawa 82a8f5
    .unreq          TMP2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem idct_helper
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_ycc_extrgb_convert_neon
shun-iwasawa 82a8f5
 * jsimd_ycc_extbgr_convert_neon
shun-iwasawa 82a8f5
 * jsimd_ycc_extrgbx_convert_neon
shun-iwasawa 82a8f5
 * jsimd_ycc_extbgrx_convert_neon
shun-iwasawa 82a8f5
 * jsimd_ycc_extxbgr_convert_neon
shun-iwasawa 82a8f5
 * jsimd_ycc_extxrgb_convert_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Colorspace conversion YCbCr -> RGB
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_load size
shun-iwasawa 82a8f5
  .if \size == 8
shun-iwasawa 82a8f5
    ld1             {v4.8b}, [U], 8
shun-iwasawa 82a8f5
    ld1             {v5.8b}, [V], 8
shun-iwasawa 82a8f5
    ld1             {v0.8b}, [Y], 8
shun-iwasawa 82a8f5
    prfm            pldl1keep, [U, #64]
shun-iwasawa 82a8f5
    prfm            pldl1keep, [V, #64]
shun-iwasawa 82a8f5
    prfm            pldl1keep, [Y, #64]
shun-iwasawa 82a8f5
  .elseif \size == 4
shun-iwasawa 82a8f5
    ld1             {v4.b}[0], [U], 1
shun-iwasawa 82a8f5
    ld1             {v4.b}[1], [U], 1
shun-iwasawa 82a8f5
    ld1             {v4.b}[2], [U], 1
shun-iwasawa 82a8f5
    ld1             {v4.b}[3], [U], 1
shun-iwasawa 82a8f5
    ld1             {v5.b}[0], [V], 1
shun-iwasawa 82a8f5
    ld1             {v5.b}[1], [V], 1
shun-iwasawa 82a8f5
    ld1             {v5.b}[2], [V], 1
shun-iwasawa 82a8f5
    ld1             {v5.b}[3], [V], 1
shun-iwasawa 82a8f5
    ld1             {v0.b}[0], [Y], 1
shun-iwasawa 82a8f5
    ld1             {v0.b}[1], [Y], 1
shun-iwasawa 82a8f5
    ld1             {v0.b}[2], [Y], 1
shun-iwasawa 82a8f5
    ld1             {v0.b}[3], [Y], 1
shun-iwasawa 82a8f5
  .elseif \size == 2
shun-iwasawa 82a8f5
    ld1             {v4.b}[4], [U], 1
shun-iwasawa 82a8f5
    ld1             {v4.b}[5], [U], 1
shun-iwasawa 82a8f5
    ld1             {v5.b}[4], [V], 1
shun-iwasawa 82a8f5
    ld1             {v5.b}[5], [V], 1
shun-iwasawa 82a8f5
    ld1             {v0.b}[4], [Y], 1
shun-iwasawa 82a8f5
    ld1             {v0.b}[5], [Y], 1
shun-iwasawa 82a8f5
  .elseif \size == 1
shun-iwasawa 82a8f5
    ld1             {v4.b}[6], [U], 1
shun-iwasawa 82a8f5
    ld1             {v5.b}[6], [V], 1
shun-iwasawa 82a8f5
    ld1             {v0.b}[6], [Y], 1
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    .error unsupported macroblock size
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_store bpp, size, fast_st3
shun-iwasawa 82a8f5
  .if \bpp == 24
shun-iwasawa 82a8f5
    .if \size == 8
shun-iwasawa 82a8f5
      .if \fast_st3 == 1
shun-iwasawa 82a8f5
        st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
shun-iwasawa 82a8f5
      .else
shun-iwasawa 82a8f5
        st1         {v10.b}[0], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v11.b}[0], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v12.b}[0], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        st1         {v10.b}[1], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v11.b}[1], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v12.b}[1], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        st1         {v10.b}[2], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v11.b}[2], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v12.b}[2], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        st1         {v10.b}[3], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v11.b}[3], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v12.b}[3], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        st1         {v10.b}[4], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v11.b}[4], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v12.b}[4], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        st1         {v10.b}[5], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v11.b}[5], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v12.b}[5], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        st1         {v10.b}[6], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v11.b}[6], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v12.b}[6], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        st1         {v10.b}[7], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v11.b}[7], [RGB], #1
shun-iwasawa 82a8f5
        st1         {v12.b}[7], [RGB], #1
shun-iwasawa 82a8f5
      .endif
shun-iwasawa 82a8f5
    .elseif \size == 4
shun-iwasawa 82a8f5
      st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
shun-iwasawa 82a8f5
      st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
shun-iwasawa 82a8f5
      st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
shun-iwasawa 82a8f5
      st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
shun-iwasawa 82a8f5
    .elseif \size == 2
shun-iwasawa 82a8f5
      st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
shun-iwasawa 82a8f5
      st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
shun-iwasawa 82a8f5
    .elseif \size == 1
shun-iwasawa 82a8f5
      st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
shun-iwasawa 82a8f5
    .else
shun-iwasawa 82a8f5
     .error unsupported macroblock size
shun-iwasawa 82a8f5
    .endif
shun-iwasawa 82a8f5
  .elseif \bpp == 32
shun-iwasawa 82a8f5
    .if \size == 8
shun-iwasawa 82a8f5
      st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
shun-iwasawa 82a8f5
    .elseif \size == 4
shun-iwasawa 82a8f5
      st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
shun-iwasawa 82a8f5
      st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
shun-iwasawa 82a8f5
      st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
shun-iwasawa 82a8f5
      st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
shun-iwasawa 82a8f5
    .elseif \size == 2
shun-iwasawa 82a8f5
      st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
shun-iwasawa 82a8f5
      st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
shun-iwasawa 82a8f5
    .elseif \size == 1
shun-iwasawa 82a8f5
      st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
shun-iwasawa 82a8f5
    .else
shun-iwasawa 82a8f5
      .error unsupported macroblock size
shun-iwasawa 82a8f5
    .endif
shun-iwasawa 82a8f5
  .elseif \bpp == 16
shun-iwasawa 82a8f5
    .if \size == 8
shun-iwasawa 82a8f5
      st1           {v25.8h}, [RGB], 16
shun-iwasawa 82a8f5
    .elseif \size == 4
shun-iwasawa 82a8f5
      st1           {v25.4h}, [RGB], 8
shun-iwasawa 82a8f5
    .elseif \size == 2
shun-iwasawa 82a8f5
      st1           {v25.h}[4], [RGB], 2
shun-iwasawa 82a8f5
      st1           {v25.h}[5], [RGB], 2
shun-iwasawa 82a8f5
    .elseif \size == 1
shun-iwasawa 82a8f5
      st1           {v25.h}[6], [RGB], 2
shun-iwasawa 82a8f5
    .else
shun-iwasawa 82a8f5
      .error unsupported macroblock size
shun-iwasawa 82a8f5
    .endif
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    .error unsupported bpp
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
shun-iwasawa 82a8f5
                                           g_offs, gsize, b_offs, bsize, \
shun-iwasawa 82a8f5
                                           defsize, fast_st3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * 2-stage pipelined YCbCr->RGB conversion
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_yuv_to_rgb_stage1
shun-iwasawa 82a8f5
    uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
shun-iwasawa 82a8f5
    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
shun-iwasawa 82a8f5
    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
shun-iwasawa 82a8f5
    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
shun-iwasawa 82a8f5
    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
shun-iwasawa 82a8f5
    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
shun-iwasawa 82a8f5
    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
shun-iwasawa 82a8f5
    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
shun-iwasawa 82a8f5
    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
shun-iwasawa 82a8f5
    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_yuv_to_rgb_stage2
shun-iwasawa 82a8f5
    rshrn           v20.4h, v20.4s, #15
shun-iwasawa 82a8f5
    rshrn2          v20.8h, v22.4s, #15
shun-iwasawa 82a8f5
    rshrn           v24.4h, v24.4s, #14
shun-iwasawa 82a8f5
    rshrn2          v24.8h, v26.4s, #14
shun-iwasawa 82a8f5
    rshrn           v28.4h, v28.4s, #14
shun-iwasawa 82a8f5
    rshrn2          v28.8h, v30.4s, #14
shun-iwasawa 82a8f5
    uaddw           v20.8h, v20.8h, v0.8b
shun-iwasawa 82a8f5
    uaddw           v24.8h, v24.8h, v0.8b
shun-iwasawa 82a8f5
    uaddw           v28.8h, v28.8h, v0.8b
shun-iwasawa 82a8f5
  .if \bpp != 16
shun-iwasawa 82a8f5
    sqxtun          v1\g_offs\defsize, v20.8h
shun-iwasawa 82a8f5
    sqxtun          v1\r_offs\defsize, v24.8h
shun-iwasawa 82a8f5
    sqxtun          v1\b_offs\defsize, v28.8h
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    sqshlu          v21.8h, v20.8h, #8
shun-iwasawa 82a8f5
    sqshlu          v25.8h, v24.8h, #8
shun-iwasawa 82a8f5
    sqshlu          v29.8h, v28.8h, #8
shun-iwasawa 82a8f5
    sri             v25.8h, v21.8h, #5
shun-iwasawa 82a8f5
    sri             v25.8h, v29.8h, #11
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
shun-iwasawa 82a8f5
    rshrn           v20.4h, v20.4s, #15
shun-iwasawa 82a8f5
    rshrn           v24.4h, v24.4s, #14
shun-iwasawa 82a8f5
    rshrn           v28.4h, v28.4s, #14
shun-iwasawa 82a8f5
    ld1             {v4.8b}, [U], 8
shun-iwasawa 82a8f5
    rshrn2          v20.8h, v22.4s, #15
shun-iwasawa 82a8f5
    rshrn2          v24.8h, v26.4s, #14
shun-iwasawa 82a8f5
    rshrn2          v28.8h, v30.4s, #14
shun-iwasawa 82a8f5
    ld1             {v5.8b}, [V], 8
shun-iwasawa 82a8f5
    uaddw           v20.8h, v20.8h, v0.8b
shun-iwasawa 82a8f5
    uaddw           v24.8h, v24.8h, v0.8b
shun-iwasawa 82a8f5
    uaddw           v28.8h, v28.8h, v0.8b
shun-iwasawa 82a8f5
  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
shun-iwasawa 82a8f5
    sqxtun          v1\g_offs\defsize, v20.8h
shun-iwasawa 82a8f5
    ld1             {v0.8b}, [Y], 8
shun-iwasawa 82a8f5
    sqxtun          v1\r_offs\defsize, v24.8h
shun-iwasawa 82a8f5
    prfm            pldl1keep, [U, #64]
shun-iwasawa 82a8f5
    prfm            pldl1keep, [V, #64]
shun-iwasawa 82a8f5
    prfm            pldl1keep, [Y, #64]
shun-iwasawa 82a8f5
    sqxtun          v1\b_offs\defsize, v28.8h
shun-iwasawa 82a8f5
    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
shun-iwasawa 82a8f5
    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
shun-iwasawa 82a8f5
    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
shun-iwasawa 82a8f5
    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
shun-iwasawa 82a8f5
    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
shun-iwasawa 82a8f5
    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
shun-iwasawa 82a8f5
    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
shun-iwasawa 82a8f5
    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
shun-iwasawa 82a8f5
  .else  /**************************** rgb565 ********************************/
shun-iwasawa 82a8f5
    sqshlu          v21.8h, v20.8h, #8
shun-iwasawa 82a8f5
    sqshlu          v25.8h, v24.8h, #8
shun-iwasawa 82a8f5
    sqshlu          v29.8h, v28.8h, #8
shun-iwasawa 82a8f5
    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
shun-iwasawa 82a8f5
    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
shun-iwasawa 82a8f5
    ld1             {v0.8b}, [Y], 8
shun-iwasawa 82a8f5
    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
shun-iwasawa 82a8f5
    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
shun-iwasawa 82a8f5
    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
shun-iwasawa 82a8f5
    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
shun-iwasawa 82a8f5
    sri             v25.8h, v21.8h, #5
shun-iwasawa 82a8f5
    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
shun-iwasawa 82a8f5
    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
shun-iwasawa 82a8f5
    prfm            pldl1keep, [U, #64]
shun-iwasawa 82a8f5
    prfm            pldl1keep, [V, #64]
shun-iwasawa 82a8f5
    prfm            pldl1keep, [Y, #64]
shun-iwasawa 82a8f5
    sri             v25.8h, v29.8h, #11
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
    do_store        \bpp, 8, \fast_st3
shun-iwasawa 82a8f5
    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
shun-iwasawa 82a8f5
    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_yuv_to_rgb
shun-iwasawa 82a8f5
    do_yuv_to_rgb_stage1
shun-iwasawa 82a8f5
    do_yuv_to_rgb_stage2
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.if \fast_st3 == 1
shun-iwasawa 82a8f5
asm_function jsimd_ycc_\colorid\()_convert_neon
shun-iwasawa 82a8f5
.else
shun-iwasawa 82a8f5
asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
    OUTPUT_WIDTH    .req w0
shun-iwasawa 82a8f5
    INPUT_BUF       .req x1
shun-iwasawa 82a8f5
    INPUT_ROW       .req w2
shun-iwasawa 82a8f5
    OUTPUT_BUF      .req x3
shun-iwasawa 82a8f5
    NUM_ROWS        .req w4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    INPUT_BUF0      .req x5
shun-iwasawa 82a8f5
    INPUT_BUF1      .req x6
shun-iwasawa 82a8f5
    INPUT_BUF2      .req x1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    RGB             .req x7
shun-iwasawa 82a8f5
    Y               .req x9
shun-iwasawa 82a8f5
    U               .req x10
shun-iwasawa 82a8f5
    V               .req x11
shun-iwasawa 82a8f5
    N               .req w15
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    sub             sp, sp, 64
shun-iwasawa 82a8f5
    mov             x9, sp
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
shun-iwasawa 82a8f5
    get_symbol_loc  x15, Ljsimd_ycc_rgb_neon_consts
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Save Neon registers */
shun-iwasawa 82a8f5
    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
shun-iwasawa 82a8f5
    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
shun-iwasawa 82a8f5
    ld1             {v0.4h, v1.4h}, [x15], 16
shun-iwasawa 82a8f5
    ld1             {v2.8h}, [x15]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ldr             INPUT_BUF0, [INPUT_BUF]
shun-iwasawa 82a8f5
    ldr             INPUT_BUF1, [INPUT_BUF, #8]
shun-iwasawa 82a8f5
    ldr             INPUT_BUF2, [INPUT_BUF, #16]
shun-iwasawa 82a8f5
    .unreq          INPUT_BUF
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
shun-iwasawa 82a8f5
    movi            v10.16b, #255
shun-iwasawa 82a8f5
    movi            v13.16b, #255
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Outer loop over scanlines */
shun-iwasawa 82a8f5
    cmp             NUM_ROWS, #1
shun-iwasawa 82a8f5
    b.lt            9f
shun-iwasawa 82a8f5
0:
shun-iwasawa 82a8f5
    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
shun-iwasawa 82a8f5
    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
shun-iwasawa 82a8f5
    mov             N, OUTPUT_WIDTH
shun-iwasawa 82a8f5
    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
shun-iwasawa 82a8f5
    add             INPUT_ROW, INPUT_ROW, #1
shun-iwasawa 82a8f5
    ldr             RGB, [OUTPUT_BUF], #8
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Inner loop over pixels */
shun-iwasawa 82a8f5
    subs            N, N, #8
shun-iwasawa 82a8f5
    b.lt            3f
shun-iwasawa 82a8f5
    do_load         8
shun-iwasawa 82a8f5
    do_yuv_to_rgb_stage1
shun-iwasawa 82a8f5
    subs            N, N, #8
shun-iwasawa 82a8f5
    b.lt            2f
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
shun-iwasawa 82a8f5
    subs            N, N, #8
shun-iwasawa 82a8f5
    b.ge            1b
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    do_yuv_to_rgb_stage2
shun-iwasawa 82a8f5
    do_store        \bpp, 8, \fast_st3
shun-iwasawa 82a8f5
    tst             N, #7
shun-iwasawa 82a8f5
    b.eq            8f
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    tst             N, #4
shun-iwasawa 82a8f5
    b.eq            3f
shun-iwasawa 82a8f5
    do_load         4
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    tst             N, #2
shun-iwasawa 82a8f5
    b.eq            4f
shun-iwasawa 82a8f5
    do_load         2
shun-iwasawa 82a8f5
4:
shun-iwasawa 82a8f5
    tst             N, #1
shun-iwasawa 82a8f5
    b.eq            5f
shun-iwasawa 82a8f5
    do_load         1
shun-iwasawa 82a8f5
5:
shun-iwasawa 82a8f5
    do_yuv_to_rgb
shun-iwasawa 82a8f5
    tst             N, #4
shun-iwasawa 82a8f5
    b.eq            6f
shun-iwasawa 82a8f5
    do_store        \bpp, 4, \fast_st3
shun-iwasawa 82a8f5
6:
shun-iwasawa 82a8f5
    tst             N, #2
shun-iwasawa 82a8f5
    b.eq            7f
shun-iwasawa 82a8f5
    do_store        \bpp, 2, \fast_st3
shun-iwasawa 82a8f5
7:
shun-iwasawa 82a8f5
    tst             N, #1
shun-iwasawa 82a8f5
    b.eq            8f
shun-iwasawa 82a8f5
    do_store        \bpp, 1, \fast_st3
shun-iwasawa 82a8f5
8:
shun-iwasawa 82a8f5
    subs            NUM_ROWS, NUM_ROWS, #1
shun-iwasawa 82a8f5
    b.gt            0b
shun-iwasawa 82a8f5
9:
shun-iwasawa 82a8f5
    /* Restore all registers and return */
shun-iwasawa 82a8f5
    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
shun-iwasawa 82a8f5
    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
shun-iwasawa 82a8f5
    br              x30
shun-iwasawa 82a8f5
    .unreq          OUTPUT_WIDTH
shun-iwasawa 82a8f5
    .unreq          INPUT_ROW
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF
shun-iwasawa 82a8f5
    .unreq          NUM_ROWS
shun-iwasawa 82a8f5
    .unreq          INPUT_BUF0
shun-iwasawa 82a8f5
    .unreq          INPUT_BUF1
shun-iwasawa 82a8f5
    .unreq          INPUT_BUF2
shun-iwasawa 82a8f5
    .unreq          RGB
shun-iwasawa 82a8f5
    .unreq          Y
shun-iwasawa 82a8f5
    .unreq          U
shun-iwasawa 82a8f5
    .unreq          V
shun-iwasawa 82a8f5
    .unreq          N
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem do_yuv_to_rgb
shun-iwasawa 82a8f5
.purgem do_yuv_to_rgb_stage1
shun-iwasawa 82a8f5
.purgem do_yuv_to_rgb_stage2
shun-iwasawa 82a8f5
.purgem do_yuv_to_rgb_stage2_store_load_stage1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
shun-iwasawa 82a8f5
generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem do_load
shun-iwasawa 82a8f5
.purgem do_store
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_extrgb_ycc_convert_neon
shun-iwasawa 82a8f5
 * jsimd_extbgr_ycc_convert_neon
shun-iwasawa 82a8f5
 * jsimd_extrgbx_ycc_convert_neon
shun-iwasawa 82a8f5
 * jsimd_extbgrx_ycc_convert_neon
shun-iwasawa 82a8f5
 * jsimd_extxbgr_ycc_convert_neon
shun-iwasawa 82a8f5
 * jsimd_extxrgb_ycc_convert_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Colorspace conversion RGB -> YCbCr
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_store size
shun-iwasawa 82a8f5
  .if \size == 8
shun-iwasawa 82a8f5
    st1             {v20.8b}, [Y], #8
shun-iwasawa 82a8f5
    st1             {v21.8b}, [U], #8
shun-iwasawa 82a8f5
    st1             {v22.8b}, [V], #8
shun-iwasawa 82a8f5
  .elseif \size == 4
shun-iwasawa 82a8f5
    st1             {v20.b}[0], [Y], #1
shun-iwasawa 82a8f5
    st1             {v20.b}[1], [Y], #1
shun-iwasawa 82a8f5
    st1             {v20.b}[2], [Y], #1
shun-iwasawa 82a8f5
    st1             {v20.b}[3], [Y], #1
shun-iwasawa 82a8f5
    st1             {v21.b}[0], [U], #1
shun-iwasawa 82a8f5
    st1             {v21.b}[1], [U], #1
shun-iwasawa 82a8f5
    st1             {v21.b}[2], [U], #1
shun-iwasawa 82a8f5
    st1             {v21.b}[3], [U], #1
shun-iwasawa 82a8f5
    st1             {v22.b}[0], [V], #1
shun-iwasawa 82a8f5
    st1             {v22.b}[1], [V], #1
shun-iwasawa 82a8f5
    st1             {v22.b}[2], [V], #1
shun-iwasawa 82a8f5
    st1             {v22.b}[3], [V], #1
shun-iwasawa 82a8f5
  .elseif \size == 2
shun-iwasawa 82a8f5
    st1             {v20.b}[4], [Y], #1
shun-iwasawa 82a8f5
    st1             {v20.b}[5], [Y], #1
shun-iwasawa 82a8f5
    st1             {v21.b}[4], [U], #1
shun-iwasawa 82a8f5
    st1             {v21.b}[5], [U], #1
shun-iwasawa 82a8f5
    st1             {v22.b}[4], [V], #1
shun-iwasawa 82a8f5
    st1             {v22.b}[5], [V], #1
shun-iwasawa 82a8f5
  .elseif \size == 1
shun-iwasawa 82a8f5
    st1             {v20.b}[6], [Y], #1
shun-iwasawa 82a8f5
    st1             {v21.b}[6], [U], #1
shun-iwasawa 82a8f5
    st1             {v22.b}[6], [V], #1
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    .error unsupported macroblock size
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_load bpp, size, fast_ld3
shun-iwasawa 82a8f5
  .if \bpp == 24
shun-iwasawa 82a8f5
    .if \size == 8
shun-iwasawa 82a8f5
      .if \fast_ld3 == 1
shun-iwasawa 82a8f5
        ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
shun-iwasawa 82a8f5
      .else
shun-iwasawa 82a8f5
        ld1         {v10.b}[0], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v11.b}[0], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v12.b}[0], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        ld1         {v10.b}[1], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v11.b}[1], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v12.b}[1], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        ld1         {v10.b}[2], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v11.b}[2], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v12.b}[2], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        ld1         {v10.b}[3], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v11.b}[3], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v12.b}[3], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        ld1         {v10.b}[4], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v11.b}[4], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v12.b}[4], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        ld1         {v10.b}[5], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v11.b}[5], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v12.b}[5], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        ld1         {v10.b}[6], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v11.b}[6], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v12.b}[6], [RGB], #1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
        ld1         {v10.b}[7], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v11.b}[7], [RGB], #1
shun-iwasawa 82a8f5
        ld1         {v12.b}[7], [RGB], #1
shun-iwasawa 82a8f5
      .endif
shun-iwasawa 82a8f5
      prfm          pldl1keep, [RGB, #128]
shun-iwasawa 82a8f5
    .elseif \size == 4
shun-iwasawa 82a8f5
      ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
shun-iwasawa 82a8f5
      ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
shun-iwasawa 82a8f5
      ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
shun-iwasawa 82a8f5
      ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
shun-iwasawa 82a8f5
    .elseif \size == 2
shun-iwasawa 82a8f5
      ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
shun-iwasawa 82a8f5
      ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
shun-iwasawa 82a8f5
    .elseif \size == 1
shun-iwasawa 82a8f5
      ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
shun-iwasawa 82a8f5
    .else
shun-iwasawa 82a8f5
      .error unsupported macroblock size
shun-iwasawa 82a8f5
    .endif
shun-iwasawa 82a8f5
  .elseif \bpp == 32
shun-iwasawa 82a8f5
    .if \size == 8
shun-iwasawa 82a8f5
      ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
shun-iwasawa 82a8f5
      prfm          pldl1keep, [RGB, #128]
shun-iwasawa 82a8f5
    .elseif \size == 4
shun-iwasawa 82a8f5
      ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
shun-iwasawa 82a8f5
      ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
shun-iwasawa 82a8f5
      ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
shun-iwasawa 82a8f5
      ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
shun-iwasawa 82a8f5
    .elseif \size == 2
shun-iwasawa 82a8f5
      ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
shun-iwasawa 82a8f5
      ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
shun-iwasawa 82a8f5
    .elseif \size == 1
shun-iwasawa 82a8f5
      ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
shun-iwasawa 82a8f5
    .else
shun-iwasawa 82a8f5
      .error unsupported macroblock size
shun-iwasawa 82a8f5
    .endif
shun-iwasawa 82a8f5
  .else
shun-iwasawa 82a8f5
    .error unsupported bpp
shun-iwasawa 82a8f5
  .endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
shun-iwasawa 82a8f5
                                           b_offs, fast_ld3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * 2-stage pipelined RGB->YCbCr conversion
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_rgb_to_yuv_stage1
shun-iwasawa 82a8f5
    ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
shun-iwasawa 82a8f5
    ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
shun-iwasawa 82a8f5
    ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
shun-iwasawa 82a8f5
    rev64           v18.4s, v1.4s
shun-iwasawa 82a8f5
    rev64           v26.4s, v1.4s
shun-iwasawa 82a8f5
    rev64           v28.4s, v1.4s
shun-iwasawa 82a8f5
    rev64           v30.4s, v1.4s
shun-iwasawa 82a8f5
    umull           v14.4s, v4.4h, v0.h[0]
shun-iwasawa 82a8f5
    umull2          v16.4s, v4.8h, v0.h[0]
shun-iwasawa 82a8f5
    umlsl           v18.4s, v4.4h, v0.h[3]
shun-iwasawa 82a8f5
    umlsl2          v26.4s, v4.8h, v0.h[3]
shun-iwasawa 82a8f5
    umlal           v28.4s, v4.4h, v0.h[5]
shun-iwasawa 82a8f5
    umlal2          v30.4s, v4.8h, v0.h[5]
shun-iwasawa 82a8f5
    umlal           v14.4s, v6.4h, v0.h[1]
shun-iwasawa 82a8f5
    umlal2          v16.4s, v6.8h, v0.h[1]
shun-iwasawa 82a8f5
    umlsl           v18.4s, v6.4h, v0.h[4]
shun-iwasawa 82a8f5
    umlsl2          v26.4s, v6.8h, v0.h[4]
shun-iwasawa 82a8f5
    umlsl           v28.4s, v6.4h, v0.h[6]
shun-iwasawa 82a8f5
    umlsl2          v30.4s, v6.8h, v0.h[6]
shun-iwasawa 82a8f5
    umlal           v14.4s, v8.4h, v0.h[2]
shun-iwasawa 82a8f5
    umlal2          v16.4s, v8.8h, v0.h[2]
shun-iwasawa 82a8f5
    umlal           v18.4s, v8.4h, v0.h[5]
shun-iwasawa 82a8f5
    umlal2          v26.4s, v8.8h, v0.h[5]
shun-iwasawa 82a8f5
    umlsl           v28.4s, v8.4h, v0.h[7]
shun-iwasawa 82a8f5
    umlsl2          v30.4s, v8.8h, v0.h[7]
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_rgb_to_yuv_stage2
shun-iwasawa 82a8f5
    rshrn           v20.4h, v14.4s, #16
shun-iwasawa 82a8f5
    shrn            v22.4h, v18.4s, #16
shun-iwasawa 82a8f5
    shrn            v24.4h, v28.4s, #16
shun-iwasawa 82a8f5
    rshrn2          v20.8h, v16.4s, #16
shun-iwasawa 82a8f5
    shrn2           v22.8h, v26.4s, #16
shun-iwasawa 82a8f5
    shrn2           v24.8h, v30.4s, #16
shun-iwasawa 82a8f5
    xtn             v20.8b, v20.8h       /* v20 = y */
shun-iwasawa 82a8f5
    xtn             v21.8b, v22.8h       /* v21 = u */
shun-iwasawa 82a8f5
    xtn             v22.8b, v24.8h       /* v22 = v */
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro do_rgb_to_yuv
shun-iwasawa 82a8f5
    do_rgb_to_yuv_stage1
shun-iwasawa 82a8f5
    do_rgb_to_yuv_stage2
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* TODO: expand macros and interleave instructions if some in-order
shun-iwasawa 82a8f5
 *       AArch64 processor actually can dual-issue LOAD/STORE with ALU */
shun-iwasawa 82a8f5
.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
shun-iwasawa 82a8f5
    do_rgb_to_yuv_stage2
shun-iwasawa 82a8f5
    do_load         \bpp, 8, \fast_ld3
shun-iwasawa 82a8f5
    st1             {v20.8b}, [Y], #8
shun-iwasawa 82a8f5
    st1             {v21.8b}, [U], #8
shun-iwasawa 82a8f5
    st1             {v22.8b}, [V], #8
shun-iwasawa 82a8f5
    do_rgb_to_yuv_stage1
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.if \fast_ld3 == 1
shun-iwasawa 82a8f5
asm_function jsimd_\colorid\()_ycc_convert_neon
shun-iwasawa 82a8f5
.else
shun-iwasawa 82a8f5
asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
    OUTPUT_WIDTH    .req w0
shun-iwasawa 82a8f5
    INPUT_BUF       .req x1
shun-iwasawa 82a8f5
    OUTPUT_BUF      .req x2
shun-iwasawa 82a8f5
    OUTPUT_ROW      .req w3
shun-iwasawa 82a8f5
    NUM_ROWS        .req w4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    OUTPUT_BUF0     .req x5
shun-iwasawa 82a8f5
    OUTPUT_BUF1     .req x6
shun-iwasawa 82a8f5
    OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    RGB             .req x7
shun-iwasawa 82a8f5
    Y               .req x9
shun-iwasawa 82a8f5
    U               .req x10
shun-iwasawa 82a8f5
    V               .req x11
shun-iwasawa 82a8f5
    N               .req w12
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load constants to d0, d1, d2, d3 */
shun-iwasawa 82a8f5
    get_symbol_loc  x13, Ljsimd_rgb_ycc_neon_consts
shun-iwasawa 82a8f5
    ld1             {v0.8h, v1.8h}, [x13]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
shun-iwasawa 82a8f5
    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
shun-iwasawa 82a8f5
    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Save Neon registers */
shun-iwasawa 82a8f5
    sub             sp, sp, #64
shun-iwasawa 82a8f5
    mov             x9, sp
shun-iwasawa 82a8f5
    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
shun-iwasawa 82a8f5
    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Outer loop over scanlines */
shun-iwasawa 82a8f5
    cmp             NUM_ROWS, #1
shun-iwasawa 82a8f5
    b.lt            9f
shun-iwasawa 82a8f5
0:
shun-iwasawa 82a8f5
    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
shun-iwasawa 82a8f5
    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
shun-iwasawa 82a8f5
    mov             N, OUTPUT_WIDTH
shun-iwasawa 82a8f5
    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
shun-iwasawa 82a8f5
    add             OUTPUT_ROW, OUTPUT_ROW, #1
shun-iwasawa 82a8f5
    ldr             RGB, [INPUT_BUF], #8
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Inner loop over pixels */
shun-iwasawa 82a8f5
    subs            N, N, #8
shun-iwasawa 82a8f5
    b.lt            3f
shun-iwasawa 82a8f5
    do_load         \bpp, 8, \fast_ld3
shun-iwasawa 82a8f5
    do_rgb_to_yuv_stage1
shun-iwasawa 82a8f5
    subs            N, N, #8
shun-iwasawa 82a8f5
    b.lt            2f
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
shun-iwasawa 82a8f5
    subs            N, N, #8
shun-iwasawa 82a8f5
    b.ge            1b
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    do_rgb_to_yuv_stage2
shun-iwasawa 82a8f5
    do_store        8
shun-iwasawa 82a8f5
    tst             N, #7
shun-iwasawa 82a8f5
    b.eq            8f
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    tbz             N, #2, 3f
shun-iwasawa 82a8f5
    do_load         \bpp, 4, \fast_ld3
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    tbz             N, #1, 4f
shun-iwasawa 82a8f5
    do_load         \bpp, 2, \fast_ld3
shun-iwasawa 82a8f5
4:
shun-iwasawa 82a8f5
    tbz             N, #0, 5f
shun-iwasawa 82a8f5
    do_load         \bpp, 1, \fast_ld3
shun-iwasawa 82a8f5
5:
shun-iwasawa 82a8f5
    do_rgb_to_yuv
shun-iwasawa 82a8f5
    tbz             N, #2, 6f
shun-iwasawa 82a8f5
    do_store        4
shun-iwasawa 82a8f5
6:
shun-iwasawa 82a8f5
    tbz             N, #1, 7f
shun-iwasawa 82a8f5
    do_store        2
shun-iwasawa 82a8f5
7:
shun-iwasawa 82a8f5
    tbz             N, #0, 8f
shun-iwasawa 82a8f5
    do_store        1
shun-iwasawa 82a8f5
8:
shun-iwasawa 82a8f5
    subs            NUM_ROWS, NUM_ROWS, #1
shun-iwasawa 82a8f5
    b.gt            0b
shun-iwasawa 82a8f5
9:
shun-iwasawa 82a8f5
    /* Restore all registers and return */
shun-iwasawa 82a8f5
    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
shun-iwasawa 82a8f5
    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
shun-iwasawa 82a8f5
    br              x30
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          OUTPUT_WIDTH
shun-iwasawa 82a8f5
    .unreq          OUTPUT_ROW
shun-iwasawa 82a8f5
    .unreq          INPUT_BUF
shun-iwasawa 82a8f5
    .unreq          NUM_ROWS
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF0
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF1
shun-iwasawa 82a8f5
    .unreq          OUTPUT_BUF2
shun-iwasawa 82a8f5
    .unreq          RGB
shun-iwasawa 82a8f5
    .unreq          Y
shun-iwasawa 82a8f5
    .unreq          U
shun-iwasawa 82a8f5
    .unreq          V
shun-iwasawa 82a8f5
    .unreq          N
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem do_rgb_to_yuv
shun-iwasawa 82a8f5
.purgem do_rgb_to_yuv_stage1
shun-iwasawa 82a8f5
.purgem do_rgb_to_yuv_stage2
shun-iwasawa 82a8f5
.purgem do_rgb_to_yuv_stage2_store_load_stage1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
shun-iwasawa 82a8f5
generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem do_load
shun-iwasawa 82a8f5
.purgem do_store
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Load data into workspace, applying unsigned->signed conversion
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
shun-iwasawa 82a8f5
 *       rid of VST1.16 instructions
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_convsamp_neon
shun-iwasawa 82a8f5
    SAMPLE_DATA     .req x0
shun-iwasawa 82a8f5
    START_COL       .req x1
shun-iwasawa 82a8f5
    WORKSPACE       .req x2
shun-iwasawa 82a8f5
    TMP1            .req x9
shun-iwasawa 82a8f5
    TMP2            .req x10
shun-iwasawa 82a8f5
    TMP3            .req x11
shun-iwasawa 82a8f5
    TMP4            .req x12
shun-iwasawa 82a8f5
    TMP5            .req x13
shun-iwasawa 82a8f5
    TMP6            .req x14
shun-iwasawa 82a8f5
    TMP7            .req x15
shun-iwasawa 82a8f5
    TMP8            .req x4
shun-iwasawa 82a8f5
    TMPDUP          .req w3
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
shun-iwasawa 82a8f5
       guarantee that the upper (unused) 32 bits of x1 are valid.  This
shun-iwasawa 82a8f5
       instruction ensures that those bits are set to zero. */
shun-iwasawa 82a8f5
    uxtw x1, w1
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov             TMPDUP, #128
shun-iwasawa 82a8f5
    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
shun-iwasawa 82a8f5
    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
shun-iwasawa 82a8f5
    dup             v0.8b, TMPDUP
shun-iwasawa 82a8f5
    add             TMP1, TMP1, START_COL
shun-iwasawa 82a8f5
    add             TMP2, TMP2, START_COL
shun-iwasawa 82a8f5
    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
shun-iwasawa 82a8f5
    add             TMP3, TMP3, START_COL
shun-iwasawa 82a8f5
    add             TMP4, TMP4, START_COL
shun-iwasawa 82a8f5
    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
shun-iwasawa 82a8f5
    add             TMP5, TMP5, START_COL
shun-iwasawa 82a8f5
    add             TMP6, TMP6, START_COL
shun-iwasawa 82a8f5
    ld1             {v16.8b}, [TMP1]
shun-iwasawa 82a8f5
    add             TMP7, TMP7, START_COL
shun-iwasawa 82a8f5
    add             TMP8, TMP8, START_COL
shun-iwasawa 82a8f5
    ld1             {v17.8b}, [TMP2]
shun-iwasawa 82a8f5
    usubl           v16.8h, v16.8b, v0.8b
shun-iwasawa 82a8f5
    ld1             {v18.8b}, [TMP3]
shun-iwasawa 82a8f5
    usubl           v17.8h, v17.8b, v0.8b
shun-iwasawa 82a8f5
    ld1             {v19.8b}, [TMP4]
shun-iwasawa 82a8f5
    usubl           v18.8h, v18.8b, v0.8b
shun-iwasawa 82a8f5
    ld1             {v20.8b}, [TMP5]
shun-iwasawa 82a8f5
    usubl           v19.8h, v19.8b, v0.8b
shun-iwasawa 82a8f5
    ld1             {v21.8b}, [TMP6]
shun-iwasawa 82a8f5
    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
shun-iwasawa 82a8f5
    usubl           v20.8h, v20.8b, v0.8b
shun-iwasawa 82a8f5
    ld1             {v22.8b}, [TMP7]
shun-iwasawa 82a8f5
    usubl           v21.8h, v21.8b, v0.8b
shun-iwasawa 82a8f5
    ld1             {v23.8b}, [TMP8]
shun-iwasawa 82a8f5
    usubl           v22.8h, v22.8b, v0.8b
shun-iwasawa 82a8f5
    usubl           v23.8h, v23.8b, v0.8b
shun-iwasawa 82a8f5
    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    br              x30
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          SAMPLE_DATA
shun-iwasawa 82a8f5
    .unreq          START_COL
shun-iwasawa 82a8f5
    .unreq          WORKSPACE
shun-iwasawa 82a8f5
    .unreq          TMP1
shun-iwasawa 82a8f5
    .unreq          TMP2
shun-iwasawa 82a8f5
    .unreq          TMP3
shun-iwasawa 82a8f5
    .unreq          TMP4
shun-iwasawa 82a8f5
    .unreq          TMP5
shun-iwasawa 82a8f5
    .unreq          TMP6
shun-iwasawa 82a8f5
    .unreq          TMP7
shun-iwasawa 82a8f5
    .unreq          TMP8
shun-iwasawa 82a8f5
    .unreq          TMPDUP
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_fdct_islow_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This file contains a slower but more accurate integer implementation of the
shun-iwasawa 82a8f5
 * forward DCT (Discrete Cosine Transform). The following code is based
shun-iwasawa 82a8f5
 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
shun-iwasawa 82a8f5
 * more details.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * TODO: can be combined with 'jsimd_convsamp_neon' to get
shun-iwasawa 82a8f5
 *       rid of a bunch of VLD1.16 instructions
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define CONST_BITS  13
shun-iwasawa 82a8f5
#define PASS1_BITS  2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
shun-iwasawa 82a8f5
#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define XFIX_P_0_298  v0.h[0]
shun-iwasawa 82a8f5
#define XFIX_N_0_390  v0.h[1]
shun-iwasawa 82a8f5
#define XFIX_P_0_541  v0.h[2]
shun-iwasawa 82a8f5
#define XFIX_P_0_765  v0.h[3]
shun-iwasawa 82a8f5
#define XFIX_N_0_899  v0.h[4]
shun-iwasawa 82a8f5
#define XFIX_P_1_175  v0.h[5]
shun-iwasawa 82a8f5
#define XFIX_P_1_501  v0.h[6]
shun-iwasawa 82a8f5
#define XFIX_N_1_847  v0.h[7]
shun-iwasawa 82a8f5
#define XFIX_N_1_961  v1.h[0]
shun-iwasawa 82a8f5
#define XFIX_P_2_053  v1.h[1]
shun-iwasawa 82a8f5
#define XFIX_N_2_562  v1.h[2]
shun-iwasawa 82a8f5
#define XFIX_P_3_072  v1.h[3]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_fdct_islow_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    DATA            .req x0
shun-iwasawa 82a8f5
    TMP             .req x9
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load constants */
shun-iwasawa 82a8f5
    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
shun-iwasawa 82a8f5
    ld1             {v0.8h, v1.8h}, [TMP]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Save Neon registers */
shun-iwasawa 82a8f5
    sub             sp, sp, #64
shun-iwasawa 82a8f5
    mov             x10, sp
shun-iwasawa 82a8f5
    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
shun-iwasawa 82a8f5
    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load all DATA into Neon registers with the following allocation:
shun-iwasawa 82a8f5
     *       0 1 2 3 | 4 5 6 7
shun-iwasawa 82a8f5
     *      ---------+--------
shun-iwasawa 82a8f5
     *   0 | d16     | d17    | v16.8h
shun-iwasawa 82a8f5
     *   1 | d18     | d19    | v17.8h
shun-iwasawa 82a8f5
     *   2 | d20     | d21    | v18.8h
shun-iwasawa 82a8f5
     *   3 | d22     | d23    | v19.8h
shun-iwasawa 82a8f5
     *   4 | d24     | d25    | v20.8h
shun-iwasawa 82a8f5
     *   5 | d26     | d27    | v21.8h
shun-iwasawa 82a8f5
     *   6 | d28     | d29    | v22.8h
shun-iwasawa 82a8f5
     *   7 | d30     | d31    | v23.8h
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
shun-iwasawa 82a8f5
    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
shun-iwasawa 82a8f5
    sub             DATA, DATA, #64
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Transpose */
shun-iwasawa 82a8f5
    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
shun-iwasawa 82a8f5
    /* 1-D FDCT */
shun-iwasawa 82a8f5
    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
shun-iwasawa 82a8f5
    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
shun-iwasawa 82a8f5
    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
shun-iwasawa 82a8f5
    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
shun-iwasawa 82a8f5
    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
shun-iwasawa 82a8f5
    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
shun-iwasawa 82a8f5
    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
shun-iwasawa 82a8f5
    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* even part */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
shun-iwasawa 82a8f5
    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
shun-iwasawa 82a8f5
    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
shun-iwasawa 82a8f5
    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
shun-iwasawa 82a8f5
    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
shun-iwasawa 82a8f5
    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
shun-iwasawa 82a8f5
    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
shun-iwasawa 82a8f5
    mov             v22.16b, v18.16b
shun-iwasawa 82a8f5
    mov             v25.16b, v24.16b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
shun-iwasawa 82a8f5
    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
shun-iwasawa 82a8f5
    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
shun-iwasawa 82a8f5
    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    rshrn           v18.4h, v18.4s, #DESCALE_P1
shun-iwasawa 82a8f5
    rshrn           v22.4h, v22.4s, #DESCALE_P1
shun-iwasawa 82a8f5
    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
shun-iwasawa 82a8f5
    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Odd part */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
shun-iwasawa 82a8f5
    add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
shun-iwasawa 82a8f5
    add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
shun-iwasawa 82a8f5
    add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
shun-iwasawa 82a8f5
    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
shun-iwasawa 82a8f5
    smull2          v5.4s, v10.8h, XFIX_P_1_175
shun-iwasawa 82a8f5
    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
shun-iwasawa 82a8f5
    smlal2          v5.4s, v11.8h, XFIX_P_1_175
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull2          v24.4s, v28.8h, XFIX_P_0_298
shun-iwasawa 82a8f5
    smull2          v25.4s, v29.8h, XFIX_P_2_053
shun-iwasawa 82a8f5
    smull2          v26.4s, v30.8h, XFIX_P_3_072
shun-iwasawa 82a8f5
    smull2          v27.4s, v31.8h, XFIX_P_1_501
shun-iwasawa 82a8f5
    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
shun-iwasawa 82a8f5
    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
shun-iwasawa 82a8f5
    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
shun-iwasawa 82a8f5
    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull2          v12.4s, v8.8h, XFIX_N_0_899
shun-iwasawa 82a8f5
    smull2          v13.4s, v9.8h, XFIX_N_2_562
shun-iwasawa 82a8f5
    smull2          v14.4s, v10.8h, XFIX_N_1_961
shun-iwasawa 82a8f5
    smull2          v15.4s, v11.8h, XFIX_N_0_390
shun-iwasawa 82a8f5
    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
shun-iwasawa 82a8f5
    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
shun-iwasawa 82a8f5
    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
shun-iwasawa 82a8f5
    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
shun-iwasawa 82a8f5
    add             v14.4s, v14.4s, v5.4s
shun-iwasawa 82a8f5
    add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
shun-iwasawa 82a8f5
    add             v15.4s, v15.4s, v5.4s
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
shun-iwasawa 82a8f5
    add             v24.4s, v24.4s, v12.4s
shun-iwasawa 82a8f5
    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
shun-iwasawa 82a8f5
    add             v25.4s, v25.4s, v13.4s
shun-iwasawa 82a8f5
    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
shun-iwasawa 82a8f5
    add             v26.4s, v26.4s, v14.4s
shun-iwasawa 82a8f5
    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
shun-iwasawa 82a8f5
    add             v27.4s, v27.4s, v15.4s
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
shun-iwasawa 82a8f5
    add             v24.4s, v24.4s, v14.4s
shun-iwasawa 82a8f5
    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
shun-iwasawa 82a8f5
    add             v25.4s, v25.4s, v15.4s
shun-iwasawa 82a8f5
    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
shun-iwasawa 82a8f5
    add             v26.4s, v26.4s, v13.4s
shun-iwasawa 82a8f5
    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
shun-iwasawa 82a8f5
    add             v27.4s, v27.4s, v12.4s
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    rshrn           v23.4h, v28.4s, #DESCALE_P1
shun-iwasawa 82a8f5
    rshrn           v21.4h, v29.4s, #DESCALE_P1
shun-iwasawa 82a8f5
    rshrn           v19.4h, v30.4s, #DESCALE_P1
shun-iwasawa 82a8f5
    rshrn           v17.4h, v31.4s, #DESCALE_P1
shun-iwasawa 82a8f5
    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
shun-iwasawa 82a8f5
    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
shun-iwasawa 82a8f5
    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
shun-iwasawa 82a8f5
    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Transpose */
shun-iwasawa 82a8f5
    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* 1-D FDCT */
shun-iwasawa 82a8f5
    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
shun-iwasawa 82a8f5
    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
shun-iwasawa 82a8f5
    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
shun-iwasawa 82a8f5
    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
shun-iwasawa 82a8f5
    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
shun-iwasawa 82a8f5
    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
shun-iwasawa 82a8f5
    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
shun-iwasawa 82a8f5
    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* even part */
shun-iwasawa 82a8f5
    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
shun-iwasawa 82a8f5
    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
shun-iwasawa 82a8f5
    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
shun-iwasawa 82a8f5
    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
shun-iwasawa 82a8f5
    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
shun-iwasawa 82a8f5
    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
shun-iwasawa 82a8f5
    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
shun-iwasawa 82a8f5
    mov             v22.16b, v18.16b
shun-iwasawa 82a8f5
    mov             v25.16b, v24.16b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
shun-iwasawa 82a8f5
    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
shun-iwasawa 82a8f5
    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
shun-iwasawa 82a8f5
    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    rshrn           v18.4h, v18.4s, #DESCALE_P2
shun-iwasawa 82a8f5
    rshrn           v22.4h, v22.4s, #DESCALE_P2
shun-iwasawa 82a8f5
    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
shun-iwasawa 82a8f5
    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Odd part */
shun-iwasawa 82a8f5
    add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
shun-iwasawa 82a8f5
    add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
shun-iwasawa 82a8f5
    add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
shun-iwasawa 82a8f5
    add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
shun-iwasawa 82a8f5
    smull2          v5.4s, v10.8h, XFIX_P_1_175
shun-iwasawa 82a8f5
    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
shun-iwasawa 82a8f5
    smlal2          v5.4s, v11.8h, XFIX_P_1_175
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull2          v24.4s, v28.8h, XFIX_P_0_298
shun-iwasawa 82a8f5
    smull2          v25.4s, v29.8h, XFIX_P_2_053
shun-iwasawa 82a8f5
    smull2          v26.4s, v30.8h, XFIX_P_3_072
shun-iwasawa 82a8f5
    smull2          v27.4s, v31.8h, XFIX_P_1_501
shun-iwasawa 82a8f5
    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
shun-iwasawa 82a8f5
    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
shun-iwasawa 82a8f5
    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
shun-iwasawa 82a8f5
    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    smull2          v12.4s, v8.8h, XFIX_N_0_899
shun-iwasawa 82a8f5
    smull2          v13.4s, v9.8h, XFIX_N_2_562
shun-iwasawa 82a8f5
    smull2          v14.4s, v10.8h, XFIX_N_1_961
shun-iwasawa 82a8f5
    smull2          v15.4s, v11.8h, XFIX_N_0_390
shun-iwasawa 82a8f5
    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
shun-iwasawa 82a8f5
    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
shun-iwasawa 82a8f5
    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
shun-iwasawa 82a8f5
    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v10.4s, v10.4s, v4.4s
shun-iwasawa 82a8f5
    add             v14.4s, v14.4s, v5.4s
shun-iwasawa 82a8f5
    add             v11.4s, v11.4s, v4.4s
shun-iwasawa 82a8f5
    add             v15.4s, v15.4s, v5.4s
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
shun-iwasawa 82a8f5
    add             v24.4s, v24.4s, v12.4s
shun-iwasawa 82a8f5
    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
shun-iwasawa 82a8f5
    add             v25.4s, v25.4s, v13.4s
shun-iwasawa 82a8f5
    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
shun-iwasawa 82a8f5
    add             v26.4s, v26.4s, v14.4s
shun-iwasawa 82a8f5
    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
shun-iwasawa 82a8f5
    add             v27.4s, v27.4s, v15.4s
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
shun-iwasawa 82a8f5
    add             v24.4s, v24.4s, v14.4s
shun-iwasawa 82a8f5
    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
shun-iwasawa 82a8f5
    add             v25.4s, v25.4s, v15.4s
shun-iwasawa 82a8f5
    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
shun-iwasawa 82a8f5
    add             v26.4s, v26.4s, v13.4s
shun-iwasawa 82a8f5
    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
shun-iwasawa 82a8f5
    add             v27.4s, v27.4s, v12.4s
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    rshrn           v23.4h, v28.4s, #DESCALE_P2
shun-iwasawa 82a8f5
    rshrn           v21.4h, v29.4s, #DESCALE_P2
shun-iwasawa 82a8f5
    rshrn           v19.4h, v30.4s, #DESCALE_P2
shun-iwasawa 82a8f5
    rshrn           v17.4h, v31.4s, #DESCALE_P2
shun-iwasawa 82a8f5
    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
shun-iwasawa 82a8f5
    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
shun-iwasawa 82a8f5
    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
shun-iwasawa 82a8f5
    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* store results */
shun-iwasawa 82a8f5
    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
shun-iwasawa 82a8f5
    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Restore Neon registers */
shun-iwasawa 82a8f5
    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
shun-iwasawa 82a8f5
    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    br              x30
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          DATA
shun-iwasawa 82a8f5
    .unreq          TMP
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#undef XFIX_P_0_298
shun-iwasawa 82a8f5
#undef XFIX_N_0_390
shun-iwasawa 82a8f5
#undef XFIX_P_0_541
shun-iwasawa 82a8f5
#undef XFIX_P_0_765
shun-iwasawa 82a8f5
#undef XFIX_N_0_899
shun-iwasawa 82a8f5
#undef XFIX_P_1_175
shun-iwasawa 82a8f5
#undef XFIX_P_1_501
shun-iwasawa 82a8f5
#undef XFIX_N_1_847
shun-iwasawa 82a8f5
#undef XFIX_N_1_961
shun-iwasawa 82a8f5
#undef XFIX_P_2_053
shun-iwasawa 82a8f5
#undef XFIX_N_2_562
shun-iwasawa 82a8f5
#undef XFIX_P_3_072
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * jsimd_fdct_ifast_neon
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This function contains a fast, not so accurate integer implementation of
shun-iwasawa 82a8f5
 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
shun-iwasawa 82a8f5
 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
shun-iwasawa 82a8f5
 * function from jfdctfst.c
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * TODO: can be combined with 'jsimd_convsamp_neon' to get
shun-iwasawa 82a8f5
 *       rid of a bunch of VLD1.16 instructions
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#undef XFIX_0_541196100
shun-iwasawa 82a8f5
#define XFIX_0_382683433  v0.h[0]
shun-iwasawa 82a8f5
#define XFIX_0_541196100  v0.h[1]
shun-iwasawa 82a8f5
#define XFIX_0_707106781  v0.h[2]
shun-iwasawa 82a8f5
#define XFIX_1_306562965  v0.h[3]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_fdct_ifast_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    DATA            .req x0
shun-iwasawa 82a8f5
    TMP             .req x9
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load constants */
shun-iwasawa 82a8f5
    get_symbol_loc  TMP, Ljsimd_fdct_ifast_neon_consts
shun-iwasawa 82a8f5
    ld1             {v0.4h}, [TMP]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* Load all DATA into Neon registers with the following allocation:
shun-iwasawa 82a8f5
     *       0 1 2 3 | 4 5 6 7
shun-iwasawa 82a8f5
     *      ---------+--------
shun-iwasawa 82a8f5
     *   0 | d16     | d17    | v0.8h
shun-iwasawa 82a8f5
     *   1 | d18     | d19    | q9
shun-iwasawa 82a8f5
     *   2 | d20     | d21    | q10
shun-iwasawa 82a8f5
     *   3 | d22     | d23    | q11
shun-iwasawa 82a8f5
     *   4 | d24     | d25    | q12
shun-iwasawa 82a8f5
     *   5 | d26     | d27    | q13
shun-iwasawa 82a8f5
     *   6 | d28     | d29    | q14
shun-iwasawa 82a8f5
     *   7 | d30     | d31    | q15
shun-iwasawa 82a8f5
     */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
shun-iwasawa 82a8f5
    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
shun-iwasawa 82a8f5
    mov             TMP, #2
shun-iwasawa 82a8f5
    sub             DATA, DATA, #64
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    /* Transpose */
shun-iwasawa 82a8f5
    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
shun-iwasawa 82a8f5
    subs            TMP, TMP, #1
shun-iwasawa 82a8f5
    /* 1-D FDCT */
shun-iwasawa 82a8f5
    add             v4.8h, v19.8h, v20.8h
shun-iwasawa 82a8f5
    sub             v20.8h, v19.8h, v20.8h
shun-iwasawa 82a8f5
    sub             v28.8h, v18.8h, v21.8h
shun-iwasawa 82a8f5
    add             v18.8h, v18.8h, v21.8h
shun-iwasawa 82a8f5
    sub             v29.8h, v17.8h, v22.8h
shun-iwasawa 82a8f5
    add             v17.8h, v17.8h, v22.8h
shun-iwasawa 82a8f5
    sub             v21.8h, v16.8h, v23.8h
shun-iwasawa 82a8f5
    add             v16.8h, v16.8h, v23.8h
shun-iwasawa 82a8f5
    sub             v6.8h, v17.8h, v18.8h
shun-iwasawa 82a8f5
    sub             v7.8h, v16.8h, v4.8h
shun-iwasawa 82a8f5
    add             v5.8h, v17.8h, v18.8h
shun-iwasawa 82a8f5
    add             v6.8h, v6.8h, v7.8h
shun-iwasawa 82a8f5
    add             v4.8h, v16.8h, v4.8h
shun-iwasawa 82a8f5
    sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
shun-iwasawa 82a8f5
    add             v19.8h, v20.8h, v28.8h
shun-iwasawa 82a8f5
    add             v16.8h, v4.8h, v5.8h
shun-iwasawa 82a8f5
    sub             v20.8h, v4.8h, v5.8h
shun-iwasawa 82a8f5
    add             v5.8h, v28.8h, v29.8h
shun-iwasawa 82a8f5
    add             v29.8h, v29.8h, v21.8h
shun-iwasawa 82a8f5
    sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
shun-iwasawa 82a8f5
    sub             v28.8h, v19.8h, v29.8h
shun-iwasawa 82a8f5
    add             v18.8h, v7.8h, v6.8h
shun-iwasawa 82a8f5
    sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
shun-iwasawa 82a8f5
    sub             v22.8h, v7.8h, v6.8h
shun-iwasawa 82a8f5
    sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
shun-iwasawa 82a8f5
    sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
shun-iwasawa 82a8f5
    add             v6.8h, v21.8h, v5.8h
shun-iwasawa 82a8f5
    sub             v5.8h, v21.8h, v5.8h
shun-iwasawa 82a8f5
    add             v29.8h, v29.8h, v28.8h
shun-iwasawa 82a8f5
    add             v19.8h, v19.8h, v28.8h
shun-iwasawa 82a8f5
    add             v29.8h, v29.8h, v7.8h
shun-iwasawa 82a8f5
    add             v21.8h, v5.8h, v19.8h
shun-iwasawa 82a8f5
    sub             v19.8h, v5.8h, v19.8h
shun-iwasawa 82a8f5
    add             v17.8h, v6.8h, v29.8h
shun-iwasawa 82a8f5
    sub             v23.8h, v6.8h, v29.8h
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    b.ne            1b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    /* store results */
shun-iwasawa 82a8f5
    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
shun-iwasawa 82a8f5
    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    br              x30
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          DATA
shun-iwasawa 82a8f5
    .unreq          TMP
shun-iwasawa 82a8f5
#undef XFIX_0_382683433
shun-iwasawa 82a8f5
#undef XFIX_0_541196100
shun-iwasawa 82a8f5
#undef XFIX_0_707106781
shun-iwasawa 82a8f5
#undef XFIX_1_306562965
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * GLOBAL(void)
shun-iwasawa 82a8f5
 * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
shun-iwasawa 82a8f5
 *                     DCTELEM *workspace);
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
asm_function jsimd_quantize_neon
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    COEF_BLOCK      .req x0
shun-iwasawa 82a8f5
    DIVISORS        .req x1
shun-iwasawa 82a8f5
    WORKSPACE       .req x2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    RECIPROCAL      .req DIVISORS
shun-iwasawa 82a8f5
    CORRECTION      .req x9
shun-iwasawa 82a8f5
    SHIFT           .req x10
shun-iwasawa 82a8f5
    LOOP_COUNT      .req x11
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov             LOOP_COUNT, #2
shun-iwasawa 82a8f5
    add             CORRECTION, DIVISORS, #(64 * 2)
shun-iwasawa 82a8f5
    add             SHIFT, DIVISORS, #(64 * 6)
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    subs            LOOP_COUNT, LOOP_COUNT, #1
shun-iwasawa 82a8f5
    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
shun-iwasawa 82a8f5
    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
shun-iwasawa 82a8f5
    abs             v20.8h, v0.8h
shun-iwasawa 82a8f5
    abs             v21.8h, v1.8h
shun-iwasawa 82a8f5
    abs             v22.8h, v2.8h
shun-iwasawa 82a8f5
    abs             v23.8h, v3.8h
shun-iwasawa 82a8f5
    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
shun-iwasawa 82a8f5
    add             v20.8h, v20.8h, v4.8h  /* add correction */
shun-iwasawa 82a8f5
    add             v21.8h, v21.8h, v5.8h
shun-iwasawa 82a8f5
    add             v22.8h, v22.8h, v6.8h
shun-iwasawa 82a8f5
    add             v23.8h, v23.8h, v7.8h
shun-iwasawa 82a8f5
    umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
shun-iwasawa 82a8f5
    umull2          v16.4s, v20.8h, v28.8h
shun-iwasawa 82a8f5
    umull           v5.4s, v21.4h, v29.4h
shun-iwasawa 82a8f5
    umull2          v17.4s, v21.8h, v29.8h
shun-iwasawa 82a8f5
    umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
shun-iwasawa 82a8f5
    umull2          v18.4s, v22.8h, v30.8h
shun-iwasawa 82a8f5
    umull           v7.4s, v23.4h, v31.4h
shun-iwasawa 82a8f5
    umull2          v19.4s, v23.8h, v31.8h
shun-iwasawa 82a8f5
    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
shun-iwasawa 82a8f5
    shrn            v4.4h, v4.4s, #16
shun-iwasawa 82a8f5
    shrn            v5.4h, v5.4s, #16
shun-iwasawa 82a8f5
    shrn            v6.4h, v6.4s, #16
shun-iwasawa 82a8f5
    shrn            v7.4h, v7.4s, #16
shun-iwasawa 82a8f5
    shrn2           v4.8h, v16.4s, #16
shun-iwasawa 82a8f5
    shrn2           v5.8h, v17.4s, #16
shun-iwasawa 82a8f5
    shrn2           v6.8h, v18.4s, #16
shun-iwasawa 82a8f5
    shrn2           v7.8h, v19.4s, #16
shun-iwasawa 82a8f5
    neg             v24.8h, v24.8h
shun-iwasawa 82a8f5
    neg             v25.8h, v25.8h
shun-iwasawa 82a8f5
    neg             v26.8h, v26.8h
shun-iwasawa 82a8f5
    neg             v27.8h, v27.8h
shun-iwasawa 82a8f5
    sshr            v0.8h, v0.8h, #15  /* extract sign */
shun-iwasawa 82a8f5
    sshr            v1.8h, v1.8h, #15
shun-iwasawa 82a8f5
    sshr            v2.8h, v2.8h, #15
shun-iwasawa 82a8f5
    sshr            v3.8h, v3.8h, #15
shun-iwasawa 82a8f5
    ushl            v4.8h, v4.8h, v24.8h  /* shift */
shun-iwasawa 82a8f5
    ushl            v5.8h, v5.8h, v25.8h
shun-iwasawa 82a8f5
    ushl            v6.8h, v6.8h, v26.8h
shun-iwasawa 82a8f5
    ushl            v7.8h, v7.8h, v27.8h
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    eor             v4.16b, v4.16b, v0.16b  /* restore sign */
shun-iwasawa 82a8f5
    eor             v5.16b, v5.16b, v1.16b
shun-iwasawa 82a8f5
    eor             v6.16b, v6.16b, v2.16b
shun-iwasawa 82a8f5
    eor             v7.16b, v7.16b, v3.16b
shun-iwasawa 82a8f5
    sub             v4.8h, v4.8h, v0.8h
shun-iwasawa 82a8f5
    sub             v5.8h, v5.8h, v1.8h
shun-iwasawa 82a8f5
    sub             v6.8h, v6.8h, v2.8h
shun-iwasawa 82a8f5
    sub             v7.8h, v7.8h, v3.8h
shun-iwasawa 82a8f5
    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    b.ne            1b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    br              x30  /* return */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          COEF_BLOCK
shun-iwasawa 82a8f5
    .unreq          DIVISORS
shun-iwasawa 82a8f5
    .unreq          WORKSPACE
shun-iwasawa 82a8f5
    .unreq          RECIPROCAL
shun-iwasawa 82a8f5
    .unreq          CORRECTION
shun-iwasawa 82a8f5
    .unreq          SHIFT
shun-iwasawa 82a8f5
    .unreq          LOOP_COUNT
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Downsample pixel values of a single component.
shun-iwasawa 82a8f5
 * This version handles the common case of 2:1 horizontal and 1:1 vertical,
shun-iwasawa 82a8f5
 * without smoothing.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * GLOBAL(void)
shun-iwasawa 82a8f5
 * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
shun-iwasawa 82a8f5
 *                            JDIMENSION v_samp_factor,
shun-iwasawa 82a8f5
 *                            JDIMENSION width_in_blocks,
shun-iwasawa 82a8f5
 *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
asm_function jsimd_h2v1_downsample_neon
shun-iwasawa 82a8f5
    IMAGE_WIDTH     .req x0
shun-iwasawa 82a8f5
    MAX_V_SAMP      .req x1
shun-iwasawa 82a8f5
    V_SAMP          .req x2
shun-iwasawa 82a8f5
    BLOCK_WIDTH     .req x3
shun-iwasawa 82a8f5
    INPUT_DATA      .req x4
shun-iwasawa 82a8f5
    OUTPUT_DATA     .req x5
shun-iwasawa 82a8f5
    OUTPTR          .req x9
shun-iwasawa 82a8f5
    INPTR           .req x10
shun-iwasawa 82a8f5
    TMP1            .req x11
shun-iwasawa 82a8f5
    TMP2            .req x12
shun-iwasawa 82a8f5
    TMP3            .req x13
shun-iwasawa 82a8f5
    TMPDUP          .req w15
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov             TMPDUP, #0x10000
shun-iwasawa 82a8f5
    lsl             TMP2, BLOCK_WIDTH, #4
shun-iwasawa 82a8f5
    sub             TMP2, TMP2, IMAGE_WIDTH
shun-iwasawa 82a8f5
    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
shun-iwasawa 82a8f5
    add             TMP3, TMP3, TMP2, lsl #4
shun-iwasawa 82a8f5
    dup             v16.4s, TMPDUP
shun-iwasawa 82a8f5
    ld1             {v18.16b}, [TMP3]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
1:  /* row loop */
shun-iwasawa 82a8f5
    ldr             INPTR, [INPUT_DATA], #8
shun-iwasawa 82a8f5
    ldr             OUTPTR, [OUTPUT_DATA], #8
shun-iwasawa 82a8f5
    subs            TMP1, BLOCK_WIDTH, #1
shun-iwasawa 82a8f5
    b.eq            3f
shun-iwasawa 82a8f5
2:  /* columns */
shun-iwasawa 82a8f5
    ld1             {v0.16b}, [INPTR], #16
shun-iwasawa 82a8f5
    mov             v4.16b, v16.16b
shun-iwasawa 82a8f5
    subs            TMP1, TMP1, #1
shun-iwasawa 82a8f5
    uadalp          v4.8h, v0.16b
shun-iwasawa 82a8f5
    shrn            v6.8b, v4.8h, #1
shun-iwasawa 82a8f5
    st1             {v6.8b}, [OUTPTR], #8
shun-iwasawa 82a8f5
    b.ne            2b
shun-iwasawa 82a8f5
3:  /* last columns */
shun-iwasawa 82a8f5
    ld1             {v0.16b}, [INPTR]
shun-iwasawa 82a8f5
    mov             v4.16b, v16.16b
shun-iwasawa 82a8f5
    subs            V_SAMP, V_SAMP, #1
shun-iwasawa 82a8f5
    /* expand right */
shun-iwasawa 82a8f5
    tbl             v2.16b, {v0.16b}, v18.16b
shun-iwasawa 82a8f5
    uadalp          v4.8h, v2.16b
shun-iwasawa 82a8f5
    shrn            v6.8b, v4.8h, #1
shun-iwasawa 82a8f5
    st1             {v6.8b}, [OUTPTR], #8
shun-iwasawa 82a8f5
    b.ne            1b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    br              x30
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          IMAGE_WIDTH
shun-iwasawa 82a8f5
    .unreq          MAX_V_SAMP
shun-iwasawa 82a8f5
    .unreq          V_SAMP
shun-iwasawa 82a8f5
    .unreq          BLOCK_WIDTH
shun-iwasawa 82a8f5
    .unreq          INPUT_DATA
shun-iwasawa 82a8f5
    .unreq          OUTPUT_DATA
shun-iwasawa 82a8f5
    .unreq          OUTPTR
shun-iwasawa 82a8f5
    .unreq          INPTR
shun-iwasawa 82a8f5
    .unreq          TMP1
shun-iwasawa 82a8f5
    .unreq          TMP2
shun-iwasawa 82a8f5
    .unreq          TMP3
shun-iwasawa 82a8f5
    .unreq          TMPDUP
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Downsample pixel values of a single component.
shun-iwasawa 82a8f5
 * This version handles the common case of 2:1 horizontal and 2:1 vertical,
shun-iwasawa 82a8f5
 * without smoothing.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * GLOBAL(void)
shun-iwasawa 82a8f5
 * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
shun-iwasawa 82a8f5
 *                            JDIMENSION v_samp_factor,
shun-iwasawa 82a8f5
 *                            JDIMENSION width_in_blocks,
shun-iwasawa 82a8f5
 *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.balign 16
shun-iwasawa 82a8f5
asm_function jsimd_h2v2_downsample_neon
shun-iwasawa 82a8f5
    IMAGE_WIDTH     .req x0
shun-iwasawa 82a8f5
    MAX_V_SAMP      .req x1
shun-iwasawa 82a8f5
    V_SAMP          .req x2
shun-iwasawa 82a8f5
    BLOCK_WIDTH     .req x3
shun-iwasawa 82a8f5
    INPUT_DATA      .req x4
shun-iwasawa 82a8f5
    OUTPUT_DATA     .req x5
shun-iwasawa 82a8f5
    OUTPTR          .req x9
shun-iwasawa 82a8f5
    INPTR0          .req x10
shun-iwasawa 82a8f5
    INPTR1          .req x14
shun-iwasawa 82a8f5
    TMP1            .req x11
shun-iwasawa 82a8f5
    TMP2            .req x12
shun-iwasawa 82a8f5
    TMP3            .req x13
shun-iwasawa 82a8f5
    TMPDUP          .req w15
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov             TMPDUP, #1
shun-iwasawa 82a8f5
    lsl             TMP2, BLOCK_WIDTH, #4
shun-iwasawa 82a8f5
    lsl             TMPDUP, TMPDUP, #17
shun-iwasawa 82a8f5
    sub             TMP2, TMP2, IMAGE_WIDTH
shun-iwasawa 82a8f5
    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
shun-iwasawa 82a8f5
    orr             TMPDUP, TMPDUP, #1
shun-iwasawa 82a8f5
    add             TMP3, TMP3, TMP2, lsl #4
shun-iwasawa 82a8f5
    dup             v16.4s, TMPDUP
shun-iwasawa 82a8f5
    ld1             {v18.16b}, [TMP3]
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
1:  /* row loop */
shun-iwasawa 82a8f5
    ldr             INPTR0, [INPUT_DATA], #8
shun-iwasawa 82a8f5
    ldr             OUTPTR, [OUTPUT_DATA], #8
shun-iwasawa 82a8f5
    ldr             INPTR1, [INPUT_DATA], #8
shun-iwasawa 82a8f5
    subs            TMP1, BLOCK_WIDTH, #1
shun-iwasawa 82a8f5
    b.eq            3f
shun-iwasawa 82a8f5
2:  /* columns */
shun-iwasawa 82a8f5
    ld1             {v0.16b}, [INPTR0], #16
shun-iwasawa 82a8f5
    ld1             {v1.16b}, [INPTR1], #16
shun-iwasawa 82a8f5
    mov             v4.16b, v16.16b
shun-iwasawa 82a8f5
    subs            TMP1, TMP1, #1
shun-iwasawa 82a8f5
    uadalp          v4.8h, v0.16b
shun-iwasawa 82a8f5
    uadalp          v4.8h, v1.16b
shun-iwasawa 82a8f5
    shrn            v6.8b, v4.8h, #2
shun-iwasawa 82a8f5
    st1             {v6.8b}, [OUTPTR], #8
shun-iwasawa 82a8f5
    b.ne            2b
shun-iwasawa 82a8f5
3:  /* last columns */
shun-iwasawa 82a8f5
    ld1             {v0.16b}, [INPTR0], #16
shun-iwasawa 82a8f5
    ld1             {v1.16b}, [INPTR1], #16
shun-iwasawa 82a8f5
    mov             v4.16b, v16.16b
shun-iwasawa 82a8f5
    subs            V_SAMP, V_SAMP, #1
shun-iwasawa 82a8f5
    /* expand right */
shun-iwasawa 82a8f5
    tbl             v2.16b, {v0.16b}, v18.16b
shun-iwasawa 82a8f5
    tbl             v3.16b, {v1.16b}, v18.16b
shun-iwasawa 82a8f5
    uadalp          v4.8h, v2.16b
shun-iwasawa 82a8f5
    uadalp          v4.8h, v3.16b
shun-iwasawa 82a8f5
    shrn            v6.8b, v4.8h, #2
shun-iwasawa 82a8f5
    st1             {v6.8b}, [OUTPTR], #8
shun-iwasawa 82a8f5
    b.ne            1b
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    br              x30
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          IMAGE_WIDTH
shun-iwasawa 82a8f5
    .unreq          MAX_V_SAMP
shun-iwasawa 82a8f5
    .unreq          V_SAMP
shun-iwasawa 82a8f5
    .unreq          BLOCK_WIDTH
shun-iwasawa 82a8f5
    .unreq          INPUT_DATA
shun-iwasawa 82a8f5
    .unreq          OUTPUT_DATA
shun-iwasawa 82a8f5
    .unreq          OUTPTR
shun-iwasawa 82a8f5
    .unreq          INPTR0
shun-iwasawa 82a8f5
    .unreq          INPTR1
shun-iwasawa 82a8f5
    .unreq          TMP1
shun-iwasawa 82a8f5
    .unreq          TMP2
shun-iwasawa 82a8f5
    .unreq          TMP3
shun-iwasawa 82a8f5
    .unreq          TMPDUP
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*****************************************************************************/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * GLOBAL(JOCTET *)
shun-iwasawa 82a8f5
 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
shun-iwasawa 82a8f5
 *                             JCOEFPTR block, int last_dc_val,
shun-iwasawa 82a8f5
 *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    BUFFER          .req x1
shun-iwasawa 82a8f5
    PUT_BUFFER      .req x6
shun-iwasawa 82a8f5
    PUT_BITS        .req x7
shun-iwasawa 82a8f5
    PUT_BITSw       .req w7
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro emit_byte
shun-iwasawa 82a8f5
    sub             PUT_BITS, PUT_BITS, #0x8
shun-iwasawa 82a8f5
    lsr             x19, PUT_BUFFER, PUT_BITS
shun-iwasawa 82a8f5
    uxtb            w19, w19
shun-iwasawa 82a8f5
    strb            w19, [BUFFER, #1]!
shun-iwasawa 82a8f5
    cmp             w19, #0xff
shun-iwasawa 82a8f5
    b.ne            14f
shun-iwasawa 82a8f5
    strb            wzr, [BUFFER, #1]!
shun-iwasawa 82a8f5
14:
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
.macro put_bits CODE, SIZE
shun-iwasawa 82a8f5
    lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
shun-iwasawa 82a8f5
    add             PUT_BITS, PUT_BITS, \SIZE
shun-iwasawa 82a8f5
    orr             PUT_BUFFER, PUT_BUFFER, \CODE
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
.macro checkbuf31
shun-iwasawa 82a8f5
    cmp             PUT_BITS, #0x20
shun-iwasawa 82a8f5
    b.lt            31f
shun-iwasawa 82a8f5
    emit_byte
shun-iwasawa 82a8f5
    emit_byte
shun-iwasawa 82a8f5
    emit_byte
shun-iwasawa 82a8f5
    emit_byte
shun-iwasawa 82a8f5
31:
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
.macro checkbuf47
shun-iwasawa 82a8f5
    cmp             PUT_BITS, #0x30
shun-iwasawa 82a8f5
    b.lt            47f
shun-iwasawa 82a8f5
    emit_byte
shun-iwasawa 82a8f5
    emit_byte
shun-iwasawa 82a8f5
    emit_byte
shun-iwasawa 82a8f5
    emit_byte
shun-iwasawa 82a8f5
    emit_byte
shun-iwasawa 82a8f5
    emit_byte
shun-iwasawa 82a8f5
47:
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.macro generate_jsimd_huff_encode_one_block fast_tbl
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.if \fast_tbl == 1
shun-iwasawa 82a8f5
asm_function jsimd_huff_encode_one_block_neon
shun-iwasawa 82a8f5
.else
shun-iwasawa 82a8f5
asm_function jsimd_huff_encode_one_block_neon_slowtbl
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
    sub             sp, sp, 272
shun-iwasawa 82a8f5
    sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
shun-iwasawa 82a8f5
    /* Save Arm registers */
shun-iwasawa 82a8f5
    stp             x19, x20, [sp]
shun-iwasawa 82a8f5
    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
shun-iwasawa 82a8f5
    ldr             PUT_BUFFER, [x0, #0x10]
shun-iwasawa 82a8f5
    ldr             PUT_BITSw, [x0, #0x18]
shun-iwasawa 82a8f5
    ldrsh           w12, [x2]               /* load DC coeff in w12 */
shun-iwasawa 82a8f5
    /* prepare data */
shun-iwasawa 82a8f5
.if \fast_tbl == 1
shun-iwasawa 82a8f5
    ld1             {v23.16b}, [x15], #16
shun-iwasawa 82a8f5
    ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
shun-iwasawa 82a8f5
    ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
shun-iwasawa 82a8f5
    ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
shun-iwasawa 82a8f5
    ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
shun-iwasawa 82a8f5
    ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
shun-iwasawa 82a8f5
    sub             w12, w12, w3      /* last_dc_val, not used afterwards */
shun-iwasawa 82a8f5
    /* ZigZag 8x8 */
shun-iwasawa 82a8f5
    tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
shun-iwasawa 82a8f5
    tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
shun-iwasawa 82a8f5
    tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
shun-iwasawa 82a8f5
    tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
shun-iwasawa 82a8f5
    tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
shun-iwasawa 82a8f5
    tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
shun-iwasawa 82a8f5
    tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
shun-iwasawa 82a8f5
    tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
shun-iwasawa 82a8f5
    ins             v0.h[0], w12
shun-iwasawa 82a8f5
    tbx             v1.16b, {v28.16b}, v16.16b
shun-iwasawa 82a8f5
    tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
shun-iwasawa 82a8f5
    tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
shun-iwasawa 82a8f5
    tbx             v6.16b, {v31.16b}, v19.16b
shun-iwasawa 82a8f5
.else
shun-iwasawa 82a8f5
      add             x13, x2, #0x22
shun-iwasawa 82a8f5
      sub             w12, w12, w3    /* last_dc_val, not used afterwards */
shun-iwasawa 82a8f5
    ld1             {v23.16b}, [x15]
shun-iwasawa 82a8f5
      add             x14, x2, #0x18
shun-iwasawa 82a8f5
      add             x3, x2, #0x36
shun-iwasawa 82a8f5
    ins             v0.h[0], w12
shun-iwasawa 82a8f5
      add             x9, x2, #0x2
shun-iwasawa 82a8f5
    ld1             {v1.h}[0], [x13]
shun-iwasawa 82a8f5
      add             x15, x2, #0x30
shun-iwasawa 82a8f5
    ld1             {v2.h}[0], [x14]
shun-iwasawa 82a8f5
      add             x19, x2, #0x26
shun-iwasawa 82a8f5
    ld1             {v3.h}[0], [x3]
shun-iwasawa 82a8f5
      add             x20, x2, #0x28
shun-iwasawa 82a8f5
    ld1             {v0.h}[1], [x9]
shun-iwasawa 82a8f5
      add             x12, x2, #0x10
shun-iwasawa 82a8f5
    ld1             {v1.h}[1], [x15]
shun-iwasawa 82a8f5
      add             x13, x2, #0x40
shun-iwasawa 82a8f5
    ld1             {v2.h}[1], [x19]
shun-iwasawa 82a8f5
      add             x14, x2, #0x34
shun-iwasawa 82a8f5
    ld1             {v3.h}[1], [x20]
shun-iwasawa 82a8f5
      add             x3, x2, #0x1a
shun-iwasawa 82a8f5
    ld1             {v0.h}[2], [x12]
shun-iwasawa 82a8f5
      add             x9, x2, #0x20
shun-iwasawa 82a8f5
    ld1             {v1.h}[2], [x13]
shun-iwasawa 82a8f5
      add             x15, x2, #0x32
shun-iwasawa 82a8f5
    ld1             {v2.h}[2], [x14]
shun-iwasawa 82a8f5
      add             x19, x2, #0x42
shun-iwasawa 82a8f5
    ld1             {v3.h}[2], [x3]
shun-iwasawa 82a8f5
      add             x20, x2, #0xc
shun-iwasawa 82a8f5
    ld1             {v0.h}[3], [x9]
shun-iwasawa 82a8f5
      add             x12, x2, #0x12
shun-iwasawa 82a8f5
    ld1             {v1.h}[3], [x15]
shun-iwasawa 82a8f5
      add             x13, x2, #0x24
shun-iwasawa 82a8f5
    ld1             {v2.h}[3], [x19]
shun-iwasawa 82a8f5
      add             x14, x2, #0x50
shun-iwasawa 82a8f5
    ld1             {v3.h}[3], [x20]
shun-iwasawa 82a8f5
      add             x3, x2, #0xe
shun-iwasawa 82a8f5
    ld1             {v0.h}[4], [x12]
shun-iwasawa 82a8f5
      add             x9, x2, #0x4
shun-iwasawa 82a8f5
    ld1             {v1.h}[4], [x13]
shun-iwasawa 82a8f5
      add             x15, x2, #0x16
shun-iwasawa 82a8f5
    ld1             {v2.h}[4], [x14]
shun-iwasawa 82a8f5
      add             x19, x2, #0x60
shun-iwasawa 82a8f5
    ld1             {v3.h}[4], [x3]
shun-iwasawa 82a8f5
      add             x20, x2, #0x1c
shun-iwasawa 82a8f5
    ld1             {v0.h}[5], [x9]
shun-iwasawa 82a8f5
      add             x12, x2, #0x6
shun-iwasawa 82a8f5
    ld1             {v1.h}[5], [x15]
shun-iwasawa 82a8f5
      add             x13, x2, #0x8
shun-iwasawa 82a8f5
    ld1             {v2.h}[5], [x19]
shun-iwasawa 82a8f5
      add             x14, x2, #0x52
shun-iwasawa 82a8f5
    ld1             {v3.h}[5], [x20]
shun-iwasawa 82a8f5
      add             x3, x2, #0x2a
shun-iwasawa 82a8f5
    ld1             {v0.h}[6], [x12]
shun-iwasawa 82a8f5
      add             x9, x2, #0x14
shun-iwasawa 82a8f5
    ld1             {v1.h}[6], [x13]
shun-iwasawa 82a8f5
      add             x15, x2, #0xa
shun-iwasawa 82a8f5
    ld1             {v2.h}[6], [x14]
shun-iwasawa 82a8f5
      add             x19, x2, #0x44
shun-iwasawa 82a8f5
    ld1             {v3.h}[6], [x3]
shun-iwasawa 82a8f5
      add             x20, x2, #0x38
shun-iwasawa 82a8f5
    ld1             {v0.h}[7], [x9]
shun-iwasawa 82a8f5
      add             x12, x2, #0x46
shun-iwasawa 82a8f5
    ld1             {v1.h}[7], [x15]
shun-iwasawa 82a8f5
      add             x13, x2, #0x3a
shun-iwasawa 82a8f5
    ld1             {v2.h}[7], [x19]
shun-iwasawa 82a8f5
      add             x14, x2, #0x74
shun-iwasawa 82a8f5
    ld1             {v3.h}[7], [x20]
shun-iwasawa 82a8f5
      add             x3, x2, #0x6a
shun-iwasawa 82a8f5
    ld1             {v4.h}[0], [x12]
shun-iwasawa 82a8f5
      add             x9, x2, #0x54
shun-iwasawa 82a8f5
    ld1             {v5.h}[0], [x13]
shun-iwasawa 82a8f5
      add             x15, x2, #0x2c
shun-iwasawa 82a8f5
    ld1             {v6.h}[0], [x14]
shun-iwasawa 82a8f5
      add             x19, x2, #0x76
shun-iwasawa 82a8f5
    ld1             {v7.h}[0], [x3]
shun-iwasawa 82a8f5
      add             x20, x2, #0x78
shun-iwasawa 82a8f5
    ld1             {v4.h}[1], [x9]
shun-iwasawa 82a8f5
      add             x12, x2, #0x62
shun-iwasawa 82a8f5
    ld1             {v5.h}[1], [x15]
shun-iwasawa 82a8f5
      add             x13, x2, #0x1e
shun-iwasawa 82a8f5
    ld1             {v6.h}[1], [x19]
shun-iwasawa 82a8f5
      add             x14, x2, #0x68
shun-iwasawa 82a8f5
    ld1             {v7.h}[1], [x20]
shun-iwasawa 82a8f5
      add             x3, x2, #0x7a
shun-iwasawa 82a8f5
    ld1             {v4.h}[2], [x12]
shun-iwasawa 82a8f5
      add             x9, x2, #0x70
shun-iwasawa 82a8f5
    ld1             {v5.h}[2], [x13]
shun-iwasawa 82a8f5
      add             x15, x2, #0x2e
shun-iwasawa 82a8f5
    ld1             {v6.h}[2], [x14]
shun-iwasawa 82a8f5
      add             x19, x2, #0x5a
shun-iwasawa 82a8f5
    ld1             {v7.h}[2], [x3]
shun-iwasawa 82a8f5
      add             x20, x2, #0x6c
shun-iwasawa 82a8f5
    ld1             {v4.h}[3], [x9]
shun-iwasawa 82a8f5
      add             x12, x2, #0x72
shun-iwasawa 82a8f5
    ld1             {v5.h}[3], [x15]
shun-iwasawa 82a8f5
      add             x13, x2, #0x3c
shun-iwasawa 82a8f5
    ld1             {v6.h}[3], [x19]
shun-iwasawa 82a8f5
      add             x14, x2, #0x4c
shun-iwasawa 82a8f5
    ld1             {v7.h}[3], [x20]
shun-iwasawa 82a8f5
      add             x3, x2, #0x5e
shun-iwasawa 82a8f5
    ld1             {v4.h}[4], [x12]
shun-iwasawa 82a8f5
      add             x9, x2, #0x64
shun-iwasawa 82a8f5
    ld1             {v5.h}[4], [x13]
shun-iwasawa 82a8f5
      add             x15, x2, #0x4a
shun-iwasawa 82a8f5
    ld1             {v6.h}[4], [x14]
shun-iwasawa 82a8f5
      add             x19, x2, #0x3e
shun-iwasawa 82a8f5
    ld1             {v7.h}[4], [x3]
shun-iwasawa 82a8f5
      add             x20, x2, #0x6e
shun-iwasawa 82a8f5
    ld1             {v4.h}[5], [x9]
shun-iwasawa 82a8f5
      add             x12, x2, #0x56
shun-iwasawa 82a8f5
    ld1             {v5.h}[5], [x15]
shun-iwasawa 82a8f5
      add             x13, x2, #0x58
shun-iwasawa 82a8f5
    ld1             {v6.h}[5], [x19]
shun-iwasawa 82a8f5
      add             x14, x2, #0x4e
shun-iwasawa 82a8f5
    ld1             {v7.h}[5], [x20]
shun-iwasawa 82a8f5
      add             x3, x2, #0x7c
shun-iwasawa 82a8f5
    ld1             {v4.h}[6], [x12]
shun-iwasawa 82a8f5
      add             x9, x2, #0x48
shun-iwasawa 82a8f5
    ld1             {v5.h}[6], [x13]
shun-iwasawa 82a8f5
      add             x15, x2, #0x66
shun-iwasawa 82a8f5
    ld1             {v6.h}[6], [x14]
shun-iwasawa 82a8f5
      add             x19, x2, #0x5c
shun-iwasawa 82a8f5
    ld1             {v7.h}[6], [x3]
shun-iwasawa 82a8f5
      add             x20, x2, #0x7e
shun-iwasawa 82a8f5
    ld1             {v4.h}[7], [x9]
shun-iwasawa 82a8f5
    ld1             {v5.h}[7], [x15]
shun-iwasawa 82a8f5
    ld1             {v6.h}[7], [x19]
shun-iwasawa 82a8f5
    ld1             {v7.h}[7], [x20]
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
    cmlt            v24.8h, v0.8h, #0
shun-iwasawa 82a8f5
    cmlt            v25.8h, v1.8h, #0
shun-iwasawa 82a8f5
    cmlt            v26.8h, v2.8h, #0
shun-iwasawa 82a8f5
    cmlt            v27.8h, v3.8h, #0
shun-iwasawa 82a8f5
    cmlt            v28.8h, v4.8h, #0
shun-iwasawa 82a8f5
    cmlt            v29.8h, v5.8h, #0
shun-iwasawa 82a8f5
    cmlt            v30.8h, v6.8h, #0
shun-iwasawa 82a8f5
    cmlt            v31.8h, v7.8h, #0
shun-iwasawa 82a8f5
    abs             v0.8h, v0.8h
shun-iwasawa 82a8f5
    abs             v1.8h, v1.8h
shun-iwasawa 82a8f5
    abs             v2.8h, v2.8h
shun-iwasawa 82a8f5
    abs             v3.8h, v3.8h
shun-iwasawa 82a8f5
    abs             v4.8h, v4.8h
shun-iwasawa 82a8f5
    abs             v5.8h, v5.8h
shun-iwasawa 82a8f5
    abs             v6.8h, v6.8h
shun-iwasawa 82a8f5
    abs             v7.8h, v7.8h
shun-iwasawa 82a8f5
    eor             v24.16b, v24.16b, v0.16b
shun-iwasawa 82a8f5
    eor             v25.16b, v25.16b, v1.16b
shun-iwasawa 82a8f5
    eor             v26.16b, v26.16b, v2.16b
shun-iwasawa 82a8f5
    eor             v27.16b, v27.16b, v3.16b
shun-iwasawa 82a8f5
    eor             v28.16b, v28.16b, v4.16b
shun-iwasawa 82a8f5
    eor             v29.16b, v29.16b, v5.16b
shun-iwasawa 82a8f5
    eor             v30.16b, v30.16b, v6.16b
shun-iwasawa 82a8f5
    eor             v31.16b, v31.16b, v7.16b
shun-iwasawa 82a8f5
    cmeq            v16.8h, v0.8h, #0
shun-iwasawa 82a8f5
    cmeq            v17.8h, v1.8h, #0
shun-iwasawa 82a8f5
    cmeq            v18.8h, v2.8h, #0
shun-iwasawa 82a8f5
    cmeq            v19.8h, v3.8h, #0
shun-iwasawa 82a8f5
    cmeq            v20.8h, v4.8h, #0
shun-iwasawa 82a8f5
    cmeq            v21.8h, v5.8h, #0
shun-iwasawa 82a8f5
    cmeq            v22.8h, v6.8h, #0
shun-iwasawa 82a8f5
    xtn             v16.8b, v16.8h
shun-iwasawa 82a8f5
    xtn             v18.8b, v18.8h
shun-iwasawa 82a8f5
    xtn             v20.8b, v20.8h
shun-iwasawa 82a8f5
    xtn             v22.8b, v22.8h
shun-iwasawa 82a8f5
      umov            w14, v0.h[0]
shun-iwasawa 82a8f5
    xtn2            v16.16b, v17.8h
shun-iwasawa 82a8f5
      umov            w13, v24.h[0]
shun-iwasawa 82a8f5
    xtn2            v18.16b, v19.8h
shun-iwasawa 82a8f5
      clz             w14, w14
shun-iwasawa 82a8f5
    xtn2            v20.16b, v21.8h
shun-iwasawa 82a8f5
      lsl             w13, w13, w14
shun-iwasawa 82a8f5
    cmeq            v17.8h, v7.8h, #0
shun-iwasawa 82a8f5
      sub             w12, w14, #32
shun-iwasawa 82a8f5
    xtn2            v22.16b, v17.8h
shun-iwasawa 82a8f5
      lsr             w13, w13, w14
shun-iwasawa 82a8f5
    and             v16.16b, v16.16b, v23.16b
shun-iwasawa 82a8f5
      neg             w12, w12
shun-iwasawa 82a8f5
    and             v18.16b, v18.16b, v23.16b
shun-iwasawa 82a8f5
      add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
shun-iwasawa 82a8f5
    and             v20.16b, v20.16b, v23.16b
shun-iwasawa 82a8f5
      add             x15, sp, #0x90           /* x15 = t2 */
shun-iwasawa 82a8f5
    and             v22.16b, v22.16b, v23.16b
shun-iwasawa 82a8f5
      ldr             w10, [x4, x12, lsl #2]
shun-iwasawa 82a8f5
    addp            v16.16b, v16.16b, v18.16b
shun-iwasawa 82a8f5
      ldrb            w11, [x3, x12]
shun-iwasawa 82a8f5
    addp            v20.16b, v20.16b, v22.16b
shun-iwasawa 82a8f5
      checkbuf47
shun-iwasawa 82a8f5
    addp            v16.16b, v16.16b, v20.16b
shun-iwasawa 82a8f5
      put_bits        x10, x11
shun-iwasawa 82a8f5
    addp            v16.16b, v16.16b, v18.16b
shun-iwasawa 82a8f5
      checkbuf47
shun-iwasawa 82a8f5
    umov            x9, v16.D[0]
shun-iwasawa 82a8f5
      put_bits        x13, x12
shun-iwasawa 82a8f5
    cnt             v17.8b, v16.8b
shun-iwasawa 82a8f5
      mvn             x9, x9
shun-iwasawa 82a8f5
    addv            B18, v17.8b
shun-iwasawa 82a8f5
      add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
shun-iwasawa 82a8f5
    umov            w12, v18.b[0]
shun-iwasawa 82a8f5
      lsr             x9, x9, #0x1     /* clear AC coeff */
shun-iwasawa 82a8f5
    ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
shun-iwasawa 82a8f5
    rbit            x9, x9             /* x9 = index0 */
shun-iwasawa 82a8f5
    ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
shun-iwasawa 82a8f5
    cmp             w12, #(64-8)
shun-iwasawa 82a8f5
    add             x11, sp, #16
shun-iwasawa 82a8f5
    b.lt            4f
shun-iwasawa 82a8f5
    cbz             x9, 6f
shun-iwasawa 82a8f5
    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
shun-iwasawa 82a8f5
    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
shun-iwasawa 82a8f5
    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
shun-iwasawa 82a8f5
    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    clz             x2, x9
shun-iwasawa 82a8f5
    add             x15, x15, x2, lsl #1
shun-iwasawa 82a8f5
    lsl             x9, x9, x2
shun-iwasawa 82a8f5
    ldrh            w20, [x15, #-126]
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    cmp             x2, #0x10
shun-iwasawa 82a8f5
    b.lt            3f
shun-iwasawa 82a8f5
    sub             x2, x2, #0x10
shun-iwasawa 82a8f5
    checkbuf47
shun-iwasawa 82a8f5
    put_bits        x13, x14
shun-iwasawa 82a8f5
    b               2b
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    clz             w20, w20
shun-iwasawa 82a8f5
    ldrh            w3, [x15, #2]!
shun-iwasawa 82a8f5
    sub             w11, w20, #32
shun-iwasawa 82a8f5
    lsl             w3, w3, w20
shun-iwasawa 82a8f5
    neg             w11, w11
shun-iwasawa 82a8f5
    lsr             w3, w3, w20
shun-iwasawa 82a8f5
    add             x2, x11, x2, lsl #4
shun-iwasawa 82a8f5
    lsl             x9, x9, #0x1
shun-iwasawa 82a8f5
    ldr             w12, [x5, x2, lsl #2]
shun-iwasawa 82a8f5
    ldrb            w10, [x4, x2]
shun-iwasawa 82a8f5
    checkbuf31
shun-iwasawa 82a8f5
    put_bits        x12, x10
shun-iwasawa 82a8f5
    put_bits        x3, x11
shun-iwasawa 82a8f5
    cbnz            x9, 1b
shun-iwasawa 82a8f5
    b               6f
shun-iwasawa 82a8f5
4:
shun-iwasawa 82a8f5
    movi            v21.8h, #0x0010
shun-iwasawa 82a8f5
    clz             v0.8h, v0.8h
shun-iwasawa 82a8f5
    clz             v1.8h, v1.8h
shun-iwasawa 82a8f5
    clz             v2.8h, v2.8h
shun-iwasawa 82a8f5
    clz             v3.8h, v3.8h
shun-iwasawa 82a8f5
    clz             v4.8h, v4.8h
shun-iwasawa 82a8f5
    clz             v5.8h, v5.8h
shun-iwasawa 82a8f5
    clz             v6.8h, v6.8h
shun-iwasawa 82a8f5
    clz             v7.8h, v7.8h
shun-iwasawa 82a8f5
    ushl            v24.8h, v24.8h, v0.8h
shun-iwasawa 82a8f5
    ushl            v25.8h, v25.8h, v1.8h
shun-iwasawa 82a8f5
    ushl            v26.8h, v26.8h, v2.8h
shun-iwasawa 82a8f5
    ushl            v27.8h, v27.8h, v3.8h
shun-iwasawa 82a8f5
    ushl            v28.8h, v28.8h, v4.8h
shun-iwasawa 82a8f5
    ushl            v29.8h, v29.8h, v5.8h
shun-iwasawa 82a8f5
    ushl            v30.8h, v30.8h, v6.8h
shun-iwasawa 82a8f5
    ushl            v31.8h, v31.8h, v7.8h
shun-iwasawa 82a8f5
    neg             v0.8h, v0.8h
shun-iwasawa 82a8f5
    neg             v1.8h, v1.8h
shun-iwasawa 82a8f5
    neg             v2.8h, v2.8h
shun-iwasawa 82a8f5
    neg             v3.8h, v3.8h
shun-iwasawa 82a8f5
    neg             v4.8h, v4.8h
shun-iwasawa 82a8f5
    neg             v5.8h, v5.8h
shun-iwasawa 82a8f5
    neg             v6.8h, v6.8h
shun-iwasawa 82a8f5
    neg             v7.8h, v7.8h
shun-iwasawa 82a8f5
    ushl            v24.8h, v24.8h, v0.8h
shun-iwasawa 82a8f5
    ushl            v25.8h, v25.8h, v1.8h
shun-iwasawa 82a8f5
    ushl            v26.8h, v26.8h, v2.8h
shun-iwasawa 82a8f5
    ushl            v27.8h, v27.8h, v3.8h
shun-iwasawa 82a8f5
    ushl            v28.8h, v28.8h, v4.8h
shun-iwasawa 82a8f5
    ushl            v29.8h, v29.8h, v5.8h
shun-iwasawa 82a8f5
    ushl            v30.8h, v30.8h, v6.8h
shun-iwasawa 82a8f5
    ushl            v31.8h, v31.8h, v7.8h
shun-iwasawa 82a8f5
    add             v0.8h, v21.8h, v0.8h
shun-iwasawa 82a8f5
    add             v1.8h, v21.8h, v1.8h
shun-iwasawa 82a8f5
    add             v2.8h, v21.8h, v2.8h
shun-iwasawa 82a8f5
    add             v3.8h, v21.8h, v3.8h
shun-iwasawa 82a8f5
    add             v4.8h, v21.8h, v4.8h
shun-iwasawa 82a8f5
    add             v5.8h, v21.8h, v5.8h
shun-iwasawa 82a8f5
    add             v6.8h, v21.8h, v6.8h
shun-iwasawa 82a8f5
    add             v7.8h, v21.8h, v7.8h
shun-iwasawa 82a8f5
    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
shun-iwasawa 82a8f5
    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
shun-iwasawa 82a8f5
    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
shun-iwasawa 82a8f5
    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    clz             x2, x9
shun-iwasawa 82a8f5
    add             x15, x15, x2, lsl #1
shun-iwasawa 82a8f5
    lsl             x9, x9, x2
shun-iwasawa 82a8f5
    ldrh            w11, [x15, #-126]
shun-iwasawa 82a8f5
2:
shun-iwasawa 82a8f5
    cmp             x2, #0x10
shun-iwasawa 82a8f5
    b.lt            3f
shun-iwasawa 82a8f5
    sub             x2, x2, #0x10
shun-iwasawa 82a8f5
    checkbuf47
shun-iwasawa 82a8f5
    put_bits        x13, x14
shun-iwasawa 82a8f5
    b               2b
shun-iwasawa 82a8f5
3:
shun-iwasawa 82a8f5
    ldrh            w3, [x15, #2]!
shun-iwasawa 82a8f5
    add             x2, x11, x2, lsl #4
shun-iwasawa 82a8f5
    lsl             x9, x9, #0x1
shun-iwasawa 82a8f5
    ldr             w12, [x5, x2, lsl #2]
shun-iwasawa 82a8f5
    ldrb            w10, [x4, x2]
shun-iwasawa 82a8f5
    checkbuf31
shun-iwasawa 82a8f5
    put_bits        x12, x10
shun-iwasawa 82a8f5
    put_bits        x3, x11
shun-iwasawa 82a8f5
    cbnz            x9, 1b
shun-iwasawa 82a8f5
6:
shun-iwasawa 82a8f5
    add             x13, sp, #0x10e
shun-iwasawa 82a8f5
    cmp             x15, x13
shun-iwasawa 82a8f5
    b.hs            1f
shun-iwasawa 82a8f5
    ldr             w12, [x5]
shun-iwasawa 82a8f5
    ldrb            w14, [x4]
shun-iwasawa 82a8f5
    checkbuf47
shun-iwasawa 82a8f5
    put_bits        x12, x14
shun-iwasawa 82a8f5
1:
shun-iwasawa 82a8f5
    str             PUT_BUFFER, [x0, #0x10]
shun-iwasawa 82a8f5
    str             PUT_BITSw, [x0, #0x18]
shun-iwasawa 82a8f5
    ldp             x19, x20, [sp], 16
shun-iwasawa 82a8f5
    add             x0, BUFFER, #0x1
shun-iwasawa 82a8f5
    add             sp, sp, 256
shun-iwasawa 82a8f5
    br              x30
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
generate_jsimd_huff_encode_one_block 1
shun-iwasawa 82a8f5
generate_jsimd_huff_encode_one_block 0
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    .unreq          BUFFER
shun-iwasawa 82a8f5
    .unreq          PUT_BUFFER
shun-iwasawa 82a8f5
    .unreq          PUT_BITS
shun-iwasawa 82a8f5
    .unreq          PUT_BITSw
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.purgem emit_byte
shun-iwasawa 82a8f5
.purgem put_bits
shun-iwasawa 82a8f5
.purgem checkbuf31
shun-iwasawa 82a8f5
.purgem checkbuf47