shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * AltiVec optimizations for libjpeg-turbo
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This software is provided 'as-is', without any express or implied
shun-iwasawa 82a8f5
 * warranty.  In no event will the authors be held liable for any damages
shun-iwasawa 82a8f5
 * arising from the use of this software.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Permission is granted to anyone to use this software for any purpose,
shun-iwasawa 82a8f5
 * including commercial applications, and to alter it and redistribute it
shun-iwasawa 82a8f5
 * freely, subject to the following restrictions:
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * 1. The origin of this software must not be misrepresented; you must not
shun-iwasawa 82a8f5
 *    claim that you wrote the original software. If you use this software
shun-iwasawa 82a8f5
 *    in a product, an acknowledgment in the product documentation would be
shun-iwasawa 82a8f5
 *    appreciated but is not required.
shun-iwasawa 82a8f5
 * 2. Altered source versions must be plainly marked as such, and must not be
shun-iwasawa 82a8f5
 *    misrepresented as being the original software.
shun-iwasawa 82a8f5
 * 3. This notice may not be removed or altered from any source distribution.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#include "jsimd_altivec.h"
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
shun-iwasawa 82a8f5
 * always get the data we want by using a single vector load (although we may
shun-iwasawa 82a8f5
 * have to permute the result.)
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
#if __BIG_ENDIAN__
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define LOAD_ROW(row) { \
shun-iwasawa 82a8f5
  elemptr = sample_data[row] + start_col; \
shun-iwasawa 82a8f5
  in##row = vec_ld(0, elemptr); \
shun-iwasawa 82a8f5
  if ((size_t)elemptr & 15) \
shun-iwasawa 82a8f5
    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#else
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define LOAD_ROW(row) { \
shun-iwasawa 82a8f5
  elemptr = sample_data[row] + start_col; \
shun-iwasawa 82a8f5
  in##row = vec_vsx_ld(0, elemptr); \
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
shun-iwasawa 82a8f5
                            DCTELEM *workspace)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  JSAMPROW elemptr;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
shun-iwasawa 82a8f5
  __vector short out0, out1, out2, out3, out4, out5, out6, out7;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  /* Constants */
shun-iwasawa 82a8f5
  __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
shun-iwasawa 82a8f5
  __vector unsigned char pb_zero = { __16X(0) };
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  LOAD_ROW(0);
shun-iwasawa 82a8f5
  LOAD_ROW(1);
shun-iwasawa 82a8f5
  LOAD_ROW(2);
shun-iwasawa 82a8f5
  LOAD_ROW(3);
shun-iwasawa 82a8f5
  LOAD_ROW(4);
shun-iwasawa 82a8f5
  LOAD_ROW(5);
shun-iwasawa 82a8f5
  LOAD_ROW(6);
shun-iwasawa 82a8f5
  LOAD_ROW(7);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  out0 = (__vector short)VEC_UNPACKHU(in0);
shun-iwasawa 82a8f5
  out1 = (__vector short)VEC_UNPACKHU(in1);
shun-iwasawa 82a8f5
  out2 = (__vector short)VEC_UNPACKHU(in2);
shun-iwasawa 82a8f5
  out3 = (__vector short)VEC_UNPACKHU(in3);
shun-iwasawa 82a8f5
  out4 = (__vector short)VEC_UNPACKHU(in4);
shun-iwasawa 82a8f5
  out5 = (__vector short)VEC_UNPACKHU(in5);
shun-iwasawa 82a8f5
  out6 = (__vector short)VEC_UNPACKHU(in6);
shun-iwasawa 82a8f5
  out7 = (__vector short)VEC_UNPACKHU(in7);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  out0 = vec_sub(out0, pw_centerjsamp);
shun-iwasawa 82a8f5
  out1 = vec_sub(out1, pw_centerjsamp);
shun-iwasawa 82a8f5
  out2 = vec_sub(out2, pw_centerjsamp);
shun-iwasawa 82a8f5
  out3 = vec_sub(out3, pw_centerjsamp);
shun-iwasawa 82a8f5
  out4 = vec_sub(out4, pw_centerjsamp);
shun-iwasawa 82a8f5
  out5 = vec_sub(out5, pw_centerjsamp);
shun-iwasawa 82a8f5
  out6 = vec_sub(out6, pw_centerjsamp);
shun-iwasawa 82a8f5
  out7 = vec_sub(out7, pw_centerjsamp);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  vec_st(out0, 0, workspace);
shun-iwasawa 82a8f5
  vec_st(out1, 16, workspace);
shun-iwasawa 82a8f5
  vec_st(out2, 32, workspace);
shun-iwasawa 82a8f5
  vec_st(out3, 48, workspace);
shun-iwasawa 82a8f5
  vec_st(out4, 64, workspace);
shun-iwasawa 82a8f5
  vec_st(out5, 80, workspace);
shun-iwasawa 82a8f5
  vec_st(out6, 96, workspace);
shun-iwasawa 82a8f5
  vec_st(out7, 112, workspace);
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define WORD_BIT  16
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
shun-iwasawa 82a8f5
   We basically need an unsigned equivalent of vec_madds(). */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define MULTIPLY(vs0, vs1, out) { \
shun-iwasawa 82a8f5
  tmpe = vec_mule((__vector unsigned short)vs0, \
shun-iwasawa 82a8f5
                  (__vector unsigned short)vs1); \
shun-iwasawa 82a8f5
  tmpo = vec_mulo((__vector unsigned short)vs0, \
shun-iwasawa 82a8f5
                  (__vector unsigned short)vs1); \
shun-iwasawa 82a8f5
  out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
shun-iwasawa 82a8f5
                                 (__vector unsigned short)tmpo, \
shun-iwasawa 82a8f5
                                 shift_pack_index); \
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
shun-iwasawa 82a8f5
                            DCTELEM *workspace)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
shun-iwasawa 82a8f5
    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
shun-iwasawa 82a8f5
    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
shun-iwasawa 82a8f5
    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
shun-iwasawa 82a8f5
    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
shun-iwasawa 82a8f5
  __vector unsigned int tmpe, tmpo;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  /* Constants */
shun-iwasawa 82a8f5
  __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
shun-iwasawa 82a8f5
#if __BIG_ENDIAN__
shun-iwasawa 82a8f5
  __vector unsigned char shift_pack_index =
shun-iwasawa 82a8f5
    {  0,  1, 16, 17,  4,  5, 20, 21,  8,  9, 24, 25, 12, 13, 28, 29 };
shun-iwasawa 82a8f5
#else
shun-iwasawa 82a8f5
  __vector unsigned char shift_pack_index =
shun-iwasawa 82a8f5
    {  2,  3, 18, 19,  6,  7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  row0 = vec_ld(0, workspace);
shun-iwasawa 82a8f5
  row1 = vec_ld(16, workspace);
shun-iwasawa 82a8f5
  row2 = vec_ld(32, workspace);
shun-iwasawa 82a8f5
  row3 = vec_ld(48, workspace);
shun-iwasawa 82a8f5
  row4 = vec_ld(64, workspace);
shun-iwasawa 82a8f5
  row5 = vec_ld(80, workspace);
shun-iwasawa 82a8f5
  row6 = vec_ld(96, workspace);
shun-iwasawa 82a8f5
  row7 = vec_ld(112, workspace);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  /* Branch-less absolute value */
shun-iwasawa 82a8f5
  row0s = vec_sra(row0, pw_word_bit_m1);
shun-iwasawa 82a8f5
  row1s = vec_sra(row1, pw_word_bit_m1);
shun-iwasawa 82a8f5
  row2s = vec_sra(row2, pw_word_bit_m1);
shun-iwasawa 82a8f5
  row3s = vec_sra(row3, pw_word_bit_m1);
shun-iwasawa 82a8f5
  row4s = vec_sra(row4, pw_word_bit_m1);
shun-iwasawa 82a8f5
  row5s = vec_sra(row5, pw_word_bit_m1);
shun-iwasawa 82a8f5
  row6s = vec_sra(row6, pw_word_bit_m1);
shun-iwasawa 82a8f5
  row7s = vec_sra(row7, pw_word_bit_m1);
shun-iwasawa 82a8f5
  row0 = vec_xor(row0, row0s);
shun-iwasawa 82a8f5
  row1 = vec_xor(row1, row1s);
shun-iwasawa 82a8f5
  row2 = vec_xor(row2, row2s);
shun-iwasawa 82a8f5
  row3 = vec_xor(row3, row3s);
shun-iwasawa 82a8f5
  row4 = vec_xor(row4, row4s);
shun-iwasawa 82a8f5
  row5 = vec_xor(row5, row5s);
shun-iwasawa 82a8f5
  row6 = vec_xor(row6, row6s);
shun-iwasawa 82a8f5
  row7 = vec_xor(row7, row7s);
shun-iwasawa 82a8f5
  row0 = vec_sub(row0, row0s);
shun-iwasawa 82a8f5
  row1 = vec_sub(row1, row1s);
shun-iwasawa 82a8f5
  row2 = vec_sub(row2, row2s);
shun-iwasawa 82a8f5
  row3 = vec_sub(row3, row3s);
shun-iwasawa 82a8f5
  row4 = vec_sub(row4, row4s);
shun-iwasawa 82a8f5
  row5 = vec_sub(row5, row5s);
shun-iwasawa 82a8f5
  row6 = vec_sub(row6, row6s);
shun-iwasawa 82a8f5
  row7 = vec_sub(row7, row7s);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  corr0 = vec_ld(DCTSIZE2 * 2, divisors);
shun-iwasawa 82a8f5
  corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
shun-iwasawa 82a8f5
  corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
shun-iwasawa 82a8f5
  corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
shun-iwasawa 82a8f5
  corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
shun-iwasawa 82a8f5
  corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
shun-iwasawa 82a8f5
  corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
shun-iwasawa 82a8f5
  corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  row0 = vec_add(row0, corr0);
shun-iwasawa 82a8f5
  row1 = vec_add(row1, corr1);
shun-iwasawa 82a8f5
  row2 = vec_add(row2, corr2);
shun-iwasawa 82a8f5
  row3 = vec_add(row3, corr3);
shun-iwasawa 82a8f5
  row4 = vec_add(row4, corr4);
shun-iwasawa 82a8f5
  row5 = vec_add(row5, corr5);
shun-iwasawa 82a8f5
  row6 = vec_add(row6, corr6);
shun-iwasawa 82a8f5
  row7 = vec_add(row7, corr7);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  recip0 = vec_ld(0, divisors);
shun-iwasawa 82a8f5
  recip1 = vec_ld(16, divisors);
shun-iwasawa 82a8f5
  recip2 = vec_ld(32, divisors);
shun-iwasawa 82a8f5
  recip3 = vec_ld(48, divisors);
shun-iwasawa 82a8f5
  recip4 = vec_ld(64, divisors);
shun-iwasawa 82a8f5
  recip5 = vec_ld(80, divisors);
shun-iwasawa 82a8f5
  recip6 = vec_ld(96, divisors);
shun-iwasawa 82a8f5
  recip7 = vec_ld(112, divisors);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  MULTIPLY(row0, recip0, row0);
shun-iwasawa 82a8f5
  MULTIPLY(row1, recip1, row1);
shun-iwasawa 82a8f5
  MULTIPLY(row2, recip2, row2);
shun-iwasawa 82a8f5
  MULTIPLY(row3, recip3, row3);
shun-iwasawa 82a8f5
  MULTIPLY(row4, recip4, row4);
shun-iwasawa 82a8f5
  MULTIPLY(row5, recip5, row5);
shun-iwasawa 82a8f5
  MULTIPLY(row6, recip6, row6);
shun-iwasawa 82a8f5
  MULTIPLY(row7, recip7, row7);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  scale0 = vec_ld(DCTSIZE2 * 4, divisors);
shun-iwasawa 82a8f5
  scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
shun-iwasawa 82a8f5
  scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
shun-iwasawa 82a8f5
  scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
shun-iwasawa 82a8f5
  scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
shun-iwasawa 82a8f5
  scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
shun-iwasawa 82a8f5
  scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
shun-iwasawa 82a8f5
  scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  MULTIPLY(row0, scale0, row0);
shun-iwasawa 82a8f5
  MULTIPLY(row1, scale1, row1);
shun-iwasawa 82a8f5
  MULTIPLY(row2, scale2, row2);
shun-iwasawa 82a8f5
  MULTIPLY(row3, scale3, row3);
shun-iwasawa 82a8f5
  MULTIPLY(row4, scale4, row4);
shun-iwasawa 82a8f5
  MULTIPLY(row5, scale5, row5);
shun-iwasawa 82a8f5
  MULTIPLY(row6, scale6, row6);
shun-iwasawa 82a8f5
  MULTIPLY(row7, scale7, row7);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  row0 = vec_xor(row0, row0s);
shun-iwasawa 82a8f5
  row1 = vec_xor(row1, row1s);
shun-iwasawa 82a8f5
  row2 = vec_xor(row2, row2s);
shun-iwasawa 82a8f5
  row3 = vec_xor(row3, row3s);
shun-iwasawa 82a8f5
  row4 = vec_xor(row4, row4s);
shun-iwasawa 82a8f5
  row5 = vec_xor(row5, row5s);
shun-iwasawa 82a8f5
  row6 = vec_xor(row6, row6s);
shun-iwasawa 82a8f5
  row7 = vec_xor(row7, row7s);
shun-iwasawa 82a8f5
  row0 = vec_sub(row0, row0s);
shun-iwasawa 82a8f5
  row1 = vec_sub(row1, row1s);
shun-iwasawa 82a8f5
  row2 = vec_sub(row2, row2s);
shun-iwasawa 82a8f5
  row3 = vec_sub(row3, row3s);
shun-iwasawa 82a8f5
  row4 = vec_sub(row4, row4s);
shun-iwasawa 82a8f5
  row5 = vec_sub(row5, row5s);
shun-iwasawa 82a8f5
  row6 = vec_sub(row6, row6s);
shun-iwasawa 82a8f5
  row7 = vec_sub(row7, row7s);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  vec_st(row0, 0, coef_block);
shun-iwasawa 82a8f5
  vec_st(row1, 16, coef_block);
shun-iwasawa 82a8f5
  vec_st(row2, 32, coef_block);
shun-iwasawa 82a8f5
  vec_st(row3, 48, coef_block);
shun-iwasawa 82a8f5
  vec_st(row4, 64, coef_block);
shun-iwasawa 82a8f5
  vec_st(row5, 80, coef_block);
shun-iwasawa 82a8f5
  vec_st(row6, 96, coef_block);
shun-iwasawa 82a8f5
  vec_st(row7, 112, coef_block);
shun-iwasawa 82a8f5
}