shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Loongson MMI optimizations for libjpeg-turbo
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
shun-iwasawa 82a8f5
 *                          All Rights Reserved.
shun-iwasawa 82a8f5
 * Copyright (C) 2019, D. R. Commander.  All Rights Reserved.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * This software is provided 'as-is', without any express or implied
shun-iwasawa 82a8f5
 * warranty.  In no event will the authors be held liable for any damages
shun-iwasawa 82a8f5
 * arising from the use of this software.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Permission is granted to anyone to use this software for any purpose,
shun-iwasawa 82a8f5
 * including commercial applications, and to alter it and redistribute it
shun-iwasawa 82a8f5
 * freely, subject to the following restrictions:
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * 1. The origin of this software must not be misrepresented; you must not
shun-iwasawa 82a8f5
 *    claim that you wrote the original software. If you use this software
shun-iwasawa 82a8f5
 *    in a product, an acknowledgment in the product documentation would be
shun-iwasawa 82a8f5
 *    appreciated but is not required.
shun-iwasawa 82a8f5
 * 2. Altered source versions must be plainly marked as such, and must not be
shun-iwasawa 82a8f5
 *    misrepresented as being the original software.
shun-iwasawa 82a8f5
 * 3. This notice may not be removed or altered from any source distribution.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#ifndef __LOONGSON_MMINTRIN_H__
shun-iwasawa 82a8f5
#define __LOONGSON_MMINTRIN_H__
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#include <stdint.h></stdint.h>
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define FUNCTION_ATTRIBS \
shun-iwasawa 82a8f5
  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Vectors are stored in 64-bit floating-point registers. */
shun-iwasawa 82a8f5
typedef double __m64;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Having a 32-bit datatype allows us to use 32-bit loads in places like
shun-iwasawa 82a8f5
   load8888. */
shun-iwasawa 82a8f5
typedef float __m32;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/********** Set Operations **********/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_setzero_si64(void)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return 0.0;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
shun-iwasawa 82a8f5
            uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
  uint32_t lo = ((uint32_t)__b6 << 24) |
shun-iwasawa 82a8f5
                ((uint32_t)__b4 << 16) |
shun-iwasawa 82a8f5
                ((uint32_t)__b2 << 8) |
shun-iwasawa 82a8f5
                (uint32_t)__b0;
shun-iwasawa 82a8f5
  uint32_t hi = ((uint32_t)__b7 << 24) |
shun-iwasawa 82a8f5
                ((uint32_t)__b5 << 16) |
shun-iwasawa 82a8f5
                ((uint32_t)__b3 << 8) |
shun-iwasawa 82a8f5
                (uint32_t)__b1;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("mtc1      %1, %0\n\t"
shun-iwasawa 82a8f5
      "mtc1      %2, $f0\n\t"
shun-iwasawa 82a8f5
      "punpcklbh %0, %0, $f0\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "r" (lo), "r" (hi)
shun-iwasawa 82a8f5
      : "$f0"
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
  uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
shun-iwasawa 82a8f5
  uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("mtc1      %1, %0\n\t"
shun-iwasawa 82a8f5
      "mtc1      %2, $f0\n\t"
shun-iwasawa 82a8f5
      "punpcklhw %0, %0, $f0\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "r" (lo), "r" (hi)
shun-iwasawa 82a8f5
      : "$f0"
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
shun-iwasawa 82a8f5
  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_set_pi32(uint32_t __i1, uint32_t __i0)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
shun-iwasawa 82a8f5
    uint64_t val = ((uint64_t)__i1 << 32) |
shun-iwasawa 82a8f5
                   ((uint64_t)__i0 <<  0);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    return *(__m64 *)&val;
shun-iwasawa 82a8f5
  } else if (__i1 == __i0) {
shun-iwasawa 82a8f5
    uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
shun-iwasawa 82a8f5
    __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    asm("pshufh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
        : "=f" (ret)
shun-iwasawa 82a8f5
        : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
shun-iwasawa 82a8f5
       );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    return ret;
shun-iwasawa 82a8f5
  } else {
shun-iwasawa 82a8f5
    uint64_t val = ((uint64_t)__i1 << 32) |
shun-iwasawa 82a8f5
                   ((uint64_t)__i0 <<  0);
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    return *(__m64 *)&val;
shun-iwasawa 82a8f5
  }
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_set1_pi8(uint8_t __b0)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("sll    $8, %1, 8\n\t"
shun-iwasawa 82a8f5
      "or     %1, %1, $8\n\t"
shun-iwasawa 82a8f5
      "mtc1   %1, %0\n\t"
shun-iwasawa 82a8f5
      "mtc1   $0, $f0\n\t"
shun-iwasawa 82a8f5
      "pshufh %0, %0, $f0\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "r" (__b0)
shun-iwasawa 82a8f5
      : "$8", "$f0"
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_set1_pi16(uint16_t __h0)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("mtc1   %1, %0\n\t"
shun-iwasawa 82a8f5
      "mtc1   $0, $f0\n\t"
shun-iwasawa 82a8f5
      "pshufh %0, %0, $f0\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "r" (__h0)
shun-iwasawa 82a8f5
      : "$8", "$f0"
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_set1_pi32(unsigned __i0)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_set_pi32(__i0, __i0);
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
shun-iwasawa 82a8f5
             uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_set_pi8(__h7, __h6, __h5, __h4,
shun-iwasawa 82a8f5
                     __h3, __h2, __h1, __h0);
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_set_pi16(__w3, __w2, __w1, __w0);
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_setr_pi32(uint32_t __i0, uint32_t __i1)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_set_pi32(__i1, __i0);
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/********** Arithmetic Operations **********/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_add_pi8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("paddb %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_add_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("paddh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_add_pi32(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("paddw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_add_si64(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("paddd %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_adds_pi8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("paddsb %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_adds_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("paddsh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_adds_pu8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("paddusb %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_adds_pu16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("paddush %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_avg_pu8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pavgb %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_avg_pu16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pavgh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_madd_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pmaddhw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_max_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pmaxsh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_max_pu8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pmaxub %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_min_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pminsh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_min_pu8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pminub %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline int FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_movemask_pi8(__m64 __m1)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  int ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pmovmskb %0, %1\n\t"
shun-iwasawa 82a8f5
      : "=r" (ret)
shun-iwasawa 82a8f5
      : "y" (__m1)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pmulhh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_mulhi_pu16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pmulhuh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_mullo_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pmullh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_mul_pu32(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pmuluw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_sad_pu8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psadbh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_asub_pu8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pasubub %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_biadd_pu8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("biadd %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_sub_pi8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psubb %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_sub_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psubh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_sub_pi32(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psubw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_sub_si64(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psubd %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_subs_pi8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psubsb %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_subs_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psubsh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_subs_pu8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psubusb %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_subs_pu16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psubush %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/********** Logical Operations **********/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_and_si64(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("and %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_andnot_si64(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("andn %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_or_si32(__m32 __m1, __m32 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m32 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("or %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_or_si64(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("or %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_xor_si64(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("xor %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/********** Shift Operations **********/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_slli_pi16(__m64 __m, int64_t __count)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psllh  %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m), "f" (*(__m64 *)&__count)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_slli_pi32(__m64 __m, int64_t __count)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psllw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m), "f" (*(__m64 *)&__count)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_slli_si64(__m64 __m, int64_t __count)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("dsll  %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m), "f" (*(__m64 *)&__count)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_srli_pi16(__m64 __m, int64_t __count)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psrlh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m), "f" (*(__m64 *)&__count)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_srli_pi32(__m64 __m, int64_t __count)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psrlw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m), "f" (*(__m64 *)&__count)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_srli_si64(__m64 __m, int64_t __count)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("dsrl  %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m), "f" (*(__m64 *)&__count)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_srai_pi16(__m64 __m, int64_t __count)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psrah %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m), "f" (*(__m64 *)&__count)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_srai_pi32(__m64 __m, int64_t __count)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("psraw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m), "f" (*(__m64 *)&__count)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_srai_si64(__m64 __m, int64_t __count)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("dsra %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m), "f" (*(__m64 *)&__count)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/********** Conversion Intrinsics **********/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
to_m64(uint64_t x)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return *(__m64 *)&x;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline uint64_t FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
to_uint64(__m64 x)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return *(uint64_t *)&x;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/********** Comparison Intrinsics **********/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pcmpeqb %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pcmpeqh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pcmpeqw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pcmpgtb %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pcmpgth %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pcmpgtw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_cmplt_pi8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pcmpltb %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_cmplt_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pcmplth %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_cmplt_pi32(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pcmpltw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/********** Miscellaneous Operations **********/
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_packs_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("packsshb %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_packs_pi32(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("packsswh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_packs_pi32_f(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("packsswh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_packs_pu16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("packushb %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_extract_pi16(__m64 __m, int64_t __pos)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pextrh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m), "f" (*(__m64 *)&__pos)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  switch (__pos) {
shun-iwasawa 82a8f5
  case 0:
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    asm("pinsrh_0 %0, %1, %2\n\t"
shun-iwasawa 82a8f5
        : "=f" (ret)
shun-iwasawa 82a8f5
        : "f" (__m1), "f" (__m2), "i" (__pos)
shun-iwasawa 82a8f5
       );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    break;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  case 1:
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    asm("pinsrh_1 %0, %1, %2\n\t"
shun-iwasawa 82a8f5
        : "=f" (ret)
shun-iwasawa 82a8f5
        : "f" (__m1), "f" (__m2), "i" (__pos)
shun-iwasawa 82a8f5
       );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    break;
shun-iwasawa 82a8f5
  case 2:
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    asm("pinsrh_2 %0, %1, %2\n\t"
shun-iwasawa 82a8f5
        : "=f" (ret)
shun-iwasawa 82a8f5
        : "f" (__m1), "f" (__m2), "i" (__pos)
shun-iwasawa 82a8f5
       );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    break;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  case 3:
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    asm("pinsrh_3 %0, %1, %2\n\t"
shun-iwasawa 82a8f5
        : "=f" (ret)
shun-iwasawa 82a8f5
        : "f" (__m1), "f" (__m2), "i" (__pos)
shun-iwasawa 82a8f5
       );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    break;
shun-iwasawa 82a8f5
  }
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_shuffle_pi16(__m64 __m, int64_t __n)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("pshufh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m), "f" (*(__m64 *)&__n)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("punpckhbh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("punpckhbh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("punpckhhw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("punpckhhw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("punpckhwd %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("punpcklbh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
shun-iwasawa 82a8f5
   which preserves the data. */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("punpcklbh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
shun-iwasawa 82a8f5
   datatype, which allows load8888 to use 32-bit loads. */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("punpcklbh %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("punpcklhw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("punpcklhw %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("punpcklwd %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("punpcklwd %0, %1, %2\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "f" (__m1), "f" (__m2)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline void FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_store_pi32(__m32 *dest, __m64 src)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  src = _mm_packs_pu16(src, _mm_setzero_si64());
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("swc1 %1, %0\n\t"
shun-iwasawa 82a8f5
      : "=m" (*dest)
shun-iwasawa 82a8f5
      : "f" (src)
shun-iwasawa 82a8f5
      : "memory"
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline void FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_store_si64(__m64 *dest, __m64 src)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  asm("gssdlc1 %1, 7+%0\n\t"
shun-iwasawa 82a8f5
      "gssdrc1 %1, %0\n\t"
shun-iwasawa 82a8f5
      : "=m" (*dest)
shun-iwasawa 82a8f5
      : "f" (src)
shun-iwasawa 82a8f5
      : "memory"
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_load_si32(const __m32 *src)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m32 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("lwc1 %0, %1\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "m" (*src)
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_load_si64(const __m64 *src)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("ldc1 %0, %1\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "m" (*src)
shun-iwasawa 82a8f5
      : "memory"
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_loadu_si64(const __m64 *src)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  __m64 ret;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  asm("gsldlc1 %0,  7(%1)\n\t"
shun-iwasawa 82a8f5
      "gsldrc1 %0,  0(%1)\n\t"
shun-iwasawa 82a8f5
      : "=f" (ret)
shun-iwasawa 82a8f5
      : "r" (src)
shun-iwasawa 82a8f5
      : "memory"
shun-iwasawa 82a8f5
     );
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
  return ret;
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_loadlo_pi8(const uint32_t *src)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_loadlo_pi8_f(__m64 src)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_loadhi_pi8_f(__m64 src)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_loadlo_pi16(__m64 src)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_unpacklo_pi16(src, _mm_setzero_si64());
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_loadlo_pi16_f(__m64 src)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_loadhi_pi16(__m64 src)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_unpackhi_pi16(src, _mm_setzero_si64());
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_loadhi_pi16_f(__m64 src)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_expand_alpha(__m64 pixel)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
extern __inline __m64 FUNCTION_ATTRIBS
shun-iwasawa 82a8f5
_mm_expand_alpha_rev(__m64 pixel)
shun-iwasawa 82a8f5
{
shun-iwasawa 82a8f5
  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
shun-iwasawa 82a8f5
}
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#endif  /* __LOONGSON_MMINTRIN_H__ */