Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
#include "traster.h"
Toshihiro Shimizu 890ddd
#include "trop.h"
Toshihiro Shimizu 890ddd
#include "tpixelgr.h"
322341
322341
#if defined(_WIN32) && defined(x64)
322341
#define USE_SSE2
322341
#endif
322341
322341
#ifdef USE_SSE2
Toshihiro Shimizu 890ddd
#include <emmintrin.h></emmintrin.h>
Rozhuk Ivan 823a31
#include <stdlib.h></stdlib.h>
Toshihiro Shimizu 890ddd
#endif
Toshihiro Shimizu 890ddd
322341
Shinya Kitaoka 120a6e
namespace {
Toshihiro Shimizu 890ddd
Shinya Kitaoka 9f5a1b
#ifdef _WIN32
Toshihiro Shimizu 890ddd
template <class t=""></class>
Toshihiro Shimizu 890ddd
struct BlurPixel {
Shinya Kitaoka 120a6e
  T b;
Shinya Kitaoka 120a6e
  T g;
Shinya Kitaoka 120a6e
  T r;
Shinya Kitaoka 120a6e
  T m;
Toshihiro Shimizu 890ddd
};
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
#else
Toshihiro Shimizu 890ddd
template <class t=""></class>
Toshihiro Shimizu 890ddd
struct BlurPixel {
Shinya Kitaoka 120a6e
  T r;
Shinya Kitaoka 120a6e
  T g;
Shinya Kitaoka 120a6e
  T b;
Shinya Kitaoka 120a6e
  T m;
Toshihiro Shimizu 890ddd
};
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
#endif
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//===================================================================
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
#define LOAD_COL_CODE                                                          \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  buffer += x;                                                                 \
Shinya Kitaoka 120a6e
  pix = col + by1;                                                             \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  for (i = by1; i < ly + by1; i++) {                                           \
Shinya Kitaoka 120a6e
    *pix++ = *buffer;                                                          \
Shinya Kitaoka 120a6e
    buffer += lx;                                                              \
Shinya Kitaoka 120a6e
  }                                                                            \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  pix += by2;                                                                  \
Shinya Kitaoka 120a6e
  left_val  = col[0];                                                          \
Shinya Kitaoka 120a6e
  right_val = *(pix - 1);                                                      \
Shinya Kitaoka 120a6e
  col--;                                                                       \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  for (i = 0; i < brad; i++) {                                                 \
Shinya Kitaoka 120a6e
    *col-- = left_val;                                                         \
Shinya Kitaoka 120a6e
    *pix++ = right_val;                                                        \
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
#define BLUR_CODE(round_fac, channel_type)                                     \
Shinya Kitaoka 120a6e
  pix1 = row1;                                                                 \
Shinya Kitaoka 120a6e
  pix2 = row1 - 1;                                                             \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  sigma1.r = pix1->r;                                                          \
Shinya Kitaoka 120a6e
  sigma1.g = pix1->g;                                                          \
Shinya Kitaoka 120a6e
  sigma1.b = pix1->b;                                                          \
Shinya Kitaoka 120a6e
  sigma1.m = pix1->m;                                                          \
Shinya Kitaoka 120a6e
  pix1++;                                                                      \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  sigma2.r = sigma2.g = sigma2.b = sigma2.m = 0.0;                             \
Shinya Kitaoka 120a6e
  sigma3.r = sigma3.g = sigma3.b = sigma3.m = 0.0;                             \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  for (i = 1; i < brad; i++) {                                                 \
Shinya Kitaoka 120a6e
    sigma1.r += pix1->r;                                                       \
Shinya Kitaoka 120a6e
    sigma1.g += pix1->g;                                                       \
Shinya Kitaoka 120a6e
    sigma1.b += pix1->b;                                                       \
Shinya Kitaoka 120a6e
    sigma1.m += pix1->m;                                                       \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
    sigma2.r += pix2->r;                                                       \
Shinya Kitaoka 120a6e
    sigma2.g += pix2->g;                                                       \
Shinya Kitaoka 120a6e
    sigma2.b += pix2->b;                                                       \
Shinya Kitaoka 120a6e
    sigma2.m += pix2->m;                                                       \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
    sigma3.r += i * (pix1->r + pix2->r);                                       \
Shinya Kitaoka 120a6e
    sigma3.g += i * (pix1->g + pix2->g);                                       \
Shinya Kitaoka 120a6e
    sigma3.b += i * (pix1->b + pix2->b);                                       \
Shinya Kitaoka 120a6e
    sigma3.m += i * (pix1->m + pix2->m);                                       \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
    pix1++;                                                                    \
Shinya Kitaoka 120a6e
    pix2--;                                                                    \
Shinya Kitaoka 120a6e
  }                                                                            \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  rsum = (sigma1.r + sigma2.r) * coeff - sigma3.r * coeffq + (round_fac);      \
Shinya Kitaoka 120a6e
  gsum = (sigma1.g + sigma2.g) * coeff - sigma3.g * coeffq + (round_fac);      \
Shinya Kitaoka 120a6e
  bsum = (sigma1.b + sigma2.b) * coeff - sigma3.b * coeffq + (round_fac);      \
Shinya Kitaoka 120a6e
  msum = (sigma1.m + sigma2.m) * coeff - sigma3.m * coeffq + (round_fac);      \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  row2->r = (channel_type)(rsum);                                              \
Shinya Kitaoka 120a6e
  row2->g = (channel_type)(gsum);                                              \
Shinya Kitaoka 120a6e
  row2->b = (channel_type)(bsum);                                              \
Shinya Kitaoka 120a6e
  row2->m = (channel_type)(msum);                                              \
Shinya Kitaoka 120a6e
  row2++;                                                                      \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  sigma2.r += row1[-brad].r;                                                   \
Shinya Kitaoka 120a6e
  sigma2.g += row1[-brad].g;                                                   \
Shinya Kitaoka 120a6e
  sigma2.b += row1[-brad].b;                                                   \
Shinya Kitaoka 120a6e
  sigma2.m += row1[-brad].m;                                                   \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  pix1 = row1 + brad;                                                          \
Shinya Kitaoka 120a6e
  pix2 = row1;                                                                 \
Shinya Kitaoka 120a6e
  pix3 = row1 - brad;                                                          \
Shinya Kitaoka 120a6e
  pix4 = row1 - brad + 1;                                                      \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  desigma.r = sigma1.r - sigma2.r;                                             \
Shinya Kitaoka 120a6e
  desigma.g = sigma1.g - sigma2.g;                                             \
Shinya Kitaoka 120a6e
  desigma.b = sigma1.b - sigma2.b;                                             \
Shinya Kitaoka 120a6e
  desigma.m = sigma1.m - sigma2.m;                                             \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
  for (i = 1; i < length; i++) {                                               \
Shinya Kitaoka 120a6e
    desigma.r += pix1->r - 2 * pix2->r + pix3->r;                              \
Shinya Kitaoka 120a6e
    desigma.g += pix1->g - 2 * pix2->g + pix3->g;                              \
Shinya Kitaoka 120a6e
    desigma.b += pix1->b - 2 * pix2->b + pix3->b;                              \
Shinya Kitaoka 120a6e
    desigma.m += pix1->m - 2 * pix2->m + pix3->m;                              \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
    rsum += (desigma.r + diff * (pix1->r - pix4->r)) * coeffq;                 \
Shinya Kitaoka 120a6e
    gsum += (desigma.g + diff * (pix1->g - pix4->g)) * coeffq;                 \
Shinya Kitaoka 120a6e
    bsum += (desigma.b + diff * (pix1->b - pix4->b)) * coeffq;                 \
Shinya Kitaoka 120a6e
    msum += (desigma.m + diff * (pix1->m - pix4->m)) * coeffq;                 \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
    row2->r = (channel_type)(rsum);                                            \
Shinya Kitaoka 120a6e
    row2->g = (channel_type)(gsum);                                            \
Shinya Kitaoka 120a6e
    row2->b = (channel_type)(bsum);                                            \
Shinya Kitaoka 120a6e
    row2->m = (channel_type)(msum);                                            \
Shinya Kitaoka 120a6e
    row2++;                                                                    \
Shinya Kitaoka 120a6e
    pix1++, pix2++, pix3++, pix4++;                                            \
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
template <typename pixel_dst,="" pixel_src,="" t="" typename=""></typename>
Shinya Kitaoka 120a6e
inline void blur_code(PIXEL_SRC *row1, PIXEL_DST *row2, int length, float coeff,
Shinya Kitaoka 120a6e
                      float coeffq, int brad, float diff, float round_fac) {
Shinya Kitaoka 120a6e
  int i;
Shinya Kitaoka 120a6e
  T rsum, gsum, bsum, msum;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  BlurPixel<t> sigma1, sigma2, sigma3, desigma;</t>
Shinya Kitaoka 120a6e
  PIXEL_SRC *pix1, *pix2, *pix3, *pix4;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  pix1 = row1;
Shinya Kitaoka 120a6e
  pix2 = row1 - 1;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  sigma1.r = pix1->r;
Shinya Kitaoka 120a6e
  sigma1.g = pix1->g;
Shinya Kitaoka 120a6e
  sigma1.b = pix1->b;
Shinya Kitaoka 120a6e
  sigma1.m = pix1->m;
Shinya Kitaoka 120a6e
  pix1++;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  sigma2.r = sigma2.g = sigma2.b = sigma2.m = 0.0;
Shinya Kitaoka 120a6e
  sigma3.r = sigma3.g = sigma3.b = sigma3.m = 0.0;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  for (i = 1; i < brad; i++) {
Shinya Kitaoka 120a6e
    sigma1.r += pix1->r;
Shinya Kitaoka 120a6e
    sigma1.g += pix1->g;
Shinya Kitaoka 120a6e
    sigma1.b += pix1->b;
Shinya Kitaoka 120a6e
    sigma1.m += pix1->m;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    sigma2.r += pix2->r;
Shinya Kitaoka 120a6e
    sigma2.g += pix2->g;
Shinya Kitaoka 120a6e
    sigma2.b += pix2->b;
Shinya Kitaoka 120a6e
    sigma2.m += pix2->m;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    sigma3.r += i * (pix1->r + pix2->r);
Shinya Kitaoka 120a6e
    sigma3.g += i * (pix1->g + pix2->g);
Shinya Kitaoka 120a6e
    sigma3.b += i * (pix1->b + pix2->b);
Shinya Kitaoka 120a6e
    sigma3.m += i * (pix1->m + pix2->m);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    pix1++;
Shinya Kitaoka 120a6e
    pix2--;
Shinya Kitaoka 120a6e
  }
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  rsum = (sigma1.r + sigma2.r) * coeff - sigma3.r * coeffq + (round_fac);
Shinya Kitaoka 120a6e
  gsum = (sigma1.g + sigma2.g) * coeff - sigma3.g * coeffq + (round_fac);
Shinya Kitaoka 120a6e
  bsum = (sigma1.b + sigma2.b) * coeff - sigma3.b * coeffq + (round_fac);
Shinya Kitaoka 120a6e
  msum = (sigma1.m + sigma2.m) * coeff - sigma3.m * coeffq + (round_fac);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  row2->r = rsum;
Shinya Kitaoka 120a6e
  row2->g = gsum;
Shinya Kitaoka 120a6e
  row2->b = bsum;
Shinya Kitaoka 120a6e
  row2->m = msum;
Shinya Kitaoka 120a6e
  row2++;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  sigma2.r += row1[-brad].r;
Shinya Kitaoka 120a6e
  sigma2.g += row1[-brad].g;
Shinya Kitaoka 120a6e
  sigma2.b += row1[-brad].b;
Shinya Kitaoka 120a6e
  sigma2.m += row1[-brad].m;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  pix1 = row1 + brad;
Shinya Kitaoka 120a6e
  pix2 = row1;
Shinya Kitaoka 120a6e
  pix3 = row1 - brad;
Shinya Kitaoka 120a6e
  pix4 = row1 - brad + 1;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  desigma.r = sigma1.r - sigma2.r;
Shinya Kitaoka 120a6e
  desigma.g = sigma1.g - sigma2.g;
Shinya Kitaoka 120a6e
  desigma.b = sigma1.b - sigma2.b;
Shinya Kitaoka 120a6e
  desigma.m = sigma1.m - sigma2.m;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  for (i = 1; i < length; i++) {
Shinya Kitaoka 120a6e
    desigma.r += pix1->r - 2 * pix2->r + pix3->r;
Shinya Kitaoka 120a6e
    desigma.g += pix1->g - 2 * pix2->g + pix3->g;
Shinya Kitaoka 120a6e
    desigma.b += pix1->b - 2 * pix2->b + pix3->b;
Shinya Kitaoka 120a6e
    desigma.m += pix1->m - 2 * pix2->m + pix3->m;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    rsum += (desigma.r + diff * (pix1->r - pix4->r)) * coeffq;
Shinya Kitaoka 120a6e
    gsum += (desigma.g + diff * (pix1->g - pix4->g)) * coeffq;
Shinya Kitaoka 120a6e
    bsum += (desigma.b + diff * (pix1->b - pix4->b)) * coeffq;
Shinya Kitaoka 120a6e
    msum += (desigma.m + diff * (pix1->m - pix4->m)) * coeffq;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    row2->r = rsum;
Shinya Kitaoka 120a6e
    row2->g = gsum;
Shinya Kitaoka 120a6e
    row2->b = bsum;
Shinya Kitaoka 120a6e
    row2->m = msum;
Shinya Kitaoka 120a6e
    row2++;
Shinya Kitaoka 120a6e
    pix1++, pix2++, pix3++, pix4++;
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
322341
#ifdef USE_SSE2
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
template <class class="" p="" t,=""></class>
Shinya Kitaoka 120a6e
inline void blur_code_SSE2(T *row1, BlurPixel

*row2, int length, float coeff,

Shinya Kitaoka 120a6e
                           float coeffq, int brad, float diff,
Shinya Kitaoka 120a6e
                           float round_fac) {
Shinya Kitaoka 120a6e
  static float two     = 2;
Shinya Kitaoka 120a6e
  static __m128i zeros = _mm_setzero_si128();
Shinya Kitaoka 120a6e
  static __m128 twos   = _mm_load_ps1(&two);
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  int i;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  __m128 sigma1, sigma2, sigma3, desigma;
Shinya Kitaoka 120a6e
  T *pix1, *pix2, *pix3, *pix4;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  pix1 = row1;
Shinya Kitaoka 120a6e
  pix2 = row1 - 1;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  //
Shinya Kitaoka 120a6e
  __m128i piPix1 = _mm_cvtsi32_si128(*(DWORD *)pix1);
Shinya Kitaoka 120a6e
  __m128i piPix2 = _mm_cvtsi32_si128(*(DWORD *)pix2);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  piPix1 = _mm_unpacklo_epi8(piPix1, zeros);
Shinya Kitaoka 120a6e
  piPix2 = _mm_unpacklo_epi8(piPix2, zeros);
Shinya Kitaoka 120a6e
  piPix1 = _mm_unpacklo_epi16(piPix1, zeros);
Shinya Kitaoka 120a6e
  piPix2 = _mm_unpacklo_epi16(piPix2, zeros);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  sigma1 = _mm_cvtepi32_ps(piPix1);
Shinya Kitaoka 120a6e
  //
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  pix1++;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  float zero = 0;
Shinya Kitaoka 120a6e
  sigma2     = _mm_load1_ps(&zero);
Shinya Kitaoka 120a6e
  sigma3     = _mm_load1_ps(&zero);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  for (i = 1; i < brad; i++) {
Shinya Kitaoka 120a6e
    piPix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix1), zeros);
Shinya Kitaoka 120a6e
    piPix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix2), zeros);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    __m128 pPix1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix1, zeros));
Shinya Kitaoka 120a6e
    __m128 pPix2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix2, zeros));
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    sigma1 = _mm_add_ps(sigma1, pPix1);
Shinya Kitaoka 120a6e
    sigma2 = _mm_add_ps(sigma2, pPix2);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    __m128i pii = _mm_unpacklo_epi8(_mm_cvtsi32_si128(i), zeros);
Shinya Kitaoka 120a6e
    __m128 pi   = _mm_cvtepi32_ps(_mm_unpacklo_epi16(pii, zeros));
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    pPix1  = _mm_add_ps(pPix1, pPix2);
Shinya Kitaoka 120a6e
    pPix1  = _mm_mul_ps(pi, pPix1);      // i*(pix1 + pix2)
Shinya Kitaoka 120a6e
    sigma3 = _mm_add_ps(sigma3, pPix1);  // sigma3 += i*(pix1 + pix2)
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    pix1++;
Shinya Kitaoka 120a6e
    pix2--;
Shinya Kitaoka 120a6e
  }
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  __m128 pCoeff    = _mm_load1_ps(&coeff);
Shinya Kitaoka 120a6e
  __m128 pCoeffq   = _mm_load1_ps(&coeffq);
Shinya Kitaoka 120a6e
  __m128 pRoundFac = _mm_load1_ps(&round_fac);
Shinya Kitaoka 120a6e
  __m128 pDiff     = _mm_load1_ps(&diff);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  // sum = (sigma1 + sigma2)*coeff - sigma3*coeffq + round_fac
Shinya Kitaoka 120a6e
  __m128 sum  = _mm_add_ps(sigma1, sigma2);
Shinya Kitaoka 120a6e
  sum         = _mm_mul_ps(sum, pCoeff);
Shinya Kitaoka 120a6e
  __m128 sum2 = _mm_mul_ps(sigma3, pCoeffq);
Shinya Kitaoka 120a6e
  sum2        = _mm_add_ps(sum2, pRoundFac);
Shinya Kitaoka 120a6e
  sum         = _mm_sub_ps(sum, sum2);
Shinya Kitaoka 120a6e
  /*
Shinya Kitaoka 120a6e
  __m128i isum = _mm_cvtps_epi32(sum);
Shinya Kitaoka 120a6e
isum = _mm_packs_epi32(isum, zeros);
Shinya Kitaoka 120a6e
isum = _mm_packs_epi16(isum, zeros);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
*(DWORD*)row2 = _mm_cvtsi128_si32(isum);
Shinya Kitaoka 120a6e
*/
Shinya Kitaoka 120a6e
  _mm_store_ps((float *)row2, sum);
Shinya Kitaoka 120a6e
  row2++;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  __m128i piPixMin =
Shinya Kitaoka 120a6e
      _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)(row1 - brad)), zeros);
Shinya Kitaoka 120a6e
  __m128 pPixMin = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPixMin, zeros));
Shinya Kitaoka 120a6e
  sigma2         = _mm_add_ps(sigma2, pPixMin);
Shinya Kitaoka 120a6e
  /*
Shinya Kitaoka 120a6e
sigma2.r += row1[-brad].r;
Shinya Kitaoka 120a6e
sigma2.g += row1[-brad].g;
Shinya Kitaoka 120a6e
sigma2.b += row1[-brad].b;
Shinya Kitaoka 120a6e
sigma2.m += row1[-brad].m;
Toshihiro Shimizu 890ddd
*/
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  pix1 = row1 + brad;
Shinya Kitaoka 120a6e
  pix2 = row1;
Shinya Kitaoka 120a6e
  pix3 = row1 - brad;
Shinya Kitaoka 120a6e
  pix4 = row1 - brad + 1;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  desigma = _mm_sub_ps(sigma1, sigma2);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  for (i = 1; i < length; i++) {
Shinya Kitaoka 120a6e
    piPix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix1), zeros);
Shinya Kitaoka 120a6e
    piPix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix2), zeros);
Shinya Kitaoka 120a6e
    __m128i piPix3 =
Shinya Kitaoka 120a6e
        _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix3), zeros);
Shinya Kitaoka 120a6e
    __m128i piPix4 =
Shinya Kitaoka 120a6e
        _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(DWORD *)pix4), zeros);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    __m128 pPix1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix1, zeros));
Shinya Kitaoka 120a6e
    __m128 pPix2 =
Shinya Kitaoka 120a6e
        _mm_cvtepi32_ps(_mm_slli_epi32(_mm_unpacklo_epi16(piPix2, zeros), 1));
Shinya Kitaoka 120a6e
    __m128 pPix3 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix3, zeros));
Shinya Kitaoka 120a6e
    __m128 pPix4 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(piPix4, zeros));
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    // desigma += pix1 - 2*pix2 + pix3
Shinya Kitaoka 120a6e
    __m128 tmp = _mm_sub_ps(pPix3, pPix2);
Shinya Kitaoka 120a6e
    tmp        = _mm_add_ps(tmp, pPix1);
Shinya Kitaoka 120a6e
    desigma    = _mm_add_ps(desigma, tmp);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    // sum += (desigma + diff*(pix1 - pix4))*coeffq
Shinya Kitaoka 120a6e
    tmp = _mm_sub_ps(pPix1, pPix4);
Shinya Kitaoka 120a6e
    tmp = _mm_mul_ps(tmp, pDiff);
Shinya Kitaoka 120a6e
    tmp = _mm_add_ps(desigma, tmp);
Shinya Kitaoka 120a6e
    tmp = _mm_mul_ps(tmp, pCoeffq);
Shinya Kitaoka 120a6e
    sum = _mm_add_ps(sum, tmp);
Shinya Kitaoka 120a6e
    /*
Shinya Kitaoka 120a6e
isum = _mm_cvtps_epi32(sum);
Shinya Kitaoka 120a6e
isum = _mm_packs_epi32(isum, zeros);
Shinya Kitaoka 120a6e
isum = _mm_packs_epi16(isum, zeros);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
*(DWORD*)row2 = _mm_cvtsi128_si32(isum);
Toshihiro Shimizu 890ddd
*/
Shinya Kitaoka 120a6e
    _mm_store_ps((float *)row2, sum);
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
    row2++;
Shinya Kitaoka 120a6e
    pix1++, pix2++, pix3++, pix4++;
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
template <class class="" p="" t,=""></class>
Shinya Kitaoka 120a6e
inline void blur_code_SSE2(BlurPixel

*row1, T *row2, int length, float coeff,

Shinya Kitaoka 120a6e
                           float coeffq, int brad, float diff,
Shinya Kitaoka 120a6e
                           float round_fac) {
Shinya Kitaoka 120a6e
  int i;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  float two     = 2;
Shinya Kitaoka 120a6e
  __m128i zeros = _mm_setzero_si128();
Shinya Kitaoka 120a6e
  __m128 twos   = _mm_load_ps1(&two);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  __m128 sigma1, sigma2, sigma3, desigma;
Shinya Kitaoka 120a6e
  BlurPixel

*pix1, *pix2, *pix3, *pix4;

Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  pix1 = row1;
Shinya Kitaoka 120a6e
  pix2 = row1 - 1;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  __m128 pPix1 = _mm_load_ps((float *)pix1);
Shinya Kitaoka 120a6e
  __m128 pPix2 = _mm_load_ps((float *)pix2);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  // sigma1 = *pix1
Shinya Kitaoka 120a6e
  sigma1 = pPix1;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  pix1++;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  float zero = 0;
Shinya Kitaoka 120a6e
  sigma2     = _mm_load1_ps(&zero);
Shinya Kitaoka 120a6e
  sigma3     = _mm_load1_ps(&zero);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  for (i = 1; i < brad; i++) {
Shinya Kitaoka 120a6e
    pPix1 = _mm_load_ps((float *)pix1);
Shinya Kitaoka 120a6e
    pPix2 = _mm_load_ps((float *)pix2);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    sigma1 = _mm_add_ps(sigma1, pPix1);
Shinya Kitaoka 120a6e
    sigma2 = _mm_add_ps(sigma2, pPix2);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    __m128i pii = _mm_unpacklo_epi8(_mm_cvtsi32_si128(i), zeros);
Shinya Kitaoka 120a6e
    __m128 pi   = _mm_cvtepi32_ps(_mm_unpacklo_epi16(pii, zeros));
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    pPix1  = _mm_add_ps(pPix1, pPix2);
Shinya Kitaoka 120a6e
    pPix1  = _mm_mul_ps(pi, pPix1);      // i*(pix1 + pix2)
Shinya Kitaoka 120a6e
    sigma3 = _mm_add_ps(sigma3, pPix1);  // sigma3 += i*(pix1 + pix2)
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    pix1++;
Shinya Kitaoka 120a6e
    pix2--;
Shinya Kitaoka 120a6e
  }
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  __m128 pCoeff  = _mm_load1_ps(&coeff);
Shinya Kitaoka 120a6e
  __m128 pCoeffq = _mm_load1_ps(&coeffq);
Shinya Kitaoka 120a6e
  //  __m128 pRoundFac = _mm_load1_ps(&round_fac);
Shinya Kitaoka 120a6e
  __m128 pDiff = _mm_load1_ps(&diff);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  // sum = (sigma1 + sigma2)*coeff - sigma3*coeffq + round_fac
Shinya Kitaoka 120a6e
  __m128 sum  = _mm_add_ps(sigma1, sigma2);
Shinya Kitaoka 120a6e
  sum         = _mm_mul_ps(sum, pCoeff);
Shinya Kitaoka 120a6e
  __m128 sum2 = _mm_mul_ps(sigma3, pCoeffq);
Shinya Kitaoka 120a6e
  // sum2 = _mm_add_ps(sum2, pRoundFac);
Shinya Kitaoka 120a6e
  sum = _mm_sub_ps(sum, sum2);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  // converte i canali da float a char
Shinya Kitaoka 120a6e
  __m128i isum = _mm_cvtps_epi32(sum);
Shinya Kitaoka 120a6e
  isum         = _mm_packs_epi32(isum, zeros);
Shinya Kitaoka 120a6e
  // isum = _mm_packs_epi16(isum, zeros);
Shinya Kitaoka 120a6e
  isum           = _mm_packus_epi16(isum, zeros);
Shinya Kitaoka 120a6e
  *(DWORD *)row2 = _mm_cvtsi128_si32(isum);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  row2++;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  // sigma2 += row1[-brad]
Shinya Kitaoka 120a6e
  __m128 pPixMin = _mm_load_ps((float *)(row1 - brad));
Shinya Kitaoka 120a6e
  sigma2         = _mm_add_ps(sigma2, pPixMin);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  pix1 = row1 + brad;
Shinya Kitaoka 120a6e
  pix2 = row1;
Shinya Kitaoka 120a6e
  pix3 = row1 - brad;
Shinya Kitaoka 120a6e
  pix4 = row1 - brad + 1;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  desigma = _mm_sub_ps(sigma1, sigma2);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  for (i = 1; i < length; i++) {
Shinya Kitaoka 120a6e
    __m128 pPix1 = _mm_load_ps((float *)pix1);
Shinya Kitaoka 120a6e
    __m128 pPix2 = _mm_load_ps((float *)pix2);
Shinya Kitaoka 120a6e
    __m128 pPix3 = _mm_load_ps((float *)pix3);
Shinya Kitaoka 120a6e
    __m128 pPix4 = _mm_load_ps((float *)pix4);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    pPix2 = _mm_mul_ps(pPix2, twos);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    // desigma += pix1 - 2*pix2 + pix3
Shinya Kitaoka 120a6e
    __m128 tmp = _mm_sub_ps(pPix3, pPix2);
Shinya Kitaoka 120a6e
    tmp        = _mm_add_ps(tmp, pPix1);
Shinya Kitaoka 120a6e
    desigma    = _mm_add_ps(desigma, tmp);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    // sum += (desigma + diff*(pix1 - pix4))*coeffq
Shinya Kitaoka 120a6e
    tmp = _mm_sub_ps(pPix1, pPix4);
Shinya Kitaoka 120a6e
    tmp = _mm_mul_ps(tmp, pDiff);
Shinya Kitaoka 120a6e
    tmp = _mm_add_ps(desigma, tmp);
Shinya Kitaoka 120a6e
    tmp = _mm_mul_ps(tmp, pCoeffq);
Shinya Kitaoka 120a6e
    sum = _mm_add_ps(sum, tmp);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    // converte i canali da float a char
Shinya Kitaoka 120a6e
    __m128i isum = _mm_cvtps_epi32(sum);
Shinya Kitaoka 120a6e
    isum         = _mm_packs_epi32(isum, zeros);
Shinya Kitaoka 120a6e
    // isum = _mm_packs_epi16(isum, zeros);  // QUESTA RIGA E' SBAGLIATA
Shinya Kitaoka 120a6e
    // assert(false);
Shinya Kitaoka 120a6e
    isum           = _mm_packus_epi16(isum, zeros);
Shinya Kitaoka 120a6e
    *(DWORD *)row2 = _mm_cvtsi128_si32(isum);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    row2++;
Shinya Kitaoka 120a6e
    pix1++, pix2++, pix3++, pix4++;
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
#endif  // _WIN32
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
#define STORE_COL_CODE(crop_val)                                               \
Shinya Kitaoka 120a6e
  {                                                                            \
Shinya Kitaoka 120a6e
    int i, val;                                                                \
Shinya Kitaoka 120a6e
    double ampl;                                                               \
Shinya Kitaoka 120a6e
    buffer += x;                                                               \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
    ampl = 1.0 + blur / 15.0;                                                  \
Shinya Kitaoka 120a6e
                                                                               \
Shinya Kitaoka 120a6e
    if (backlit)                                                               \
Shinya Kitaoka 120a6e
      for (i = ((dy >= 0) ? 0 : -dy); i < std::min(ly, r_ly - dy); i++) {      \
Shinya Kitaoka 120a6e
        val       = troundp(col[i].r * ampl);                                  \
Shinya Kitaoka 120a6e
        buffer->r = (val > crop_val) ? crop_val : val;                         \
Shinya Kitaoka 120a6e
        val       = troundp(col[i].g * ampl);                                  \
Shinya Kitaoka 120a6e
        buffer->g = (val > crop_val) ? crop_val : val;                         \
Shinya Kitaoka 120a6e
        val       = troundp(col[i].b * ampl);                                  \
Shinya Kitaoka 120a6e
        buffer->b = (val > crop_val) ? crop_val : val;                         \
Shinya Kitaoka 120a6e
        val       = troundp(col[i].m * ampl);                                  \
Shinya Kitaoka 120a6e
        buffer->m = (val > crop_val) ? crop_val : val;                         \
Shinya Kitaoka 120a6e
        buffer += wrap;                                                        \
Shinya Kitaoka 120a6e
      }                                                                        \
Shinya Kitaoka 120a6e
    else                                                                       \
Shinya Kitaoka 120a6e
      for (i = ((dy >= 0) ? 0 : -dy); i < std::min(ly, r_ly - dy); i++) {      \
Shinya Kitaoka 120a6e
        *buffer = col[i];                                                      \
Shinya Kitaoka 120a6e
        buffer += wrap;                                                        \
Shinya Kitaoka 120a6e
      }                                                                        \
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
template <class t=""></class>
Shinya Kitaoka 120a6e
void store_colRgb(T *buffer, int wrap, int r_ly, T *col, int ly, int x, int dy,
Shinya Kitaoka 120a6e
                  int backlit, double blur) {
Shinya Kitaoka 120a6e
  int val = T::maxChannelValue;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  if (val == 255)
Shinya Kitaoka 120a6e
    STORE_COL_CODE(204)
Shinya Kitaoka 120a6e
  else if (val == 65535)
Shinya Kitaoka 120a6e
    STORE_COL_CODE(204 * 257)
Shinya Kitaoka 120a6e
  else
Shinya Kitaoka 120a6e
    assert(false);
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
template <class t=""></class>
Shinya Kitaoka 120a6e
void store_colGray(T *buffer, int wrap, int r_ly, T *col, int ly, int x, int dy,
Shinya Kitaoka 120a6e
                   int backlit, double blur) {
Shinya Kitaoka 120a6e
  int i;
Shinya Kitaoka 120a6e
  double ampl;
Shinya Kitaoka 120a6e
  buffer += x;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  ampl = 1.0 + blur / 15.0;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  for (i = ((dy >= 0) ? 0 : -dy); i < std::min(ly, r_ly - dy); i++) {
Shinya Kitaoka 120a6e
    *buffer = col[i];
Shinya Kitaoka 120a6e
    buffer += wrap;
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
template <class p=""></class>
Shinya Kitaoka 120a6e
void load_colRgb(BlurPixel

*buffer, BlurPixel

*col, int lx, int ly, int x,

Shinya Kitaoka 120a6e
                 int brad, int by1, int by2) {
Shinya Kitaoka 120a6e
  int i;
Shinya Kitaoka 120a6e
  BlurPixel

*pix, left_val, right_val;

Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  LOAD_COL_CODE
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
void load_channel_col32(float *buffer, float *col, int lx, int ly, int x,
Shinya Kitaoka 120a6e
                        int brad, int by1, int by2) {
Shinya Kitaoka 120a6e
  int i;
Shinya Kitaoka 120a6e
  float *pix, left_val, right_val;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  LOAD_COL_CODE
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
template <class class="" p="" q,="" t,=""></class>
Shinya Kitaoka 120a6e
void do_filtering_chan(BlurPixel

*row1, T *row2, int length, float coeff,

Shinya Kitaoka 120a6e
                       float coeffq, int brad, float diff, bool useSSE) {
322341
#ifdef USE_SSE2
Shinya Kitaoka 120a6e
  if (useSSE && T::maxChannelValue == 255)
Shinya Kitaoka 120a6e
    blur_code_SSE2<t, p="">(row1, row2, length, coeff, coeffq, brad, diff, 0.5);</t,>
Shinya Kitaoka 120a6e
  else
Toshihiro Shimizu 890ddd
#endif
Shinya Kitaoka 120a6e
  {
Shinya Kitaoka 120a6e
    int i;
Shinya Kitaoka 120a6e
    P rsum, gsum, bsum, msum;
Shinya Kitaoka 120a6e
    BlurPixel

sigma1, sigma2, sigma3, desigma;

Shinya Kitaoka 120a6e
    BlurPixel

*pix1, *pix2, *pix3, *pix4;

Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    BLUR_CODE((P)0.5, Q)
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
template <class t=""></class>
Shinya Kitaoka 120a6e
void do_filtering_channel_float(T *row1, float *row2, int length, float coeff,
Shinya Kitaoka 120a6e
                                float coeffq, int brad, float diff) {
Shinya Kitaoka 120a6e
  int i;
Shinya Kitaoka 120a6e
  float sum;
Shinya Kitaoka 120a6e
  float sigma1, sigma2, sigma3, desigma;
Shinya Kitaoka 120a6e
  T *pix1, *pix2, *pix3, *pix4;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  pix1 = row1;
Shinya Kitaoka 120a6e
  pix2 = row1 - 1;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  sigma1 = pix1->value;
Shinya Kitaoka 120a6e
  pix1++;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  sigma2 = 0.0;
Shinya Kitaoka 120a6e
  sigma3 = 0.0;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  for (i = 1; i < brad; i++) {
Shinya Kitaoka 120a6e
    sigma1 += pix1->value;
Shinya Kitaoka 120a6e
    sigma2 += pix2->value;
Shinya Kitaoka 120a6e
    sigma3 += i * (pix1->value + pix2->value);
Shinya Kitaoka 120a6e
    pix1++;
Shinya Kitaoka 120a6e
    pix2--;
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  sum = (sigma1 + sigma2) * coeff - sigma3 * coeffq;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  *row2 = sum;
Shinya Kitaoka 120a6e
  row2++;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  sigma2 += row1[-brad].value;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  pix1 = row1 + brad;
Shinya Kitaoka 120a6e
  pix2 = row1;
Shinya Kitaoka 120a6e
  pix3 = row1 - brad;
Shinya Kitaoka 120a6e
  pix4 = row1 - brad + 1;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  desigma = sigma1 - sigma2;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  for (i = 1; i < length; i++) {
Shinya Kitaoka 120a6e
    desigma += pix1->value - 2 * pix2->value + pix3->value;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
    sum += (desigma + diff * (pix1->value - pix4->value)) * coeffq;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
    *row2 = sum;
Shinya Kitaoka 120a6e
    row2++;
Shinya Kitaoka 120a6e
    pix1++, pix2++, pix3++, pix4++;
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
template <class t=""></class>
Shinya Kitaoka 120a6e
void do_filtering_channel_gray(float *row1, T *row2, int length, float coeff,
Shinya Kitaoka 120a6e
                               float coeffq, int brad, float diff) {
Shinya Kitaoka 120a6e
  int i;
Shinya Kitaoka 120a6e
  float sum;
Shinya Kitaoka 120a6e
  float sigma1, sigma2, sigma3, desigma;
Shinya Kitaoka 120a6e
  float *pix1, *pix2, *pix3, *pix4;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  pix1 = row1;
Shinya Kitaoka 120a6e
  pix2 = row1 - 1;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  sigma1 = *pix1;
Shinya Kitaoka 120a6e
  pix1++;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  sigma2 = 0.0;
Shinya Kitaoka 120a6e
  sigma3 = 0.0;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  for (i = 1; i < brad; i++) {
Shinya Kitaoka 120a6e
    sigma1 += *pix1;
Shinya Kitaoka 120a6e
    sigma2 += *pix2;
Shinya Kitaoka 120a6e
    sigma3 += i * (*pix1 + *pix2);
Shinya Kitaoka 120a6e
    pix1++;
Shinya Kitaoka 120a6e
    pix2--;
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  sum = (sigma1 + sigma2) * coeff - sigma3 * coeffq + 0.5F;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  row2->setValue((int)sum);
Shinya Kitaoka 120a6e
  row2++;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  sigma2 += row1[-brad];
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  pix1 = row1 + brad;
Shinya Kitaoka 120a6e
  pix2 = row1;
Shinya Kitaoka 120a6e
  pix3 = row1 - brad;
Shinya Kitaoka 120a6e
  pix4 = row1 - brad + 1;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  desigma = sigma1 - sigma2;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  for (i = 1; i < length; i++) {
Shinya Kitaoka 120a6e
    desigma += *pix1 - 2 * (*pix2) + (*pix3);
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
    sum += (desigma + diff * (*pix1 - *pix4)) * coeffq;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
    row2->setValue((int)sum);
Shinya Kitaoka 120a6e
    row2++;
Shinya Kitaoka 120a6e
    pix1++, pix2++, pix3++, pix4++;
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
template <class t=""></class>
Shinya Kitaoka 120a6e
void load_rowRgb(TRasterPT<t> &rin, T *row, int lx, int y, int brad, int bx1,</t>
Shinya Kitaoka 120a6e
                 int bx2) {
Shinya Kitaoka 120a6e
  int i;
Shinya Kitaoka 120a6e
  T *buf32, *pix;
Shinya Kitaoka 120a6e
  T left_val, right_val;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  pix = row + bx1;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  {
Shinya Kitaoka 120a6e
    rin->lock();
Shinya Kitaoka 120a6e
    buf32 = rin->pixels(y);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    for (i = 0; i < lx; i++) *pix++ = *buf32++;
Shinya Kitaoka 120a6e
    rin->unlock();
Shinya Kitaoka 120a6e
  }
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  pix += bx2;
Shinya Kitaoka 120a6e
  left_val  = *row;
Shinya Kitaoka 120a6e
  right_val = *(pix - 1);
Shinya Kitaoka 120a6e
  row--;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  for (i = 0; i < brad;
Shinya Kitaoka 120a6e
       i++) /* pixels equal to the ones of border of image are added   */
Shinya Kitaoka 120a6e
  {         /* to avoid a black blur to get into the picture.          */
Shinya Kitaoka 120a6e
    *row-- = left_val;
Shinya Kitaoka 120a6e
    *pix++ = right_val;
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
template <class t=""></class>
Shinya Kitaoka 120a6e
void load_rowGray(TRasterPT<t> &rin, T *row, int lx, int y, int brad, int bx1,</t>
Shinya Kitaoka 120a6e
                  int bx2) {
Shinya Kitaoka 120a6e
  int i;
Shinya Kitaoka 120a6e
  T *buf8, *pix;
Shinya Kitaoka 120a6e
  T left_val, right_val;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  pix  = row + bx1;
Shinya Kitaoka 120a6e
  buf8 = (T *)(rin->pixels(y));
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  for (i = 0; i < lx; i++) *pix++ = *buf8++;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  pix += bx2;
Shinya Kitaoka 120a6e
  left_val  = *row;
Shinya Kitaoka 120a6e
  right_val = *(pix - 1);
Shinya Kitaoka 120a6e
  row--;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  for (i = 0; i < brad;
Shinya Kitaoka 120a6e
       i++) /* pixels equal to the ones of border of image are added   */
Shinya Kitaoka 120a6e
  {         /* to avoid a black blur to get into the picture.          */
Shinya Kitaoka 120a6e
    *row-- = left_val;
Shinya Kitaoka 120a6e
    *pix++ = right_val;
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
template <class class="" p="" t,=""></class>
Shinya Kitaoka 120a6e
void do_filtering_floatRgb(T *row1, BlurPixel

*row2, int length, float coeff,

Shinya Kitaoka 120a6e
                           float coeffq, int brad, float diff, bool useSSE) {
Toshihiro Shimizu 890ddd
/*
Toshihiro Shimizu 890ddd
  int i;
Toshihiro Shimizu 890ddd
  float rsum, gsum, bsum,  msum;
Toshihiro Shimizu 890ddd
  CASM_FPIXEL sigma1, sigma2, sigma3, desigma;
Toshihiro Shimizu 890ddd
  TPixel32 *pix1, *pix2, *pix3, *pix4;
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
  BLUR_CODE(0, unsigned char)
Toshihiro Shimizu 890ddd
*/
Toshihiro Shimizu 890ddd
322341
#ifdef USE_SSE2
Shinya Kitaoka 120a6e
  if (useSSE)
Shinya Kitaoka 120a6e
    blur_code_SSE2<t, p="">(row1, row2, length, coeff, coeffq, brad, diff, 0);</t,>
Shinya Kitaoka 120a6e
  else
Toshihiro Shimizu 890ddd
#endif
Shinya Kitaoka 120a6e
    blur_code<t, blurpixel<p="">, P>(row1, row2, length, coeff, coeffq, brad, diff,</t,>
Shinya Kitaoka 120a6e
                                  0);
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
template <class class="" p="" q,="" t,=""></class>
Shinya Kitaoka 120a6e
void doBlurRgb(TRasterPT<t> &dstRas, TRasterPT<t> &srcRas, double blur, int dx,</t></t>
Shinya Kitaoka 120a6e
               int dy, bool useSSE) {
Shinya Kitaoka 120a6e
  int i, lx, ly, llx, lly, brad;
Shinya Kitaoka 120a6e
  float coeff, coeffq, diff;
Shinya Kitaoka 120a6e
  int bx1 = 0, by1 = 0, bx2 = 0, by2 = 0;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  brad = (int)ceil(blur); /* number of pixels involved in the filtering */
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  // int border = brad*2; // per sicurezza
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  coeff = (float)(blur /
Shinya Kitaoka 120a6e
                  (brad - brad * brad +
Shinya Kitaoka 120a6e
                   blur * (2 * brad -
Shinya Kitaoka 120a6e
                           1))); /*sum of the weights of triangolar filter. */
Shinya Kitaoka 120a6e
  coeffq = (float)(coeff / blur);
Shinya Kitaoka 120a6e
  diff   = (float)(blur - brad);
Shinya Kitaoka 120a6e
  lx     = srcRas->getLx();
Shinya Kitaoka 120a6e
  ly     = srcRas->getLy();
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  if ((lx == 0) || (ly == 0)) return;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  llx = lx + bx1 + bx2;
Shinya Kitaoka 120a6e
  lly = ly + by1 + by2;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  T *row1, *col2, *buffer;
Shinya Kitaoka 120a6e
  BlurPixel

*row2, *col1, *fbuffer;

Shinya Kitaoka 120a6e
  TRasterGR8P r1;
Toshihiro Shimizu 890ddd
Shinya Kitaoka 9f5a1b
#ifdef _WIN32
Shinya Kitaoka 120a6e
  if (useSSE) {
Shinya Kitaoka 120a6e
    fbuffer =
Shinya Kitaoka 120a6e
        (BlurPixel

*)_aligned_malloc(llx * ly * sizeof(BlurPixel

), 16);

Shinya Kitaoka 120a6e
    row1 = (T *)_aligned_malloc((llx + 2 * brad) * sizeof(T), 16);
Shinya Kitaoka 120a6e
    col1 = (BlurPixel

*)_aligned_malloc(

Shinya Kitaoka 120a6e
        (lly + 2 * brad) * sizeof(BlurPixel

), 16);

Shinya Kitaoka 120a6e
    col2 = (T *)_aligned_malloc(lly * sizeof(T), 16);
Shinya Kitaoka 120a6e
  } else
Toshihiro Shimizu 890ddd
#endif
Shinya Kitaoka 120a6e
  {
Shinya Kitaoka 120a6e
    TRasterGR8P raux(llx * sizeof(BlurPixel

), ly);

Shinya Kitaoka 120a6e
    r1 = raux;
Shinya Kitaoka 120a6e
    r1->lock();
Shinya Kitaoka 120a6e
    fbuffer = (BlurPixel

*)r1->getRawData(); // new CASM_FPIXEL [llx *ly];

Shinya Kitaoka 120a6e
    row1    = new T[llx + 2 * brad];
Shinya Kitaoka 6fa9ac
    col1    = new BlurPixel

[ lly + 2 * brad ];

Shinya Kitaoka 120a6e
    col2    = new T[lly];
Shinya Kitaoka 120a6e
  }
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  if ((!fbuffer) || (!row1) || (!col1) || (!col2)) {
Shinya Kitaoka 120a6e
    if (!useSSE) r1->unlock();
Michał Janiszewski 50e38f
#ifdef _WIN32
Shinya Kitaoka 120a6e
    if (useSSE) {
Shinya Kitaoka 120a6e
      _aligned_free(col2);
Shinya Kitaoka 120a6e
      _aligned_free(col1);
Shinya Kitaoka 120a6e
      _aligned_free(row1);
Shinya Kitaoka 120a6e
      _aligned_free(fbuffer);
Shinya Kitaoka 120a6e
    } else
Michał Janiszewski 50e38f
#endif
Shinya Kitaoka 120a6e
    {
Shinya Kitaoka 120a6e
      delete[] col2;
Shinya Kitaoka 120a6e
      delete[] col1;
Shinya Kitaoka 120a6e
      delete[] row1;
Shinya Kitaoka 120a6e
    }
Shinya Kitaoka 120a6e
    return;
Shinya Kitaoka 120a6e
  }
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  row2 = fbuffer;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  try {
Shinya Kitaoka 120a6e
    for (i = 0; i < ly; i++) {
Shinya Kitaoka 120a6e
      load_rowRgb<t>(srcRas, row1 + brad, lx, i, brad, bx1, bx2);</t>
Shinya Kitaoka 120a6e
      do_filtering_floatRgb<t>(row1 + brad, row2, llx, coeff, coeffq, brad,</t>
Shinya Kitaoka 120a6e
                               diff, useSSE);
Shinya Kitaoka 120a6e
      row2 += llx;
Shinya Kitaoka 120a6e
    }
Shinya Kitaoka 120a6e
    dstRas->lock();
Shinya Kitaoka 120a6e
    buffer = (T *)dstRas->getRawData();
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    if (dy >= 0) buffer += (dstRas->getWrap()) * dy;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    for (i = (dx >= 0) ? 0 : -dx; i < std::min(llx, dstRas->getLx() - dx);
Shinya Kitaoka 120a6e
         i++) {
Shinya Kitaoka 120a6e
      load_colRgb

(fbuffer, col1 + brad, llx, ly, i, brad, by1, by2);

Shinya Kitaoka 120a6e
      do_filtering_chan<t, p="" q,="">(col1 + brad, col2, lly, coeff, coeffq, brad,</t,>
Shinya Kitaoka 120a6e
                                 diff, useSSE);
Shinya Kitaoka 120a6e
      store_colRgb<t>(buffer, dstRas->getWrap(), dstRas->getLy(), col2, lly,</t>
Shinya Kitaoka 120a6e
                      i + dx, dy, 0, blur);
Shinya Kitaoka 120a6e
    }
Shinya Kitaoka 120a6e
    dstRas->unlock();
Shinya Kitaoka 120a6e
  } catch (...) {
Shinya Kitaoka 120a6e
    dstRas->clear();
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
Shinya Kitaoka 9f5a1b
#ifdef _WIN32
Shinya Kitaoka 120a6e
  if (useSSE) {
Shinya Kitaoka 120a6e
    _aligned_free(col2);
Shinya Kitaoka 120a6e
    _aligned_free(col1);
Shinya Kitaoka 120a6e
    _aligned_free(row1);
Shinya Kitaoka 120a6e
    _aligned_free(fbuffer);
Shinya Kitaoka 120a6e
  } else
Toshihiro Shimizu 890ddd
#endif
Shinya Kitaoka 120a6e
  {
Shinya Kitaoka 120a6e
    delete[] col2;
Shinya Kitaoka 120a6e
    delete[] col1;
Shinya Kitaoka 120a6e
    delete[] row1;
Shinya Kitaoka 120a6e
    r1->unlock();
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//-------------------------------------------------------------------
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
template <class t=""></class>
Shinya Kitaoka 120a6e
void doBlurGray(TRasterPT<t> &dstRas, TRasterPT<t> &srcRas, double blur, int dx,</t></t>
Shinya Kitaoka 120a6e
                int dy) {
Shinya Kitaoka 120a6e
  int i, lx, ly, llx, lly, brad;
Shinya Kitaoka 120a6e
  float coeff, coeffq, diff;
Shinya Kitaoka 120a6e
  int bx1 = 0, by1 = 0, bx2 = 0, by2 = 0;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  brad  = (int)ceil(blur); /* number of pixels involved in the filtering */
Shinya Kitaoka 120a6e
  coeff = (float)(blur /
Shinya Kitaoka 120a6e
                  (brad - brad * brad +
Shinya Kitaoka 120a6e
                   blur * (2 * brad -
Shinya Kitaoka 120a6e
                           1))); /*sum of the weights of triangolar filter. */
Shinya Kitaoka 120a6e
  coeffq = (float)(coeff / blur);
Shinya Kitaoka 120a6e
  diff   = (float)(blur - brad);
Shinya Kitaoka 120a6e
  lx     = srcRas->getLx();
Shinya Kitaoka 120a6e
  ly     = srcRas->getLy();
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  if ((lx == 0) || (ly == 0)) return;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  llx = lx + bx1 + bx2;
Shinya Kitaoka 120a6e
  lly = ly + by1 + by2;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  T *row1, *col2, *buffer;
Shinya Kitaoka 120a6e
  float *row2, *col1, *fbuffer;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  TRasterGR8P r1(llx * sizeof(float), ly);
Shinya Kitaoka 120a6e
  r1->lock();
Shinya Kitaoka 120a6e
  fbuffer = (float *)r1->getRawData();  // new float[llx *ly];
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  row1 = new T[llx + 2 * brad];
Shinya Kitaoka 120a6e
  col1 = new float[lly + 2 * brad];
Shinya Kitaoka 120a6e
  col2 = new T[lly];
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  if ((!fbuffer) || (!row1) || (!col1) || (!col2)) {
Shinya Kitaoka 120a6e
    delete[] row1;
Shinya Kitaoka 120a6e
    delete[] col1;
Shinya Kitaoka 120a6e
    delete[] col2;
Shinya Kitaoka 120a6e
    return;
Shinya Kitaoka 120a6e
  }
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  row2 = fbuffer;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  for (i = 0; i < ly; i++) {
Shinya Kitaoka 120a6e
    load_rowGray<t>(srcRas, row1 + brad, lx, i, brad, bx1, bx2);</t>
Shinya Kitaoka 120a6e
    do_filtering_channel_float<t>(row1 + brad, row2, llx, coeff, coeffq, brad,</t>
Shinya Kitaoka 120a6e
                                  diff);
Shinya Kitaoka 120a6e
    row2 += llx;
Shinya Kitaoka 120a6e
  }
Shinya Kitaoka 120a6e
  dstRas->lock();
Shinya Kitaoka 120a6e
  buffer = (T *)dstRas->getRawData();
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  if (dy >= 0) buffer += (dstRas->getWrap()) * dy;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  for (i = (dx >= 0) ? 0 : -dx; i < std::min(llx, dstRas->getLx() - dx); i++) {
Shinya Kitaoka 120a6e
    load_channel_col32(fbuffer, col1 + brad, llx, ly, i, brad, by1, by2);
Shinya Kitaoka 120a6e
    do_filtering_channel_gray<t>(col1 + brad, col2, lly, coeff, coeffq, brad,</t>
Shinya Kitaoka 120a6e
                                 diff);
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
    int backlit = 0;
Shinya Kitaoka 120a6e
    store_colGray<t>(buffer, dstRas->getWrap(), dstRas->getLy(), col2, lly,</t>
Shinya Kitaoka 120a6e
                     i + dx, dy, backlit, blur);
Shinya Kitaoka 120a6e
  }
Shinya Kitaoka 120a6e
  dstRas->unlock();
Shinya Kitaoka 120a6e
  delete[] col2;
Shinya Kitaoka 120a6e
  delete[] col1;
Shinya Kitaoka 120a6e
  delete[] row1;
Shinya Kitaoka 120a6e
  r1->unlock();  // delete[]fbuffer;
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
};  // namespace
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//====================================================================
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
int TRop::getBlurBorder(double blur) {
Shinya Kitaoka 120a6e
  int brad = (int)ceil(blur); /* number of pixels involved in the filtering */
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
  int border = brad * 2;  // per sicurezza
Shinya Kitaoka 120a6e
  return border;
Toshihiro Shimizu 890ddd
}
Toshihiro Shimizu 890ddd
Toshihiro Shimizu 890ddd
//--------------------------------------------------------------------
Toshihiro Shimizu 890ddd
Shinya Kitaoka 120a6e
void TRop::blur(const TRasterP &dstRas, const TRasterP &srcRas, double blur,
Shinya Kitaoka 120a6e
                int dx, int dy, bool useSSE) {
Shinya Kitaoka 120a6e
  TRaster32P dstRas32 = dstRas;
Shinya Kitaoka 120a6e
  TRaster32P srcRas32 = srcRas;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
  if (dstRas32 && srcRas32)
Shinya Kitaoka 120a6e
    doBlurRgb<tpixel32, float="" uchar,="">(dstRas32, srcRas32, blur, dx, dy, useSSE);</tpixel32,>
Shinya Kitaoka 120a6e
  else {
Shinya Kitaoka 120a6e
    TRaster64P dstRas64 = dstRas;
Shinya Kitaoka 120a6e
    TRaster64P srcRas64 = srcRas;
Shinya Kitaoka 120a6e
    if (dstRas64 && srcRas64)
Shinya Kitaoka 120a6e
      doBlurRgb<tpixel64, double="" ushort,="">(dstRas64, srcRas64, blur, dx, dy,</tpixel64,>
Shinya Kitaoka 120a6e
                                          useSSE);
Shinya Kitaoka 120a6e
    else {
Shinya Kitaoka 120a6e
      TRasterGR8P dstRasGR8 = dstRas;
Shinya Kitaoka 120a6e
      TRasterGR8P srcRasGR8 = srcRas;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
      if (dstRasGR8 && srcRasGR8)
Shinya Kitaoka 120a6e
        doBlurGray<tpixelgr8>(dstRasGR8, srcRasGR8, blur, dx, dy);</tpixelgr8>
Shinya Kitaoka 120a6e
      else {
Shinya Kitaoka 120a6e
        TRasterGR16P dstRasGR16 = dstRas;
Shinya Kitaoka 120a6e
        TRasterGR16P srcRasGR16 = srcRas;
Shinya Kitaoka 120a6e
Shinya Kitaoka 120a6e
        if (dstRasGR16 && srcRasGR16)
Shinya Kitaoka 120a6e
          doBlurGray<tpixelgr16>(dstRasGR16, srcRasGR16, blur, dx, dy);</tpixelgr16>
Shinya Kitaoka 120a6e
        else
Shinya Kitaoka 120a6e
          throw TException("TRop::blur unsupported pixel type");
Shinya Kitaoka 120a6e
      }
Shinya Kitaoka 120a6e
    }
Shinya Kitaoka 120a6e
  }
Toshihiro Shimizu 890ddd
}