Blame projects/neural/benchmark.inc.cpp

e4740d
#ifndef BENCHMARK_INC_CPP
e4740d
#define BENCHMARK_INC_CPP
e4740d
e4740d
e4740d
#include "common.inc.cpp"
e4740d
#include "layer.conv.inc.cpp"
e4740d
e4740d
e4740d
class Benchmark: public ThreadControl {
e4740d
private:
e4740d
  typedef int Int;
e4740d
  typedef double Float;
e4740d
  
e4740d
  int repeats;
e4740d
  int mode;
e4740d
  Layout pl, cl;
e4740d
  Kernel k;
e4740d
  std::vector<float> pvalues;</float>
e4740d
  std::vector<float> cvalues;</float>
e4740d
  std::vector<float> weights;</float>
e4740d
  
e4740d
  
e4740d
  __attribute__((always_inline))
e4740d
  void threadFuncXYCP(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) {
e4740d
    Layout pl = this->pl;
e4740d
    Layout cl = this->cl;
e4740d
    Kernel k = this->k;
e4740d
    
e4740d
    assert(k.sx == k_sx);
e4740d
    assert(k.sy == k_sy);
e4740d
    assert(cl.getD() == c_d);
e4740d
    assert(pl.getD() == p_d);
e4740d
    
e4740d
    Int tid = barrier.tid;
e4740d
    Int ts = barrier.threads;
e4740d
e4740d
    Int c_w = cl.getW();
e4740d
    Int c_h = cl.getH();
e4740d
    Int c_wd = c_w*c_d;
e4740d
    Int c_hw = c_h*c_w;
e4740d
    Int c_hwd = c_h*c_wd;
e4740d
    Int c_sxz = cl.sx*cl.sz;
e4740d
    
e4740d
    Int p_sxz = pl.sx*pl.sz;
e4740d
    Int p_szk = pl.sz*k.dx;
e4740d
    Int p_sxzk = p_sxz*k.dy;
e4740d
e4740d
    //Int k_sxd = k_sx*p_d;
e4740d
    Int k_syx = k_sx*k_sy;
e4740d
    Int k_syxd = k_syx*p_d;
e4740d
    
e4740d
    Float *cvalues = this->cvalues.data() + cl.y0*c_sxz + cl.x0*cl.sz + cl.z0;
e4740d
    Float *pvalues = this->pvalues.data() + (pl.y0 + k.oy)*p_sxz + (pl.x0 + k.ox)*pl.sz + pl.z0;
e4740d
    Float *weights = this->weights.data();
e4740d
    
e4740d
    if (mode == 0) {
e4740d
      for(Int i = repeats; i; --i) {
e4740d
        for(Int i = tid; i < c_hwd; i += ts) {
e4740d
          Int cy = i/c_wd;
e4740d
          Int cx = i%c_wd/c_d;
e4740d
          Int cz = i%c_d;
e4740d
          
e4740d
          Float *ic = &cvalues[ cy*c_sxz + cx*cl.sz + cz ];
e4740d
          Float *ip = &pvalues[ cy*p_sxzk + cx*p_szk ];
e4740d
          Float *iw = &weights[ cz*k_syxd ];
e4740d
e4740d
          Float a = 0;
e4740d
          
e4740d
          for(Int i = 0; i < p_d; ++i, ++ip, ++iw)
e4740d
          for(Int i = 0; i < k_syx; ++i) {
e4740d
            Int ky = i/k_sx;
e4740d
            Int kx = i%k_sx;
e4740d
            a += iw[i] * ip[ ky*p_sxz + kx*pl.sz ];
e4740d
          }
e4740d
          
e4740d
          *ic = a;
e4740d
        }
e4740d
        barrier.wait2();
e4740d
      }
e4740d
    } else
e4740d
    if (mode == 1) {
e4740d
      if (c_w > 1 || c_h > 1 || pl.sx != pl.getW() || pl.sz != p_d) {
e4740d
        for(Int i = repeats; i; --i)
e4740d
        for(Int i = 0; i < k_syx; ++i) {
e4740d
          Int ky = i/k_sx;
e4740d
          Int kx = i%k_sx;
e4740d
          //Int pi = ky*p_sxz + kx*pl.sz;
e4740d
          //Int wi = i*p_d;
e4740d
          Float *ip = &pvalues[ ky*p_sxz + kx*pl.sz ];
e4740d
          Float *iw = &weights[ i*p_d ];
e4740d
          for(Int i = tid; i < c_hw; i += ts) {
e4740d
            Int cy = i/c_w;
e4740d
            Int cx = i%c_w;
e4740d
            Float *iip = &ip[ cy*p_sxzk + cx*p_szk ];
e4740d
            Float *iic = &cvalues[ cy*c_sxz + cx*cl.sz ];
e4740d
            
e4740d
            for(Int cz = 0; cz < c_d; ++cz) {
e4740d
              //Int pii = pi + cy*p_sxzk + cx*p_szk;
e4740d
              //Int wii = wi + cz*k_syxd;
e4740d
              Float *iiw = &iw[ cz*k_syxd ];
e4740d
              Float a = iic[cz];
e4740d
              
e4740d
              for(Int i = 0; i < p_d; ++i)
e4740d
                iip[i] += iiw[i] * a;
e4740d
                //pvalues[pii + i] += weights[wii + i] * a;
e4740d
            }
e4740d
          }
e4740d
          barrier.wait2();
e4740d
        }
e4740d
      } else {
e4740d
        //Int c_dd = c_d*p_d;
e4740d
        //Int c_wdd = c_wd*p_d;
e4740d
        //Int c_hwdd = c_hwd*p_d;
e4740d
        Int cnt = k_syxd/ts;
e4740d
        for(Int i = repeats; i; --i) {
e4740d
          for(Int i = cnt*tid; i < cnt; ++i) {
e4740d
            //Int ky = i/k_sxd;
e4740d
            //Int kx = i%k_sxd/p_d;
e4740d
            //Int pz = i%p_d;
e4740d
            //Float *ip = &pvalues[ i ];//ky*p_sxz + kx*pl.sz + pz ];
e4740d
            Float *iw = &weights[ i*c_d ];
e4740d
            Float a = 0;
e4740d
            for(Int cz = 0; cz < c_d; ++cz) {
e4740d
              a += iw[ cz ] * cvalues[ cz ]; //*k_syxd
e4740d
            }
e4740d
            pvalues[ i ] = a;
e4740d
            //*ip = a;
e4740d
          }
e4740d
          barrier.wait2();
e4740d
        }
e4740d
      }
e4740d
    }
e4740d
  }
e4740d
e4740d
  __attribute__((always_inline))
e4740d
  void threadFuncXYC(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) {
e4740d
    switch(p_d) {
e4740d
    case   3: return threadFuncXYCP( barrier, k_sx, k_sy, c_d,  3 );
e4740d
    case   4: return threadFuncXYCP( barrier, k_sx, k_sy, c_d,  4 );
e4740d
    case  24: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 24 );
e4740d
    case  48: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 28 );
e4740d
    }
e4740d
    threadFuncXYCP( barrier, k_sx, k_sy, c_d, p_d );
e4740d
  }
e4740d
e4740d
  __attribute__((always_inline))
e4740d
  void threadFuncXY(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) {
e4740d
    switch(c_d) {
e4740d
    case   3: return threadFuncXYC( barrier, k_sx, k_sy,  3, p_d );
e4740d
    case   4: return threadFuncXYC( barrier, k_sx, k_sy,  4, p_d );
e4740d
    case  24: return threadFuncXYC( barrier, k_sx, k_sy, 24, p_d );
e4740d
    case  48: return threadFuncXYC( barrier, k_sx, k_sy, 48, p_d );
e4740d
    }
e4740d
    threadFuncXYC( barrier, k_sx, k_sy, c_d, p_d );
e4740d
  }
e4740d
e4740d
  void threadFunc(Barrier &barrier) override {
e4740d
    Int k_sx = k.sx;
e4740d
    Int k_sy = k.sy;
e4740d
    Int c_d = cl.getD();
e4740d
    Int p_d = pl.getD();
e4740d
e4740d
    if (k_sy == k_sx) switch(k_sx) {
e4740d
    case 4: return threadFuncXY( barrier, 4, 4, c_d, p_d );
e4740d
    }
e4740d
    threadFuncXY( barrier, k_sx, k_sy, c_d, p_d );
e4740d
  }
e4740d
  
e4740d
e4740d
  void init(int mode, Layout pl, Layout cl, Kernel k, long long totalLinks) {
e4740d
    assert(pl);
e4740d
    assert(cl);
e4740d
    assert(k);
e4740d
    assert(totalLinks > 0);
e4740d
    assert(0 <= pl.x0 + k.ox && (cl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);
e4740d
    assert(0 <= pl.y0 + k.oy && (cl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);
e4740d
    
e4740d
    this->mode = mode;
e4740d
    this->pl = pl;
e4740d
    this->cl = cl;
e4740d
    this->k = k;
e4740d
    
e4740d
    pvalues.resize(pl.getCount());
e4740d
    cvalues.resize(cl.getCount());
e4740d
    weights.resize(cl.getD()*k.sx*k.sy*pl.getD());
e4740d
    
e4740d
    for(int i = 0; i < (int)pvalues.size(); ++i) pvalues[i] = rand();
e4740d
    for(int i = 0; i < (int)cvalues.size(); ++i) cvalues[i] = rand();
e4740d
    for(int i = 0; i < (int)weights.size(); ++i) weights[i] = rand();
e4740d
    
e4740d
    long long links = weights.size() * cl.getW() * cl.getH();
e4740d
    repeats = (totalLinks - 1)/links + 1;
e4740d
    
e4740d
    //printf( "benchmark init: prev %lld, curr %lld, links %lld, repeats %d, total links: %lld\n",
e4740d
    //  (long long)pvalues.size(), (long long)cvalues.size(), links, repeats, links*repeats );
e4740d
  }
e4740d
e4740d
  
e4740d
  void run(const char *name, int threadsCount, int mode, Layout pl, Layout cl, Kernel k, long long totalLinks) {
e4740d
    init(mode, pl, cl, k, totalLinks);
e4740d
    
e4740d
    volatile long long t0 = timeUs();
e4740d
    runThreads(threadsCount);
e4740d
    volatile long long t1 = timeUs();
e4740d
e4740d
    Float sum = 0;
e4740d
    for(int i = 0; i < (int)pvalues.size(); ++i) sum += pvalues[i];
e4740d
    for(int i = 0; i < (int)cvalues.size(); ++i) sum += cvalues[i];
e4740d
    for(int i = 0; i < (int)weights.size(); ++i) sum += weights[i];
e4740d
    printf("%s %d: %f, %lld\n", name, mode, (t1 - t0)*1e-6, (long long)sum);
e4740d
  }
e4740d
e4740d
  
e4740d
  void run(const char *name, int threadsCount, Layout pl, Layout cl, Kernel k, long long totalLinks) {
e4740d
    for(int mode = 0; mode < 2; ++mode)
e4740d
      run(name, threadsCount, mode, pl, cl, k, totalLinks);
e4740d
  }
e4740d
e4740d
e4740d
public:
e4740d
  void run(int threadsCount = 1) {
e4740d
    //printf("run benchmark: %d\n", threadsCount);
e4740d
    /*
e4740d
    run( "514x3  -> 258x24", threadsCount,
e4740d
         Layout(514, 514,  3).expandXY(2),
e4740d
         Layout(258, 258, 24).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
e4740d
    run( "258x24 -> 130x48", threadsCount,
e4740d
         Layout(258, 258, 24).expandXY(2),
e4740d
         Layout(130, 130, 48).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
e4740d
    run( "130x48 -> 66x96 ", threadsCount,
e4740d
         Layout(130, 130, 48).expandXY(2),
e4740d
         Layout( 66,  66, 96).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
e4740d
    run( "66x96  -> 34x144", threadsCount,
e4740d
         Layout( 66,  66, 96).expandXY(2),
e4740d
         Layout(34, 34,  144).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
e4740d
    run( "34x144 -> 18x216", threadsCount,
e4740d
         Layout(34, 34,  144).expandXY(2),
e4740d
         Layout(18, 18,  216).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
e4740d
    run( "18x216 -> 10x324", threadsCount,
e4740d
         Layout(18, 18,  216).expandXY(2),
e4740d
         Layout(10, 10,  324).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
e4740d
    run( "10x324 -> 6x486 ", threadsCount,
e4740d
         Layout(10, 10,  324).expandXY(2),
e4740d
         Layout( 6,  6,  486).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
e4740d
    run( "6x486 -> 4x729  ", threadsCount,
e4740d
         Layout( 6,  6,  486).expandXY(2),
e4740d
         Layout( 4,  4,  729).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
e4740d
    */
e4740d
    run( "4x768 -> 1x1093 ", threadsCount,
e4740d
         Layout( 4,  4,  768).expandXY(0),
e4740d
         Layout( 1,  1, 1093).expandXY(0), Kernel(4, 2,  0), 10ll*1000*1000*1000 );
e4740d
  }
e4740d
};
e4740d
e4740d
e4740d
e4740d
#endif