Blame simple/neural/nnlayer3.mt.inc.cpp

53488e
#ifndef NNLAYER3_MT_INC_CPP
53488e
#define NNLAYER3_MT_INC_CPP
53488e
53488e
53488e
#include "nnlayer3.inc.cpp"
53488e
53488e
53488e
#include <atomic></atomic>
53488e
#include <thread></thread>
53488e
#include <vector></vector>
53488e
53488e
53488e
class Barrier {
53488e
private:
53488e
  std::atomic<unsigned int=""> &counter;</unsigned>
53488e
  const unsigned int threads;
53488e
  unsigned int next;
53488e
public:
53488e
  inline Barrier(std::atomic<unsigned int=""> &counter, unsigned int threads): counter(counter), threads(threads), next() { }</unsigned>
53488e
  inline void wait() { next += threads; ++counter; while(counter < next); }
53488e
  inline void subwait(int tid) { while(counter < next + tid); }
53488e
};
53488e
53488e
53488e
class TrainMT {
53488e
private:
53488e
  struct LDesc {
53488e
    int y0, y1;
53488e
    double sumQ;
53488e
    LDesc(): y0(), y1(), sumQ() { }
53488e
  };
53488e
53488e
public:
53488e
  Layer *layer;
53488e
  const unsigned char *dataX;
53488e
  const unsigned char *dataY;
53488e
  int strideX;
53488e
  int strideY;
53488e
  int *shuffle;
53488e
  int count;
53488e
  Real trainRatio;
53488e
53488e
  TrainMT():
53488e
    layer(),
53488e
    dataX(),
53488e
    dataY(),
53488e
    strideX(),
53488e
    strideY(),
53488e
    shuffle(),
53488e
    count(),
53488e
    trainRatio() { }
53488e
53488e
private:
53488e
  void trainFunc(int tid, int threads, std::atomic<unsigned int=""> &barrierCounter, LDesc *ldescs) {</unsigned>
53488e
    Barrier barrier(barrierCounter, threads);
53488e
53488e
    Layer &fl = *layer;
53488e
    Layer &bl = layer->back();
53488e
    int layersCount = fl.totalLayers();
53488e
    LDesc *fld = ldescs, *bld = fld + layersCount - 1;
53488e
53488e
    Real trainRatio = this->trainRatio;
53488e
53488e
    int fsxz = fl.sx*fl.sz;
53488e
    int bsxz = bl.sx*bl.sz;
53488e
53488e
    //barrier.subwait(tid);
53488e
    //for(LDesc *ld = fld; ld <= bld; ++ld)
53488e
    //  printf("t%d %d %d %d\n", tid, (int)(ld-fld), ld->y0, ld->y1);
53488e
    //barrier.wait();
53488e
53488e
    const unsigned char *dataX = this->dataX + fsxz*fld->y0;
53488e
    const unsigned char *dataY = this->dataY + bsxz*bld->y0;
53488e
53488e
    double sumQ = 0;
53488e
    for(int i = 0; i < count; ++i) {
53488e
      int ii = shuffle[i];
53488e
      const unsigned char *curX = dataX + strideX*ii;
53488e
      const unsigned char *curY = dataY + strideY*ii;
53488e
53488e
      barrier.wait();
53488e
      const unsigned char *px = curX;
53488e
      for(Neuron *in = fl.neurons + fsxz*fld->y0, *e = fl.neurons + fsxz*fld->y1; in < e; ++in, ++px)
53488e
        in->v = Real(*px)*Real(1/255.0);
53488e
53488e
      LDesc *ld = fld + 1;
53488e
      for(Layer *l = fl.next; l; l = l->next, ++ld) {
53488e
        barrier.wait();
53488e
        l->pass(ld->y0, ld->y1);
53488e
      }
53488e
53488e
      double q = 0;
53488e
      const unsigned char *py = curY;
53488e
      for(Neuron *in = bl.neurons + bsxz*bld->y0, *e = bl.neurons + bsxz*bld->y1; in < e; ++in, ++py) {
53488e
        Real v = (in->v - 0.25)*2;
53488e
        Real d = Real(*py)*Real(1/255.0) - v;
53488e
        in->d *= d * trainRatio;
53488e
        d *= d;
53488e
        q += d*d;
53488e
      }
53488e
      sumQ += q;
53488e
53488e
      if (trainRatio > 0) {
53488e
        ld = bld;
53488e
        for(Layer *l = &bl; l->prev; l = l->prev, --ld) {
53488e
          if (!l->prev->prev) {
53488e
            barrier.wait();
53488e
            l->backpassWeights(ld->y0, ld->y1);
53488e
            break;
53488e
          } else
53488e
          if (l->next) {
53488e
            barrier.wait();
53488e
            l->backpassTpl<true>(ld->y0, ld->y1);</true>
53488e
            //l->backpassTpl<false>(ld->y0, ld->y1);</false>
53488e
            //barrier.wait();
53488e
            //l->next->backpassWeights(ld[1].y0, ld[1].y1);
53488e
          }
53488e
        }
53488e
      }
53488e
53488e
      //if (!tid) printf(" - %d, %f, %f\n", i, q, sumQ);
53488e
    }
53488e
53488e
    ldescs->sumQ = sumQ;
53488e
  }
53488e
53488e
public:
53488e
  double train(int threads) {
53488e
    assert(threads > 0);
53488e
    assert(layer && !layer->prev);
53488e
    assert(dataX && dataY && shuffle);
53488e
    assert(count > 0);
53488e
    assert(trainRatio >= 0);
53488e
53488e
    int layersCount = layer->totalLayers();
53488e
    assert(layersCount > 0);
53488e
    std::vector<ldesc> ldescs( threads*layersCount );</ldesc>
53488e
53488e
    int layerId = 0;
53488e
    for(Layer *l = layer; l; l = l->next, ++layerId) {
53488e
      assert(layerId < layersCount);
53488e
      int tsy = l->sy/threads;
53488e
      for(int tid = 0; tid < threads; ++tid) {
53488e
        LDesc &desc = ldescs[tid*layersCount + layerId];
53488e
        desc.y0 = tid*tsy;
53488e
        desc.y1 = desc.y0 + tsy;
53488e
        if (tid == threads-1) desc.y1 = l->sy;
53488e
      }
53488e
    }
53488e
    assert(layerId == layersCount);
53488e
53488e
    std::atomic<unsigned int=""> barrierCounter(0);</unsigned>
53488e
    std::vector<std::thread*> t(threads - 1);</std::thread*>
53488e
    for(int i = 1; i < threads; ++i)
53488e
      t[i-1] = new std::thread(&TrainMT::trainFunc, this, i, threads, std::ref(barrierCounter), &ldescs[i*layersCount]);
53488e
    trainFunc(0, threads, barrierCounter, &ldescs[0]);
53488e
53488e
    double result = ldescs[0].sumQ;
53488e
    for(int i = 1; i < threads; ++i)
53488e
      { t[i-1]->join(); delete t[i-1]; result += ldescs[i*layersCount].sumQ; }
53488e
53488e
    return sqrt(sqrt( result/(count * layer->back().countNeurons()) ));
53488e
  }
53488e
};
53488e
53488e
53488e
#endif