Blame simple/neural/nnlayer2.mt.inc.cpp

53488e
#ifndef NNLAYER2_MT_INC_CPP
53488e
#define NNLAYER2_MT_INC_CPP
53488e
53488e
53488e
#include "nnlayer2.inc.cpp"
53488e
53488e
53488e
#include <atomic></atomic>
53488e
#include <thread></thread>
53488e
#include <vector></vector>
53488e
53488e
53488e
class Barrier {
53488e
private:
53488e
  std::atomic<unsigned int=""> &counter;</unsigned>
53488e
  const unsigned int threads;
53488e
  unsigned int next;
53488e
public:
53488e
  inline Barrier(std::atomic<unsigned int=""> &counter, unsigned int threads): counter(counter), threads(threads), next() { }</unsigned>
53488e
  inline void wait() { next += threads; ++counter; while(counter < next); }
53488e
};
53488e
53488e
53488e
class TrainMT {
53488e
private:
53488e
  struct LDesc {
53488e
    int nb, ne, lb, le;
53488e
    double sumQ;
53488e
    LDesc(): nb(), ne(), lb(), le(), sumQ() { }
53488e
  };
53488e
53488e
public:
53488e
  Layer *layer;
53488e
  const unsigned char *dataX;
53488e
  const unsigned char *dataY;
53488e
  int strideX;
53488e
  int strideY;
53488e
  int *shuffle;
53488e
  int count;
53488e
  Real trainRatio;
53488e
53488e
  TrainMT():
53488e
    layer(),
53488e
    dataX(),
53488e
    dataY(),
53488e
    strideX(),
53488e
    strideY(),
53488e
    shuffle(),
53488e
    count(),
53488e
    trainRatio() { }
53488e
53488e
private:
53488e
  void trainFunc(int tid, int threads, std::atomic<unsigned int=""> &barrierCounter, LDesc *ldescs) {</unsigned>
53488e
    Barrier barrier(barrierCounter, threads);
53488e
53488e
    Layer &fl = *layer;
53488e
    Layer &bl = layer->back();
53488e
    int layersCount = fl.totalLayers();
53488e
    LDesc *fld = ldescs, *bld = fld + layersCount - 1;
53488e
53488e
    Real trainRatio = this->trainRatio;
53488e
53488e
    double sumQ = 0;
53488e
    for(int i = 0; i < count; ++i) {
53488e
      int ii = shuffle[i];
53488e
      const unsigned char *curX = dataX + strideX*ii;
53488e
      const unsigned char *curY = dataY + strideY*ii;
53488e
53488e
      const unsigned char *px = curX;
53488e
      for(Neuron *in = fl.neurons + fld->nb, *e = fl.neurons + fld->ne; in < e; ++in, ++px)
53488e
        in->v = Real(*px)*Real(1/255.0);
53488e
53488e
      LDesc *ld = fld + 1;
53488e
      for(Layer *l = fl.next; l; l = l->next, ++ld) {
53488e
        barrier.wait();
53488e
        l->pass(ld->nb, ld->ne);
53488e
      }
53488e
53488e
      double q = 0;
53488e
      const unsigned char *py = curY;
53488e
      for(Neuron *in = bl.neurons + bld->nb, *e = bl.neurons + bld->ne; in < e; ++in, ++py) {
53488e
        Real d = Real(*py)*Real(1/255.0) - in->v;
53488e
        in->d *= d * trainRatio;
53488e
        q += d*d;
53488e
      }
53488e
      sumQ += q;
53488e
53488e
      if (trainRatio > 0) {
53488e
        ld = bld - 1;
53488e
        for(Layer *l = bl.prev; l; l = l->prev, --ld) {
53488e
          barrier.wait();
53488e
          l->backpass(ld->lb, ld->le);
53488e
        }
53488e
      }
53488e
53488e
      //if (!tid) printf(" - %d, %f, %f\n", i, q, sumQ);
53488e
    }
53488e
53488e
    ldescs->sumQ = sumQ;
53488e
  }
53488e
53488e
public:
53488e
  double train(int threads) {
53488e
    assert(threads > 0);
53488e
    assert(layer && !layer->prev);
53488e
    assert(dataX && dataY && shuffle);
53488e
    assert(count > 0);
53488e
    assert(trainRatio >= 0);
53488e
53488e
    int layersCount = layer->totalLayers();
53488e
    assert(layersCount > 0);
53488e
    std::vector<ldesc> ldescs( threads*layersCount );</ldesc>
53488e
53488e
    int layerId = 0;
53488e
    for(Layer *l = layer; l; l = l->next, ++layerId) {
53488e
      assert(layerId < layersCount);
53488e
      int tsize = l->size/threads;
53488e
      for(int tid = 0; tid < threads; ++tid) {
53488e
        LDesc &desc = ldescs[tid*layersCount + layerId];
53488e
        desc.nb = tid*tsize;
53488e
        desc.ne = desc.nb + tsize;
53488e
        if (tid == threads-1) desc.ne = l->size;
53488e
      }
53488e
53488e
      if (int lsize = l->size*l->lsize) {
53488e
        int tlsize = lsize/threads;
53488e
        int ipn = l->links[ l->lfirst ].nprev;
53488e
        int tid = 0;
53488e
        int count = 0;
53488e
53488e
        ldescs[tid*layersCount + layerId].lb = l->lfirst;
53488e
        if (threads > 1) {
53488e
          for(int il = l->lfirst; il != lsize; il = l->links[il].lnext, ++count) {
53488e
            Link &link = l->links[il];
53488e
            if (ipn != link.nprev) {
53488e
              if (count >= tlsize) {
53488e
                ldescs[tid*layersCount + layerId].le = il;
53488e
                ++tid;
53488e
                count -= tlsize;
53488e
                ldescs[tid*layersCount + layerId].lb = il;
53488e
                if (tid == threads - 1) break;
53488e
              }
53488e
              ipn = link.nprev;
53488e
            }
53488e
          }
53488e
        }
53488e
        ldescs[tid*layersCount + layerId].le = lsize;
53488e
      }
53488e
    }
53488e
    assert(layerId == layersCount);
53488e
53488e
    std::atomic<unsigned int=""> barrierCounter(0);</unsigned>
53488e
    std::vector<std::thread*> t(threads - 1);</std::thread*>
53488e
    for(int i = 1; i < threads; ++i)
53488e
      t[i-1] = new std::thread(&TrainMT::trainFunc, this, i, threads, std::ref(barrierCounter), &ldescs[i*layersCount]);
53488e
    trainFunc(0, threads, barrierCounter, &ldescs[0]);
53488e
53488e
    double result = ldescs[0].sumQ;
53488e
    for(int i = 1; i < threads; ++i)
53488e
      { t[i-1]->join(); delete t[i-1]; result += ldescs[i*layersCount].sumQ; }
53488e
53488e
    return result/(count * layer->back().size);
53488e
  }
53488e
};
53488e
53488e
53488e
#endif