From 8e5348c557b3104a62756f182bd9d46933448abb Mon Sep 17 00:00:00 2001
From: Ivan Mahonin <bh@icystar.com>
Date: Mar 17 2023 14:03:54 +0000
Subject: neural: convolution shared


---
diff --git a/projects/neural/build-view-digits.sh b/projects/neural/build-view-digits.sh
new file mode 100755
index 0000000..9615903
--- /dev/null
+++ b/projects/neural/build-view-digits.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+
+export PKG_CONFIG_PATH="$HOME/opt/helianthus-release/lib/pkgconfig:/usr/local/lib/pkgconfig:/usr/lib/pkgconfig:/usr/lib/x86_64-linux-gnu/pkgconfig"
+
+if [ "$1" == "debug" ]; then
+  c++ -Wall `pkg-config --cflags --libs helianthus` -g       -O0 -pthread view-digits.cpp -lm -o view-didigts-dbg
+  echo done debug
+else
+  c++ -Wall `pkg-config --cflags --libs helianthus` -DNDEBUG -O3 -pthread view-digits.cpp -lm -o view-digits
+  echo done release
+fi
+
diff --git a/projects/neural/layer.conv.inc.cpp b/projects/neural/layer.conv.inc.cpp
index 02ffb6c..efd72ac 100644
--- a/projects/neural/layer.conv.inc.cpp
+++ b/projects/neural/layer.conv.inc.cpp
@@ -293,7 +293,7 @@ public:
       static inline void init(Neuron &n, AccumType &a) { a.v = n.v; }
       static inline void iter(Neuron &n, Weight &w, AccumType &a) { w.w += n.d * a.v; }
     };
-    iterateConvolution<I>(mtPrevLayouts[barrier.tid], prev->layout, layout, kernel, neurons, prev->neurons, weights);
+    iterateConvolution<I>(mtPrevLayouts[barrier.tid], layout, prev->layout, kernel, prev->neurons, neurons, weights);
   }
   
   
@@ -303,7 +303,7 @@ public:
       static inline void iter(Neuron &n, Weight &w, AccumType &a) { a.v += n.d * w.w; }
       static inline void done(Neuron &n, AccumType &a) { n.d *= a.v; }
     };
-    iterateConvolution<I>(mtPrevLayouts[barrier.tid], prev->layout, layout, kernel, neurons, prev->neurons, weights);
+    iterateConvolution<I>(mtPrevLayouts[barrier.tid], layout, prev->layout, kernel, prev->neurons, neurons, weights);
   }
 
   
diff --git a/projects/neural/layer.conv.shared.inc.cpp b/projects/neural/layer.conv.shared.inc.cpp
new file mode 100644
index 0000000..dee1861
--- /dev/null
+++ b/projects/neural/layer.conv.shared.inc.cpp
@@ -0,0 +1,333 @@
+#ifndef LAYER_CONV_SHARED_INC_CPP
+#define LAYER_CONV_SHARED_INC_CPP
+
+
+
+#include "layer.conv.inc.cpp"
+
+
+
+template<typename Iter>
+void iterateTestConvolutionShared(Layout cl, Layout pl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+  if (!cl) return;
+  assert(pl);
+  assert(k);
+  assert(c_neurons);
+  assert(p_neurons);
+  assert(weights);
+  assert(pl.x0 + k.ox >= 0 && pl.x0 + (cl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);
+  assert(pl.y0 + k.oy >= 0 && pl.y0 + (cl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);
+ 
+  for(int cy = cl.y0; cy < cl.y1; ++cy)
+  for(int cx = cl.x0; cx < cl.x1; ++cx)
+  for(int cz = cl.z0; cz < cl.z1; ++cz) {
+    int ci = (cy*cl.sx + cx)*cl.sz + cz;
+    Neuron &cn = c_neurons[ci];
+    typename Iter::AccumType a = {};
+    Iter::init(cn, a);
+    
+    for(int ky = 0; ky < k.sy; ++ky)
+    for(int kx = 0; kx < k.sx; ++kx)
+    for(int pz = pl.z0; pz < pl.z1; ++pz) {
+      int wi = (ky*k.sx + kx)*pl.getD() + pz - pl.z0;
+      Weight &w = weights[wi];
+
+      int px = pl.x0 + (cx - cl.x0)*k.dx + k.ox + kx;
+      int py = pl.y0 + (cy - cl.y0)*k.dy + k.oy + ky;
+      int pi = (py*pl.sx + px)*pl.sz + pz;
+      Neuron &pn = p_neurons[pi];
+      
+      Iter::iter(pn, w, a);
+    }
+    
+    Iter::done(cn, a);
+  }
+}
+
+
+template<typename Iter>
+void iterateConvolutionShared(Layout cl, Layout pl, Layout wl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+  if (!cl) return;
+  assert(pl);
+  assert(wl);
+  assert(k);
+  assert(c_neurons);
+  assert(p_neurons);
+  assert(weights);
+  assert(cl.isSubLayoutOf(wl));
+  assert(pl.x0 + k.ox >= 0 && pl.x0 + (wl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);
+  assert(pl.y0 + k.oy >= 0 && pl.y0 + (wl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);
+
+  int c_h    = cl.getH();
+  int c_w    = cl.getW();
+  int c_d    = cl.getD();
+  int c_swz  = c_w*cl.sz;
+  int c_shxz = c_h*cl.sx*cl.sz;
+  int c_dx   = cl.sz - c_d;
+  int c_dy   = (cl.sx - c_w)*cl.sz;
+
+  int p_d    = pl.getD();
+  int p_dx   = k.dx*pl.sz;
+  int p_dy   = k.dy*pl.sx*pl.sz - c_w*p_dx;
+
+  int k_sxd  = k.sx*p_d;
+  int p_ddy  = (pl.sx - k.sx)*pl.sz;
+  int p_ddx  = pl.sz - p_d;
+
+  Neuron *icn = c_neurons + (cl.y0*cl.sx + cl.x0)*cl.sz + cl.z0;
+  Neuron *ipn = p_neurons + ((pl.y0 + (cl.y0 - wl.y0)*k.dy + k.oy)*pl.sx + pl.x0 + (cl.x0 - wl.x0)*k.dx + k.ox)*pl.sz + pl.z0;
+  Weight *ew = weights + k.sy*k_sxd;
+
+  for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy)
+  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx)
+  for(Neuron *e = icn +    c_d; icn < e; ++icn) {
+    typename Iter::AccumType a;
+    Iter::init(*icn, a);
+
+    Neuron *iipn = ipn;
+    for(Weight *iw =    weights; iw < ew; iipn += p_ddy)
+    for(Weight *e = iw +  k_sxd; iw < e;  iipn += p_ddx)
+    for(Weight *e = iw +    p_d; iw < e;  ++iw, ++iipn)
+      Iter::iter(*iipn, *iw, a);
+
+    Iter::done(*icn, a);
+  }
+}
+
+
+template<typename Iter>
+void iterateConvolutionSharedPoint(Layout cl, Layout pl, Layout wl, Kernel k, int kx, int ky, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+  if (!cl) return;
+  assert(pl);
+  assert(wl);
+  assert(k);
+  assert(c_neurons);
+  assert(p_neurons);
+  assert(weights);
+  assert(cl.isSubLayoutOf(wl));
+  assert(kx >= 0 && kx < k.sx);
+  assert(ky >= 0 && ky < k.sy);
+  assert(pl.x0 + k.ox >= 0 && pl.x0 + (wl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);
+  assert(pl.y0 + k.oy >= 0 && pl.y0 + (wl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);
+
+  int c_h    = cl.getH();
+  int c_w    = cl.getW();
+  int c_d    = cl.getD();
+  int c_swz  = c_w*cl.sz;
+  int c_shxz = c_h*cl.sx*cl.sz;
+  int c_dx   = cl.sz - c_d;
+  int c_dy   = (cl.sx - c_w)*cl.sz;
+
+  int p_d    = pl.getD();
+  int p_dx   = k.dx*pl.sz;
+  int p_dy   = k.dy*pl.sx*pl.sz - c_w*p_dx;
+
+  Neuron *icn = c_neurons + (cl.y0*cl.sx + cl.x0)*cl.sz + cl.z0;
+  Neuron *ipn = p_neurons + ((pl.y0 + (cl.y0 - wl.y0)*k.dy + k.oy + ky)*pl.sx + pl.x0 + (cl.x0 - wl.x0)*k.dx + k.ox + kx)*pl.sz + pl.z0;
+  weights += (ky*k.sx + kx)*p_d;
+  Weight *ew = weights + p_d;
+
+  for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy)
+  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx)
+  for(Neuron *e = icn +    c_d; icn < e; ++icn,       ipn -= p_d)
+  for(Weight *iw = weights; iw < ew; ++ipn, ++iw)
+    Iter::iter2(*icn, *ipn, *iw);
+}
+
+
+
+
+class LayerConvSharedBase: public Layer {
+public:
+  std::vector<Weight> mtWeights;
+  
+  using Layer::Layer;
+  
+
+  void split(int threadsCount) override {
+    Layer::split(threadsCount);
+    Weight w = {};
+    mtWeights.clear();
+    mtWeights.resize(threadsCount*weightsCount, w);
+  }
+
+  
+  inline void sumWeights(int tid, int threads) {
+    int wc = weightsCount;
+    Weight *iw = weights + tid;
+    Weight *ia = mtWeights.data() + tid;
+    Weight *ea = mtWeights.data() + threads*wc;
+    for(Weight *ew = weights + wc; iw < ew; iw += threads, ia += threads) {
+      WeightReal w = iw->w;
+      for(Weight *iia = ia; iia < ea; iia += wc)
+        w += iia->w, iia->w = 0;
+      iw->w = w;
+    }
+  }
+};
+
+
+
+template<Func func>
+class LayerConvShared: public LayerConvSharedBase {
+public:
+  Kernel kernel;
+
+
+  LayerConvShared(Layer &prev, const Layout &layout, const Kernel &kernel, Weight *weights = nullptr):
+    LayerConvSharedBase(&prev, layout, kernel.sx*kernel.sy*prev.back().layout.getD(), weights),
+    kernel(kernel)
+  {
+    assert(kernel);
+    stat.links = weightsCount*neuronsCount;
+    if (ownWeights) fillWeights(-1, 1);
+  }
+
+  
+  void pass(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void init(Neuron&, AccumType &a) { a.v = 0; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { a.v += n.v * w.w; }
+      static inline void done(Neuron &n, AccumType &a) { func(n, a.v); }
+    };
+    iterateConvolutionShared<I>(mtLayouts[barrier.tid], prev->layout, layout, kernel, neurons, prev->neurons, weights);
+  }
+  
+
+  void backpassWeights(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void init(Neuron &n, AccumType &a) { a.v = n.d; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { w.w += n.v * a.v; }
+    };
+    iterateConvolutionShared<I>(mtLayouts[barrier.tid], prev->layout, layout, kernel, neurons, prev->neurons, &mtWeights[barrier.tid * weightsCount]);
+    barrier.wait();
+    sumWeights(barrier.tid, barrier.threads);
+  }
+  
+  
+  void backpassDeltas(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void iter2(Neuron &cn, Neuron &pn, Weight &w) { pn.a.v += cn.d * w.w; }
+      static inline void iter3(Neuron &n) { n.d *= n.a.v; n.a.v = 0; }
+    };
+    int ksx = kernel.sx, ksy = kernel.sy;
+    for(int kx = 0; kx < ksx; ++kx)
+    for(int ky = 0; ky < ksy; ++ky) {
+      iterateConvolutionSharedPoint<I>(mtLayouts[barrier.tid], prev->layout, layout, kernel, kx, ky, neurons, prev->neurons, weights);
+      barrier.wait();
+    }
+    iterateNeurons<I>(mtPrevLayouts[barrier.tid], prev->neurons);
+  }
+  
+  
+  void testPass() override {
+    struct I: public Iter {
+      static inline void init(Neuron&, AccumType &a) { a.v = 0; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { a.v += n.v * w.w; }
+      static inline void done(Neuron &n, AccumType &a) { func(n, a.v); }
+    };
+    iterateTestConvolutionShared<I>(layout, prev->layout, kernel, neurons, prev->neurons, weights);
+  }
+
+    
+  void testBackpass() override {
+    struct I: public Iter {
+      static inline void init(Neuron &n, AccumType &a) { a.v = n.d; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { n.a.v += a.v * w.w; }
+      static inline void iter3(Neuron &n) { n.d *= n.a.v; n.a.v = 0; }
+    };
+    struct IW: public Iter {
+      static inline void init(Neuron &n, AccumType &a) { a.v = n.d; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { w.w += a.v * n.v; }
+    };
+    clearAccum();
+    iterateTestConvolutionShared<I>(layout, prev->layout, kernel, neurons, prev->neurons, weights);
+    iterateTestConvolutionShared<IW>(layout, prev->layout, kernel, neurons, prev->neurons, weights);
+    iterateNeurons<I>(prev->layout, prev->neurons);
+    clearAccum();
+  }
+};
+
+
+
+template<Func func>
+class LayerDeconvShared: public LayerConvSharedBase {
+public:
+  Kernel kernel;
+
+
+  LayerDeconvShared(Layer &prev, const Layout &layout, const Kernel &kernel, Weight *weights = nullptr):
+    LayerConvSharedBase(&prev, layout, kernel.sx*kernel.sy*layout.getD(), weights),
+    kernel(kernel)
+  {
+    assert(kernel);
+    stat.links = weightsCount*neuronsCount;
+    if (ownWeights) fillWeights(-1, 1);
+  }
+
+  
+  void pass(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void iter2(Neuron &cn, Neuron &pn, Weight &w) { pn.a.v += cn.v * w.w; }
+      static inline void iter3(Neuron &n) { func(n, n.a.v); n.a.v = 0; }
+    };
+    int k_sx = kernel.sx, k_sy = kernel.sy;
+    for(int kx = 0; kx < k_sx; ++kx)
+    for(int ky = 0; ky < k_sy; ++ky) {
+      iterateConvolutionSharedPoint<I>(mtPrevLayouts[barrier.tid], layout, prev->layout, kernel, kx, ky, prev->neurons, neurons, weights);
+      barrier.wait();
+    }
+    iterateNeurons<I>(mtLayouts[barrier.tid], neurons);
+  }
+  
+  
+  void backpassWeights(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void init(Neuron &n, AccumType &a) { a.v = n.v; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { w.w += n.d * a.v; }
+    };
+    iterateConvolutionShared<I>(mtPrevLayouts[barrier.tid], layout, prev->layout, kernel, prev->neurons, neurons, &mtWeights[barrier.tid * weightsCount]);
+    barrier.wait();
+    sumWeights(barrier.tid, barrier.threads);
+  }
+  
+  
+  void backpassDeltas(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void init(Neuron&, AccumType &a) { a.v = 0; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { a.v += n.d * w.w; }
+      static inline void done(Neuron &n, AccumType &a) { n.d *= a.v; }
+    };
+    iterateConvolutionShared<I>(mtPrevLayouts[barrier.tid], layout, prev->layout, kernel, prev->neurons, neurons, weights);
+  }
+
+  
+  void testPass() override {
+    struct I: public Iter {
+      static inline void init(Neuron &n, AccumType &a) { a.v = n.v; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { n.a.v += a.v * w.w; }
+      static inline void iter3(Neuron &n) { func(n, n.a.v); n.a.v = 0; }
+    };
+    clearAccum();
+    iterateTestConvolutionShared<I>(prev->layout, layout, kernel, prev->neurons, neurons, weights);
+    iterateNeurons<I>(layout, neurons);
+    clearAccum();
+  }
+  
+  
+  void testBackpass() override {
+    struct I: public Iter {
+      static inline void init(Neuron &n, AccumType &a) { a.v = 0; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { a.v += n.d * w.w; }
+      static inline void done(Neuron &n, AccumType &a) { n.d *= a.v; }
+    };
+    struct IW: public Iter {
+      static inline void init(Neuron &n, AccumType &a) { a.v = n.v; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { w.w += n.d * a.v; }
+    };
+    iterateTestConvolutionShared<I>(prev->layout, layout, kernel, prev->neurons, neurons, weights);
+    iterateTestConvolutionShared<IW>(prev->layout, layout, kernel, prev->neurons, neurons, weights);
+  }
+};
+
+#endif
diff --git a/projects/neural/layer.conv.test.inc.cpp b/projects/neural/layer.conv.test.inc.cpp
index 7124b30..c0b4be8 100644
--- a/projects/neural/layer.conv.test.inc.cpp
+++ b/projects/neural/layer.conv.test.inc.cpp
@@ -5,6 +5,7 @@
 
 #include "layer.test.inc.cpp"
 #include "layer.conv.inc.cpp"
+#include "layer.conv.shared.inc.cpp"
 
 
 class ConvTest: public Test {
@@ -147,11 +148,23 @@ public:
     }
 
     {
-      Layer l(nullptr, pl);
-      new LayerConv<funcSigmoidExp>(l, cl, k);
+      Layer l(nullptr, cl);
+      new LayerDeconv<funcSigmoidExp>(l, pl, k);
       Test::testLayer("LayerDeconv", l);
     }
 
+    {
+      Layer l(nullptr, pl);
+      new LayerConvShared<funcSigmoidExp>(l, cl, k);
+      Test::testLayer("LayerConvShared", l);
+    }
+
+    {
+      Layer l(nullptr, cl);
+      new LayerDeconvShared<funcSigmoidExp>(l, pl, k);
+      Test::testLayer("LayerDeconvShared", l);
+    }
+
     return st;
   }
 
diff --git a/projects/neural/layer.simple.inc.cpp b/projects/neural/layer.simple.inc.cpp
index 1f2a7b7..1da0312 100644
--- a/projects/neural/layer.simple.inc.cpp
+++ b/projects/neural/layer.simple.inc.cpp
@@ -14,6 +14,17 @@ inline void funcSigmoidExp(Neuron &n, AccumReal s) {
 }
 
 
+inline void funcSigmoidExp2(Neuron &n, AccumReal s) {
+  if (s > 5) s = 5; else if (s < -5) s = -5;
+  AccumReal ss = 1/(1 + std::exp(-s)); n.v = ss; n.d = 0;//ss * (1-ss) * 0.1;
+}
+
+
+inline void funcReLU(Neuron &n, AccumReal s)
+  { n.v = s > 0 ? s : 0; n.d = s > 0; }
+
+
+
 template<typename Iter>
 inline void iterateNeurons(const Layout &l, Neuron *neurons) {
   if (!l) return;
diff --git a/projects/neural/train.digit.inc.cpp b/projects/neural/train.digit.inc.cpp
index 9b3bc9e..8277f42 100644
--- a/projects/neural/train.digit.inc.cpp
+++ b/projects/neural/train.digit.inc.cpp
@@ -34,6 +34,21 @@ public:
     fclose(f);
     return true;
   }
+  
+  static void printSymbol(const unsigned char *data, int w, int h, int index = -1) {
+    if (index >= 0) printf("\nsymbol %d (%d):\n", (int)data[w*h], index);
+               else printf("\nsymbol %d:\n", (int)data[w*h]);
+    for(int i = 0; i < h; ++i) {
+      for(int j = 0; j < w; ++j) printf("%s", data[i*w+j] > 128u ? "#" : " ");
+      printf("\n");
+    }
+    printf("\n");
+  }
+
+  void printSymbol(int index) {
+    const Layout &l = layer->layout;
+    printSymbol(&data[(l.getActiveCount()+1)*index], l.getW(), l.getH(), index);
+  }
 
 protected:
   bool prepare() override {
@@ -71,22 +86,38 @@ protected:
   }
 
 
-  AccumReal verifyDataMain(int, int iter) override {
+  Quality verifyData(Barrier &barrier, int, int iter) override {
+    Quality q = {};
+    if (barrier.tid) return q;
+    
     struct I: public Iter {
       typedef int DataType;
-      struct DataAccumType { int ri, mi; NeuronReal m; };
+      struct DataAccumType { int ri, mi; NeuronReal m, ratio, q; };
       static inline void iter4(Neuron &n, DataType d, DataAccumType &a) {
         NeuronReal v1 = d == a.ri;
         NeuronReal v0 = n.v;
-        n.d *= v1 - v0;
+        NeuronReal diff = v1 - v0;
+        n.d *= diff*a.ratio;
+        a.q += diff*diff;
         if (a.m < v0) { a.m = v0; a.mi = d; }
       }
     };
     
-    I::DataAccumType a = { data[ (shuffle[iter%count] + 1)*stride - 1 ], 0, 0 };
+    int index = shuffle[iter%count];
+    if (index == 59915) {
+      ++skipBackpass;
+      return q;
+    }
+    
+    I::DataAccumType a = { data[ (index + 1)*stride - 1 ], 0, 0, ratio };
     iterateNeurons2<I>(obl, obl, bl->neurons, 0, 1, &a);
     
-    return a.mi != a.ri;
+    q.train = sqrt(a.q/obl.getActiveCount());
+    q.human = a.mi != a.ri;
+    //if (!q.human && q.train < 0.01) ++skipBackpass;
+    //if (!q.human) ++skipBackpass;
+    //if (q.human) printSymbol(index);
+    return q;
   }
 };
 
diff --git a/projects/neural/train.inc.cpp b/projects/neural/train.inc.cpp
index a99b9f3..8582aa4 100644
--- a/projects/neural/train.inc.cpp
+++ b/projects/neural/train.inc.cpp
@@ -15,10 +15,34 @@ long long timeUs() {
 }
 
 
+struct Quality {
+  AccumReal train;
+  AccumReal human;
+  
+  inline Quality& operator+=(const Quality &b) {
+    train += b.train;
+    human += b.human;
+    return *this;
+  }
+
+  inline Quality& operator*=(AccumReal x) {
+    train *= x;
+    human *= x;
+    return *this;
+  }
+  
+  inline bool operator<(const Quality &b) const {
+    return human < b.human ? true
+         : b.human < human ? false
+         : train < b.train;
+  }
+};
+
+
 class Trainer {
 private:
   std::atomic<unsigned int> barrierCounter;
-  std::vector<AccumReal> qualities;
+  std::vector<Quality> qualities;
 
 public:
   Layer *layer;
@@ -30,7 +54,7 @@ public:
   AccumReal qmin;
 
 protected:
-  volatile bool doBackpassAtThisIteration;
+  std::atomic<unsigned int> skipBackpass;
   Layer *fl;
   Layer *bl;
   
@@ -40,21 +64,16 @@ protected:
   virtual void finish() { }
 
   virtual void loadData(Barrier &barrier, int block, int iter) { }
-  virtual AccumReal verifyData(Barrier &barrier, int block, int iter) { return 0; }
-
-  virtual void loadDataMain(int block, int iter) { };
-  virtual AccumReal verifyDataMain(int block, int iter) { return 0; };
+  virtual Quality verifyData(Barrier &barrier, int block, int iter) { return Quality{}; }
 
 private:
   void threadFunc(int tid, int block) {
     Barrier barrier(barrierCounter, tid, threadsCount);
 
-    volatile AccumReal &sumQ = qualities[tid] = 0;
+    Quality sumQ = {};
     for(int i = 0; i < itersPerBlock; ++i) {
       barrier.wait();
       loadData(barrier, block, i);
-      barrier.wait();
-      if (!tid) loadDataMain(block, i);
 
       for(Layer *l = fl->next; l; l = l->next) {
         barrier.wait();
@@ -62,15 +81,14 @@ private:
       }
 
       barrier.wait();
+      skipBackpass = 0;
       sumQ += verifyData(barrier, block, i);
+      
       barrier.wait();
-      if (!tid) {
-        doBackpassAtThisIteration = true;
-        sumQ += verifyDataMain(block, i);
-      }
+      bool skipBp = skipBackpass;
       
       barrier.wait();
-      if (ratio > 0 && doBackpassAtThisIteration) {
+      if (ratio > 0 && !skipBp) {
         for(Layer *l = bl; l->prev && l->prev->prev; l = l->prev) {
           barrier.wait();
           l->backpassDeltas(barrier);
@@ -81,20 +99,21 @@ private:
         }
       }
     }
+    qualities[tid] = sumQ;
   }
 
 
-  AccumReal runThreads(int block) {
+  Quality runThreads(int block) {
     barrierCounter = 0;
     std::vector<std::thread*> t(threadsCount, nullptr);
     for(int i = 1; i < threadsCount; ++i)
       t[i] = new std::thread(&Trainer::threadFunc, this, i, block);
     threadFunc(0, block);
 
-    AccumReal result = qualities[0];
+    Quality result = qualities[0];
     for(int i = 1; i < threadsCount; ++i)
       { t[i]->join(); delete t[i]; result += qualities[i]; }
-    return result / itersPerBlock;
+    return result *= 1/(AccumReal)itersPerBlock;
   }
 
 
@@ -108,7 +127,7 @@ public:
     blocksPerSaving(),
     blocksCount(1000),
     qmin(),
-    doBackpassAtThisIteration(),
+    skipBackpass(0),
     fl(),
     bl() { }
 
@@ -133,31 +152,40 @@ public:
   }
 
 
-  AccumReal run() {
+  Quality run() {
     assert(layer && !layer->prev && layer->next);
     assert(threadsCount > 0);
     assert(itersPerBlock > 0);
 
+    Quality bad = {INFINITY, INFINITY};
+    
     printf("training: threads %d, itersPerBlock %d, ratio: %lf\n", threadsCount, itersPerBlock, ratio);
+    fflush(stdout);
 
     fl = layer;
     bl = &layer->back();
     
     qualities.clear();
-    qualities.resize(threadsCount, 0);
+    qualities.resize(threadsCount, Quality{});
     for(Layer *l = layer; l; l = l->next)
       l->split(threadsCount);
     
     if (!prepare())
-      return printf("cannot prepare\n"), -1;
+      return printf("cannot prepare\n"), bad;
 
-    AccumReal result = -1;
+    
+    
+    AccumReal ratioCopy = ratio;
+    Quality result = bad, best = result, saved = result;
     long long fullTimeStartUs = timeUs();
+    ratio = 0;
     int i = 0;
+    int bps = blocksPerSaving > 0 ? blocksPerSaving : 1;
+    int nextSave = i + bps;
     while(true) {
       if (!prepareBlock()) {
         printf("cannot prepare block\n");
-        result = -1;
+        result = bad;
         break;
       };
 
@@ -172,18 +200,33 @@ public:
       fullTimeStartUs = t;
       ++i;
 
-      printf("%4d, total %7d, avg.result %f, time: %f / %f\n", i, i*itersPerBlock, result, runTimeUs*0.000001, fullTimeUs*0.000001);
-
-      bool done = (blocksCount > 0 && i >= blocksCount) || result <= qmin;
-
-      if (ratio > 0 && (blocksPerSaving <= 0 || i%blocksPerSaving == 0 || done) && !layer->save()) {
-        printf("saving failed\n");
-        result = -1;
-        break;
+      if (i == 1) saved = result;
+      bool good = result < best;
+      bool done = (blocksCount > 0 && i >= blocksCount) || result.human <= qmin;
+      bool saving = ratio > 0 && (i >= nextSave || done) && result < saved;
+      if (good) best = result;
+      
+      printf("%4d, total %7d, avg.result %f (%f), best %f (%f), time: %f / %f%s\n",
+        i, i*itersPerBlock,
+        result.human, result.train, best.human, best.train,
+        runTimeUs*0.000001, fullTimeUs*0.000001,
+        (saving ? ", saving" : "" ) );
+      fflush(stdout);
+
+      if (saving) {
+        if (!layer->save()) {
+          printf("saving failed\n");
+          result = bad;
+          break;
+        }
+        saved = result;
+        nextSave += bps;
       }
 
       if (done) break;
+      ratio = ratioCopy;
     }
+    ratio = ratioCopy;
 
     finish();
 
diff --git a/projects/neural/trainer.cpp b/projects/neural/trainer.cpp
index d8f8424..e14f3e6 100644
--- a/projects/neural/trainer.cpp
+++ b/projects/neural/trainer.cpp
@@ -16,31 +16,33 @@ bool runTests() {
 int main() {
   srand(time(NULL));
 
-  return !runTests();
+  //return !runTests();
   
-  #define FILENAME "data/output/weights.bin" // 28x28
+  //#define FILENAME "data/output/weights-digit.bin"
+  #define FILENAME "data/output/weights-digit-conv.bin"
 
   printf("create neural network\n");
-  Layer l(nullptr, Layout(28, 28)); l.filename                = FILENAME "1";
-  //(new LayerSimple<funcSigmoidExp>(l, Layout(256)))->filename = FILENAME "2";
-  //(new LayerSimple<funcSigmoidExp>(l, Layout(64)))->filename = FILENAME "3";
-  //(new LayerSimple<funcSigmoidExp>(l, Layout(128)))->filename = FILENAME "4";
-  //(new LayerSimple<funcSigmoidExp>(l, Layout(64)))->filename  = FILENAME "5";
-  //(new LayerSimple<funcSigmoidExp>(l, Layout(128)))->filename = FILENAME "5";
-  (new LayerSimple<funcSigmoidExp>(l, Layout(32)))->filename  = FILENAME "6";
-  //(new LayerSimple<funcSigmoidExp>(l, Layout(16)))->filename  = FILENAME "7";
-  (new LayerSimple<funcSigmoidExp>(l, Layout(10)))->filename  = FILENAME "8";
+  //Layer l(                    nullptr, Layout(28, 28) );
+  //(new LayerSimple<funcSigmoidExp>( l, Layout(256)   ))->filename = FILENAME "1";
+  //(new LayerSimple<funcSigmoidExp>( l, Layout(64)    ))->filename = FILENAME "2";
+  //(new LayerSimple<funcSigmoidExp>( l, Layout(10)    ))->filename = FILENAME "3";
+  
+  Layer l(nullptr, Layout(28, 28));
+  (new LayerConvShared<funcSigmoidExp>(l, Layout(11, 11, 16), Kernel(6, 2, 0)))->filename = FILENAME "1";
+  (new LayerSimple<funcSigmoidExp>(l, Layout(64)))->filename  = FILENAME "2";
+  (new LayerSimple<funcSigmoidExp>(l, Layout(10)))->filename  = FILENAME "3";
 
   l.sumStat().print();
 
   printf("load training data\n");
   TrainerDigit t;
-  if (!t.loadSymbolMap("data/symbols-data.bin")) return 1;
+  if (!t.loadSymbolMap("data/symbols-data.bin")) return 1; // 28x28
 
   //printf("try load previously saved network\n"); l.load();
 
   printf("train\n");
-  t.configure(l, 0.5, 4, 1000000, 0, 0, 0.0000001).run();
+  //t.configure(l, 0.5, 8, 70000, 0, 0, 0.00001).run();
+  t.configure(l, 0.1, 8, 70000, 0, 0, 0.00001).run();
 
   return 0;
 }
diff --git a/projects/neural/view-digits.cpp b/projects/neural/view-digits.cpp
new file mode 100644
index 0000000..525ff90
--- /dev/null
+++ b/projects/neural/view-digits.cpp
@@ -0,0 +1,154 @@
+
+#include <helianthus.h>
+
+
+#include "layer.all.inc.cpp"
+
+
+Layer *nl;
+Framebuffer fb, fbMin;
+Animation fbAnim, fbMinAnim;
+
+int wasPressed;
+double prevX, prevY;
+
+
+
+void prepareImage() {
+  int w, h;
+  unsigned char *pixels = NULL;
+
+  saveState();
+  target(fb);
+  imageFromViewport(&w, &h, &pixels);
+  restoreState();
+  if (!pixels) return;
+
+  int x0 = w, y0 = h, x1 = 0, y1 = 0;
+  for(int y = 0; y < h; ++y) {
+    for(int x = 0; x < w; ++x) {
+      if (imageGetPixel(w, h, pixels, x, y) != 0x000000ff) {
+        if (x0 > x) x0 = x;
+        if (x1 < x) x1 = x;
+        if (y0 > y) y0 = y;
+        if (y1 < y) y1 = y;
+      }
+    }
+  }
+  free(pixels);
+  pixels = NULL;
+
+  if (x1 < x0 || y1 < y0) return;
+
+  int fw = framebufferGetWidth(fbMin);
+  int fh = framebufferGetHeight(fbMin);
+
+  double wx = x1 - x0 + 1;
+  double wy = y1 - y0 + 1;
+  double s = (fw - 4)/(double)(wx > wy ? wx : wy);
+  double cx = (x0 + x1)/2.0;
+  double cy = (y0 + y1)/2.0;
+
+  double xx = fw/2 - s*cx;
+  double yy = fh/2 - s*cy;
+  double ww = s*w;
+  double hh = s*h;
+
+  saveState();
+  target(fbMin);
+  noStroke();
+  rectTextured(fbAnim, xx, yy, ww, hh);
+  imageFromViewport(&w, &h, &pixels);
+  restoreState();
+
+  if (!pixels) return;
+  Neuron *in = nl->neurons;
+  for(int y = 0; y < h; ++y)
+    for(int x = 0; x < w; ++x)
+      (in++)->v = colorGetValue(imageGetPixel(w, h, pixels, x, y));
+}
+
+
+void init() {
+  background(COLOR_BLACK);
+  stroke(COLOR_WHITE);
+  fb = createFramebufferEx(512, 512, NULL, FALSE, FALSE, TRUE);
+  fbMin = createFramebufferEx(28, 28, NULL, FALSE, FALSE, TRUE);
+  fbAnim = createAnimationFromFramebuffer(fb);
+  fbMinAnim = createAnimationFromFramebuffer(fbMin);
+
+  saveState();
+  target(fb);
+  clear();
+  target(fbMin);
+  clear();
+  restoreState();
+
+  #define FILENAME "data/weights-digit.bin"
+  nl = new Layer(                   nullptr, Layout(28, 28));
+  (new LayerSimple<funcSigmoidExp>    ( *nl, Layout(256)      ))->filename = FILENAME "1";
+  (new LayerSimple<funcSigmoidExp>    ( *nl, Layout( 64)      ))->filename = FILENAME "2";
+  (new LayerSimple<funcSigmoidExp>    ( *nl, Layout( 10)      ))->filename = FILENAME "3";
+  nl->load();
+}
+
+
+void draw() {
+  saveState();
+
+  if (mouseDown("left")) {
+    double x = mouseX(), y = mouseY();
+    if (!wasPressed) prevX = x, prevY = y;
+
+    saveState();
+    strokeWidth(32);
+    target(fb);
+    line(prevX, prevY, x, y);
+    restoreState();
+
+    prevX = x, prevY = y;
+    wasPressed = TRUE;
+  } else {
+    wasPressed = FALSE;
+  }
+
+  if (keyWentDown("space")) {
+    prepareImage();
+    for(Layer *l = nl->next; l; l = l->next)
+      l->testPass();
+    saveState();
+    target(fb);
+    clear();
+    restoreState();
+  }
+
+  noStroke();
+  rectTextured(fbAnim, 0, 0, 512, 512);
+
+  stroke(COLOR_WHITE);
+  rectTextured(fbMinAnim, 16, 16, 28, 28);
+
+  noFill();
+
+  Layer &nlb = nl->back();
+  textSize(8);
+  int res = 0;
+  for(int i = 0; i < 10; ++i) {
+    if (nlb.neurons[i].v > nlb.neurons[res].v) res = i;
+    textf(16, 90+8*i, "%d: %lf", i, nlb.neurons[i].v);
+  }
+  textSize(16);
+  textf(16, 60, "%d", res);
+
+  restoreState();
+}
+
+
+int main(int largc, char **largv) {
+  windowSetVariableFrameRate();
+  windowSetInit(&init);
+  windowSetDraw(&draw);
+  windowRun();
+  return 0;
+}
+