From e865c950724aa517cd4c7d814b2451fb1c07e847 Mon Sep 17 00:00:00 2001
From: Ivan Mahonin <bh@icystar.com>
Date: Mar 16 2023 06:13:46 +0000
Subject: neural project


---
diff --git a/projects/neural/build-trainer.sh b/projects/neural/build-trainer.sh
new file mode 100755
index 0000000..72d6fa9
--- /dev/null
+++ b/projects/neural/build-trainer.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -e
+
+
+if [ "$1" == "debug" ]; then
+  c++ -Wall -g -O0 -pthread trainer.cpp -lm -o trainer-dbg
+  echo done debug
+else
+  c++ -Wall -DNDEBUG -O3 -pthread trainer.cpp -lm -o trainer
+  echo done release
+fi
+
diff --git a/projects/neural/data b/projects/neural/data
new file mode 120000
index 0000000..c5a9c9a
--- /dev/null
+++ b/projects/neural/data
@@ -0,0 +1 @@
+../../simple/neural/data
\ No newline at end of file
diff --git a/projects/neural/layer.all.inc.cpp b/projects/neural/layer.all.inc.cpp
new file mode 100644
index 0000000..2ccae82
--- /dev/null
+++ b/projects/neural/layer.all.inc.cpp
@@ -0,0 +1,9 @@
+#ifndef LAYER_ALL_INC_CPP
+#define LAYER_ALL_INC_CPP
+
+
+#include "layer.simple.inc.cpp"
+#include "layer.conv.inc.cpp"
+
+
+#endif
diff --git a/projects/neural/layer.all.test.inc.cpp b/projects/neural/layer.all.test.inc.cpp
new file mode 100644
index 0000000..8d42289
--- /dev/null
+++ b/projects/neural/layer.all.test.inc.cpp
@@ -0,0 +1,21 @@
+#ifndef LAYER_ALL_TEST_INC_CPP
+#define LAYER_ALL_TEST_INC_CPP
+
+
+
+#include "layer.simple.test.inc.cpp"
+#include "layer.conv.test.inc.cpp"
+
+
+class AllTest: public Test {
+public:
+  static bool test(const char *name = "all") {
+    Stage st(name);
+    SimpleTest::test();
+    ConvTest::test();
+    return st;
+  }
+};
+
+
+#endif
diff --git a/projects/neural/layer.conv.inc.cpp b/projects/neural/layer.conv.inc.cpp
new file mode 100644
index 0000000..c0b6e82
--- /dev/null
+++ b/projects/neural/layer.conv.inc.cpp
@@ -0,0 +1,334 @@
+#ifndef LAYER_CONV_INC_CPP
+#define LAYER_CONV_INC_CPP
+
+
+
+#include "layer.simple.inc.cpp"
+
+
+
+struct Kernel {
+  int sx, sy;
+  int dx, dy;
+  int ox, oy;
+
+  inline Kernel():
+    sx(), sy(), dx(), dy(), ox(), oy() { }
+  inline Kernel(int sx, int sy, int dx, int dy, int ox, int oy):
+    sx(sx), sy(sy), dx(dx), dy(dy), ox(ox), oy(oy) { }
+  inline Kernel(int s, int d, int o):
+    sx(s), sy(s), dx(d), dy(d), ox(o), oy(o) { }
+  inline operator bool() const
+    { return sx > 0 && sy > 0 && dx > 0 && dy > 0; }
+  
+  
+  void print(const char *prefix = nullptr) const {
+    if (prefix && *prefix) printf("%s: ", prefix);
+    printf("x(sdo): %d %d %d, y(sdo): %d %d %d\n", sx, dx, ox, sy, dy, oy);
+  }
+  void printYX(const char *prefix = nullptr) const {
+    if (prefix && *prefix) printf("%s: ", prefix);
+    printf("y(sdo): %d %d %d, x(sdo): %d %d %d\n", sy, dy, oy, sx, dx, ox);
+  }
+};
+
+
+template<typename Iter>
+void iterateTestConvolution(Layout cl, Layout pl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+  if (!cl) return;
+  assert(pl);
+  assert(k);
+  assert(c_neurons);
+  assert(p_neurons);
+  assert(weights);
+  assert(pl.x0 + k.ox >= 0 && pl.x0 + (cl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);
+  assert(pl.y0 + k.oy >= 0 && pl.y0 + (cl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);
+ 
+  for(int cy = cl.y0; cy < cl.y1; ++cy)
+  for(int cx = cl.x0; cx < cl.x1; ++cx)
+  for(int cz = cl.z0; cz < cl.z1; ++cz) {
+    int ci = (cy*cl.sx + cx)*cl.sz + cz;
+    Neuron &cn = c_neurons[ci];
+    typename Iter::AccumType a = {};
+    Iter::init(cn, a);
+    
+    for(int ky = 0; ky < k.sy; ++ky)
+    for(int kx = 0; kx < k.sx; ++kx)
+    for(int pz = pl.z0; pz < pl.z1; ++pz) {
+      int wi = ((cy - cl.y0)*cl.getW() + cx - cl.x0)*cl.getD() + cz - cl.z0;
+      wi = ((wi*k.sy + ky)*k.sx + kx)*pl.getD() + pz - pl.z0;
+      Weight &w = weights[wi];
+
+      int px = pl.x0 + (cx - cl.x0)*k.dx + k.ox + kx;
+      int py = pl.y0 + (cy - cl.y0)*k.dy + k.oy + ky;
+      int pi = (py*pl.sx + px)*pl.sz + pz;
+      Neuron &pn = p_neurons[pi];
+      
+      Iter::iter(pn, w, a);
+    }
+    
+    Iter::done(cn, a);
+  }
+}
+    
+
+
+
+template<typename Iter>
+void iterateConvolution(Layout cl, Layout pl, Layout wl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+  if (!cl) return;
+  assert(pl);
+  assert(wl);
+  assert(k);
+  assert(c_neurons);
+  assert(p_neurons);
+  assert(weights);
+  assert(cl.isSubLayoutOf(wl));
+  assert(pl.x0 + k.ox >= 0 && pl.x0 + (wl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);
+  assert(pl.y0 + k.oy >= 0 && pl.y0 + (wl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);
+
+  int c_h    = cl.getH();
+  int c_w    = cl.getW();
+  int c_d    = cl.getD();
+  int c_swz  = c_w*cl.sz;
+  int c_shxz = c_h*cl.sx*cl.sz;
+  int c_dx   = cl.sz - c_d;
+  int c_dy   = (cl.sx - c_w)*cl.sz;
+
+  int p_d    = pl.getD();
+  int p_dx   = k.dx*pl.sz;
+  int p_dy   = k.dy*pl.sx*pl.sz - c_w*p_dx;
+
+  int k_sxd  = k.sx*p_d;
+  int k_syxd = k.sy*k_sxd;
+  int p_ddy  = (pl.sx - k.sx)*pl.sz;
+  int p_ddx  = pl.sz - p_d;
+
+  int w_w    = wl.getW();
+  int w_d    = wl.getD();
+  int w_dx   = (w_d - c_d)*k_syxd;
+  int w_dy   = (w_w - c_w)*w_d*k_syxd;
+
+  int cx0    = cl.x0 - wl.x0;
+  int cy0    = cl.y0 - wl.y0;
+  int cz0    = cl.z0 - wl.z0;
+  
+  Neuron *icn = c_neurons + (cl.y0*cl.sx + cl.x0)*cl.sz + cl.z0;
+  Neuron *ipn = p_neurons + ((pl.y0 + cy0*k.dy + k.oy)*pl.sx + pl.x0 + cx0*k.dx + k.ox)*pl.sz + pl.z0;
+  Weight *iw = weights + ((cy0*w_w + cx0)*w_d + cz0)*k_syxd;
+
+  for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy, iw += w_dy)
+  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx, iw += w_dx)
+  for(Neuron *e = icn +    c_d; icn < e; ++icn) {
+    typename Iter::AccumType a;
+    Iter::init(*icn, a);
+
+    Neuron *iipn = ipn;
+    for(Weight *e = iw + k_syxd; iw < e; iipn += p_ddy)
+    for(Weight *e = iw +  k_sxd; iw < e; iipn += p_ddx)
+    for(Weight *e = iw +    p_d; iw < e; ++iw, ++iipn)
+      Iter::iter(*iipn, *iw, a);
+
+    Iter::done(*icn, a);
+  }
+}
+
+template<typename Iter>
+void iterateConvolutionPoint(Layout cl, Layout pl, Layout wl, Kernel k, int kx, int ky, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+  if (!cl) return;
+  assert(pl);
+  assert(wl);
+  assert(k);
+  assert(c_neurons);
+  assert(p_neurons);
+  assert(weights);
+  assert(cl.isSubLayoutOf(wl));
+  assert(kx >= 0 && kx < k.sx);
+  assert(ky >= 0 && ky < k.sy);
+  assert(pl.x0 + k.ox >= 0 && pl.x0 + (wl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);
+  assert(pl.y0 + k.oy >= 0 && pl.y0 + (wl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);
+
+  int c_h    = cl.getH();
+  int c_w    = cl.getW();
+  int c_d    = cl.getD();
+  int c_swz  = c_w*cl.sz;
+  int c_shxz = c_h*cl.sx*cl.sz;
+  int c_dx   = cl.sz - c_d;
+  int c_dy   = (cl.sx - c_w)*cl.sz;
+
+  int p_d    = pl.getD();
+  int p_dx   = k.dx*pl.sz;
+  int p_dy   = k.dy*pl.sx*pl.sz - c_w*p_dx;
+
+  int k_sxd  = k.sx*p_d;
+  int k_syxd = k.sy*k_sxd;
+
+  int w_w    = wl.getW();
+  int w_d    = wl.getD();
+  int w_dz   = k_syxd - p_d;
+  int w_dx   = (w_d - c_d)*k_syxd;
+  int w_dy   = (w_w - c_w)*w_d*k_syxd;
+
+  int cx0    = cl.x0 - wl.x0;
+  int cy0    = cl.y0 - wl.y0;
+  int cz0    = cl.z0 - wl.z0;
+  
+  Neuron *icn = c_neurons + (cl.y0*cl.sx + cl.x0)*cl.sz + cl.z0;
+  Neuron *ipn = p_neurons + ((pl.y0 + cy0*k.dy + k.oy + ky)*pl.sx + pl.x0 + cx0*k.dx + k.ox + kx)*pl.sz + pl.z0;
+  Weight *iw = weights + ((cy0*w_w + cx0)*w_d + cz0)*k_syxd + ky*k_sxd + kx*p_d;
+
+  for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy, iw += w_dy)
+  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx, iw += w_dx)
+  for(Neuron *e = icn +    c_d; icn < e; ++icn,       ipn -= p_d,  iw += w_dz)
+  for(Neuron *e = ipn +    p_d; ipn < e; ++ipn, ++iw)
+    Iter::iter2(*icn, *ipn, *iw);
+}
+
+
+
+template<Func func>
+class LayerConv: public Layer {
+public:
+  Kernel kernel;
+
+  LayerConv(Layer &prev, const Layout &layout, const Kernel &kernel, Weight *weights = nullptr):
+    Layer(&prev, layout, layout.getActiveCount()*kernel.sx*kernel.sy*prev.back().layout.getD(), weights),
+    kernel(kernel)
+  {
+    assert(kernel);
+    if (ownWeights) fillWeights(-1, 1);
+  }
+
+  
+  void pass(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void init(Neuron&, AccumType &a) { a.v = 0; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { a.v += n.v * w.w; }
+      static inline void done(Neuron &n, AccumType &a) { func(n, a.v); }
+    };
+    iterateConvolution<I>(mtLayouts[barrier.tid], prev->layout, layout, kernel, neurons, prev->neurons, weights);
+  }
+  
+
+  void backpassWeights(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void init(Neuron &n, AccumType &a) { a.v = n.d; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { w.w += n.v * a.v; }
+    };
+    iterateConvolution<I>(mtLayouts[barrier.tid], prev->layout, layout, kernel, neurons, prev->neurons, weights);
+  }
+  
+  
+  void backpassDeltas(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void iter2(Neuron &cn, Neuron &pn, Weight &w) { pn.a.v += cn.d * w.w; }
+      static inline void iter3(Neuron &n) { n.d *= n.a.v; n.a.v = 0; }
+    };
+    int ksx = kernel.sx, ksy = kernel.sy;
+    for(int kx = 0; kx < ksx; ++kx)
+    for(int ky = 0; ky < ksy; ++ky) {
+      iterateConvolutionPoint<I>(mtLayouts[barrier.tid], prev->layout, layout, kernel, kx, ky, neurons, prev->neurons, weights);
+      barrier.wait();
+    }
+    iterateNeurons<I>(prev->mtLayouts[barrier.tid], prev->neurons);
+  }
+  
+  
+  void testPass() override {
+    struct I: public Iter {
+      static inline void init(Neuron&, AccumType &a) { a.v = 0; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { a.v += n.v * w.w; }
+      static inline void done(Neuron &n, AccumType &a) { func(n, a.v); }
+    };
+    iterateTestConvolution<I>(layout, prev->layout, kernel, neurons, prev->neurons, weights);
+  }
+
+    
+  void testBackpass() override {
+    struct I: public Iter {
+      static inline void init(Neuron &n, AccumType &a) { a.v = n.d; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { n.a.v += a.v * w.w; w.w += a.v * n.v; }
+      static inline void iter3(Neuron &n) { n.d *= n.a.v; n.a.v = 0; }
+    };
+    clearAccum();
+    iterateTestConvolution<I>(layout, prev->layout, kernel, neurons, prev->neurons, weights);
+    iterateNeurons<I>(prev->layout, prev->neurons);
+    clearAccum();
+  }
+};
+
+
+
+template<Func func>
+class LayerDeconv: public Layer {
+public:
+  Kernel kernel;
+
+  LayerDeconv(Layer &prev, const Layout &layout, const Kernel &kernel, Weight *weights = nullptr):
+    Layer(&prev, layout, prev.back().layout.getActiveCount()*kernel.sx*kernel.sy*layout.getD(), weights),
+    kernel(kernel)
+  {
+    assert(kernel);
+    if (ownWeights) fillWeights(-1, 1);
+  }
+  
+
+  void pass(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void iter2(Neuron &cn, Neuron &pn, Weight &w) { pn.a.v += cn.v * w.w; }
+      static inline void iter3(Neuron &n) { func(n, n.a.v); n.a.v = 0; }
+    };
+    int k_sx = kernel.sx, k_sy = kernel.sy;
+    for(int kx = 0; kx < k_sx; ++kx)
+    for(int ky = 0; ky < k_sy; ++ky) {
+      iterateConvolutionPoint<I>(prev->mtLayouts[barrier.tid], layout, prev->layout, kernel, kx, ky, prev->neurons, neurons, weights);
+      barrier.wait();
+    }
+    iterateNeurons<I>(mtLayouts[barrier.tid], neurons);
+  }
+  
+  
+  void backpassWeights(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void init(Neuron &n, AccumType &a) { a.v = n.v; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { w.w += n.d * a.v; }
+    };
+    iterateConvolution<I>(prev->mtLayouts[barrier.tid], prev->layout, layout, kernel, neurons, prev->neurons, weights);
+  }
+  
+  
+  void backpassDeltas(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void init(Neuron&, AccumType &a) { a.v = 0; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { a.v += n.d * w.w; }
+      static inline void done(Neuron &n, AccumType &a) { n.d *= a.v; }
+    };
+    iterateConvolution<I>(prev->mtLayouts[barrier.tid], prev->layout, layout, kernel, neurons, prev->neurons, weights);
+  }
+
+  
+  void testPass() override {
+    struct I: public Iter {
+      static inline void init(Neuron &n, AccumType &a) { a.v = n.v; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { n.a.v += a.v * w.w; }
+      static inline void iter3(Neuron &n) { func(n, n.a.v); n.a.v = 0; }
+    };
+    clearAccum();
+    iterateTestConvolution<I>(prev->layout, layout, kernel, prev->neurons, neurons, weights);
+    iterateNeurons<I>(layout, neurons);
+    clearAccum();
+  }
+  
+  
+  void testBackpass() override {
+    struct I: public Iter {
+      struct AccumType: public Accum { NeuronReal vv; };
+      static inline void init(Neuron &n, AccumType &a) { a.v = 0; a.vv = n.v; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { a.v += n.d * w.w; w.w += n.d * a.vv; }
+      static inline void done(Neuron &n, AccumType &a) { n.d *= a.v; }
+    };
+    iterateTestConvolution<I>(prev->layout, layout, kernel, prev->neurons, neurons, weights);
+  }
+};
+
+#endif
diff --git a/projects/neural/layer.conv.test.inc.cpp b/projects/neural/layer.conv.test.inc.cpp
new file mode 100644
index 0000000..7124b30
--- /dev/null
+++ b/projects/neural/layer.conv.test.inc.cpp
@@ -0,0 +1,171 @@
+#ifndef LAYER_CONV_TEST_INC_CPP
+#define LAYER_CONV_TEST_INC_CPP
+
+
+
+#include "layer.test.inc.cpp"
+#include "layer.conv.inc.cpp"
+
+
+class ConvTest: public Test {
+public:
+  static void init(const Layout &cl, const Layout &pl, const Kernel &k, bool shared = false)
+    { Test::init(cl.getCount(), pl.getCount(), (shared ? 1 : cl.getActiveCount())*k.sx*k.sy*pl.getD()); }
+
+
+  static bool verifyWeights(const char *name, const Layout &cl, const Layout &pl, const Kernel &k) {
+    Stage st(name);
+    for(int cy = cl.y0; cy < cl.y1; ++cy)
+    for(int cx = cl.x0; cx < cl.x1; ++cx)
+    for(int cz = cl.z0; cz < cl.z1; ++cz) {
+      int ci = (cy*cl.sx + cx)*cl.sz + cz;
+      for(int ky = 0; ky < k.sy; ++ky)
+      for(int kx = 0; kx < k.sx; ++kx)
+      for(int pz = pl.z0; pz < pl.z1; ++pz) {
+        int wi = ((cy - cl.y0)*cl.getW() + cx - cl.x0)*cl.getD() + cz - cl.z0;
+        wi = ((wi*k.sy + ky)*k.sx + kx)*pl.getD() + pz - pl.z0;
+
+        int px = pl.x0 + (cx - cl.x0)*k.dx + k.ox + kx;
+        int py = pl.y0 + (cy - cl.y0)*k.dy + k.oy + ky;
+        if ( px < pl.x0 || px >= pl.x1
+          || py < pl.y0 || py >= pl.y1 ) continue;
+
+        int pi = (py*pl.sx + px)*pl.sz + pz;
+
+        int s = (int)p_neurons.size();
+        int w = weights[wi].i;
+        int i = ci*s + pi + 1;
+
+        if (w != i) {
+          int ww = w;
+          int wpz = ww%pl.sz; ww /= pl.sz;
+          int wpx = ww%pl.sx; ww /= pl.sx;
+          int wpy = ww%pl.sy; ww /= pl.sy;
+          int wcz = ww%cl.sz; ww /= cl.sz;
+          int wcx = ww%cl.sx; ww /= cl.sx;
+          int wcy = ww;
+
+          printf(
+            "wrong index: %d = ((%d*%d + %d)*%d + %d)*%d + (%d*%d + %d)*%d + %d + 1,\n"
+            "expected:    %d = ((%d*%d + %d)*%d + %d)*%d + (%d*%d + %d)*%d + %d + 1\n"
+            "wi %d, ky %d, kx %d \n",
+            w,
+            wcy, cl.sx, wcx, cl.sz, wcz, s,
+            wpy, pl.sx, wpx, pl.sz, wpz,
+            i,
+            cy, cl.sx, cx, cl.sz, cz, s,
+            py, pl.sx, px, pl.sz, pz,
+            wi, ky, kx );
+          pl.printYXZ("prev layout");
+          cl.printYXZ("curr layout");
+          k.printYX("kernel");
+          ++errors;
+          return st;
+        }
+      }
+    }
+    return st;
+  }
+
+  static bool testIterators(const char *name, const Layout &cl, const Layout &pl, const Kernel &k, int threads) {
+    Stage st(name);
+
+    assert(cl && pl && k && threads > 0);
+    Layout::List clist, plist;
+    cl.split(clist, threads);
+    pl.split(plist, threads);
+    
+    struct I: public Iter {
+      static inline void init(Neuron &n, Iter::AccumType &a) { ++n.a.i; a.i = (AccumInt)(&n - c_neurons.data()); }
+      static inline void iter(Neuron &n, Weight &w, Iter::AccumType &a) {
+        if (w.i)
+          ++errors;
+        w.i = (WeightInt)(&n - p_neurons.data() + a.i*p_neurons.size() + 1);
+      }
+      static inline void iter2(Neuron &cn, Neuron &pn, Weight &w) {
+        if (w.i)
+          ++errors;
+        w.i = (WeightInt)((&cn - c_neurons.data())*p_neurons.size() + &pn - p_neurons.data() + 1);
+        pn.v = pn.v + 1;
+      }
+    };
+
+    if (threads == 1) {
+      Stage st("iterateTestConvolution");
+      init(cl, pl, k);
+      iterateTestConvolution<I>(cl, pl, k, c_neurons.data(), p_neurons.data(), weights.data());
+      verifyNeurons("conv-neurons", cl, c_neurons.data());
+      verifyWeights("conv-weights", cl, pl, k);
+    }
+
+    {
+      Stage st("iterateConvolution");
+      init(cl, pl, k);
+      for(int i = 0; i < threads; ++i)
+        iterateConvolution<I>(clist[i], pl, cl, k, c_neurons.data(), p_neurons.data(), weights.data());
+      verifyNeurons("conv-neurons", cl, c_neurons.data());
+      verifyWeights("conv-weights", cl, pl, k);
+    }
+
+    {
+      Stage st("iterateConvolutionPoint");
+      init(cl, pl, k);
+      int e = errors;
+      for(int ky = 0; ky < k.sy && errors == e; ++ky)
+      for(int kx = 0; kx < k.sx && errors == e; ++kx) {
+        for(int i = 0; i < threads; ++i)
+          iterateConvolutionPoint<I>(clist[i], pl, cl, k, kx, ky, c_neurons.data(), p_neurons.data(), weights.data());
+        if (!verifyNeuronsAccum(pl, p_neurons.data(), cl.getD(), true))
+          printf("kx: %d, ky: %d\n", kx, ky), k.printYX("kernel");
+      }
+      verifyNeurons("conv-neurons", pl, p_neurons.data(), true);
+      verifyWeights("conv-weights", cl, pl, k);
+    }
+
+    return st;
+  }
+
+  static bool testIterators(const char *name, const Layout &cl, const Layout &pl, const Kernel &k) {
+    Stage st(name);
+    testIterators( "single-thread", cl, pl, k,   1 );
+    testIterators( "2-threads",     cl, pl, k,   2 );
+    testIterators( "7-threads",     cl, pl, k,   7 );
+    testIterators( "8-threads",     cl, pl, k,   8 );
+    testIterators( "512-threads",   cl, pl, k, 512 );
+    return st;
+  }
+  
+  static bool test(const char *name, const Layout &cl, const Layout &pl, const Kernel &k) {
+    Stage st(name);
+    
+    testIterators("iterators", cl, pl, k);
+
+    {
+      Layer l(nullptr, pl);
+      new LayerConv<funcSigmoidExp>(l, cl, k);
+      Test::testLayer("LayerConv", l);
+    }
+
+    {
+      Layer l(nullptr, pl);
+      new LayerConv<funcSigmoidExp>(l, cl, k);
+      Test::testLayer("LayerDeconv", l);
+    }
+
+    return st;
+  }
+
+  static bool test(const char *name = "convolution") {
+    Stage st(name);
+    test( "square", Layout(64, 64, 4), Layout(128, 128, 4).expandXY(2),                 Kernel(5, 2, -2)           );
+    test( "rect1",  Layout(63, 43, 5), Layout( 63,  85, 3).expandX(2).   expandY(3),    Kernel(5, 7, 1, 2, -2, -3) );
+    test( "rect2",  Layout(43, 63, 3), Layout( 85,  63, 5).expandX(3).   expandY(3, 2), Kernel(7, 5, 2, 1, -3, -2) );
+    test( "rect3",  Layout(64, 48, 5), Layout( 64,  96, 3).expandX(1, 2).expandY(3, 1), Kernel(4, 6, 1, 2, -1, -3) );
+    test( "pad",    Layout(64, 48, 5).expandX(3, 4).expandY(4, 3).expandZ(5, 4),
+                    Layout(64, 96, 3).expandX(6, 5).expandY(7, 6).expandZ(0, 1),        Kernel(4, 6, 1, 2, -1, -3) );
+    return st;
+  }
+};
+
+
+#endif
diff --git a/projects/neural/layer.inc.cpp b/projects/neural/layer.inc.cpp
new file mode 100644
index 0000000..ad7c516
--- /dev/null
+++ b/projects/neural/layer.inc.cpp
@@ -0,0 +1,223 @@
+#ifndef LAYER_INC_CPP
+#define LAYER_INC_CPP
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+
+#include <atomic>
+#include <vector>
+#include <algorithm>
+
+
+#include "layout.inc.cpp"
+
+
+
+typedef double WeightReal;
+typedef double NeuronReal;
+typedef double AccumReal;
+
+typedef int WeightInt;
+typedef int AccumInt;
+
+
+#define RANDOM_MAX 0x7fffffff
+inline unsigned int randomNext(unsigned int prev)
+  { return (1103515245*prev + 12345) & RANDOM_MAX; }
+
+
+struct Accum {
+  union { AccumReal v; AccumInt i; };
+};
+
+
+struct Neuron {
+  NeuronReal v, d;
+  Accum a;
+};
+
+
+struct Weight {
+  union { WeightReal w; WeightInt i; };
+};
+
+
+struct Iter {
+  typedef Accum AccumType;
+  typedef NeuronReal* DataType;
+  typedef AccumType DataAccumType;
+  static inline void init(Neuron&, AccumType&) { }
+  static inline void iter(Neuron&, Weight&, AccumType&) { }
+  static inline void done(Neuron&, AccumType&) { }
+  static inline void iter2(Neuron&, Neuron&, Weight&) { }
+  static inline void iter3(Neuron&) { }
+  static inline void iter4(Neuron&, DataType, DataAccumType&) { }
+};
+
+
+class Barrier {
+private:
+  std::atomic<unsigned int> &counter;
+  unsigned int next;
+public:
+  const unsigned int tid;
+  const unsigned int threads;
+
+  Barrier(const Barrier&) = delete;
+  inline Barrier(std::atomic<unsigned int> &counter, unsigned int tid, unsigned int threads):
+    counter(counter), next(), tid(tid), threads(threads) { assert(tid < threads); }
+  inline void wait() { next += threads; ++counter; while(counter < next); }
+  inline void subwait() { while(counter < next + tid); }
+};
+
+
+struct Stat {
+  int neurons;
+  int activeNeurons;
+  int weights;
+  int links;
+  size_t memsize;
+
+  Stat(): neurons(), activeNeurons(), weights(), links(), memsize() { }
+
+  Stat& operator+= (const Stat &b) {
+    neurons += b.neurons;
+    activeNeurons += b.activeNeurons;
+    weights += b.weights;
+    links   += b.links;
+    memsize += b.memsize;
+    return *this;
+  }
+
+  void print(const char *prefix = nullptr) const {
+    if (prefix && *prefix) printf("%s: ", prefix);
+    printf("neurons: %d / %d, links %d / %d, memSize: %llu\n", activeNeurons, neurons, weights, links, (unsigned long long)memsize);
+  }
+};
+
+
+class Layer {
+public:
+  Layer *prev, *next;
+
+  Layout layout;
+
+  Neuron *neurons;
+  int neuronsCount;
+
+  Weight *weights;
+  int weightsCount;
+  bool ownWeights;
+
+  const char *filename;
+
+  Stat stat;
+
+  Layout::List mtLayouts;
+
+
+  Layer(Layer *prev, const Layout &layout, int weightsCount = 0, Weight *weights = nullptr):
+    prev(prev ? &prev->back() : nullptr),
+    next(),
+    layout(layout),
+    neurons(),
+    neuronsCount(layout.getCount()),
+    weights(weights),
+    weightsCount(weightsCount),
+    ownWeights(!weights && weightsCount),
+    filename()
+  {
+    assert(layout);
+    assert(neuronsCount > 0);
+    assert(weightsCount >= 0);
+    assert(!prev == !weightsCount);
+
+    if (this->prev) this->prev->next = this;
+    if (neuronsCount) {
+      neurons = new Neuron[neuronsCount];
+      memset(neurons, 0, sizeof(*neurons)*neuronsCount);
+    }
+    if (ownWeights) {
+      this->weights = new Weight[weightsCount];
+      memset(this->weights, 0, sizeof(*this->weights)*weightsCount);
+    }
+
+    stat.neurons = neuronsCount;
+    stat.activeNeurons = layout.getActiveCount();
+    stat.weights = weightsCount;
+    stat.links = weightsCount;
+    stat.memsize = neuronsCount*sizeof(*neurons);
+    if (ownWeights) stat.memsize += weightsCount*sizeof(*weights);
+  }
+
+
+  virtual ~Layer() {
+    if (next) delete next;
+    if (neurons) delete[] neurons;
+    if (ownWeights) delete[] weights;
+  }
+
+
+  inline Layer& front()
+    { Layer *l = this; while(l->prev) l = l->prev; return *l; }
+  inline Layer& back()
+    { Layer *l = this; while(l->next) l = l->next; return *l; }
+  inline Stat sumStat() const
+    { Stat s; for(const Layer *l = this; l; l = l->next) s += l->stat; return s; }
+
+  bool save() const {
+    if (filename && weightsCount) {
+      FILE *f = fopen(filename, "wb");
+      if (!f)
+        return printf("cannot open file for write: %s\n", filename), false;
+      if (!fwrite(weights, sizeof(*weights)*weightsCount, 1, f))
+        return fclose(f), printf("cannot write to file: %s\n", filename), false;
+      fclose(f);
+    }
+    return !next || next->save();
+  }
+
+
+  bool load() {
+    if (filename && weightsCount) {
+      FILE *f = fopen(filename, "rb");
+      if (!f)
+        return printf("cannot open file for read: %s\n", filename), false;
+      if (!fread(weights, sizeof(*weights)*weightsCount, 1, f))
+        return fclose(f), printf("cannot read from file: %s\n", filename), false;
+      fclose(f);
+    }
+    return !next || next->load();
+  }
+
+  
+  void clearAccum() {
+    Accum a = {};
+    for(Neuron *in = neurons, *e = in + neuronsCount; in < e; ++in)
+      in->a = a;
+  }
+
+  
+  void fillWeights(WeightReal wmin, WeightReal wmax) {
+    WeightReal k = (wmax - wmin)/RAND_MAX;
+    for(Weight *iw = weights, *e = iw + weightsCount; iw < e; ++iw)
+      iw->w = rand()*k + wmin;
+  }
+
+ 
+  virtual void split(int threadsCount)
+    { layout.split(mtLayouts, threadsCount); }
+  virtual void pass(Barrier &barrier) { }
+  virtual void backpassWeights(Barrier &barrier) { }
+  virtual void backpassDeltas(Barrier &barrier) { }
+  
+  virtual void testPass() { }
+  virtual void testBackpass() { }
+};
+
+
+#endif
diff --git a/projects/neural/layer.simple.inc.cpp b/projects/neural/layer.simple.inc.cpp
new file mode 100644
index 0000000..1f2a7b7
--- /dev/null
+++ b/projects/neural/layer.simple.inc.cpp
@@ -0,0 +1,309 @@
+#ifndef LAYER_SIMPLE_INC_CPP
+#define LAYER_SIMPLE_INC_CPP
+
+
+#include "layer.inc.cpp"
+
+
+typedef void Func(Neuron &n, AccumReal s);
+
+
+inline void funcSigmoidExp(Neuron &n, AccumReal s) {
+  //if (s > 5) s = 5; else if (s < -5) s = -5;
+  AccumReal ss = 1/(1 + std::exp(-s)); n.v = ss; n.d = ss * (1-ss);
+}
+
+
+template<typename Iter>
+inline void iterateNeurons(const Layout &l, Neuron *neurons) {
+  if (!l) return;
+  assert(neurons);
+  
+  int h = l.y1 - l.y0;
+  int w = l.x1 - l.x0;
+  int d = l.z1 - l.z0;
+  int sz = l.sz;
+  int sxz = l.sx*sz;
+  int swz = w*sz;
+  int shxz = h*sxz;
+  int dy = sxz - swz;
+  int dx = sz - d;
+
+  Neuron *in = neurons + l.y0*sxz + l.x0*sz + l.z0;
+
+  for(Neuron *e = in + shxz; in < e; in += dy)
+  for(Neuron *e = in +  swz; in < e; in += dx)
+  for(Neuron *e = in +    d; in < e; ++in)
+    Iter::iter3(*in);
+}
+
+
+template<typename Iter>
+inline void iterateNeurons2(Layout l, Layout dl, Neuron *neurons, typename Iter::DataType data, int stride = 1, typename Iter::DataAccumType *accum = nullptr) {
+  if (!l) return;
+  assert(dl);
+  assert(neurons);
+  assert(l.isSubLayoutOf(dl));
+  
+  int h    = l.getH();
+  int w    = l.getW();
+  int d    = l.getD();
+  int sxz  = l.sx*l.sz;
+  int swz  = w*l.sz;
+  int shxz = h*sxz;
+  int dy   = sxz - swz;
+  int dx   = l.sz - d;
+
+  int d_w  = dl.getW();
+  int d_d  = dl.getD();
+  int d_dx = (d_d - d)*stride;
+  int d_dy = (d_w - w)*d_d*stride;
+
+  Neuron *in = neurons + l.y0*sxz + l.x0*l.sz + l.z0;
+  data += (((l.y0 - dl.y0)*d_w + l.x0 - dl.x0)*d_d + l.z0 - dl.z0)*stride;
+
+  for(Neuron *e = in + shxz; in < e; in += dy, data += d_dy)
+  for(Neuron *e = in +  swz; in < e; in += dx, data += d_dx)
+  for(Neuron *e = in +    d; in < e; ++in, data += stride)
+    Iter::iter4(*in, data, *accum);
+}
+
+
+template<typename Iter>
+inline void iterateSimple(Layout cl, Layout pl, Layout wl, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+  if (!cl) return;
+  assert(pl);
+  assert(wl);
+  assert(c_neurons);
+  assert(p_neurons);
+  assert(weights);
+  assert(cl.isSubLayoutOf(wl));
+
+  int c_h    = cl.getH();
+  int c_w    = cl.getW();
+  int c_d    = cl.getD();
+  int c_sxz  = cl.sx*cl.sz;
+  int c_swz  = c_w*cl.sz;
+  int c_shxz = c_h*c_sxz;
+  int c_dy   = c_sxz - c_swz;
+  int c_dx   = cl.sz - c_d;
+
+  int p_h    = pl.getH();
+  int p_w    = pl.getW();
+  int p_d    = pl.getD();
+  int p_sxz  = pl.sx*pl.sz;
+  int p_swz  = p_w*pl.sz;
+  int p_shxz = p_h*p_sxz;
+  int p_dy   = p_sxz - p_swz;
+  int p_dx   = pl.sz - p_d;
+
+  int w_w    = wl.getW();
+  int w_d    = wl.getD();
+  int w_dz   = p_h*p_w*p_d;
+  int w_dx   = (w_d - c_d)*w_dz;
+  int w_dy   = (w_w - c_w)*w_d*w_dz;
+
+  Neuron *icn = c_neurons + (cl.y0*c_sxz + cl.x0*cl.sz + cl.z0);
+  p_neurons += pl.y0*p_sxz + pl.x0*pl.sz + pl.z0;
+
+  Weight *iw = weights + (((cl.y0 - wl.y0)*w_w + cl.x0 - wl.x0)*w_d + cl.z0 - wl.z0)*w_dz;
+
+  for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, iw += w_dy)
+  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, iw += w_dx)
+  for(Neuron *e = icn +    c_d; icn < e; ++icn) {
+    typename Iter::AccumType a;
+    Iter::init(*icn, a);
+
+    Neuron *ipn = p_neurons;
+    for(Neuron *e = ipn + p_shxz; ipn < e; ipn += p_dy)
+    for(Neuron *e = ipn +  p_swz; ipn < e; ipn += p_dx)
+    for(Neuron *e = ipn +    p_d; ipn < e; ++ipn, ++iw)
+      Iter::iter(*ipn, *iw, a);
+
+    Iter::done(*icn, a);
+  }
+}
+
+
+template<typename Iter>
+void iterateSimpleInv(Layout cl, Layout pl, Layout wl, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+  if (!cl) return;
+  assert(pl);
+  assert(wl);
+  assert(c_neurons);
+  assert(p_neurons);
+  assert(weights);
+  assert(cl.isSubLayoutOf(wl));
+
+  int c_h    = cl.getH();
+  int c_w    = cl.getW();
+  int c_d    = cl.getD();
+  int c_sxz  = cl.sx*cl.sz;
+  int c_swz  = c_w*cl.sz;
+  int c_shxz = c_h*c_sxz;
+  int c_dy   = c_sxz - c_swz;
+  int c_dx   = cl.sz - c_d;
+
+  int p_h    = pl.getH();
+  int p_w    = pl.getW();
+  int p_d    = pl.getD();
+  int p_sxz  = pl.sx*pl.sz;
+  int p_swz  = p_w*pl.sz;
+  int p_shxz = p_h*p_sxz;
+  int p_dy   = p_sxz - p_swz;
+  int p_dx   = pl.sz - p_d;
+
+  int w_w    = wl.getW();
+  int w_h    = wl.getH();
+  int w_d    = wl.getD();
+  int w_ddz  = w_h*w_w*w_d;
+  int w_dx   = w_d - c_d;
+  int w_dy   = (w_w - c_w)*w_d;
+
+  Neuron *icn = c_neurons + (cl.y0*c_sxz + cl.x0*cl.sz + cl.z0);
+  p_neurons += pl.y0*p_sxz + pl.x0*pl.sz + pl.z0;
+
+  Weight *iw = weights + ((cl.y0 - wl.y0)*w_w + cl.x0 - wl.x0)*w_d + cl.z0 - wl.z0;
+
+  for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, iw += w_dy)
+  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, iw += w_dx)
+  for(Neuron *e = icn +    c_d; icn < e; ++icn, ++iw) {
+    typename Iter::AccumType a;
+    Iter::init(*icn, a);
+
+    Weight *iiw = iw;
+    Neuron *ipn = p_neurons;
+    for(Neuron *e = ipn + p_shxz; ipn < e; ipn += p_dy)
+    for(Neuron *e = ipn +  p_swz; ipn < e; ipn += p_dx)
+    for(Neuron *e = ipn +    p_d; ipn < e; ++ipn, iiw += w_ddz)
+      Iter::iter(*ipn, *iiw, a);
+
+    Iter::done(*icn, a);
+  }
+}
+
+
+Layout optimizeLayoutSimple(const Layout &layout) {
+  Layout l = layout;
+  if (l.x0 == 0 && l.x1 == l.sx)
+    { l.x0 = l.y0*l.sx; l.x1 *= l.y1; l.sx *= l.sy; l.y0 = 0; l.y1 = l.sy = 1; }
+  if (l.z0 == 0 && l.z1 == l.sz)
+    { l.z0 = l.x0*l.sz; l.z1 *= l.x1; l.sz *= l.sx; l.x0 = 0; l.x1 = l.sx = 1; }
+  return l;
+}
+
+
+template<Func func>
+class LayerSimple: public Layer {
+public:
+  Layout optLayout;
+  Layout prevOptLayout;
+  Layout::List mtOptLayouts;
+  Layout::List mtPrevOptLayouts;
+
+
+  LayerSimple(Layer &prev, const Layout &layout, Weight *weights = nullptr):
+    Layer(&prev, layout, layout.getActiveCount() * prev.back().layout.getActiveCount(), weights),
+    optLayout(optimizeLayoutSimple(layout)),
+    prevOptLayout(optimizeLayoutSimple(this->prev->layout))
+  {
+    if (ownWeights) fillWeights(-1, 1);
+  }
+
+
+  void split(int threadsCount) override {
+    Layer::split(threadsCount);
+    optLayout.split(mtOptLayouts, threadsCount);
+    prevOptLayout.split(mtPrevOptLayouts, threadsCount);
+  }
+
+
+  void pass(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void init(Neuron&, AccumType &a) { a.v = 0; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { a.v += n.v * w.w; }
+      static inline void done(Neuron &n, AccumType &a) { func(n, a.v); }
+    };
+    iterateSimple<I>(mtOptLayouts[barrier.tid], prevOptLayout, optLayout, neurons, prev->neurons, weights);
+  }
+  
+  
+  void backpassWeights(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void init(Neuron &n, AccumType &a) { a.v = n.d; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { w.w += n.v * a.v; }
+    };
+    iterateSimple<I>(mtOptLayouts[barrier.tid], prevOptLayout, optLayout, neurons, prev->neurons, weights);
+  }
+  
+  
+  void backpassDeltas(Barrier &barrier) override {
+    struct I: public Iter {
+      static inline void init(Neuron&, AccumType &a) { a.v = 0; }
+      static inline void iter(Neuron &n, Weight &w, AccumType &a) { a.v += n.d * w.w; }
+      static inline void done(Neuron &n, AccumType &a) { n.d *= a.v; }
+    };
+    iterateSimpleInv<I>(mtPrevOptLayouts[barrier.tid], optLayout, prevOptLayout, prev->neurons, neurons, weights);
+  }
+
+  
+  void testPass() override {
+    Layout cl = layout;
+    Layout pl = prev->layout;
+    
+    for(int cy = cl.y0; cy < cl.y1; ++cy)
+    for(int cx = cl.x0; cx < cl.x1; ++cx)
+    for(int cz = cl.z0; cz < cl.z1; ++cz) {
+      AccumReal a = 0;
+      Neuron &cn = neurons[ (cy*cl.sx + cx)*cl.sz + cz ];
+      int wi = ((cy-cl.y0)*cl.getW() + cx-cl.x0)*cl.getD() + cz-cl.z0;
+
+      for(int py = pl.y0; py < pl.y1; ++py)
+      for(int px = pl.x0; px < pl.x1; ++px)
+      for(int pz = pl.z0; pz < pl.z1; ++pz) {
+        Neuron &pn = prev->neurons[ (py*pl.sx + px)*pl.sz + pz ];
+        int wii = ((wi*pl.getH() + py-pl.y0)*pl.getW() + px-pl.x0)*pl.getD() + pz-pl.z0;
+        Weight &w = weights[wii];
+        a += pn.v * w.w;
+      }
+      
+      func(cn, a);
+    }
+  }
+
+  
+  void testBackpass() override {
+    Layout cl = layout;
+    Layout pl = prev->layout;
+  
+    for(int i = 0; i < prev->neuronsCount; ++i)
+      prev->neurons[i].a.v = 0;
+    
+    for(int cy = cl.y0; cy < cl.y1; ++cy)
+    for(int cx = cl.x0; cx < cl.x1; ++cx)
+    for(int cz = cl.z0; cz < cl.z1; ++cz) {
+      Neuron &cn = neurons[ (cy*cl.sx + cx)*cl.sz + cz ];
+      int wi = ((cy-cl.y0)*cl.getW() + cx-cl.x0)*cl.getD() + cz-cl.z0;
+
+      for(int py = pl.y0; py < pl.y1; ++py)
+      for(int px = pl.x0; px < pl.x1; ++px)
+      for(int pz = pl.z0; pz < pl.z1; ++pz) {
+        Neuron &pn = prev->neurons[ (py*pl.sx + px)*pl.sz + pz ];
+        int wii = ((wi*pl.getH() + py-pl.y0)*pl.getW() + px-pl.x0)*pl.getD() + pz-pl.z0;
+        Weight &w = weights[wii];
+        
+        pn.a.v += w.w * cn.d;
+        w.w += pn.v * cn.d;
+      }
+    }
+
+    for(int i = 0; i < prev->neuronsCount; ++i) {
+      Neuron &pn = prev->neurons[i];
+      pn.d *= pn.a.v;
+      pn.a.v = 0;
+    }
+  }
+};
+
+
+#endif
diff --git a/projects/neural/layer.simple.test.inc.cpp b/projects/neural/layer.simple.test.inc.cpp
new file mode 100644
index 0000000..7fef70e
--- /dev/null
+++ b/projects/neural/layer.simple.test.inc.cpp
@@ -0,0 +1,192 @@
+#ifndef LAYER_SIMPLE_TEST_INC_CPP
+#define LAYER_SIMPLE_TEST_INC_CPP
+
+
+#include "layer.test.inc.cpp"
+#include "layer.simple.inc.cpp"
+
+
+class SimpleTest: public Test {
+public:
+  static void init(const Layout &cl, const Layout &pl = Layout())
+    { Test::init(cl.getCount(), pl.getCount(), cl.getActiveCount()*pl.getActiveCount()); }
+
+
+  static bool verifyWeights(const char *name, const Layout &cl, const Layout &pl) {
+    Stage st(name);
+    for(int cy = cl.y0; cy < cl.y1; ++cy)
+    for(int cx = cl.x0; cx < cl.x1; ++cx)
+    for(int cz = cl.z0; cz < cl.z1; ++cz) {
+      int ci = (cy*cl.sx + cx)*cl.sz + cz;
+
+      for(int py = pl.y0; py < pl.y1; ++py)
+      for(int px = pl.x0; px < pl.x1; ++px)
+      for(int pz = pl.z0; pz < pl.z1; ++pz) {
+        int pi = (py*pl.sx + px)*pl.sz + pz;
+
+        int wi = ((cy - cl.y0)*cl.getW() + cx - cl.x0)*cl.getD() + cz - cl.z0;
+        wi = ((wi*pl.getH() + py - pl.y0)*pl.getW() + px - pl.x0)*pl.getD() + pz - pl.z0;
+
+        int s = (int)p_neurons.size();
+        int w = weights[wi].i;
+        int i = ci*s + pi + 1;
+
+        if (w != i) {
+          int ww = w;
+          int wpz = ww%pl.sz; ww /= pl.sz;
+          int wpx = ww%pl.sx; ww /= pl.sx;
+          int wpy = ww%pl.sy; ww /= pl.sy;
+          int wcz = ww%cl.sz; ww /= cl.sz;
+          int wcx = ww%cl.sx; ww /= cl.sx;
+          int wcy = ww;
+
+          printf(
+            "wrong index: %d = ((%d*%d + %d)*%d + %d)*%d + (%d*%d + %d)*%d + %d + 1,\n"
+            "expected:    %d = ((%d*%d + %d)*%d + %d)*%d + (%d*%d + %d)*%d + %d + 1\n"
+            "wi = %d\n",
+            w,
+            wcy, cl.sx, wcx, cl.sz, wcz, s,
+            wpy, pl.sx, wpx, pl.sz, wpz,
+            i,
+            cy, cl.sx, cx, cl.sz, cz, s,
+            py, pl.sx, px, pl.sz, pz,
+            wi );
+          pl.printYXZ("prev layout");
+          cl.printYXZ("curr layout");
+          ++errors;
+          return st;
+        }
+      }
+    }
+    return st;
+  }
+
+
+  static bool testIterators(const char *name, const Layout &cl, const Layout &pl, const Layout &ocl, const Layout &opl, int threads) {
+    Stage st(name);
+
+    assert(cl && pl && ocl && opl && threads > 0);
+    Layout::List oclist, oplist;
+    ocl.split(oclist, threads);
+    opl.split(oplist, threads);
+
+    struct I: public Iter {
+      typedef int DataType;
+      typedef int DataAccumType;
+      static inline void init(Neuron &n, Iter::AccumType &a) { ++n.a.i; a.i = (AccumInt)(&n - c_neurons.data()); }
+      static inline void iter(Neuron &n, Weight &w, Iter::AccumType &a) {
+        if (w.i)
+          ++errors;
+        w.i = (WeightInt)(&n - p_neurons.data() + a.i*p_neurons.size() + 1);
+      }
+      static inline void iter3(Neuron &n) { ++n.a.i; }
+      static inline void iter4(Neuron &n, DataType d, DataAccumType &a) { n.a.i = d; ++a; }
+    };
+
+    struct IB: public Iter {
+      static inline void init(Neuron &n, Iter::AccumType &a) { ++n.a.i; a.i = (AccumInt)(&n - p_neurons.data()); }
+      static inline void iter(Neuron &n, Weight &w, Iter::AccumType &a) {
+        if (w.i)
+          ++errors;
+        w.i = (WeightInt)((&n - c_neurons.data())*p_neurons.size() + a.i + 1);
+      }
+    };
+
+    {
+      Stage st("iterateNeurons");
+      init(cl);
+      for(int i = 0; i < threads; ++i)
+        iterateNeurons<I>(oclist[i], c_neurons.data());
+      verifyNeurons("check-neurons", cl, c_neurons.data());
+    }
+
+    {
+      Stage st("iterateNeurons2");
+      init(cl);
+      for(int i = 0; i < threads; ++i) {
+        int a = 5, aa = a + oclist[i].getActiveCount();
+        iterateNeurons2<I>(oclist[i], ocl, c_neurons.data(), 5, 3, &a);
+        if (a != aa) {
+          printf("wrong accum value %d, expected %d, tid: %d/%d\n", a, aa, i, threads);
+          oclist[i].printYXZ("sub  layout");
+          oclist[i].printYXZ("orig layout");
+        }
+      }
+      verifyNeuronIndices("check-neuron-indices", cl, c_neurons.data(), 5, 3);
+    }
+    
+    {
+      Stage st("iterateSimple");
+      init(cl, pl);
+      for(int i = 0; i < threads; ++i)
+        iterateSimple<I>(oclist[i], opl, ocl, c_neurons.data(), p_neurons.data(), weights.data());
+      verifyNeurons("check-neurons", cl, c_neurons.data());
+      verifyWeights("check-weights", cl, pl);
+    }
+
+    {
+      Stage st("iterateSimpleInv");
+      init(cl, pl);
+      for(int i = 0; i < threads; ++i)
+        iterateSimpleInv<IB>(oplist[i], ocl, opl, p_neurons.data(), c_neurons.data(), weights.data());
+      verifyNeurons("check-neurons", pl, p_neurons.data());
+      verifyWeights("check-weights", cl, pl);
+    }
+
+    return st;
+  }
+
+
+  static bool testIterators(const char *name, const Layout &cl, const Layout &pl) {
+    Stage st(name);
+    
+    {
+      Stage st("plain");
+      testIterators( "single-thread", cl, pl, cl, pl,   1 );
+      testIterators( "2-threads",     cl, pl, cl, pl,   2 );
+      testIterators( "7-threads",     cl, pl, cl, pl,   7 );
+      testIterators( "8-threads",     cl, pl, cl, pl,   8 );
+      testIterators( "512-threads",   cl, pl, cl, pl, 512 );
+    }
+
+    {
+      Stage st("optimized");
+      Layout ocl = optimizeLayoutSimple(cl);
+      Layout opl = optimizeLayoutSimple(pl);
+      testIterators( "single-thread", cl, pl, ocl, opl,   1 );
+      testIterators( "2-threads",     cl, pl, ocl, opl,   2 );
+      testIterators( "7-threads",     cl, pl, ocl, opl,   7 );
+      testIterators( "8-threads",     cl, pl, ocl, opl,   8 );
+      testIterators( "512-threads",   cl, pl, ocl, opl, 512 );
+    }
+    
+    return st;
+  }
+
+  
+  static bool test(const char *name, const Layout &cl, const Layout &pl) {
+    Stage st(name);
+    
+    testIterators(name, cl, pl);
+    
+    {
+      Layer l(nullptr, pl);
+      new LayerSimple<funcSigmoidExp>(l, cl);
+      Test::testLayer("LayerSimple", l);
+    }
+    
+    return st;
+  }
+
+  
+  static bool test(const char *name = "simple") {
+    Stage st(name);
+    test("square-16x8", Layout(8, 8, 4), Layout(16, 16, 3));
+    test("random-rect", Layout( 7, 4, 3).expandX(1, 2).expandY(3, 1).expandZ(5, 8),
+                        Layout(13, 9, 4).expandX(2, 0).expandY(5, 3).expandZ(3, 1) );
+    return st;
+  }
+};
+
+
+#endif
diff --git a/projects/neural/layer.test.inc.cpp b/projects/neural/layer.test.inc.cpp
new file mode 100644
index 0000000..83327c2
--- /dev/null
+++ b/projects/neural/layer.test.inc.cpp
@@ -0,0 +1,253 @@
+#ifndef LAYER_TEST_INC_CPP
+#define LAYER_TEST_INC_CPP
+
+
+#include <thread>
+
+#include "layer.inc.cpp"
+
+
+
+class Test {
+public:
+  class Stage {
+  public:
+    const int errors;
+    inline explicit Stage(const char *name): errors(Test::errors) {
+      for(int i = 0; i < level; ++i) printf("- ");
+      printf("%s\n", name);
+      fflush(stdout);
+      ++level;
+    }
+    inline ~Stage() {
+      --level;
+      if (!*this) {
+        for(int i = 0; i < level; ++i) printf("- ");
+        printf("FAILED\n");
+      }
+      fflush(stdout);
+    }
+    operator bool() { return Test::errors == errors; }
+  };
+
+private:
+  static int level;
+
+protected:
+  static std::vector<Neuron> c_neurons;
+  static std::vector<Neuron> p_neurons;
+  static std::vector<Weight> weights;
+
+public:
+  static int errors;
+
+
+  static void init(int c_count, int p_count, int w_count) {
+    Neuron n = {};
+    Weight w = {};
+
+    c_neurons.clear();
+    p_neurons.clear();
+    weights.clear();
+    c_neurons.resize(c_count, n);
+    p_neurons.resize(p_count, n);
+    weights.resize(w_count, w);
+  }
+
+
+  static bool verifyNeurons(const char *name, const Layout &l, const Neuron *neurons, bool ignorePadded = false) {
+    Stage st(name);
+    for(int y = 0; y < l.sy; ++y)
+    for(int x = 0; x < l.sx; ++x)
+    for(int z = 0; z < l.sz; ++z) {
+      int n = neurons[ (y*l.sx + x)*l.sz + z ].a.i;
+      int i = x >= l.x0 && x < l.x1
+           && y >= l.y0 && y < l.y1
+           && z >= l.z0 && z < l.z1;
+      if (ignorePadded ? i && n != i : n != i) {
+        printf(
+          "wrong neuron mark %d, expected %d (%d, %d, %d)\n",
+          n, i, y, x, z );
+        l.printYXZ("layout");
+        ++errors;
+        return st;
+      }
+    }
+    return st;
+  }
+  
+  
+  static bool verifyNeuronIndices(const char *name, const Layout &l, const Neuron *neurons, int base = 1, int stride = 1) {
+    Stage st(name);
+    for(int y = 0; y < l.sy; ++y)
+    for(int x = 0; x < l.sx; ++x)
+    for(int z = 0; z < l.sz; ++z) {
+      bool active = x >= l.x0 && x < l.x1
+                 && y >= l.y0 && y < l.y1
+                 && z >= l.z0 && z < l.z1;
+      
+      int n = neurons[ (y*l.sx + x)*l.sz + z ].a.i;
+      int i = (((y - l.y0)*l.getW() + x - l.x0)*l.getD() + z - l.z0)*stride + base;
+      
+      if (!active) i = 0;
+      
+      if (n != i) {
+        printf(
+          "wrong neuron mark %d, expected %d (%d, %d, %d)\n",
+          n, i, y, x, z );
+        l.printYXZ("layout");
+        ++errors;
+        return st;
+      }
+    }
+    return st;
+  }
+  
+  
+  static bool verifyNeuronsAccum(const Layout &l, Neuron *neurons, int accum = 1, bool ignoreBounds = false) {
+    for(int y = 0; y < l.sy; ++y)
+    for(int x = 0; x < l.sx; ++x)
+    for(int z = 0; z < l.sz; ++z) {
+      Neuron &n = neurons[ (y*l.sx + x)*l.sz + z ];
+      int i = ( x >= l.x0 && x < l.x1
+             && y >= l.y0 && y < l.y1
+             && z >= l.z0 && z < l.z1 )*accum;
+      if (ignoreBounds) i = accum;
+      if (n.v != 0 && n.v != i) {
+        printf(
+          "wrong neuron mark %g, expected 0 or %d (%d, %d, %d)\n",
+          n.v, i, y, x, z );
+        l.printYXZ("layout");
+        ++errors;
+        return false;
+      }
+      if (n.v) n.a.i = 1;
+      n.v = 0;
+    }
+    return true;
+  }
+  
+  
+  static bool testLayer(const char *name, Layer &l) {
+    Stage st(name);
+
+    assert(l.next);
+    Layer &p = l;
+    Layer &c = *l.next;
+    
+    
+    struct H {
+      Layer &p;
+      Layer &c;
+
+      std::vector<std::thread*> threads;
+      std::atomic<unsigned int> counter;
+      
+      H(Layer &p, Layer &c): p(p), c(c), counter(0) { }
+  
+      void prepareData() {
+        memcpy(c.neurons, c_neurons.data(), c.neuronsCount*sizeof(Neuron));
+        memcpy(p.neurons, p_neurons.data(), p.neuronsCount*sizeof(Neuron));
+        memcpy(c.weights, weights.data(), c.weightsCount*sizeof(Weight));
+      }
+  
+      void applyDelta() {
+        for(int i = 0; i < c.neuronsCount; ++i)
+          c.neurons[i].d *= c_neurons[i].v - c.neurons[i].v;
+      }
+  
+      void func(int tid) {
+        Barrier barrier(counter, tid, threads.size());
+        c.pass(barrier);
+        barrier.wait();
+        if (!tid) applyDelta();
+        barrier.wait();
+        c.backpassDeltas(barrier);
+        barrier.wait();
+        c.backpassWeights(barrier);
+      }
+      
+      bool test(const char *name, int threadsCount) {
+        Stage st(name);
+        
+        assert(threadsCount > 0);
+        
+        counter = 0;
+        threads.clear();
+        threads.resize(threadsCount, nullptr);
+        
+        prepareData();
+
+        p.split(threadsCount);
+        c.split(threadsCount);
+        for(int i = 1; i < threadsCount; ++i) threads[i] = new std::thread(&H::func, this, i);
+        func(0);
+        for(int i = 1; i < threadsCount; ++i) { threads[i]->join(); delete threads[i]; }
+        threads.clear();
+        
+        for(int i = 0; i < c.neuronsCount; ++i) {
+          NeuronReal a = c.neurons[i].v;
+          NeuronReal b = c_neurons[i + c.neuronsCount].v;
+          if (fabs(a - b) > 1e-6)
+            { printf("results differs at neuron %d, was %g, expected %g\n", i, a, b); ++errors; break; }
+        }
+        
+        for(int i = 0; i < p.neuronsCount; ++i) {
+          NeuronReal a = p.neurons[i].d;
+          NeuronReal b = p_neurons[i + p.neuronsCount].d;
+          if (fabs(a - b) > 1e-6)
+            { printf("deltas differs at neuron %d, was %g, expected %g\n", i, a, b); ++errors; break; }
+        }
+
+        for(int i = 0; i < c.weightsCount; ++i) {
+          WeightReal a = c.weights[i].w;
+          WeightReal b = weights[i + c.weightsCount].w;
+          if (fabs(a - b) > 1e-6)
+            { printf("weights differs at %d, was %g, expected %g\n", i, a, b); ++errors; break; }
+        }
+        
+        if (!st) {
+          p.layout.printYXZ("prev layout");
+          c.layout.printYXZ("curr layout");
+        }
+        
+        return st;
+      }
+    } h(p, c);
+    
+    // make base data
+    
+    init(c.neuronsCount*2, p.neuronsCount*2, c.weightsCount*2);
+    for(int i = 0; i < c.neuronsCount; ++i) c_neurons[i].v = rand()/(NeuronReal)RAND_MAX;
+    for(int i = 0; i < p.neuronsCount; ++i) p_neurons[i].v = rand()/(NeuronReal)RAND_MAX;
+    memcpy(weights.data(), c.weights, c.weightsCount*sizeof(Weight));
+    
+    h.prepareData();
+    c.testPass();
+    h.applyDelta();
+    c.testBackpass();
+
+    memcpy(&c_neurons[c.neuronsCount], c.neurons, c.neuronsCount*sizeof(Neuron));
+    memcpy(&p_neurons[p.neuronsCount], p.neurons, p.neuronsCount*sizeof(Neuron));
+    memcpy(&weights[c.weightsCount], c.weights, c.weightsCount*sizeof(Weight));
+
+    h.test("single-thread", 1);
+    h.test("2-threads", 2);
+    h.test("7-threads", 7);
+    h.test("8-threads", 8);
+    //h.test("512-threads", 512);
+    
+    return st;
+  }
+};
+
+
+int Test::level = 0;
+std::vector<Neuron> Test::c_neurons;
+std::vector<Neuron> Test::p_neurons;
+std::vector<Weight> Test::weights;
+int Test::errors = 0;
+
+
+
+#endif
diff --git a/projects/neural/layout.inc.cpp b/projects/neural/layout.inc.cpp
new file mode 100644
index 0000000..83a78a0
--- /dev/null
+++ b/projects/neural/layout.inc.cpp
@@ -0,0 +1,129 @@
+#ifndef LAYOUT_INC_CPP
+#define LAYOUT_INC_CPP
+
+
+#include <cassert>
+#include <cstdio>
+
+#include <vector>
+#include <algorithm>
+
+
+struct Layout {
+  typedef std::vector<Layout> List;
+
+  int sx, sy, sz;
+  int x0, x1;
+  int y0, y1;
+  int z0, z1;
+
+  inline Layout(): sx(), sy(), sz(), x0(), x1(), y0(), y1(), z0(), z1() { }
+
+  explicit inline Layout(int sx, int sy = 1, int sz = 1):
+    sx(sx), sy(sy), sz(sz), x0(), x1(sx), y0(), y1(sy), z0(), z1(sz) { }
+
+
+  inline Layout& expandX  (int e0, int e1) { return sx += e0+e1, x0 += e0, x1 += e0, *this; }
+  inline Layout& expandY  (int e0, int e1) { return sy += e0+e1, y0 += e0, y1 += e0, *this; }
+  inline Layout& expandZ  (int e0, int e1) { return sz += e0+e1, z0 += e0, z1 += e0, *this; }
+  inline Layout& expandXY (int e0, int e1) { return expandX (e0, e1).expandY(e0, e1); }
+  inline Layout& expandXYZ(int e0, int e1) { return expandXY(e0, e1).expandZ(e0, e1); }
+  inline Layout& expandX  (int e)          { return expandX  (e, e); }
+  inline Layout& expandY  (int e)          { return expandY  (e, e); }
+  inline Layout& expandZ  (int e)          { return expandX  (e, e); }
+  inline Layout& expandXY (int e)          { return expandXY (e, e); }
+  inline Layout& expandXYZ(int e)          { return expandXYZ(e, e); }
+
+  inline Layout& padX  (int p0, int p1) { return x0 += p0, x1 -= p0, *this; }
+  inline Layout& padY  (int p0, int p1) { return y0 += p0, y1 -= p0, *this; }
+  inline Layout& padZ  (int p0, int p1) { return z0 += p0, z1 -= p0, *this; }
+  inline Layout& padXY (int p0, int p1) { return padX (p0, p1).padY(p0, p1); }
+  inline Layout& padXYZ(int p0, int p1) { return padXY(p0, p1).padZ(p0, p1); }
+  inline Layout& padX  (int p)          { return padX  (p, p); }
+  inline Layout& padY  (int p)          { return padY  (p, p); }
+  inline Layout& padZ  (int p)          { return padX  (p, p); }
+  inline Layout& padXY (int p)          { return padXY (p, p); }
+  inline Layout& padXYZ(int p)          { return padXYZ(p, p); }
+
+
+  inline int getW() const { return x1 - x0; }
+  inline int getH() const { return y1 - y0; }
+  inline int getD() const { return z1 - z0; }
+
+  inline int getCount() const { return sx*sy*sz; }
+  inline int getActiveCount() const { return getW()*getH()*getD(); }
+
+
+  inline operator bool() const {
+    return x0 >= 0 && x0 < x1 && x1 <= sx
+        && y0 >= 0 && y0 < y1 && y1 <= sy
+        && z0 >= 0 && z0 < z1 && z1 <= sz;
+  }
+
+  
+  inline bool isSameSizeWith(const Layout &b) const
+    { return sx == b.sx && sy == b.sy && sz == b.sz; }
+  inline bool isSameActiveSizeWith(const Layout &b) const
+    { return getW() == b.getW() && getH() == b.getH() && getD() == b.getD(); }
+
+  inline bool isSubLayoutOf(const Layout &b) const
+    { return isSameSizeWith(b) && b.x0 <= x0 && x0 < x1 && x1 <= b.x1; }
+  inline bool isParentLayoutOf(const Layout &b) const
+    { return b.isSubLayoutOf(*this); }
+  
+
+  void splitX(List &list, int count) const {
+    if (count <= 0) return list.clear();
+    list.resize(count);
+    int v = x0, s = x1 - v;
+    for(int i = 0; i < count; ++i) {
+      Layout &l = list[i] = *this;
+      l.x0 = v;
+      l.x1 = (v += s/count + (i < s%count));
+    }
+  }
+
+  void splitY(List &list, int count) const {
+    if (count <= 0) return list.clear();
+    list.resize(count);
+    int v = y0, s = y1 - v;
+    for(int i = 0; i < count; ++i) {
+      Layout &l = list[i] = *this;
+      l.y0 = v;
+      l.y1 = (v += s/count + (i < s%count));
+    }
+  }
+
+  void splitZ(List &list, int count) const {
+    if (count <= 0) return list.clear();
+    list.resize(count);
+    int v = z0, s = z1 - v;
+    for(int i = 0; i < count; ++i) {
+      Layout &l = list[i] = *this;
+      l.z0 = v;
+      l.z1 = (v += s/count + (i < s%count));
+    }
+  }
+
+  void split(List &list, int count) const {
+    int h = getH(), w = getW(), d = getD();
+    if (h >= w && h >= d) splitY(list, count); else
+              if (w >= d) splitX(list, count); else
+                          splitZ(list, count);
+  }
+
+  void print(const char *prefix = nullptr) const {
+    if (prefix && *prefix) printf("%s: ", prefix);
+    printf("x: %d (%d-%d), y: %d (%d-%d), z: %d (%d-%d)\n", sx, x0, x1, sy, y0, y1, sz, z0, z1);
+  }
+  void printYXZ(const char *prefix = nullptr) const {
+    if (prefix && *prefix) printf("%s: ", prefix);
+    printf("y: %d (%d-%d), x: %d (%d-%d), z: %d (%d-%d)\n", sy, y0, y1, sx, x0, x1, sz, z0, z1);
+  }
+};
+
+
+
+
+
+#endif
diff --git a/projects/neural/train.digit.inc.cpp b/projects/neural/train.digit.inc.cpp
new file mode 100644
index 0000000..9b3bc9e
--- /dev/null
+++ b/projects/neural/train.digit.inc.cpp
@@ -0,0 +1,94 @@
+#ifndef TRAIN_DIGIT_INC_CPP
+#define TRAIN_DIGIT_INC_CPP
+
+
+#include "train.inc.cpp"
+#include "layer.simple.inc.cpp"
+
+
+class TrainerDigit: public Trainer {
+protected:
+  std::vector<unsigned char> data;
+  std::vector<unsigned int> shuffle;
+  Layout ofl, obl;
+  Layout::List oflist, oblist;
+  int stride, count;
+
+public:
+  TrainerDigit(): stride(), count() { }
+
+  bool loadSymbolMap(const char *filename) {
+    data.clear();
+
+    FILE *f = fopen(filename, "rb");
+    if (!f)
+      return printf("cannot open file for read: %s\n", filename), false;
+    fseek(f, 0, SEEK_END);
+    size_t fs = ftello(f);
+    fseek(f, 0, SEEK_SET);
+
+    data.resize(fs, 0);
+    if (!fread(data.data(), fs, 1, f))
+      return printf("cannot read from file: %s\n", filename), fclose(f), data.clear(), false;
+
+    fclose(f);
+    return true;
+  }
+
+protected:
+  bool prepare() override {
+    ofl = optimizeLayoutSimple(fl->layout);
+    obl = optimizeLayoutSimple(bl->layout);
+    ofl.split(oflist, threadsCount);
+    obl.split(oblist, threadsCount);
+    stride = ofl.getActiveCount() + 1;
+    count = data.size()/stride;
+    if (count <= 0) return false;
+    shuffle.resize(count);
+    for(int i = 0; i < count; ++i)
+      shuffle[i] = i;
+    return true;
+  }
+
+
+  bool prepareBlock() override {
+    int cnt = itersPerBlock > count ? count : itersPerBlock;
+    for(int i = 0; i < cnt; ++i) {
+      int j = rand()%count;
+      if (i != j) std::swap(shuffle[i], shuffle[j]);
+    }
+    return true;
+  }
+
+
+  void loadData(Barrier &barrier, int, int iter) override {
+    struct I: public Iter {
+      typedef const unsigned char* DataType;
+      static inline void iter4(Neuron &n, DataType d, DataAccumType&) { n.v = *d/(NeuronReal)255; }
+    };
+    const unsigned char *id = data.data() + shuffle[iter%count]*stride;
+    iterateNeurons2<I>(oflist[barrier.tid], ofl, fl->neurons, id);
+  }
+
+
+  AccumReal verifyDataMain(int, int iter) override {
+    struct I: public Iter {
+      typedef int DataType;
+      struct DataAccumType { int ri, mi; NeuronReal m; };
+      static inline void iter4(Neuron &n, DataType d, DataAccumType &a) {
+        NeuronReal v1 = d == a.ri;
+        NeuronReal v0 = n.v;
+        n.d *= v1 - v0;
+        if (a.m < v0) { a.m = v0; a.mi = d; }
+      }
+    };
+    
+    I::DataAccumType a = { data[ (shuffle[iter%count] + 1)*stride - 1 ], 0, 0 };
+    iterateNeurons2<I>(obl, obl, bl->neurons, 0, 1, &a);
+    
+    return a.mi != a.ri;
+  }
+};
+
+
+#endif
diff --git a/projects/neural/train.image.inc.cpp b/projects/neural/train.image.inc.cpp
new file mode 100644
index 0000000..d7dd9a2
--- /dev/null
+++ b/projects/neural/train.image.inc.cpp
@@ -0,0 +1,204 @@
+#ifndef TRAIN_IMAGE_INC_CPP
+#define TRAIN_IMAGE_INC_CPP
+
+
+#include "train.inc.cpp"
+#include "layer.simple.inc.cpp"
+
+
+class TrainerImage: public Trainer {
+protected:
+  std::vector<unsigned char> data;
+  std::vector<unsigned int> shuffle;
+  const char *datafile;
+  const char *outfile;
+  Layout ofl, obl;
+  Layout::List oflist, oblist;
+  int stride, count;
+
+public:
+  TrainerImage(): stride(), count() { }
+
+  bool configure(const char *datafile, const char *outfile) {
+    this->datafile = datafile;
+    this->outfile = outfile;
+  }
+    
+    data.clear();
+
+    FILE *f = fopen(filename, "rb");
+    if (!f)
+      return printf("cannot open file for read: %s\n", filename), false;
+    fseek(f, 0, SEEK_END);
+    size_t fs = ftello(f);
+    fseek(f, 0, SEEK_SET);
+
+    data.resize(fs, 0);
+    if (!fread(data.data(), fs, 1, f))
+      return printf("cannot read from file: %s\n", filename), fclose(f), data.clear(), false;
+
+    fclose(f);
+    return true;
+  }
+
+  
+void imgTrain(Layer &l, const char *datafile, int size, const char *outfile, double trainRatio, int count) {
+  Layer &bl = l.back();
+
+  assert(!l.prev);
+  assert(datafile);
+  assert(count > 0 && size > 0);
+  assert(l.size == size);
+  assert(bl.size == size);
+
+  int blockSize = 1000;//1024*1024*1024/size;
+  assert(blockSize > 0);
+
+  FILE *f = fopen(datafile, "rb");
+  if (!f)
+    { printf("cannot open file: %s\n", datafile); return; }
+  fseeko64(f, 0, SEEK_END);
+  long long fsize = ftello64(f);
+  int xCount = (int)(fsize/size);
+  if (xCount <= 0)
+    { printf("no tests in file: %s\n", datafile); return; }
+
+  int *block = new int[blockSize*2];
+  int *shuffle = block + blockSize;
+  double *results = new double[blockSize];
+  unsigned char *blockData = new unsigned char[(blockSize + 1)*size];
+  unsigned char *blockResData = blockData + blockSize*size;
+  bool err = false;
+
+  for(int j = 0; j < blockSize; ++j)
+    { shuffle[j] = j; results[j] = 0; }
+
+  int blocksCount = (count - 1)/blockSize + 1;
+
+  printf("training %d (%d x %d blocks), tests: %d, ratio: %f:\n", blocksCount*blockSize, blocksCount, blockSize, xCount, trainRatio);
+
+  double avgSum = 0;
+  for(int i = 0; i < blocksCount; ++i) {
+    for(int j = 0; j < blockSize; ++j) {
+      block[j] = rand()%xCount;
+      std::swap(shuffle[i], shuffle[rand()%blockSize]);
+    }
+    std::sort(block, block + blockSize);
+
+    for(int j = 0; j < blockSize; ++j) {
+      fseeko64(f, block[j]*(long long)size, SEEK_SET);
+      if (!fread(blockData + j*size, size, 1, f))
+        { printf("cannot read data from file: %s\n", datafile); err = true; break; }
+    }
+    if (err) break;
+
+    printf("  next data block loaded\n");
+
+    double sumQ = 0;
+    for(int j = 0; j < blockSize; ++j) {
+      unsigned char *data = blockData + shuffle[j]*size;
+      for(double *ia = l.a, *e = ia + l.size; ia < e; ++ia, ++data)
+        *ia = *data/255.0;
+
+      double firstQ = 0, q = 0;
+      for(int repeat = 0; repeat < 1; ++repeat) {
+        l.pass();
+
+        for(double *ia = l.a, *iba = bl.a, *ibda = bl.da, *e = ia + l.size; ia < e; ++ia, ++iba, ++ibda) {
+          double d = *ia - *iba;
+          *ibda = d;
+          q += d*d;
+        }
+        q /= size;
+        if (!repeat) firstQ = q;
+
+        bl.backpass(trainRatio);
+      }
+
+      sumQ += firstQ;
+      avgSum += firstQ - results[j];
+      results[j] = firstQ;
+      int avgCnt = i ? blockSize : j + 1;
+      printf("  %4d: total: %6d, avg result: %f, last result: %f -> %f\n", j+1, i*blockSize+j+1, avgSum/avgCnt, firstQ, q);
+    }
+
+    printf("%4d: total: %6d, avg result: %f\n", i+1, (i+1)*blockSize, sumQ/blockSize);
+
+    if (outfile && !l.save(outfile))
+      { printf("cannot save neural network weights to file: %s\n", outfile); err = true; break; }
+
+    unsigned char *data = blockResData;
+    for(double *iba = bl.a, *e = iba + bl.size; iba < e; ++iba, ++data)
+      *data = (unsigned char)(*iba*255.999);
+    tgaSave("data/output/sampleX.tga", blockData + shuffle[blockSize-1]*size, 256, 256, 3);
+    tgaSave("data/output/sampleY.tga", blockResData, 256, 256, 3);
+  }
+
+  delete[] block;
+  delete[] results;
+  delete[] blockData;
+
+  printf("finished\n");
+}
+  
+
+protected:
+  bool prepare() override {
+    ofl = optimizeLayoutSimple(fl->layout);
+    obl = optimizeLayoutSimple(bl->layout);
+    assert(ofl && obl);
+    assert(ofl.getActiveCount() == obl.getActiveCount());
+    
+    ofl.split(oflist, threadsCount);
+    obl.split(oblist, threadsCount);
+    stride = ofl.getActiveCount() + 1;
+    count = data.size()/stride;
+    if (count <= 0) return false;
+    shuffle.resize(count);
+    for(int i = 0; i < count; ++i)
+      shuffle[i] = i;
+    return true;
+  }
+
+
+  bool prepareBlock() override {
+    int cnt = itersPerBlock > count ? count : itersPerBlock;
+    for(int i = 0; i < cnt; ++i) {
+      int j = rand()%count;
+      if (i != j) std::swap(shuffle[i], shuffle[j]);
+    }
+    return true;
+  }
+
+
+  void loadData(Barrier &barrier, int, int iter) override {
+    struct I: public Iter {
+      typedef const unsigned char* Type;
+      static inline void iter4(Neuron &n, Type d, AccumType&) { n.v = *d/(NeuronReal)255; }
+    };
+    const unsigned char *id = data.data() + shuffle[iter%count]*stride;
+    iterateNeurons2<I>(oflist[barrier.tid], ofl, fl->neurons, id);
+  }
+
+
+  AccumReal verifyDataMain(int, int iter) override {
+    struct I: public Iter {
+      typedef int Type;
+      struct AccumType { int ri, mi; NeuronReal m; };
+      static inline void iter4(Neuron &n, Type d, AccumType &a) {
+        NeuronReal v1 = d == a.ri;
+        NeuronReal v0 = n.v;
+        n.d *= v1 - v0;
+        if (a.m < v0) { a.m = v0; a.mi = d; }
+      }
+    };
+    
+    I::AccumType a = { data[ (shuffle[iter%count] + 1)*stride - 1 ], 0, 0 };
+    iterateNeurons2<I>(obl, obl, bl->neurons, 0, 1, &a);
+    
+    return a.mi != a.ri;
+  }
+};
+
+
+#endif
diff --git a/projects/neural/train.inc.cpp b/projects/neural/train.inc.cpp
new file mode 100644
index 0000000..8806713
--- /dev/null
+++ b/projects/neural/train.inc.cpp
@@ -0,0 +1,195 @@
+#ifndef NNTRAIN_INC_CPP
+#define NNTRAIN_INC_CPP
+
+
+#include <chrono>
+#include <thread>
+
+
+#include "layer.inc.cpp"
+
+
+long long timeUs() {
+  static std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
+  return (long long)std::chrono::duration_cast<std::chrono::microseconds>( std::chrono::steady_clock::now() - begin ).count();
+}
+
+
+class Trainer {
+private:
+  std::atomic<unsigned int> barrierCounter;
+  std::vector<AccumReal> qualities;
+
+public:
+  Layer *layer;
+  AccumReal ratio;
+  int threadsCount;
+  int itersPerBlock;
+  int blocksPerSaving;
+  int blocksCount;
+  AccumReal qmin;
+
+protected:
+  volatile bool doBackpassAtThisIteration;
+  Layer *fl;
+  Layer *bl;
+  
+  virtual bool prepare() { return true; }
+  virtual bool prepareBlock() { return true; }
+  virtual void finishBlock() { }
+  virtual void finish() { }
+
+  virtual void loadData(Barrier &barrier, int block, int iter) { }
+  virtual AccumReal verifyData(Barrier &barrier, int block, int iter) { return 0; }
+
+  virtual void loadDataMain(int block, int iter) { };
+  virtual AccumReal verifyDataMain(int block, int iter) { return 0; };
+
+private:
+  void threadFunc(int tid, int block) {
+    Barrier barrier(barrierCounter, tid, threadsCount);
+
+    volatile AccumReal &sumQ = qualities[tid] = 0;
+    for(int i = 0; i < itersPerBlock; ++i) {
+      barrier.wait();
+      loadData(barrier, block, i);
+      barrier.wait();
+      if (!tid) loadDataMain(block, i);
+
+      for(Layer *l = fl->next; l; l = l->next) {
+        barrier.wait();
+        l->pass(barrier);
+      }
+
+      barrier.wait();
+      sumQ += verifyData(barrier, block, i);
+      barrier.wait();
+      if (!tid) {
+        doBackpassAtThisIteration = true;
+        sumQ += verifyDataMain(block, i);
+      }
+      
+      barrier.wait();
+      if (ratio > 0 && doBackpassAtThisIteration) {
+        for(Layer *l = bl; l->prev && l->prev->prev; l = l->prev) {
+          barrier.wait();
+          l->backpassDeltas(barrier);
+        }
+        for(Layer *l = bl; l->prev; l = l->prev) {
+          barrier.wait();
+          l->backpassWeights(barrier);
+        }
+      }
+    }
+  }
+
+
+  AccumReal runThreads(int block) {
+    barrierCounter = 0;
+    std::vector<std::thread*> t(threadsCount, nullptr);
+    for(int i = 1; i < threadsCount; ++i)
+      t[i] = new std::thread(&Trainer::threadFunc, this, i, block);
+    threadFunc(0, block);
+
+    AccumReal result = qualities[0];
+    for(int i = 1; i < threadsCount; ++i)
+      { t[i]->join(); delete t[i]; result += qualities[i]; }
+    return result / itersPerBlock;
+  }
+
+
+public:
+  Trainer():
+    barrierCounter(0),
+    layer(),
+    ratio(),
+    threadsCount(1),
+    itersPerBlock(100),
+    blocksPerSaving(),
+    blocksCount(1000),
+    qmin(),
+    doBackpassAtThisIteration(),
+    fl(),
+    bl() { }
+
+
+  Trainer& configure(
+    Layer &layer,
+    AccumReal ratio,
+    int threadsCount,
+    int itersPerBlock,
+    int blocksPerSaving,
+    int blocksCount,
+    AccumReal qmin )
+  {
+    this->layer           = &layer;
+    this->ratio           = ratio;
+    this->threadsCount    = threadsCount;
+    this->itersPerBlock   = itersPerBlock;
+    this->blocksPerSaving = blocksPerSaving;
+    this->blocksCount     = blocksCount;
+    this->qmin            = qmin;
+    return *this;
+  }
+
+
+  AccumReal run() {
+    assert(layer && !layer->prev && layer->next);
+    assert(threadsCount > 0);
+    assert(itersPerBlock > 0);
+
+    printf("training: threads %d, itersPerBlock %d, ratio: %lf\n", threadsCount, itersPerBlock, ratio);
+
+    fl = layer;
+    bl = &layer->back();
+    
+    qualities.clear();
+    qualities.resize(threadsCount, 0);
+    for(Layer *l = layer; l; l = l->next)
+      l->split(threadsCount);
+    
+    if (!prepare())
+      return printf("cannot prepare\n"), -1;
+
+    AccumReal result = -1;
+    long long fullTimeStartUs = timeUs();
+    int i = 0;
+    while(true) {
+      if (!prepareBlock()) {
+        printf("cannot prepare block\n");
+        result = -1;
+        break;
+      };
+
+      long long runTimeUs = timeUs();
+      result = runThreads(i);
+      runTimeUs = timeUs() - runTimeUs;
+
+      finishBlock();
+
+      long long t = timeUs();
+      long long fullTimeUs = t - fullTimeStartUs;
+      fullTimeStartUs = t;
+      ++i;
+
+      printf("%4d, total %7d, avg.result %f, time: %f / %f\n", i, i*itersPerBlock, result, runTimeUs*0.000001, fullTimeUs*0.000001);
+
+      bool done = (blocksCount > 0 && i >= blocksCount) || result <= qmin;
+
+      if (ratio > 0 && (blocksPerSaving <= 0 || i%blocksPerSaving == 0 || done) && !layer->save()) {
+        printf("saving failed\n");
+        result = -1;
+        break;
+      }
+
+      if (done) break;
+    }
+
+    finish();
+
+    return result;
+  }
+};
+
+
+#endif
diff --git a/projects/neural/trainer.cpp b/projects/neural/trainer.cpp
new file mode 100644
index 0000000..51149f5
--- /dev/null
+++ b/projects/neural/trainer.cpp
@@ -0,0 +1,47 @@
+
+
+#include <ctime>
+
+#include "layer.all.inc.cpp"
+#include "layer.all.test.inc.cpp"
+#include "train.digit.inc.cpp"
+
+
+bool runTests() {
+ if (!AllTest::test()) return false;
+ return printf("success\n"), true;
+}
+
+
+int main() {
+  srand(time(NULL));
+
+  //return !runTests();
+  
+  #define FILENAME "data/output/weights.bin" // 28x28
+
+  printf("create neural network\n");
+  Layer l(nullptr, Layout(28, 28)); l.filename                = FILENAME "1";
+  //(new LayerSimple<funcSigmoidExp>(l, Layout(256)))->filename = FILENAME "2";
+  //(new LayerSimple<funcSigmoidExp>(l, Layout(64)))->filename = FILENAME "3";
+  //(new LayerSimple<funcSigmoidExp>(l, Layout(128)))->filename = FILENAME "4";
+  //(new LayerSimple<funcSigmoidExp>(l, Layout(64)))->filename  = FILENAME "5";
+  //(new LayerSimple<funcSigmoidExp>(l, Layout(128)))->filename = FILENAME "5";
+  (new LayerSimple<funcSigmoidExp>(l, Layout(32)))->filename  = FILENAME "6";
+  //(new LayerSimple<funcSigmoidExp>(l, Layout(16)))->filename  = FILENAME "7";
+  (new LayerSimple<funcSigmoidExp>(l, Layout(10)))->filename  = FILENAME "8";
+
+  l.sumStat().print();
+
+  printf("load training data\n");
+  TrainerDigit t;
+  if (!t.loadSymbolMap("data/symbols-data.bin")) return 1;
+
+  //printf("try load previously saved network\n"); l.load();
+
+  printf("train\n");
+  t.configure(l, 0.5, 4, 1000000, 0, 0, 0.0000001).run();
+
+  return 0;
+}
+