From e4740de5c46356293ceb8931c5ca0fb8574e564b Mon Sep 17 00:00:00 2001
From: Ivan Mahonin <bh@icystar.com>
Date: Apr 01 2023 12:41:10 +0000
Subject: benchmark


---
diff --git a/projects/neural/benchmark.inc.cpp b/projects/neural/benchmark.inc.cpp
new file mode 100644
index 0000000..8cffeb1
--- /dev/null
+++ b/projects/neural/benchmark.inc.cpp
@@ -0,0 +1,257 @@
+#ifndef BENCHMARK_INC_CPP
+#define BENCHMARK_INC_CPP
+
+
+#include "common.inc.cpp"
+#include "layer.conv.inc.cpp"
+
+
+class Benchmark: public ThreadControl {
+private:
+  typedef int Int;
+  typedef double Float;
+  
+  int repeats;
+  int mode;
+  Layout pl, cl;
+  Kernel k;
+  std::vector<Float> pvalues;
+  std::vector<Float> cvalues;
+  std::vector<Float> weights;
+  
+  
+  __attribute__((always_inline))
+  void threadFuncXYCP(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) {
+    Layout pl = this->pl;
+    Layout cl = this->cl;
+    Kernel k = this->k;
+    
+    assert(k.sx == k_sx);
+    assert(k.sy == k_sy);
+    assert(cl.getD() == c_d);
+    assert(pl.getD() == p_d);
+    
+    Int tid = barrier.tid;
+    Int ts = barrier.threads;
+
+    Int c_w = cl.getW();
+    Int c_h = cl.getH();
+    Int c_wd = c_w*c_d;
+    Int c_hw = c_h*c_w;
+    Int c_hwd = c_h*c_wd;
+    Int c_sxz = cl.sx*cl.sz;
+    
+    Int p_sxz = pl.sx*pl.sz;
+    Int p_szk = pl.sz*k.dx;
+    Int p_sxzk = p_sxz*k.dy;
+
+    //Int k_sxd = k_sx*p_d;
+    Int k_syx = k_sx*k_sy;
+    Int k_syxd = k_syx*p_d;
+    
+    Float *cvalues = this->cvalues.data() + cl.y0*c_sxz + cl.x0*cl.sz + cl.z0;
+    Float *pvalues = this->pvalues.data() + (pl.y0 + k.oy)*p_sxz + (pl.x0 + k.ox)*pl.sz + pl.z0;
+    Float *weights = this->weights.data();
+    
+    if (mode == 0) {
+      for(Int i = repeats; i; --i) {
+        for(Int i = tid; i < c_hwd; i += ts) {
+          Int cy = i/c_wd;
+          Int cx = i%c_wd/c_d;
+          Int cz = i%c_d;
+          
+          Float *ic = &cvalues[ cy*c_sxz + cx*cl.sz + cz ];
+          Float *ip = &pvalues[ cy*p_sxzk + cx*p_szk ];
+          Float *iw = &weights[ cz*k_syxd ];
+
+          Float a = 0;
+          
+          for(Int i = 0; i < p_d; ++i, ++ip, ++iw)
+          for(Int i = 0; i < k_syx; ++i) {
+            Int ky = i/k_sx;
+            Int kx = i%k_sx;
+            a += iw[i] * ip[ ky*p_sxz + kx*pl.sz ];
+          }
+          
+          *ic = a;
+        }
+        barrier.wait2();
+      }
+    } else
+    if (mode == 1) {
+      if (c_w > 1 || c_h > 1 || pl.sx != pl.getW() || pl.sz != p_d) {
+        for(Int i = repeats; i; --i)
+        for(Int i = 0; i < k_syx; ++i) {
+          Int ky = i/k_sx;
+          Int kx = i%k_sx;
+          //Int pi = ky*p_sxz + kx*pl.sz;
+          //Int wi = i*p_d;
+          Float *ip = &pvalues[ ky*p_sxz + kx*pl.sz ];
+          Float *iw = &weights[ i*p_d ];
+          for(Int i = tid; i < c_hw; i += ts) {
+            Int cy = i/c_w;
+            Int cx = i%c_w;
+            Float *iip = &ip[ cy*p_sxzk + cx*p_szk ];
+            Float *iic = &cvalues[ cy*c_sxz + cx*cl.sz ];
+            
+            for(Int cz = 0; cz < c_d; ++cz) {
+              //Int pii = pi + cy*p_sxzk + cx*p_szk;
+              //Int wii = wi + cz*k_syxd;
+              Float *iiw = &iw[ cz*k_syxd ];
+              Float a = iic[cz];
+              
+              for(Int i = 0; i < p_d; ++i)
+                iip[i] += iiw[i] * a;
+                //pvalues[pii + i] += weights[wii + i] * a;
+            }
+          }
+          barrier.wait2();
+        }
+      } else {
+        //Int c_dd = c_d*p_d;
+        //Int c_wdd = c_wd*p_d;
+        //Int c_hwdd = c_hwd*p_d;
+        Int cnt = k_syxd/ts;
+        for(Int i = repeats; i; --i) {
+          for(Int i = cnt*tid; i < cnt; ++i) {
+            //Int ky = i/k_sxd;
+            //Int kx = i%k_sxd/p_d;
+            //Int pz = i%p_d;
+            //Float *ip = &pvalues[ i ];//ky*p_sxz + kx*pl.sz + pz ];
+            Float *iw = &weights[ i*c_d ];
+            Float a = 0;
+            for(Int cz = 0; cz < c_d; ++cz) {
+              a += iw[ cz ] * cvalues[ cz ]; //*k_syxd
+            }
+            pvalues[ i ] = a;
+            //*ip = a;
+          }
+          barrier.wait2();
+        }
+      }
+    }
+  }
+
+  __attribute__((always_inline))
+  void threadFuncXYC(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) {
+    switch(p_d) {
+    case   3: return threadFuncXYCP( barrier, k_sx, k_sy, c_d,  3 );
+    case   4: return threadFuncXYCP( barrier, k_sx, k_sy, c_d,  4 );
+    case  24: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 24 );
+    case  48: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 28 );
+    }
+    threadFuncXYCP( barrier, k_sx, k_sy, c_d, p_d );
+  }
+
+  __attribute__((always_inline))
+  void threadFuncXY(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) {
+    switch(c_d) {
+    case   3: return threadFuncXYC( barrier, k_sx, k_sy,  3, p_d );
+    case   4: return threadFuncXYC( barrier, k_sx, k_sy,  4, p_d );
+    case  24: return threadFuncXYC( barrier, k_sx, k_sy, 24, p_d );
+    case  48: return threadFuncXYC( barrier, k_sx, k_sy, 48, p_d );
+    }
+    threadFuncXYC( barrier, k_sx, k_sy, c_d, p_d );
+  }
+
+  void threadFunc(Barrier &barrier) override {
+    Int k_sx = k.sx;
+    Int k_sy = k.sy;
+    Int c_d = cl.getD();
+    Int p_d = pl.getD();
+
+    if (k_sy == k_sx) switch(k_sx) {
+    case 4: return threadFuncXY( barrier, 4, 4, c_d, p_d );
+    }
+    threadFuncXY( barrier, k_sx, k_sy, c_d, p_d );
+  }
+  
+
+  void init(int mode, Layout pl, Layout cl, Kernel k, long long totalLinks) {
+    assert(pl);
+    assert(cl);
+    assert(k);
+    assert(totalLinks > 0);
+    assert(0 <= pl.x0 + k.ox && (cl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);
+    assert(0 <= pl.y0 + k.oy && (cl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);
+    
+    this->mode = mode;
+    this->pl = pl;
+    this->cl = cl;
+    this->k = k;
+    
+    pvalues.resize(pl.getCount());
+    cvalues.resize(cl.getCount());
+    weights.resize(cl.getD()*k.sx*k.sy*pl.getD());
+    
+    for(int i = 0; i < (int)pvalues.size(); ++i) pvalues[i] = rand();
+    for(int i = 0; i < (int)cvalues.size(); ++i) cvalues[i] = rand();
+    for(int i = 0; i < (int)weights.size(); ++i) weights[i] = rand();
+    
+    long long links = weights.size() * cl.getW() * cl.getH();
+    repeats = (totalLinks - 1)/links + 1;
+    
+    //printf( "benchmark init: prev %lld, curr %lld, links %lld, repeats %d, total links: %lld\n",
+    //  (long long)pvalues.size(), (long long)cvalues.size(), links, repeats, links*repeats );
+  }
+
+  
+  void run(const char *name, int threadsCount, int mode, Layout pl, Layout cl, Kernel k, long long totalLinks) {
+    init(mode, pl, cl, k, totalLinks);
+    
+    volatile long long t0 = timeUs();
+    runThreads(threadsCount);
+    volatile long long t1 = timeUs();
+
+    Float sum = 0;
+    for(int i = 0; i < (int)pvalues.size(); ++i) sum += pvalues[i];
+    for(int i = 0; i < (int)cvalues.size(); ++i) sum += cvalues[i];
+    for(int i = 0; i < (int)weights.size(); ++i) sum += weights[i];
+    printf("%s %d: %f, %lld\n", name, mode, (t1 - t0)*1e-6, (long long)sum);
+  }
+
+  
+  void run(const char *name, int threadsCount, Layout pl, Layout cl, Kernel k, long long totalLinks) {
+    for(int mode = 0; mode < 2; ++mode)
+      run(name, threadsCount, mode, pl, cl, k, totalLinks);
+  }
+
+
+public:
+  void run(int threadsCount = 1) {
+    //printf("run benchmark: %d\n", threadsCount);
+    /*
+    run( "514x3  -> 258x24", threadsCount,
+         Layout(514, 514,  3).expandXY(2),
+         Layout(258, 258, 24).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
+    run( "258x24 -> 130x48", threadsCount,
+         Layout(258, 258, 24).expandXY(2),
+         Layout(130, 130, 48).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
+    run( "130x48 -> 66x96 ", threadsCount,
+         Layout(130, 130, 48).expandXY(2),
+         Layout( 66,  66, 96).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
+    run( "66x96  -> 34x144", threadsCount,
+         Layout( 66,  66, 96).expandXY(2),
+         Layout(34, 34,  144).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
+    run( "34x144 -> 18x216", threadsCount,
+         Layout(34, 34,  144).expandXY(2),
+         Layout(18, 18,  216).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
+    run( "18x216 -> 10x324", threadsCount,
+         Layout(18, 18,  216).expandXY(2),
+         Layout(10, 10,  324).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
+    run( "10x324 -> 6x486 ", threadsCount,
+         Layout(10, 10,  324).expandXY(2),
+         Layout( 6,  6,  486).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
+    run( "6x486 -> 4x729  ", threadsCount,
+         Layout( 6,  6,  486).expandXY(2),
+         Layout( 4,  4,  729).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
+    */
+    run( "4x768 -> 1x1093 ", threadsCount,
+         Layout( 4,  4,  768).expandXY(0),
+         Layout( 1,  1, 1093).expandXY(0), Kernel(4, 2,  0), 10ll*1000*1000*1000 );
+  }
+};
+
+
+
+#endif
diff --git a/projects/neural/benchmark.segment.inc.cpp b/projects/neural/benchmark.segment.inc.cpp
new file mode 100644
index 0000000..713a0cb
--- /dev/null
+++ b/projects/neural/benchmark.segment.inc.cpp
@@ -0,0 +1,90 @@
+
+#include "segment.inc.cpp"
+#include "segment.test.inc.cpp"
+#include "segment.cx4.inc.cpp"
+
+
+class BenchmarkSegment {
+public:
+  static std::vector<NeuronReal> values;
+  static std::vector<Weight> weights;
+  
+  static void runSegment(const char *name, Segment &s, int threads, int repeats, bool backpass) {
+    class H: public ThreadControl {
+    public:
+      Segment &s;
+      int repeats;
+      NeuronReal ratio;
+      std::vector<Quality> qualities;
+      
+      H(Segment &s, int repeats, NeuronReal ratio): s(s), repeats(repeats), ratio(ratio) { }
+      
+      void threadFunc(Barrier &barrier) override {
+        Segment &s = this->s;
+        Layout l = s.layout;
+        NeuronReal r = ratio;
+        int sx = l.getW() - s.sx + 1;
+        int sy = l.getH() - s.sy + 1;
+        int sz = l.getD() - s.sz + 1;
+        
+        Quality q;
+        for(int i = 0; i < repeats; ++i) {
+          int x = l.x0 + barrier.commonRand()%sx;
+          int y = l.y0 + barrier.commonRand()%sy;
+          int z = l.z0 + barrier.commonRand()%sz;
+          q += s.pass(barrier, x, y, z, r);
+          barrier.wait();
+        }
+        qualities[barrier.tid] = q;
+      }
+    } h(s, repeats, backpass ? 1 : 0);
+    
+    int cnt = s.layout.getCount();
+    values.resize(cnt);
+    weights.resize(s.weightsCount);
+    s.f_values = values.data();
+    s.weights = weights.data();
+
+    for(int i = 0; i < cnt; ++i)
+      s.f_values[i] = rand()/(NeuronReal)RAND_MAX;
+    for(int i = 0; i < s.weightsCount; ++i)
+      s.weights[i].w = rand()/(WeightReal)RAND_MAX;
+    h.qualities.resize(threads);
+    
+    volatile long long t0 = timeUs();
+    h.runThreads(threads);
+    volatile long long t1 = timeUs();
+
+    AccumReal sum = 0;
+    for(int i = 0; i < threads; ++i) sum += h.qualities[i].train + h.qualities[i].human;
+    for(int i = 0; i < s.weightsCount; ++i) sum += s.weights[i].w;
+    
+    printf("%s %d: %f, %lld\n", name, (int)backpass, (t1 - t0)*1e-6, (long long)sum);
+  }
+
+
+  static void runSegment(const char *name, Segment &s, int threads, int repeats) {
+    runSegment(name, s, threads, repeats, false);
+    runSegment(name, s, threads, repeats, true);
+  }
+
+  
+  static void runSegment(const char *name, Segment &s, int threads) {
+    int repeats = (int)( (1ll*1000*1000*1000 + s.effectiveLinks/2)/s.effectiveLinks );
+    if (!SegmentTest::testSegment(name, s)) return;
+    runSegment(name, s, threads, repeats, false);
+    runSegment(name, s, threads, repeats, true);
+  }
+  
+  
+  static void run(int threads) {
+    { SegmentCx4 s(3, 24); s.layout = Layout(514, 514,  3).expandXY(2);
+      runSegment("cx4-3x24", s, threads); }
+  }
+};
+
+
+std::vector<NeuronReal> BenchmarkSegment::values;
+std::vector<Weight> BenchmarkSegment::weights;
+
+  
diff --git a/projects/neural/common.inc.cpp b/projects/neural/common.inc.cpp
index 40efc35..03f43af 100644
--- a/projects/neural/common.inc.cpp
+++ b/projects/neural/common.inc.cpp
@@ -12,9 +12,12 @@
 #include <vector>
 #include <string>
 #include <chrono>
-#include <thread>
 #include <algorithm>
 
+#include <mutex>
+#include <thread>
+#include <condition_variable>
+
 
 #include "layout.inc.cpp"
 
@@ -36,6 +39,10 @@ inline unsigned int randomBranch(unsigned int seed)
   
 inline void busyloop(unsigned int count)
   { while(count--) __asm__ __volatile__(""); }
+inline void sleep()
+  { std::this_thread::sleep_for(std::chrono::nanoseconds(0)); }
+inline void sleepUs(long long us)
+  { std::this_thread::sleep_for(std::chrono::microseconds(us)); }
 
 
 inline long long timeUs() {
@@ -75,29 +82,82 @@ struct Iter {
 
 
 
+class Barrier;
+
+class ThreadControl {
+private:
+  friend class Barrier;
+  
+  std::mutex mutex;
+  std::condition_variable cond;
+  std::atomic<unsigned int> counter;
+  std::vector<std::thread*> threads;
+  unsigned int commonSeed;
+  
+  void runSingleThread(unsigned int tid, unsigned int seed);
+
+protected:
+  virtual void threadFunc(Barrier&) { }
+  
+public:
+  ThreadControl(): counter(0), commonSeed() { }
+  void runThreads(unsigned int threadsCount = 1) {
+    assert(threadsCount);
+    counter = 0;
+    threads.clear();
+    threads.resize(threadsCount);
+    commonSeed = rand();
+    for(unsigned int i = 1; i < threadsCount; ++i)
+      threads[i] = new std::thread(&ThreadControl::runSingleThread, this, i, rand());
+    runSingleThread(0, rand());
+    for(unsigned int i = 1; i < threadsCount; ++i)
+      { threads[i]->join(); delete threads[i]; }
+    threads.clear();
+  }
+};
+
+
 class Barrier {
 private:
-  std::atomic<unsigned int> &counter;
+  ThreadControl &owner;
   unsigned int next;
   unsigned int busyseed;
 public:
   const unsigned int tid;
   const unsigned int threads;
   unsigned int seed;
+  unsigned int commonSeed;
 
   Barrier(const Barrier&) = delete;
-  inline Barrier(std::atomic<unsigned int> &counter, unsigned int tid, unsigned int threads, unsigned int seed):
-    counter(counter), next(), busyseed(randomBranch(seed)), tid(tid), threads(threads), seed(seed) { assert(tid < threads); }
+  inline Barrier(ThreadControl &owner, unsigned int tid, unsigned int seed, unsigned int commonSeed):
+    owner(owner), next(), busyseed(randomBranch(seed)), tid(tid), threads(owner.threads.size()), seed(seed), commonSeed(commonSeed)
+    { assert(tid < threads); }
     
   //inline void busyloop() { }
   inline void busyloop(unsigned int maxCycles = 4096) { ::busyloop( (busyseed = randomNext(busyseed))%maxCycles ); }
   inline unsigned int rand() { return seed = randomNext(seed); }
-  inline void wait() { next += threads; ++counter; while(counter < next) busyloop(); }
-  inline void subwait() { while(counter < next + tid) busyloop(); }
+  inline unsigned int commonRand() { return commonSeed = randomNext(commonSeed); }
+  inline void wait() { next += threads; ++owner.counter; while(owner.counter < next) busyloop(); }
+  inline void subwait() { while(owner.counter < next + tid) busyloop(); }
+  
+  inline void wait2() {
+    next += threads;
+    std::unique_lock<std::mutex> lock(owner.mutex);
+    if (++owner.counter == next) owner.cond.notify_all(); else
+      while(owner.counter < next) owner.cond.wait(lock);
+  }
   
+  inline void wait3() { next += threads; ++owner.counter; while(owner.counter < next) sleepUs(1); }
 };
 
 
+void ThreadControl::runSingleThread(unsigned int tid, unsigned int seed) {
+  Barrier barrier(*this, tid, seed, commonSeed);
+  threadFunc(barrier);
+}
+
+
+
 struct Stat {
   int neurons;
   int activeNeurons;
diff --git a/projects/neural/layer.inc.cpp b/projects/neural/layer.inc.cpp
index fe357de..ebdc838 100644
--- a/projects/neural/layer.inc.cpp
+++ b/projects/neural/layer.inc.cpp
@@ -152,31 +152,22 @@ public:
   
   
   void passFull(const Layer *last = nullptr, int threadsCount = 1) {
-    struct H {
+    class H: public ThreadControl {
+    public:
       Layer &layer;
       const Layer *last;
-      std::atomic<unsigned int> barrierCounter;
-      std::vector<std::thread*> threads;
       
-      H(Layer &layer, const Layer *last, int threadsCount): layer(layer), last(last), barrierCounter(0), threads(threadsCount) { }
+      H(Layer &layer, const Layer *last): layer(layer), last(last) { }
       
-      void func(int tid, unsigned int seed) {
-        Barrier barrier(barrierCounter, tid, threads.size(), seed);
+      void threadFunc(Barrier &barrier) override {
         for(Layer *l = layer.next; l; l = l->next) {
           l->pass(barrier);
           if (l == last || !l->next) break;
           barrier.wait();
         }
       }
-    } h(*this, last, threadsCount);
-    
-    for(Layer *l = this; l; l = l->next)
-      l->split(threadsCount);
-    for(int i = 1; i < threadsCount; ++i)
-      h.threads[i] = new std::thread(&H::func, &h, i, rand());
-    h.func(0, rand());
-    for(int i = 1; i < threadsCount; ++i)
-      { h.threads[i]->join(); delete h.threads[i]; }
+    };
+    H(*this, last).runThreads(threadsCount);
   }
 };
 
diff --git a/projects/neural/layer.test.inc.cpp b/projects/neural/layer.test.inc.cpp
index a12c154..c99db49 100644
--- a/projects/neural/layer.test.inc.cpp
+++ b/projects/neural/layer.test.inc.cpp
@@ -3,6 +3,7 @@
 
 
 #include "test.inc.cpp"
+#include "layer.inc.cpp"
 
 
 
@@ -16,14 +17,12 @@ public:
     Layer &c = *l.next;
     
     
-    struct H {
+    class H: public ThreadControl {
+    public:
       Layer &p;
       Layer &c;
 
-      std::vector<std::thread*> threads;
-      std::atomic<unsigned int> counter;
-      
-      H(Layer &p, Layer &c): p(p), c(c), counter(0) { }
+      H(Layer &p, Layer &c): p(p), c(c) { }
   
       void fillLayout(Layout l, Neuron *neurons) {
         for(int y = 0; y < l.sy; ++y)
@@ -53,11 +52,10 @@ public:
           c.neurons[i].d *= c_neurons[i].v - c.neurons[i].v;
       }
   
-      void func(int tid, unsigned int seed) {
-        Barrier barrier(counter, tid, threads.size(), seed);
+      void threadFunc(Barrier &barrier) override {
         c.pass(barrier);
         barrier.wait();
-        if (!tid) applyDelta();
+        if (!barrier.tid) applyDelta();
         barrier.wait();
         c.backpassDeltas(barrier);
         barrier.wait();
@@ -69,18 +67,11 @@ public:
         
         assert(threadsCount > 0);
         
-        counter = 0;
-        threads.clear();
-        threads.resize(threadsCount, nullptr);
-        
         prepareData();
 
         p.split(threadsCount);
         c.split(threadsCount);
-        for(int i = 1; i < threadsCount; ++i) threads[i] = new std::thread(&H::func, this, i, rand());
-        func(0, rand());
-        for(int i = 1; i < threadsCount; ++i) { threads[i]->join(); delete threads[i]; }
-        threads.clear();
+        runThreads(threadsCount);
         
         for(int i = 0; i < c.neuronsCount; ++i) {
           NeuronReal a = c.neurons[i].v;
diff --git a/projects/neural/segment.conv.inc.cpp b/projects/neural/segment.conv.inc.cpp
new file mode 100644
index 0000000..40b114a
--- /dev/null
+++ b/projects/neural/segment.conv.inc.cpp
@@ -0,0 +1,297 @@
+#ifndef SEGMENT_CONV_INC_CPP
+#define SEGMENT_CONV_INC_CPP
+
+
+#include "segment.inc.cpp"
+#include "func.inc.cpp"
+#include "layer.conv.inc.cpp"
+
+
+class SegmentConv: public Segment {
+public:
+  enum {
+    KSX = 4,
+    KSY = 4,
+  };
+  
+  const int msx, msy, msz;
+  
+  NeuronReal *m_values;
+  NeuronReal *b_values;
+  
+  SegmentConv(int sx, int sy, int sz, int msz, Weight *weights = nullptr):
+    Segment(sx, sy, sz, msz*KSY*KSX*sz, weights), msx((sx - KSX)/2 + 1), msy((sy - KSY)/2 + 1), msz(msz)
+  {
+    assert(msx > 0);
+    assert(msy > 0);
+    assert(msz > 0);
+    m_values = new NeuronReal[msx*msy*msz + sx*sy*sz];
+    b_values = m_values + msx*msy*msz;
+    clear();
+  }
+  ~SegmentConv()
+    { delete[] m_values; }  
+  
+  
+  void clear() override
+    { memset(m_values, 0, sizeof(*m_values)*(msx*msy*msz + sx*sy*sz)); }
+
+    
+  inline void check(int x, int y, int z) {
+    Segment::check(x, y, z);
+    assert(layout.getD() == sz);
+  }
+
+
+  
+  Quality pass(Barrier &barrier, int x, int y, int z, NeuronReal trainRatio) override {
+    check(x, y, z);
+    Layout l = layout;
+    NeuronReal *f_values = this->f_values + (y*l.sx + x)*l.sz + z;
+  }
+  
+  __attribute__((always_inline))
+  inline Quality pass(Barrier &barrier, NeuronReal *f_values, NeuronReal trainRatio) {
+    Layout l = layout;
+    int tid = barrier.tid;
+    int threads = barrier.threads;
+    
+    int sx = this->sx;
+    //int sy = this->sy;
+    int sz = this->sz;
+    int msx = this->msx;
+    int msy = this->msy;
+    int msz = this->msz;
+    int msxz = msx*msz;
+    
+    int ksxz = KSX*sz;
+    int ksyxz = KSY*ksxz;
+    
+    int fv_dkx = l.sz - sz;
+    int fv_dky = (l.sx - KSX)*l.sz;
+    int fv_dmx = 2*l.sz;
+    int fv_dmy = 2*(l.sx - msx)*l.sz;
+    
+    int mn_dtz = threads - msx*msy*msz;
+    
+    // stage 1: pass from front to mid
+    
+    int f_sxz = l.sx*l.sz;
+    int f_sz2 = l.sz*2;
+    int f_sxz2 = l.sx*f_sz2;
+    
+    int m_cnt = msx*msy*msz;
+    int mi0 = m_cnt*tid/threads;
+    int mi1 = m_cnt*(tid+1)/threads;
+    
+    for(int i = mi0; i < mi1; ++i) {
+      int my = i/msxz;
+      int mx = i%msxz/msz;
+      int mz = i%msz;
+      
+      AccumReal a = 0;
+      int wi = i*ksyxz;
+      int fvi = my*f_sxz2 + mx*f_sz2 + mz;
+      for(int ky = 0; ky < KSY; ++ky, fvi += f_sxz, wi += ksxz) {
+        Weight *iw = &weights[wi];
+        NeuronReal *ifv = &f_values[fvi];
+        for(int i = 0; i < ksxz; ++i)
+          a += ifv[i]*iw[i].w;
+      }
+      
+      m_values[i] = a > 0 ? a : 0;
+    }
+    
+    barrier.wait();
+    
+    // stage 2: pass from mid to back and verify
+    
+    AccumReal qa = 0;
+    for(int by = 2 + tid; by < 10; by += threads)
+    for(int bx = 2; bx < 10; ++bx)
+    for(int bz = 0; bz < sz; ++bz) {
+      AccumReal a = 0;
+      Neuron &bn = b_neurons[ (by*sx + bx)*sz + bz ];
+      
+      for(int ky = by%2; ky < KSY; ky += 2)
+      for(int kx = bx%2; kx < KSX; kx += 2) {
+        int mx = (bx - kx)/2;
+        int my = (by - ky)/2;
+        assert(mx >= 0 && mx < msx && (bx - kx)%2 == 0);
+        assert(my >= 0 && my < msy && (by - ky)%2 == 0);
+        for(int mz = 0; mz < msz; ++mz) {
+          Neuron &mn = m_neurons[ (my*msx + mx)*msz + mz ];
+          Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + bz ];
+          a += mn.v * w.w;
+        }
+      }
+      
+      if (a > 0) bn.v = a, bn.d = 1; else bn.v = bn.d = 0;
+      
+      NeuronReal fn = f_values[ (by*l.sx + bx)*l.sz + bz ];
+      NeuronReal d = fn - bn.v;
+      bn.d *= d*trainRatio;
+      qa += d*d;
+    }
+    Quality q(qa/(64*sz));
+    
+    if (trainRatio <= 0) return q;
+    
+    barrier.wait();
+    
+    // stage 3: backpass deltas
+    
+    for(int mz = tid; mz < msz; mz += threads)
+    for(int my = 1; my < 4; ++my)
+    for(int mx = 1; mx < 4; ++mx) {
+      AccumReal a = 0;
+      Neuron &mn = m_neurons[ (my*msx + mx)*msz + mz ];
+      
+      for(int ky = 0; ky < ksy; ++ky)
+      for(int kx = 0; kx < ksx; ++kx)
+      for(int kz = 0; kz < sz;  ++kz) {
+        int bx = mx*2 + kx;
+        int by = my*2 + ky;
+        Neuron &bn = b_neurons[ (by*sx + bx)*sz + kz ];
+        Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + kz ];
+        a += bn.d * w.w;
+      }
+      mn.d *= a;
+    }
+    
+    barrier.wait();
+    
+    // stage 4: update weights
+
+    for(int mz = tid; mz < msz; mz += threads)
+    for(int by = 4; by <  8; ++by)
+    for(int bx = 4; bx <  8; ++bx)
+    for(int bz = 0; bz < sz; ++bz) {
+      Neuron &bn = b_neurons[ (by*sx + bx)*sz + bz ];
+      NeuronReal fv = f_values[ (by*l.sx + bx)*l.sz + bz ];
+      
+      for(int ky = by%2; ky < ksy; ky += 2)
+      for(int kx = bx%2; kx < ksx; kx += 2) {
+        int mx = (bx - kx)/2;
+        int my = (by - ky)/2;
+        assert(mx >= 1 && mx < 4 && (bx - kx)%2 == 0);
+        assert(my >= 1 && my < 4 && (by - ky)%2 == 0);
+        Neuron &mn = m_neurons[ (my*msx + mx)*msz + mz ];
+        Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + bz ];
+        w.w += bn.d*mn.v + mn.d*fv;
+      }
+    }
+    
+    return q;
+  }
+  
+  
+  
+  Quality testPass(int x, int y, int z, NeuronReal trainRatio) override {
+    check(x, y, z);
+    
+    Layout l = layout;
+    
+    // stage 1: pass
+    
+    clear();
+    
+    for(int my = 0; my < msy; ++my)
+    for(int mx = 0; mx < msx; ++mx)
+    for(int mz = 0; mz < msz; ++mz) {
+      AccumReal a = 0;
+      for(int ky = 0; ky < KSY; ++ky)
+      for(int kx = 0; kx < KSX; ++kx)
+      for(int kz = 0; kz < sz;  ++kz) {
+        int fx = x + mx*2 + kx;
+        int fy = y + my*2 + ky;
+        int fz = z + kz;
+        NeuronReal fv = f_values[ (fy*l.sx + fx)*l.sz + fz ];
+        Weight &w = weights[ ((mz*KSY + ky)*KSX + kx)*sz + kz ];
+        a += fv * w.w;
+      }
+      
+      NeuronReal &mv = m_values[ (my*msx + mx)*msz + mz ];
+      if (a < 0) { mv = 0; continue; }
+      mv = a;
+      
+      for(int ky = 0; ky < KSY; ++ky)
+      for(int kx = 0; kx < KSX; ++kx)
+      for(int kz = 0; kz < sz;  ++kz) {
+        int bx = mx*2 + kx;
+        int by = my*2 + ky;
+        int bz = kz;
+        NeuronReal &bv = b_values[ (by*sx + bx)*sz + bz ];
+        Weight &w = weights[ ((mz*KSY + ky)*KSX + kx)*sz + kz ];
+        bv += a * w.w;
+      }
+    }
+    
+    // stage 2: finalize values and verify
+    
+    AccumReal qa = 0;
+    for(int by = 0; by < sy; ++by)
+    for(int bx = 0; bx < sx; ++bx)
+    for(int bz = 0; bz < sz; ++bz) {
+        NeuronReal fn = f_values[ ((y + by)*l.sx + x + bx)*l.sz + z + bz ];
+        NeuronReal &bv = b_values[ (by*sx + bx)*sz + bz ];
+        if (bv > 0) {
+          NeuronReal d = fn - bv;
+          bv = d*trainRatio;
+          qa += d*d;
+        } else {
+          bv = 0;
+          qa += fn*fn;
+        }
+    }
+    Quality q(qa/(KSX*KSY*sz));
+    
+    if (trainRatio <= 0) return q;
+    
+    // stage 3: backpass deltas and update weights
+    
+    for(int my = 0; my < msy; ++my)
+    for(int mx = 0; mx < msx; ++mx)
+    for(int mz = 0; mz < msz; ++mz) {
+      NeuronReal mv = m_values[ (my*msx + mx)*msz + mz ];
+      if (!mv) continue;
+      
+      AccumReal a = 0;
+      for(int ky = 0; ky < KSY; ++ky)
+      for(int kx = 0; kx < KSX; ++kx)
+      for(int kz = 0; kz < sz;  ++kz) {
+        int bx = mx*2 + kx;
+        int by = my*2 + ky;
+        int bz = kz;
+        NeuronReal bv = b_values[ (by*sx + bx)*sz + bz ];
+        Weight &w = weights[ ((mz*KSY + ky)*KSX + kx)*sz + kz ];
+        a += bv * w.w;
+      }
+
+      for(int ky = 0; ky < KSY; ++ky)
+      for(int kx = 0; kx < KSX; ++kx)
+      for(int kz = 0; kz < sz;  ++kz) {
+        int bx = mx*2 + kx;
+        int by = my*2 + ky;
+        int bz = kz;
+        NeuronReal fv = f_values[ ((y + by)*l.sx + x + bx)*l.sz + z + bz ];
+        NeuronReal bv = b_values[ (by*sx + bx)*sz + bz ];
+        Weight &w = weights[ ((mz*KSY + ky)*KSX + kx)*sz + kz ];
+        w.w += bv*mv + fv*a;
+      }
+    }
+    
+    return q;
+  }
+
+  
+  bool saveDemo() override
+    { return !filename || saveConvDemoImage(filename, msz, KSX, KSY, sz, weights); }
+};
+
+
+
+
+#endif
+
+
diff --git a/projects/neural/segment.cx4.inc.cpp b/projects/neural/segment.cx4.inc.cpp
index 61234ba..517688b 100644
--- a/projects/neural/segment.cx4.inc.cpp
+++ b/projects/neural/segment.cx4.inc.cpp
@@ -54,15 +54,20 @@ public:
     int threads = barrier.threads;
     
     int sx = this->sx;
-    int sy = this->sy;
+    //int sy = this->sy;
     int sz = this->sz;
     int msx = this->msx;
     int msy = this->msy;
     int msz = this->msz;
     
     int ksxyz = ksx*ksy*sz;
+    
     int fv_dkx = l.sz - sz;
     int fv_dky = (l.sx - ksx)*l.sz;
+    int fv_dmx = 2*l.sz;
+    int fv_dmy = 2*(l.sx - MSX)*l.sz;
+    
+    int mn_dtz = threads - msx*msy*msz;
     
     NeuronReal *f_values = this->f_values + (y*l.sx + x)*l.sz + z;
     
@@ -72,9 +77,9 @@ public:
     Neuron *imn = m_neurons + tid;
     NeuronReal *ifv = f_values;
     
-    for(int mz = tid; mz < msz; mz += threads, iw += threads*ksxyz, imn += threads - msx*msy*msz, ifv = f_values)
-    for(int my = 0; my < MSY; ++my, ifv += 2*(l.sx - MSX)*l.sz)
-    for(int mx = 0; mx < MSX; ++mx, imn += msz, ifv += 2*l.sz) {
+    for(int mz = tid; mz < msz; mz += threads, iw += threads*ksxyz, imn += mn_dtz, ifv = f_values)
+    for(int my = 0; my < MSY; ++my, ifv += fv_dmy)
+    for(int mx = 0; mx < MSX; ++mx, imn += msz, ifv += fv_dmx) {
       AccumReal a = 0;
       
       Weight *iiw = iw;
diff --git a/projects/neural/segment.inc.cpp b/projects/neural/segment.inc.cpp
index ad4e3ea..badd35a 100644
--- a/projects/neural/segment.inc.cpp
+++ b/projects/neural/segment.inc.cpp
@@ -9,11 +9,12 @@ class Segment: public WeightHolder {
 public:
   const int sx, sy, sz;
   
+  int effectiveLinks;
   Layout layout;
   NeuronReal *f_values;
   
   Segment(int sx, int sy, int sz, int weightsCount, Weight *weights = nullptr):
-    WeightHolder(weightsCount, weights), sx(sx), sy(sy), sz(sz), f_values() { }
+    WeightHolder(weightsCount, weights), sx(sx), sy(sy), sz(sz), effectiveLinks(weightsCount), f_values() { }
   
   virtual ~Segment() { }
   
diff --git a/projects/neural/segment.layer.inc.cpp b/projects/neural/segment.layer.inc.cpp
new file mode 100644
index 0000000..0dc5d55
--- /dev/null
+++ b/projects/neural/segment.layer.inc.cpp
@@ -0,0 +1,73 @@
+#ifndef SEGMENT_LAYER_INC_CPP
+#define SEGMENT_LAYER_INC_CPP
+
+
+#include "segment.inc.cpp"
+#include "layer.inc.cpp"
+
+
+class SegmentLayer: public Segment {
+public:
+  Layer *fl;
+  Layer *bl;
+  
+  SegmentLayer(Layer &fl, Layer &bl):
+    Segment(fl.layout.getW(), fl.layout.getH(), fl.layout.getD(), fl.weightsCount, fl.weights)
+  {
+    assert(fl.layout.getW() == bl.layout.getW());
+    assert(fl.layout.getH() == bl.layout.getH());
+    assert(fl.layout.getD() == bl.layout.getD());
+    filename = fl.filename;
+  }
+  
+  SegmentLayer(Layer &layer): SegmentLayer(layer, layer.back()) { }
+  
+  
+  Quality pass(Barrier &barrier, int x, int y, int z, NeuronReal trainRatio) override {
+    check(x, y, z);
+    
+    // copy values
+    
+    // pass
+    
+    Layer *ffl = fl;
+    for(Layer *l = fl->next; l; l = l->next) {
+      if (!l->skipTrain) ffl = l;
+      barrier.wait();
+      l->pass(barrier);
+      if (l == bl) break;
+      if (!l->next) return Quality::nan();
+    }
+    
+    // verify
+    
+    Quality q;
+    
+    if (trainRatio <= 0) return q;
+    
+    // back pass deltas
+    
+    for(Layer *l = bl; l != ffl; l = l->next) {
+      barrier.wait();
+      l->backpassDeltas(barrier);
+    }
+    
+    // beack pass weights
+
+    for(Layer *l = bl; l; l = l->next) {
+      if (!l->skipTrain) {
+        barrier.wait();
+        l->backpassDeltas(barrier);
+      }
+    }
+    
+    return q;
+  }
+};
+
+
+
+
+#endif
+
+
diff --git a/projects/neural/segment.test.inc.cpp b/projects/neural/segment.test.inc.cpp
index 1b7da72..93151c5 100644
--- a/projects/neural/segment.test.inc.cpp
+++ b/projects/neural/segment.test.inc.cpp
@@ -12,45 +12,36 @@ public:
   static bool testSegment(const char *name, Segment &segment, Layout l, int x, int y, int z, NeuronReal trainRatio) {
     Stage st(name);
 
-    struct H {
+    class H: public ThreadControl {
+    public:
       Segment &segment;
       int x, y, z;
       Quality testQ;
       NeuronReal ratio;
-      
-      std::vector<std::thread*> threads;
       std::vector<Quality> qualities;
-      std::atomic<unsigned int> counter;
       
-      H(Segment &segment, int x, int y, int z, NeuronReal ratio): segment(segment), x(x), y(y), z(z), ratio(ratio), counter(0) { }
+      H(Segment &segment, int x, int y, int z, NeuronReal ratio): segment(segment), x(x), y(y), z(z), ratio(ratio) { }
   
       void prepareData()
         { memcpy(segment.weights, weights.data(), segment.weightsCount*sizeof(Weight)); }
   
-      void func(int tid, unsigned int seed) {
-        Barrier barrier(counter, tid, threads.size(), seed);
-        qualities[tid] = segment.pass(barrier, x, y, z, ratio);
-      }
+      void threadFunc(Barrier &barrier) override
+        { qualities[barrier.tid] = segment.pass(barrier, x, y, z, ratio); }
       
       bool test(const char *name, int threadsCount) {
         Stage st(name);
         
         assert(threadsCount > 0);
-        counter = 0;
-        threads.clear();
         qualities.clear();
-        threads.resize(threadsCount, nullptr);
         qualities.resize(threadsCount);
         
         prepareData();
 
         segment.split(threadsCount);
-        for(int i = 1; i < threadsCount; ++i) threads[i] = new std::thread(&H::func, this, i, rand());
-        func(0, rand());
+        runThreads(threadsCount);
         
-        Quality q = qualities[0];
-        for(int i = 1; i < threadsCount; ++i) { threads[i]->join(); delete threads[i]; q += qualities[i]; }
-        threads.clear();
+        Quality q;
+        for(int i = 0; i < threadsCount; ++i) q += qualities[i];
         
         if ( fabs(q.train - testQ.train) > 1e-10 
           || fabs(q.human - testQ.human) > 1e-10 )
@@ -103,6 +94,15 @@ public:
     
     return st;
   }
+
+
+  static bool testSegment(const char *name, Segment &segment) {
+    Layout l = segment.layout;
+    int x = l.x0 + rand()%(l.getW() - segment.sx + 1);
+    int y = l.y0 + rand()%(l.getH() - segment.sy + 1);
+    int z = l.z0 + rand()%(l.getD() - segment.sz + 1);
+    return testSegment(name, segment, l, x, y, z, 0.1);
+  }
 };
 
 
diff --git a/projects/neural/test.all.inc.cpp b/projects/neural/test.all.inc.cpp
index 227f226..a3d22e2 100644
--- a/projects/neural/test.all.inc.cpp
+++ b/projects/neural/test.all.inc.cpp
@@ -12,8 +12,8 @@ class AllTest: public Test {
 public:
   static bool test(const char *name = "all") {
     Stage st(name);
-    //SimpleTest::test();
-    //ConvTest::test();
+    SimpleTest::test();
+    ConvTest::test();
     Cx4Test::test();
     return st;
   }
diff --git a/projects/neural/train.cx4.inc.cpp b/projects/neural/train.cx4.inc.cpp
index 54831ad..c9060c7 100644
--- a/projects/neural/train.cx4.inc.cpp
+++ b/projects/neural/train.cx4.inc.cpp
@@ -147,6 +147,8 @@ protected:
     if (imagesInFile < 1) return fclose(f), f = nullptr, false;
     imagesInMemory = loadImagesCount > imagesInFile ? imagesInFile : loadImagesCount;
     
+    for(Layer *l = layerFull; l; l = l->next)
+      l->split(threadsCount);
     
     Layout l = layerPre ? layerPre->layout : layerFull->layout;
     assert(l.getW() >= segment->sx);
diff --git a/projects/neural/train.image.inc.cpp b/projects/neural/train.image.inc.cpp
index 18c5c46..47001e8 100644
--- a/projects/neural/train.image.inc.cpp
+++ b/projects/neural/train.image.inc.cpp
@@ -34,10 +34,12 @@ protected:
     assert(datafile);   
     assert(fl->layout.getD() == 3);
     
+    #ifndef NDEBUG
     Layer *dl = dataLayer ? dataLayer : fl;
     assert(dl->layout.getW() == bl->layout.getW());
     assert(dl->layout.getH() == bl->layout.getH());
     assert(dl->layout.getD() == bl->layout.getD());
+    #endif
     
     imgsize = fl->layout.getActiveCount();
     fl->layout.split(flist, threadsCount);
diff --git a/projects/neural/train.inc.cpp b/projects/neural/train.inc.cpp
index 41fd23a..6e58719 100644
--- a/projects/neural/train.inc.cpp
+++ b/projects/neural/train.inc.cpp
@@ -5,9 +5,8 @@
 #include "layer.inc.cpp"
 
 
-class Trainer {
+class Trainer: public ThreadControl {
 private:
-  std::atomic<unsigned int> barrierCounter;
   std::vector<Quality> qualities;
 
 public:
@@ -24,6 +23,7 @@ protected:
   Layer *fl;
   Layer *bl;
   Layer *ffl;
+  int currentBlock;
   
   virtual bool prepare() { return true; }
   virtual bool prepareBlock() { return true; }
@@ -34,9 +34,8 @@ protected:
   virtual Quality verifyData(Barrier &barrier, int block, int iter) { return Quality{}; }
 
 private:
-  void threadFunc(int tid, unsigned int seed, int block) {
-    Barrier barrier(barrierCounter, tid, threadsCount, seed);
-
+  void threadFunc(Barrier &barrier) override {
+    int block = currentBlock;
     Quality sumQ;
     for(int i = 0; i < itersPerBlock; ++i) {
       barrier.wait();
@@ -68,27 +67,21 @@ private:
         }
       }
     }
-    qualities[tid] = sumQ;
+    qualities[barrier.tid] = sumQ;
   }
 
 
   Quality runThreads(int block) {
-    barrierCounter = 0;
-    std::vector<std::thread*> t(threadsCount, nullptr);
-    for(int i = 1; i < threadsCount; ++i)
-      t[i] = new std::thread(&Trainer::threadFunc, this, i, rand(), block);
-    threadFunc(0, rand(), block);
-
-    Quality result = qualities[0];
-    for(int i = 1; i < threadsCount; ++i)
-      { t[i]->join(); delete t[i]; result += qualities[i]; }
-    return result *= 1/(AccumReal)itersPerBlock;
+    currentBlock = block;
+    ThreadControl::runThreads(threadsCount);
+    Quality q;
+    for(int i = 0; i < threadsCount; ++i) q += qualities[i];
+    return q *= 1/(AccumReal)itersPerBlock;
   }
 
 
 public:
   Trainer():
-    barrierCounter(0),
     layer(),
     ratio(),
     threadsCount(1),
diff --git a/projects/neural/train.segment.inc.cpp b/projects/neural/train.segment.inc.cpp
index 836ab3e..4ee8701 100644
--- a/projects/neural/train.segment.inc.cpp
+++ b/projects/neural/train.segment.inc.cpp
@@ -6,11 +6,10 @@
 #include "layer.inc.cpp"
 
 
-class TrainerSegment {
+class TrainerSegment: public ThreadControl {
 private:
-  std::atomic<unsigned int> barrierCounter;
   std::vector<QualityPair> qualities;
-
+  
 public:
   Segment *segment;
   AccumReal ratio;
@@ -25,6 +24,8 @@ public:
 
 protected:
   volatile int x, y, z;
+  int currentBlock;
+  bool currentBlockMeasureOnly;
   
   virtual bool prepare() { return true; }
   virtual bool prepareBlock(int block, bool measureOnly) { return true; }
@@ -34,8 +35,9 @@ protected:
   virtual void loadData(Barrier &barrier, int block, int iter, bool measure) { }
 
 private:
-  void threadFunc(int tid, unsigned int seed, int block, bool measureOnly) {
-    Barrier barrier(barrierCounter, tid, threadsCount, seed);
+  void threadFunc(Barrier &barrier) override {
+    int block = currentBlock;
+    bool measureOnly = currentBlockMeasureOnly;
 
     QualityPair q;
     if (!measureOnly) {
@@ -54,21 +56,16 @@ private:
       q.measure += segment->pass(barrier, x, y, z, 0);
     }
     
-    qualities[tid] = q;
+    qualities[barrier.tid] = q;
   }
 
 
   QualityPair runThreads(int block, bool measureOnly) {
-    barrierCounter = 0;
-    std::vector<std::thread*> t(threadsCount, nullptr);
-    for(int i = 1; i < threadsCount; ++i)
-      t[i] = new std::thread(&TrainerSegment::threadFunc, this, i, rand(), block, measureOnly);
-    threadFunc(0, rand(), block, measureOnly);
-
-    QualityPair q = qualities[0];
-    for(int i = 1; i < threadsCount; ++i)
-      { t[i]->join(); delete t[i]; q += qualities[i]; }
-    
+    currentBlock = block;
+    currentBlockMeasureOnly = measureOnly;
+    ThreadControl::runThreads(threadsCount);
+    QualityPair q;
+    for(int i = 0; i < threadsCount; ++i) q += qualities[i];
     q.measure *= 1/(AccumReal)measuresPerBlock;
     q.train *= 1/(AccumReal)trainsPerBlock;
     return q;
@@ -77,7 +74,6 @@ private:
 
 public:
   TrainerSegment():
-    barrierCounter(0),
     segment(),
     ratio(),
     threadsCount(1),
diff --git a/projects/neural/trainer.cpp b/projects/neural/trainer.cpp
index 820f172..e639553 100644
--- a/projects/neural/trainer.cpp
+++ b/projects/neural/trainer.cpp
@@ -7,6 +7,8 @@
 #include "train.digit.inc.cpp"
 #include "train.image.inc.cpp"
 #include "train.cx4.inc.cpp"
+#include "benchmark.inc.cpp"
+#include "benchmark.segment.inc.cpp"
 
 
 bool runTests() {
@@ -115,7 +117,7 @@ bool trainCx4() {
   int cnt = 1;
   fl[cnt] = new LayerConvShared<funcReLU>(l, Layout(257, 257, 24).expandXY(3), Kernel(4, 2, -2)); fl[cnt]->filename = FILENAME "1"; ++cnt;
   fl[cnt] = new LayerConvShared<funcReLU>(l, Layout(130, 130, 48), Kernel(4, 2, -2)); fl[cnt]->filename = FILENAME "2"; ++cnt;
-  fl[cnt] = new LayerConvShared<funcReLU>(l, Layout( 66,  66, 96), Kernel(4, 2, -2)); fl[cnt]->filename = FILENAME "3"; ++cnt;
+  //fl[cnt] = new LayerConvShared<funcReLU>(l, Layout( 66,  66, 96), Kernel(4, 2, -2)); fl[cnt]->filename = FILENAME "3"; ++cnt;
   //fl[cnt] = new LayerConvShared<funcReLU>(l, Layout( 6,  6, 48), Kernel(4, 2,  0)); fl[cnt]->filename = FILENAME "4"; ++cnt;
   //fl[cnt] = new LayerConvShared<funcReLU>(l, Layout( 2,  2, 96), Kernel(4, 2,  0)); fl[cnt]->filename = FILENAME "5"; ++cnt;
   for(int i = cnt-1; i > 0; --i) {
@@ -137,9 +139,9 @@ bool trainCx4() {
   t.ratio            = 0.000001;
   t.threadsCount     = 8;
   t.measuresPerBlock = 1000;
-  t.trainsPerBlock   = 10000;
+  t.trainsPerBlock   = 100000;
   t.loadImagesCount  = 100;
-  t.blocksPerLoading = 10;
+  t.blocksPerLoading = 1;
   t.qmin             = 0.00001;
   t.infile           = "data/img512-data.bin";
   t.outfile          = FILENAME ".test";
@@ -154,6 +156,8 @@ bool trainCx4() {
 int main() {
   srand(time(NULL));
 
+  //while(1) BenchmarkSegment::run(8);
+  //while(1) Benchmark().run(8);
   //return !runTests();
   //return !trainDigits();
   //return !trainDigitsConv();