From b579b360d819a65c1901f7256583ec5a7f462a6d Mon Sep 17 00:00:00 2001
From: Ivan Mahonin <bh@icystar.com>
Date: Mar 21 2023 13:44:43 +0000
Subject: neural: segment


---
diff --git a/projects/neural/common.inc.cpp b/projects/neural/common.inc.cpp
new file mode 100644
index 0000000..40efc35
--- /dev/null
+++ b/projects/neural/common.inc.cpp
@@ -0,0 +1,167 @@
+#ifndef COMMON_INC_CPP
+#define COMMON_INC_CPP
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+
+#include <atomic>
+#include <vector>
+#include <string>
+#include <chrono>
+#include <thread>
+#include <algorithm>
+
+
+#include "layout.inc.cpp"
+
+
+typedef double WeightReal;
+typedef double NeuronReal;
+typedef double AccumReal;
+
+typedef int WeightInt;
+typedef int AccumInt;
+
+
+
+#define RANDOM_MAX 0x7fffffff
+inline unsigned int randomNext(unsigned int prev)
+  { return (1103515245*prev + 12345) & RANDOM_MAX; }
+inline unsigned int randomBranch(unsigned int seed)
+  { return randomNext(seed + 1); }
+  
+inline void busyloop(unsigned int count)
+  { while(count--) __asm__ __volatile__(""); }
+
+
+inline long long timeUs() {
+  static std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
+  return (long long)std::chrono::duration_cast<std::chrono::microseconds>( std::chrono::steady_clock::now() - begin ).count();
+}
+
+
+  
+struct Accum {
+  union { AccumReal v; AccumInt i; };
+};
+
+
+struct Neuron {
+  NeuronReal v, d;
+  Accum a;
+};
+
+
+struct Weight {
+  union { WeightReal w; WeightInt i; };
+};
+
+
+struct Iter {
+  typedef Accum AccumType;
+  typedef NeuronReal* DataType;
+  typedef AccumType DataAccumType;
+  static inline void init(Neuron&, AccumType&) { }
+  static inline void iter(Neuron&, Weight&, AccumType&) { }
+  static inline void done(Neuron&, AccumType&) { }
+  static inline void iter2(Neuron&, Neuron&, Weight&) { }
+  static inline void iter3(Neuron&) { }
+  static inline void iter4(Neuron&, DataType, DataAccumType&) { }
+};
+
+
+
+class Barrier {
+private:
+  std::atomic<unsigned int> &counter;
+  unsigned int next;
+  unsigned int busyseed;
+public:
+  const unsigned int tid;
+  const unsigned int threads;
+  unsigned int seed;
+
+  Barrier(const Barrier&) = delete;
+  inline Barrier(std::atomic<unsigned int> &counter, unsigned int tid, unsigned int threads, unsigned int seed):
+    counter(counter), next(), busyseed(randomBranch(seed)), tid(tid), threads(threads), seed(seed) { assert(tid < threads); }
+    
+  //inline void busyloop() { }
+  inline void busyloop(unsigned int maxCycles = 4096) { ::busyloop( (busyseed = randomNext(busyseed))%maxCycles ); }
+  inline unsigned int rand() { return seed = randomNext(seed); }
+  inline void wait() { next += threads; ++counter; while(counter < next) busyloop(); }
+  inline void subwait() { while(counter < next + tid) busyloop(); }
+  
+};
+
+
+struct Stat {
+  int neurons;
+  int activeNeurons;
+  int weights;
+  int links;
+  size_t memsize;
+
+  Stat(): neurons(), activeNeurons(), weights(), links(), memsize() { }
+
+  Stat& operator+= (const Stat &b) {
+    neurons += b.neurons;
+    activeNeurons += b.activeNeurons;
+    weights += b.weights;
+    links   += b.links;
+    memsize += b.memsize;
+    return *this;
+  }
+
+  void print(const char *prefix = nullptr) const {
+    if (prefix && *prefix) printf("%s: ", prefix);
+    printf("neurons: %d / %d, links %d / %d, memSize: %llu\n", activeNeurons, neurons, weights, links, (unsigned long long)memsize);
+  }
+};
+
+
+struct Quality {
+  AccumReal train;
+  AccumReal human;
+  
+  inline Quality(AccumReal train, AccumReal human): train(train), human(human) {}
+  inline explicit Quality(AccumReal train = 0): Quality(train, train) {}
+  inline static Quality nan() { return Quality(NAN); }
+  inline static Quality bad() { return Quality(INFINITY); }
+  
+  inline Quality& operator+=(const Quality &b)
+    { train += b.train; human += b.human; return *this; }
+  inline Quality& operator*=(AccumReal x)
+    { train *= x; human *= x; return *this; }
+  inline bool operator<(const Quality &b) const {
+    return human < b.human ? true
+         : b.human < human ? false
+         : train < b.train;
+  }
+};
+
+
+struct QualityPair {
+  Quality measure;
+  Quality train;
+  
+  inline explicit QualityPair(const Quality &measure = Quality(), const Quality &train = Quality()):
+    measure(measure), train(train) { }
+
+  inline QualityPair& operator+=(const QualityPair &b)
+    { measure += b.measure; train += b.train; return *this; }
+  inline QualityPair& operator*=(AccumReal x)
+    { measure *= x; train *= x; return *this; }
+  inline bool operator<(const QualityPair &b) const {
+    return measure < b.measure ? true
+         : b.measure < measure ? false
+         : train < b.train;
+  }
+};
+
+
+#endif
+
diff --git a/projects/neural/font.data.inc.cpp b/projects/neural/font.data.inc.cpp
new file mode 100644
index 0000000..30256af
--- /dev/null
+++ b/projects/neural/font.data.inc.cpp
@@ -0,0 +1,157 @@
+#ifndef FONT_DATA_INC_CPP
+#define FONT_DATA_INC_CPP
+
+/**
+ * 8x8 monochrome bitmap fonts for rendering
+ * Author: Daniel Hepper <daniel@hepper.net>
+ *
+ * License: Public Domain
+ *
+ * Based on:
+ * // Summary: font8x8.h
+ * // 8x8 monochrome bitmap fonts for rendering
+ * //
+ * // Author:
+ * //     Marcel Sondaar
+ * //     International Business Machines (public domain VGA fonts)
+ * //
+ * // License:
+ * //     Public Domain
+ *
+ * Fetched from: http://dimensionalrift.homelinux.net/combuster/mos3/?p=viewsource&file=/modules/gfx/font8_8.asm
+ **/
+
+// Constant: font8x8_basic
+// Contains an 8x8 font map for unicode points U+0000 - U+007F (basic latin)
+const unsigned char font8x8data[128][8] = {
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0000 (nul)
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0001
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0002
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0003
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0004
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0005
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0006
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0007
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0008
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0009
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+000A
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+000B
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+000C
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+000D
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+000E
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+000F
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0010
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0011
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0012
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0013
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0014
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0015
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0016
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0017
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0018
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0019
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+001A
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+001B
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+001C
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+001D
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+001E
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+001F
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0020 (space)
+    { 0x18, 0x3C, 0x3C, 0x18, 0x18, 0x00, 0x18, 0x00},   // U+0021 (!)
+    { 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0022 (")
+    { 0x36, 0x36, 0x7F, 0x36, 0x7F, 0x36, 0x36, 0x00},   // U+0023 (#)
+    { 0x0C, 0x3E, 0x03, 0x1E, 0x30, 0x1F, 0x0C, 0x00},   // U+0024 ($)
+    { 0x00, 0x63, 0x33, 0x18, 0x0C, 0x66, 0x63, 0x00},   // U+0025 (%)
+    { 0x1C, 0x36, 0x1C, 0x6E, 0x3B, 0x33, 0x6E, 0x00},   // U+0026 (&)
+    { 0x06, 0x06, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0027 (')
+    { 0x18, 0x0C, 0x06, 0x06, 0x06, 0x0C, 0x18, 0x00},   // U+0028 (()
+    { 0x06, 0x0C, 0x18, 0x18, 0x18, 0x0C, 0x06, 0x00},   // U+0029 ())
+    { 0x00, 0x66, 0x3C, 0xFF, 0x3C, 0x66, 0x00, 0x00},   // U+002A (*)
+    { 0x00, 0x0C, 0x0C, 0x3F, 0x0C, 0x0C, 0x00, 0x00},   // U+002B (+)
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x06},   // U+002C (,)
+    { 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x00},   // U+002D (-)
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x00},   // U+002E (.)
+    { 0x60, 0x30, 0x18, 0x0C, 0x06, 0x03, 0x01, 0x00},   // U+002F (/)
+    { 0x3E, 0x63, 0x73, 0x7B, 0x6F, 0x67, 0x3E, 0x00},   // U+0030 (0)
+    { 0x0C, 0x0E, 0x0C, 0x0C, 0x0C, 0x0C, 0x3F, 0x00},   // U+0031 (1)
+    { 0x1E, 0x33, 0x30, 0x1C, 0x06, 0x33, 0x3F, 0x00},   // U+0032 (2)
+    { 0x1E, 0x33, 0x30, 0x1C, 0x30, 0x33, 0x1E, 0x00},   // U+0033 (3)
+    { 0x38, 0x3C, 0x36, 0x33, 0x7F, 0x30, 0x78, 0x00},   // U+0034 (4)
+    { 0x3F, 0x03, 0x1F, 0x30, 0x30, 0x33, 0x1E, 0x00},   // U+0035 (5)
+    { 0x1C, 0x06, 0x03, 0x1F, 0x33, 0x33, 0x1E, 0x00},   // U+0036 (6)
+    { 0x3F, 0x33, 0x30, 0x18, 0x0C, 0x0C, 0x0C, 0x00},   // U+0037 (7)
+    { 0x1E, 0x33, 0x33, 0x1E, 0x33, 0x33, 0x1E, 0x00},   // U+0038 (8)
+    { 0x1E, 0x33, 0x33, 0x3E, 0x30, 0x18, 0x0E, 0x00},   // U+0039 (9)
+    { 0x00, 0x0C, 0x0C, 0x00, 0x00, 0x0C, 0x0C, 0x00},   // U+003A (:)
+    { 0x00, 0x0C, 0x0C, 0x00, 0x00, 0x0C, 0x0C, 0x06},   // U+003B (;)
+    { 0x18, 0x0C, 0x06, 0x03, 0x06, 0x0C, 0x18, 0x00},   // U+003C (<)
+    { 0x00, 0x00, 0x3F, 0x00, 0x00, 0x3F, 0x00, 0x00},   // U+003D (=)
+    { 0x06, 0x0C, 0x18, 0x30, 0x18, 0x0C, 0x06, 0x00},   // U+003E (>)
+    { 0x1E, 0x33, 0x30, 0x18, 0x0C, 0x00, 0x0C, 0x00},   // U+003F (?)
+    { 0x3E, 0x63, 0x7B, 0x7B, 0x7B, 0x03, 0x1E, 0x00},   // U+0040 (@)
+    { 0x0C, 0x1E, 0x33, 0x33, 0x3F, 0x33, 0x33, 0x00},   // U+0041 (A)
+    { 0x3F, 0x66, 0x66, 0x3E, 0x66, 0x66, 0x3F, 0x00},   // U+0042 (B)
+    { 0x3C, 0x66, 0x03, 0x03, 0x03, 0x66, 0x3C, 0x00},   // U+0043 (C)
+    { 0x1F, 0x36, 0x66, 0x66, 0x66, 0x36, 0x1F, 0x00},   // U+0044 (D)
+    { 0x7F, 0x46, 0x16, 0x1E, 0x16, 0x46, 0x7F, 0x00},   // U+0045 (E)
+    { 0x7F, 0x46, 0x16, 0x1E, 0x16, 0x06, 0x0F, 0x00},   // U+0046 (F)
+    { 0x3C, 0x66, 0x03, 0x03, 0x73, 0x66, 0x7C, 0x00},   // U+0047 (G)
+    { 0x33, 0x33, 0x33, 0x3F, 0x33, 0x33, 0x33, 0x00},   // U+0048 (H)
+    { 0x1E, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00},   // U+0049 (I)
+    { 0x78, 0x30, 0x30, 0x30, 0x33, 0x33, 0x1E, 0x00},   // U+004A (J)
+    { 0x67, 0x66, 0x36, 0x1E, 0x36, 0x66, 0x67, 0x00},   // U+004B (K)
+    { 0x0F, 0x06, 0x06, 0x06, 0x46, 0x66, 0x7F, 0x00},   // U+004C (L)
+    { 0x63, 0x77, 0x7F, 0x7F, 0x6B, 0x63, 0x63, 0x00},   // U+004D (M)
+    { 0x63, 0x67, 0x6F, 0x7B, 0x73, 0x63, 0x63, 0x00},   // U+004E (N)
+    { 0x1C, 0x36, 0x63, 0x63, 0x63, 0x36, 0x1C, 0x00},   // U+004F (O)
+    { 0x3F, 0x66, 0x66, 0x3E, 0x06, 0x06, 0x0F, 0x00},   // U+0050 (P)
+    { 0x1E, 0x33, 0x33, 0x33, 0x3B, 0x1E, 0x38, 0x00},   // U+0051 (Q)
+    { 0x3F, 0x66, 0x66, 0x3E, 0x36, 0x66, 0x67, 0x00},   // U+0052 (R)
+    { 0x1E, 0x33, 0x07, 0x0E, 0x38, 0x33, 0x1E, 0x00},   // U+0053 (S)
+    { 0x3F, 0x2D, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00},   // U+0054 (T)
+    { 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x3F, 0x00},   // U+0055 (U)
+    { 0x33, 0x33, 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x00},   // U+0056 (V)
+    { 0x63, 0x63, 0x63, 0x6B, 0x7F, 0x77, 0x63, 0x00},   // U+0057 (W)
+    { 0x63, 0x63, 0x36, 0x1C, 0x1C, 0x36, 0x63, 0x00},   // U+0058 (X)
+    { 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x0C, 0x1E, 0x00},   // U+0059 (Y)
+    { 0x7F, 0x63, 0x31, 0x18, 0x4C, 0x66, 0x7F, 0x00},   // U+005A (Z)
+    { 0x1E, 0x06, 0x06, 0x06, 0x06, 0x06, 0x1E, 0x00},   // U+005B ([)
+    { 0x03, 0x06, 0x0C, 0x18, 0x30, 0x60, 0x40, 0x00},   // U+005C (\)
+    { 0x1E, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1E, 0x00},   // U+005D (])
+    { 0x08, 0x1C, 0x36, 0x63, 0x00, 0x00, 0x00, 0x00},   // U+005E (^)
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF},   // U+005F (_)
+    { 0x0C, 0x0C, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0060 (`)
+    { 0x00, 0x00, 0x1E, 0x30, 0x3E, 0x33, 0x6E, 0x00},   // U+0061 (a)
+    { 0x07, 0x06, 0x06, 0x3E, 0x66, 0x66, 0x3B, 0x00},   // U+0062 (b)
+    { 0x00, 0x00, 0x1E, 0x33, 0x03, 0x33, 0x1E, 0x00},   // U+0063 (c)
+    { 0x38, 0x30, 0x30, 0x3e, 0x33, 0x33, 0x6E, 0x00},   // U+0064 (d)
+    { 0x00, 0x00, 0x1E, 0x33, 0x3f, 0x03, 0x1E, 0x00},   // U+0065 (e)
+    { 0x1C, 0x36, 0x06, 0x0f, 0x06, 0x06, 0x0F, 0x00},   // U+0066 (f)
+    { 0x00, 0x00, 0x6E, 0x33, 0x33, 0x3E, 0x30, 0x1F},   // U+0067 (g)
+    { 0x07, 0x06, 0x36, 0x6E, 0x66, 0x66, 0x67, 0x00},   // U+0068 (h)
+    { 0x0C, 0x00, 0x0E, 0x0C, 0x0C, 0x0C, 0x1E, 0x00},   // U+0069 (i)
+    { 0x30, 0x00, 0x30, 0x30, 0x30, 0x33, 0x33, 0x1E},   // U+006A (j)
+    { 0x07, 0x06, 0x66, 0x36, 0x1E, 0x36, 0x67, 0x00},   // U+006B (k)
+    { 0x0E, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00},   // U+006C (l)
+    { 0x00, 0x00, 0x33, 0x7F, 0x7F, 0x6B, 0x63, 0x00},   // U+006D (m)
+    { 0x00, 0x00, 0x1F, 0x33, 0x33, 0x33, 0x33, 0x00},   // U+006E (n)
+    { 0x00, 0x00, 0x1E, 0x33, 0x33, 0x33, 0x1E, 0x00},   // U+006F (o)
+    { 0x00, 0x00, 0x3B, 0x66, 0x66, 0x3E, 0x06, 0x0F},   // U+0070 (p)
+    { 0x00, 0x00, 0x6E, 0x33, 0x33, 0x3E, 0x30, 0x78},   // U+0071 (q)
+    { 0x00, 0x00, 0x3B, 0x6E, 0x66, 0x06, 0x0F, 0x00},   // U+0072 (r)
+    { 0x00, 0x00, 0x3E, 0x03, 0x1E, 0x30, 0x1F, 0x00},   // U+0073 (s)
+    { 0x08, 0x0C, 0x3E, 0x0C, 0x0C, 0x2C, 0x18, 0x00},   // U+0074 (t)
+    { 0x00, 0x00, 0x33, 0x33, 0x33, 0x33, 0x6E, 0x00},   // U+0075 (u)
+    { 0x00, 0x00, 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x00},   // U+0076 (v)
+    { 0x00, 0x00, 0x63, 0x6B, 0x7F, 0x7F, 0x36, 0x00},   // U+0077 (w)
+    { 0x00, 0x00, 0x63, 0x36, 0x1C, 0x36, 0x63, 0x00},   // U+0078 (x)
+    { 0x00, 0x00, 0x33, 0x33, 0x33, 0x3E, 0x30, 0x1F},   // U+0079 (y)
+    { 0x00, 0x00, 0x3F, 0x19, 0x0C, 0x26, 0x3F, 0x00},   // U+007A (z)
+    { 0x38, 0x0C, 0x0C, 0x07, 0x0C, 0x0C, 0x38, 0x00},   // U+007B ({)
+    { 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x00},   // U+007C (|)
+    { 0x07, 0x0C, 0x0C, 0x38, 0x0C, 0x0C, 0x07, 0x00},   // U+007D (})
+    { 0x6E, 0x3B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+007E (~)
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}    // U+007F
+};
+
+#endif
diff --git a/projects/neural/font.inc.cpp b/projects/neural/font.inc.cpp
new file mode 100644
index 0000000..b8f6aa8
--- /dev/null
+++ b/projects/neural/font.inc.cpp
@@ -0,0 +1,42 @@
+#ifndef FONT_INC_CPP
+#define FONT_INC_CPP
+
+#include <cstdio>
+#include <cstdarg>
+#include <cstring>
+
+#include "font.data.inc.cpp"
+
+
+
+void imgPrint(unsigned char *data, int w, int h, int ch, int x, int y, const unsigned char *color, const char *text) {
+  int x0 = x;
+  while(unsigned char c = (unsigned char)*text++) {
+    if (c == '\n') { x = x0; y += 8; continue; }
+    const unsigned char *sym = font8x8data[c];
+    for(int yy = y, ey = y + 8; yy < ey; ++yy, ++sym) {
+      if (yy >= 0 && yy < h) {
+        unsigned char row = *sym;
+        for(int xx = x; row; ++xx, row >>= 1)
+          if ((row & 1) && xx >= 0 && xx <= w)
+            memcpy(data + (yy*w + xx)*ch, color, ch);
+      }
+    }
+    x += 8;
+  }
+}
+
+
+void imgPrintf(unsigned char *data, int w, int h, int ch, int x, int y, const unsigned char *color, const char *format, ...) {
+  char buf[1024] = {};
+  va_list args;
+  va_start(args, format);
+  vsnprintf(buf, sizeof(buf),format, args);
+  va_end(args);
+  imgPrint(data, w, h, ch, x, y, color, buf);
+}
+
+
+
+#endif
+
diff --git a/projects/neural/func.inc.cpp b/projects/neural/func.inc.cpp
new file mode 100644
index 0000000..b5c4cc3
--- /dev/null
+++ b/projects/neural/func.inc.cpp
@@ -0,0 +1,29 @@
+#ifndef FUNC_INC_CPP
+#define FUNC_INC_CPP
+
+
+#include "common.inc.cpp"
+
+
+
+typedef void Func(Neuron &n, AccumReal s);
+
+
+inline void funcSigmoidExp(Neuron &n, AccumReal s) {
+  //if (s > 5) s = 5; else if (s < -5) s = -5;
+  AccumReal ss = 1/(1 + std::exp(-s)); n.v = ss; n.d = ss * (1-ss);
+}
+
+
+inline void funcSigmoidExp2(Neuron &n, AccumReal s) {
+  //if (s > 5) s = 5; else if (s < -5) s = -5;
+  AccumReal ss = 1/(1 + std::exp(-s)); n.v = ss; n.d = 0;//ss * (1-ss) * 0.1;
+}
+
+
+inline void funcReLU(Neuron &n, AccumReal s)
+  { n.v = s > 0 ? s : 0; n.d = s > 0; }
+
+
+
+#endif
diff --git a/projects/neural/layer.all.test.inc.cpp b/projects/neural/layer.all.test.inc.cpp
deleted file mode 100644
index 8d42289..0000000
--- a/projects/neural/layer.all.test.inc.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef LAYER_ALL_TEST_INC_CPP
-#define LAYER_ALL_TEST_INC_CPP
-
-
-
-#include "layer.simple.test.inc.cpp"
-#include "layer.conv.test.inc.cpp"
-
-
-class AllTest: public Test {
-public:
-  static bool test(const char *name = "all") {
-    Stage st(name);
-    SimpleTest::test();
-    ConvTest::test();
-    return st;
-  }
-};
-
-
-#endif
diff --git a/projects/neural/layer.conv.inc.cpp b/projects/neural/layer.conv.inc.cpp
index efd72ac..a9aa454 100644
--- a/projects/neural/layer.conv.inc.cpp
+++ b/projects/neural/layer.conv.inc.cpp
@@ -3,6 +3,8 @@
 
 
 
+#include "tga.inc.cpp"
+#include "font.inc.cpp"
 #include "layer.simple.inc.cpp"
 
 
@@ -186,6 +188,71 @@ void iterateConvolutionPoint(Layout cl, Layout pl, Layout wl, Kernel k, int kx, 
 
 
 
+bool saveConvDemoImage(const char *filename, int count, int ksx, int ksy, int ksz, const Weight *weights) {
+  int cols = count;
+  int rows = ksz + 1;
+  int w = 1 + cols*(ksx + 1);
+  int h = 10 + rows*(ksy + 1);
+  std::vector<unsigned char> pixels(w*h*3, 0);
+
+  WeightReal range = 0;
+  for(const Weight *iw = weights, *e = iw + count*ksx*ksy*ksz; iw < e; ++iw) {
+    WeightReal r = fabs(iw->w);
+    if (range < r) range = r;
+  }
+  
+  const unsigned char white[] = { 255, 255, 255 };
+  imgPrintf(pixels.data(), w, h, 3, 1, 1, white, "%f", range);
+  
+  // rgb row
+  
+  for(int i = 0; i < count; ++i)
+  for(int ky = 0; ky < ksy; ++ky)
+  for(int kx = 0; kx < ksx; ++kx) {
+    int y0 = 10;
+    int x0 = i*(ksx + 1) + 1;
+    unsigned char *p = &pixels[ ((y0 + ky)*w + x0 + kx)*3 ];
+    
+    for(int kz = 0; kz < 3; ++kz) {
+      if (kz < ksz) {
+        WeightReal x = weights[ ((i*ksy + ky)*ksx + kx)*3 + kz ].w;
+        x /= range;
+        x = (x + 0.5)*256;
+        unsigned char c = x < 0 ? 0 : x > 255 ? 255 : (unsigned char)x;
+        p[kz] = c;
+      } else {
+        p[kz] = 0;
+      }
+    }
+  }
+
+  // gray rows
+  
+  for(int i = 0; i < count; ++i)
+  for(int kz = 0; kz < ksz; ++kz)
+  for(int ky = 0; ky < ksy; ++ky)
+  for(int kx = 0; kx < ksx; ++kx) {
+    WeightReal x = weights[ ((i*ksy + ky)*ksx + kx)*ksz + kz ].w;
+    x /= range;
+    x = (x + 0.5)*256;
+    unsigned char c = x < 0 ? 0 : x > 255 ? 255 : (unsigned char)x;
+
+    int y0 = (kz + 1)*(ksy + 1) + 10;
+    int x0 = i*(ksx + 1) + 1;
+    unsigned char *p = &pixels[ ((y0 + ky)*w + x0 + kx)*3 ];
+    p[0] = p[1] = p[2] = c;
+    
+    //if (c ==   0) p[0] = p[1] = 0; // blue un underflow
+    //if (c == 255) p[1] = p[2] = 0; // red on overflow
+  }
+  
+  std::string fn(filename);
+  fn += ".tga";
+  return tgaSave(fn.c_str(), pixels.data(), w, h, 3);
+}
+
+
+
 template<Func func>
 class LayerConv: public Layer {
 public:
diff --git a/projects/neural/layer.conv.shared.inc.cpp b/projects/neural/layer.conv.shared.inc.cpp
index 366093f..f729bc4 100644
--- a/projects/neural/layer.conv.shared.inc.cpp
+++ b/projects/neural/layer.conv.shared.inc.cpp
@@ -29,7 +29,7 @@ void iterateTestConvolutionShared(Layout cl, Layout pl, Kernel k, Neuron *c_neur
     for(int ky = 0; ky < k.sy; ++ky)
     for(int kx = 0; kx < k.sx; ++kx)
     for(int pz = pl.z0; pz < pl.z1; ++pz) {
-      int wi = (ky*k.sx + kx)*pl.getD() + pz - pl.z0;
+      int wi = (((cz - cl.z0)*k.sy + ky)*k.sx + kx)*pl.getD() + pz - pl.z0;
       Weight &w = weights[wi];
 
       int px = pl.x0 + (cx - cl.x0)*k.dx + k.ox + kx;
@@ -46,7 +46,7 @@ void iterateTestConvolutionShared(Layout cl, Layout pl, Kernel k, Neuron *c_neur
 
 
 template<typename Iter>
-void iterateConvolutionSharedDyn(Layout cl, Layout pl, Layout wl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+void iterateConvolutionShared(Layout cl, Layout pl, Layout wl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
   if (!cl) return;
   assert(pl);
   assert(wl);
@@ -71,122 +71,32 @@ void iterateConvolutionSharedDyn(Layout cl, Layout pl, Layout wl, Kernel k, Neur
   int p_dy   = k.dy*pl.sx*pl.sz - c_w*p_dx;
 
   int k_sxd  = k.sx*p_d;
+  int k_syxd = k.sy*k_sxd;
   int p_ddy  = (pl.sx - k.sx)*pl.sz;
   int p_ddx  = pl.sz - p_d;
 
   Neuron *icn = c_neurons + (cl.y0*cl.sx + cl.x0)*cl.sz + cl.z0;
   Neuron *ipn = p_neurons + ((pl.y0 + (cl.y0 - wl.y0)*k.dy + k.oy)*pl.sx + pl.x0 + (cl.x0 - wl.x0)*k.dx + k.ox)*pl.sz + pl.z0;
-  Weight *ew = weights + k.sy*k_sxd;
-
-  for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy)
-  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx)
-  for(Neuron *e = icn +    c_d; icn < e; ++icn) {
-    typename Iter::AccumType a;
-    Iter::init(*icn, a);
-
-    Neuron *iipn = ipn;
-    for(Weight *iw =    weights; iw < ew; iipn += p_ddy)
-    for(Weight *e = iw +  k_sxd; iw < e;  iipn += p_ddx)
-    for(Weight *e = iw +    p_d; iw < e;  ++iw, ++iipn)
-      Iter::iter(*iipn, *iw, a);
-
-    Iter::done(*icn, a);
-  }
-}
-
-
-template<typename Iter, int KSX, int KSY, int PD>
-void iterateConvolutionSharedXYD(Layout cl, Layout pl, Layout wl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
-  if (!cl) return;
-  assert(pl);
-  assert(wl);
-  assert(k);
-  assert(c_neurons);
-  assert(p_neurons);
-  assert(weights);
-  assert(cl.isSubLayoutOf(wl));
-  assert(pl.x0 + k.ox >= 0 && pl.x0 + (wl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);
-  assert(pl.y0 + k.oy >= 0 && pl.y0 + (wl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);
   
-  assert(KSX == k.sx);
-  assert(KSY == k.sy);
-  assert(PD == pl.getD());
-
-  int c_h    = cl.getH();
-  int c_w    = cl.getW();
-  int c_d    = cl.getD();
-  int c_swz  = c_w*cl.sz;
-  int c_shxz = c_h*cl.sx*cl.sz;
-  int c_dx   = cl.sz - c_d;
-  int c_dy   = (cl.sx - c_w)*cl.sz;
-
-  int p_dx   = k.dx*pl.sz;
-  int p_dy   = k.dy*pl.sx*pl.sz - c_w*p_dx;
-
-  int p_ddy  = (pl.sx - KSX)*pl.sz;
-  int p_ddx  = pl.sz - PD;
-
-  Neuron *icn = c_neurons + (cl.y0*cl.sx + cl.x0)*cl.sz + cl.z0;
-  Neuron *ipn = p_neurons + ((pl.y0 + (cl.y0 - wl.y0)*k.dy + k.oy)*pl.sx + pl.x0 + (cl.x0 - wl.x0)*k.dx + k.ox)*pl.sz + pl.z0;
+  weights += (cl.z0 - wl.z0)*k_syxd;
+  Weight *iw = weights;
 
   for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy)
-  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx)
+  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx, iw = weights)
   for(Neuron *e = icn +    c_d; icn < e; ++icn) {
     typename Iter::AccumType a;
     Iter::init(*icn, a);
 
     Neuron *iipn = ipn;
-    Weight *iw = weights;
-    for(int i = 0; i < KSY; ++i, iipn += p_ddy)
-    for(int i = 0; i < KSX; ++i, iipn += p_ddx)
-    for(int i = 0; i <  PD; ++i, ++iw, ++iipn)
+    for(Weight *e = iw + k_syxd; iw < e; iipn += p_ddy)
+    for(Weight *e = iw +  k_sxd; iw < e; iipn += p_ddx)
+    for(Weight *e = iw +    p_d; iw < e; ++iw, ++iipn)
       Iter::iter(*iipn, *iw, a);
 
     Iter::done(*icn, a);
   }
 }
 
-typedef void (*iterateConvolutionSharedFunc)(Layout, Layout, Layout, Kernel, Neuron*, Neuron*, Weight*);
-template<typename Iter, int KSX, int KSY>
-iterateConvolutionSharedFunc getIterateConvolutionSharedFuncXY(int pd) {
-  if (pd <= 8) switch(pd) {
-  case 1:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 1>;
-  case 2:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 2>;
-  case 3:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 3>;
-  case 4:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 4>;
-  case 5:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 5>;
-  case 6:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 6>;
-  case 7:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 7>;
-  case 8:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 8>;
-  }
-  return &iterateConvolutionSharedDyn<Iter>;
-}
-
-
-template<typename Iter>
-iterateConvolutionSharedFunc getIterateConvolutionSharedFunc(int ksx, int ksy, int pd) {
-  if (0 && ksx == ksy && pd <= 8) switch(ksx) {
-  case 1:  return getIterateConvolutionSharedFuncXY<Iter, 1, 1>(pd);
-  case 2:  return getIterateConvolutionSharedFuncXY<Iter, 2, 2>(pd);
-  case 3:  return getIterateConvolutionSharedFuncXY<Iter, 3, 3>(pd);
-  case 4:  return getIterateConvolutionSharedFuncXY<Iter, 4, 4>(pd);
-  case 5:  return getIterateConvolutionSharedFuncXY<Iter, 5, 5>(pd);
-  case 6:  return getIterateConvolutionSharedFuncXY<Iter, 6, 6>(pd);
-  case 7:  return getIterateConvolutionSharedFuncXY<Iter, 7, 7>(pd);
-  case 8:  return getIterateConvolutionSharedFuncXY<Iter, 8, 8>(pd);
-  }
-  return &iterateConvolutionSharedDyn<Iter>;
-}
-
-
-template<typename Iter>
-void iterateConvolutionShared(const Layout &cl, const Layout &pl, const Layout &wl, const Kernel &k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
-  iterateConvolutionSharedFunc f = getIterateConvolutionSharedFunc<Iter>(k.sx, k.sy, pl.getD());
-  f(cl, pl, wl, k, c_neurons, p_neurons, weights);
-}
-
-
-
 
 template<typename Iter>
 void iterateConvolutionSharedPoint(Layout cl, Layout pl, Layout wl, Kernel k, int kx, int ky, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
@@ -214,29 +124,62 @@ void iterateConvolutionSharedPoint(Layout cl, Layout pl, Layout wl, Kernel k, in
   int p_d    = pl.getD();
   int p_dx   = k.dx*pl.sz;
   int p_dy   = k.dy*pl.sx*pl.sz - c_w*p_dx;
+  
+  int w_dz   = (k.sy*k.sx - 1)*p_d;
 
   Neuron *icn = c_neurons + (cl.y0*cl.sx + cl.x0)*cl.sz + cl.z0;
   Neuron *ipn = p_neurons + ((pl.y0 + (cl.y0 - wl.y0)*k.dy + k.oy + ky)*pl.sx + pl.x0 + (cl.x0 - wl.x0)*k.dx + k.ox + kx)*pl.sz + pl.z0;
-  weights += (ky*k.sx + kx)*p_d;
-  Weight *ew = weights + p_d;
+  weights += (((cl.z0 - wl.z0)*k.sy + ky)*k.sx + kx)*p_d;
+  Weight *iw = weights;
 
   for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy)
-  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx)
-  for(Neuron *e = icn +    c_d; icn < e; ++icn,       ipn -= p_d)
-  for(Weight *iw = weights; iw < ew; ++ipn, ++iw)
+  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx, iw = weights)
+  for(Neuron *e = icn +    c_d; icn < e; ++icn,       ipn -= p_d,  iw += w_dz)
+  for(Weight *e = iw  +    p_d; iw  < e; ++ipn, ++iw)
     Iter::iter2(*icn, *ipn, *iw);
 }
 
 
+void fillConvolutionWeights(int kx, int ky, int kz, int count, Weight *weights) {
+  double kr = 1.5;
+  double sum = 0;
+  
+  Weight *iw = weights;
+  for(int i = 0; i < count; ++i)
+  for(int y = 0; y < ky; ++y)
+  for(int x = 0; x < kx; ++x)
+  for(int z = 0; z < kz; ++z, ++iw) {
+    double dx = (2.0*x/(kx-1) - 1)*kr;
+    double dy = (2.0*y/(ky-1) - 1)*kr;
+    double e = exp( -dx*dx - dy*dy );
+    sum += e;
+    iw->w = (WeightReal)( (rand()/(double)RAND_MAX*2 - 1)*e );
+    //iw->w = (WeightReal)( rand()/(double)RAND_MAX*e );
+  }
+  
+  WeightReal k = (WeightReal)(10*kz/sum);
+  Weight *ew = iw;
+  for(iw = weights; iw < ew; ++iw) iw->w *= k;
+}
+
 
 
 class LayerConvSharedBase: public Layer {
 public:
+  Kernel kernel;
   std::vector<Weight> mtWeights;
   
-  using Layer::Layer;
-  
 
+  LayerConvSharedBase(Layer &prev, const Layout &layout, const Kernel &kernel, Weight *weights = nullptr):
+    Layer(&prev, layout, kernel.sx * kernel.sy * layout.getD() * prev.back().layout.getD(), weights),
+    kernel(kernel)
+  {
+    assert(kernel);
+    stat.links = weightsCount * layout.getW() * layout.getH();
+    if (ownWeights) fillWeights(-1, 1);
+  }
+
+  
   void split(int threadsCount) override {
     Layer::split(threadsCount);
     Weight w = {};
@@ -264,16 +207,11 @@ public:
 template<Func func>
 class LayerConvShared: public LayerConvSharedBase {
 public:
-  Kernel kernel;
-
-
   LayerConvShared(Layer &prev, const Layout &layout, const Kernel &kernel, Weight *weights = nullptr):
-    LayerConvSharedBase(&prev, layout, kernel.sx*kernel.sy*prev.back().layout.getD(), weights),
-    kernel(kernel)
+    LayerConvSharedBase(prev, layout, kernel, weights)
   {
-    assert(kernel);
-    stat.links = weightsCount*neuronsCount;
-    if (ownWeights) fillWeights(-1, 1);
+    stat.links = weightsCount * layout.getW() * layout.getH();
+    if (ownWeights) fillConvolutionWeights(kernel.sx, kernel.sy, this->prev->layout.getD(), layout.getD(), this->weights);
   }
 
   
@@ -339,6 +277,10 @@ public:
     iterateNeurons<I>(prev->layout, prev->neurons);
     clearAccum();
   }
+
+
+  bool saveDemo() override
+    { return !filename || saveConvDemoImage( filename, layout.getD(), kernel.sx, kernel.sy, prev->layout.getD(), weights ); }
 };
 
 
@@ -346,16 +288,11 @@ public:
 template<Func func>
 class LayerDeconvShared: public LayerConvSharedBase {
 public:
-  Kernel kernel;
-
-
   LayerDeconvShared(Layer &prev, const Layout &layout, const Kernel &kernel, Weight *weights = nullptr):
-    LayerConvSharedBase(&prev, layout, kernel.sx*kernel.sy*layout.getD(), weights),
-    kernel(kernel)
+    LayerConvSharedBase(prev, layout, kernel, weights)
   {
-    assert(kernel);
-    stat.links = weightsCount*neuronsCount;
-    if (ownWeights) fillWeights(-1, 1);
+    stat.links = weightsCount * this->prev->layout.getW() * this->prev->layout.getH();
+    if (ownWeights) fillConvolutionWeights(kernel.sx, kernel.sy, layout.getD(), this->prev->layout.getD(), this->weights);
   }
 
   
@@ -421,6 +358,10 @@ public:
     iterateTestConvolutionShared<I>(prev->layout, layout, kernel, prev->neurons, neurons, weights);
     iterateTestConvolutionShared<IW>(prev->layout, layout, kernel, prev->neurons, neurons, weights);
   }
+
+  
+  bool saveDemo() override
+    { return !filename || saveConvDemoImage( filename, prev->layout.getD(), kernel.sx, kernel.sy, layout.getD(), weights ); }
 };
 
 #endif
diff --git a/projects/neural/layer.conv.test.inc.cpp b/projects/neural/layer.conv.test.inc.cpp
index c0b4be8..c4bd34f 100644
--- a/projects/neural/layer.conv.test.inc.cpp
+++ b/projects/neural/layer.conv.test.inc.cpp
@@ -8,7 +8,7 @@
 #include "layer.conv.shared.inc.cpp"
 
 
-class ConvTest: public Test {
+class ConvTest: public LayerTest {
 public:
   static void init(const Layout &cl, const Layout &pl, const Kernel &k, bool shared = false)
     { Test::init(cl.getCount(), pl.getCount(), (shared ? 1 : cl.getActiveCount())*k.sx*k.sy*pl.getD()); }
@@ -144,25 +144,25 @@ public:
     {
       Layer l(nullptr, pl);
       new LayerConv<funcSigmoidExp>(l, cl, k);
-      Test::testLayer("LayerConv", l);
+      testLayer("LayerConv", l);
     }
 
     {
       Layer l(nullptr, cl);
       new LayerDeconv<funcSigmoidExp>(l, pl, k);
-      Test::testLayer("LayerDeconv", l);
+      testLayer("LayerDeconv", l);
     }
 
     {
       Layer l(nullptr, pl);
       new LayerConvShared<funcSigmoidExp>(l, cl, k);
-      Test::testLayer("LayerConvShared", l);
+      testLayer("LayerConvShared", l);
     }
 
     {
       Layer l(nullptr, cl);
       new LayerDeconvShared<funcSigmoidExp>(l, pl, k);
-      Test::testLayer("LayerDeconvShared", l);
+      testLayer("LayerDeconvShared", l);
     }
 
     return st;
diff --git a/projects/neural/layer.convsub.shared.inc.cpp b/projects/neural/layer.convsub.shared.inc.cpp
new file mode 100644
index 0000000..3223870
--- /dev/null
+++ b/projects/neural/layer.convsub.shared.inc.cpp
@@ -0,0 +1,236 @@
+#ifndef LAYER_CONVSUB_SHARED_INC_CPP
+#define LAYER_CONVSUB_SHARED_INC_CPP
+
+
+#include "layer.conv.inc.cpp"
+
+
+template<typename Iter>
+void iterateConvolutionShared2(Layout cl, Layout pl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+  assert(cl);
+  assert(pl);
+  assert(k);
+  assert(c_neurons);
+  assert(p_neurons);
+  assert(weights);
+  assert(!cl.hasPadZ());
+  assert(!pl.hasPadZ());
+  assert(pl.x0 + k.ox >= 0 && pl.x0 + (cl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);
+  assert(pl.y0 + k.oy >= 0 && pl.y0 + (cl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);
+
+  int c_h    = cl.getH();
+  int c_w    = cl.getW();
+  int c_swz  = c_w*cl.sz;
+  int c_shxz = c_h*cl.sx*cl.sz;
+  int c_dx   = cl.sz - c_d;
+  int c_dy   = (cl.sx - c_w)*cl.sz;
+
+  int p_d    = pl.getD();
+  int p_dkx  = pl.sx - k.sx
+  int p_dx   = k.dx*pl.sz;
+  int p_dy   = k.dy*pl.sx*pl.sz - c_w*p_dx;
+
+  c_neurons += (cl.y0*cl.sx + cl.x0)*cl.sz + cl.z0;
+  p_neurons += ((pl.y0 + (cl.y0 - wl.y0)*k.dy + k.oy)*pl.sx + pl.x0 + (cl.x0 - wl.x0)*k.dx + k.ox)*pl.sz + pl.z0;
+  
+  for(int ky = 0; ky < k.sy; ++ky, p_neurons += p_dkx)
+  for(int kx = 0; kx < k.sx; ++kx, p_neurons += pl.sz) {
+  }
+  
+  
+  Neuron *icn = c_neurons + (cl.y0*cl.sx + cl.x0)*cl.sz + cl.z0;
+  Neuron *ipn = p_neurons + ((pl.y0 + (cl.y0 - wl.y0)*k.dy + k.oy + ky)*pl.sx + pl.x0 + (cl.x0 - wl.x0)*k.dx + k.ox + kx)*pl.sz + pl.z0;
+  weights += (ky*k.sx + kx)*p_d;
+  Weight *ew = weights + p_d;
+
+  for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy)
+  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx)
+  for(Neuron *e = icn +    c_d; icn < e; ++icn,       ipn -= p_d)
+  for(Weight *iw = weights; iw < ew; ++ipn, ++iw)
+    Iter::iter2(*icn, *ipn, *iw);
+}
+
+
+
+
+template<Func func>
+class LayerSub: public Layer {
+public:
+  Layout optLayout;
+  Layout::List mtOptLayouts;
+  std::vector<Neuron*> choosen;
+  
+  LayerSub(Layer &prev, const Layout &layout):
+    Layer(&prev, layout),
+    optLayout(optimizeLayoutSimple(layout)),
+    choosen(layout.getActiveCount(), nullptr)
+    { }
+
+
+  void split(int threadsCount) override {
+    Layer::split(threadsCount);
+    optLayout.split(mtOptLayouts, threadsCount);
+  }
+
+  
+  void pass(Barrier &barrier) override {
+    Layout cl = mtLayouts[barrier.tid];
+    Layout pl = prev->layout;
+    Layout wl = layout;
+    if (!cl) return;
+
+    assert(pl.getW() == wl.getW()*2);
+    assert(pl.getH() == wl.getH()*2);
+    assert(pl.getD() == wl.getD());
+    assert(cl.isSubLayoutOf(wl));
+
+    int c_h    = cl.getH();
+    int c_w    = cl.getW();
+    int c_d    = cl.getD();
+    int c_sxz  = cl.sx*cl.sz;
+    int c_swz  = c_w*cl.sz;
+    int c_shxz = c_h*c_sxz;
+    int c_dy   = c_sxz - c_swz;
+    int c_dx   = cl.sz - c_d;
+
+    int w_d    = wl.getD();
+    int w_w    = wl.getW();
+    int w_dy   = (w_w - c_w)*w_d;
+    int w_dx   = w_d - c_d;
+    
+    int p_dy   = (pl.sx - c_w)*pl.sz*2;
+    int p_dx   = pl.sz*2 - c_d;
+    
+    int p_i1   = pl.sz;
+    int p_i2   = pl.sx*pl.sz;
+    int p_i3   = p_i1 + p_i2;
+
+    int cx0 = cl.x0 - wl.x0;
+    int cy0 = cl.y0 - wl.y0;
+    int cz0 = cl.z0 - wl.z0;
+    
+    Neuron *icn = neurons + (cl.y0*c_sxz + cl.x0*cl.sz + cl.z0);
+    Neuron *ipn = prev->neurons + ((pl.y0 + cy0*2)*pl.sx + pl.x0 + cx0*2)*pl.sz + pl.z0 + cz0;
+    Neuron **icc = choosen.data() + (cy0*w_w + cx0)*w_d + cz0;
+    
+    for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy, icc += w_dy)
+    for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx, icc += w_dx)
+    for(Neuron *e = icn +    c_d; icn < e; ++icn, ++ipn, ++icc) {
+      Neuron *iipn = ipn, *pn = iipn;
+      NeuronReal v = pn->v, d = pn->d;
+      pn->d = 0;
+      
+      iipn = ipn + p_i1;
+      if (v < iipn->v) { v = iipn->v; d = iipn->d; pn = iipn; }
+      iipn->d = 0;
+      
+      iipn = ipn + p_i2;
+      if (v < iipn->v) { v = iipn->v; d = iipn->d; pn = iipn; }
+      iipn->d = 0;
+      
+      iipn = ipn + p_i3;
+      if (v < iipn->v) { v = iipn->v; d = iipn->d; pn = iipn; }
+      iipn->d = 0;
+      
+      func(*icn, v);
+      icn->d *= d;
+      *icc = pn;
+    }
+  }
+  
+
+  void backpassDeltas(Barrier &barrier) override {
+    Layout cl = mtOptLayouts[barrier.tid];
+    Layout wl = optLayout;
+    if (!cl) return;
+
+    int c_h    = cl.getH();
+    int c_w    = cl.getW();
+    int c_d    = cl.getD();
+    int c_sxz  = cl.sx*cl.sz;
+    int c_swz  = c_w*cl.sz;
+    int c_shxz = c_h*c_sxz;
+    int c_dy   = c_sxz - c_swz;
+    int c_dx   = cl.sz - c_d;
+
+    int w_d    = wl.getD();
+    int w_w    = wl.getW();
+    int w_dy   = (w_w - c_w)*w_d;
+    int w_dx   = w_d - c_d;
+    
+    Neuron *icn = neurons + (cl.y0*c_sxz + cl.x0*cl.sz + cl.z0);
+    Neuron **icc = choosen.data() + ((cl.y0 - wl.y0)*w_w + cl.x0 - wl.x0)*w_d + cl.z0 - wl.z0;
+    
+    for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, icc += w_dy)
+    for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, icc += w_dx)
+    for(Neuron *e = icn +    c_d; icn < e; ++icn, ++icc) {
+      assert(*icc);
+      (*icc)->d = icn->d;
+    }
+  }
+
+  
+  void testPass() override {
+    Layout cl = layout;
+    Layout pl = prev->layout;
+
+    assert(pl.getW() == cl.getW()*2);
+    assert(pl.getH() == cl.getH()*2);
+    assert(pl.getD() == cl.getD());
+    
+    for(int cy = cl.y0; cy < cl.y1; ++cy)
+    for(int cx = cl.x0; cx < cl.x1; ++cx)
+    for(int cz = cl.z0; cz < cl.z1; ++cz) {
+      int ci = (cy*cl.sx + cx)*cl.sz + cz;
+      Neuron &cn = neurons[ci];
+
+      Neuron *c = nullptr;
+      NeuronReal v = 0, d = 0;
+
+      for(int ky = 0; ky < 2; ++ky)
+      for(int kx = 0; kx < 2; ++kx) {
+        int px = pl.x0 + (cx - cl.x0)*2 + kx;
+        int py = pl.y0 + (cy - cl.y0)*2 + ky;
+        int pz = pl.z0 + cz - cl.z0;
+        
+        Neuron &pn = prev->neurons[ (py*pl.sx + px)*pl.sz + pz ];
+        if (!c || v < pn.v) { v = pn.v; d = pn.d; c = &pn; }
+        pn.d = 0;
+      }
+      
+      assert(c);
+      c->d = d;
+      func(cn, v);
+    }
+  }
+
+  
+  void testBackpass() override {
+    Layout cl = layout;
+    Layout pl = prev->layout;
+
+    assert(pl.getW() == cl.getW()*2);
+    assert(pl.getH() == cl.getH()*2);
+    assert(pl.getD() == cl.getD());
+    
+    for(int cy = cl.y0; cy < cl.y1; ++cy)
+    for(int cx = cl.x0; cx < cl.x1; ++cx)
+    for(int cz = cl.z0; cz < cl.z1; ++cz) {
+      int ci = (cy*cl.sx + cx)*cl.sz + cz;
+      Neuron &cn = neurons[ci];
+      
+      for(int ky = 0; ky < 2; ++ky)
+      for(int kx = 0; kx < 2; ++kx) {
+        int px = pl.x0 + (cx - cl.x0)*2 + kx;
+        int py = pl.y0 + (cy - cl.y0)*2 + ky;
+        int pz = pl.z0 + cz - cl.z0;
+        
+        Neuron &pn = prev->neurons[ (py*pl.sx + px)*pl.sz + pz ];
+        pn.d *= cn.d;
+      }
+    }
+  }
+};
+
+
+#endif
diff --git a/projects/neural/layer.inc.cpp b/projects/neural/layer.inc.cpp
index 0d6447a..fe357de 100644
--- a/projects/neural/layer.inc.cpp
+++ b/projects/neural/layer.inc.cpp
@@ -2,118 +2,57 @@
 #define LAYER_INC_CPP
 
 
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <cassert>
+#include "common.inc.cpp"
 
-#include <atomic>
-#include <vector>
-#include <string>
-#include <algorithm>
 
 
-#include "layout.inc.cpp"
-
-
-
-typedef double WeightReal;
-typedef double NeuronReal;
-typedef double AccumReal;
-
-typedef int WeightInt;
-typedef int AccumInt;
-
-
-#define RANDOM_MAX 0x7fffffff
-inline unsigned int randomNext(unsigned int prev)
-  { return (1103515245*prev + 12345) & RANDOM_MAX; }
-inline unsigned int randomBranch(unsigned int seed)
-  { return randomNext(seed + 1); }
-  
-inline void busyloop(unsigned int count)
-  { while(count--) __asm__ __volatile__(""); }
-
-
-struct Accum {
-  union { AccumReal v; AccumInt i; };
-};
-
-
-struct Neuron {
-  NeuronReal v, d;
-  Accum a;
-};
-
-
-struct Weight {
-  union { WeightReal w; WeightInt i; };
-};
-
-
-struct Iter {
-  typedef Accum AccumType;
-  typedef NeuronReal* DataType;
-  typedef AccumType DataAccumType;
-  static inline void init(Neuron&, AccumType&) { }
-  static inline void iter(Neuron&, Weight&, AccumType&) { }
-  static inline void done(Neuron&, AccumType&) { }
-  static inline void iter2(Neuron&, Neuron&, Weight&) { }
-  static inline void iter3(Neuron&) { }
-  static inline void iter4(Neuron&, DataType, DataAccumType&) { }
-};
-
-
-class Barrier {
-private:
-  std::atomic<unsigned int> &counter;
-  unsigned int next;
-  unsigned int busyseed;
+class WeightHolder {
 public:
-  const unsigned int tid;
-  const unsigned int threads;
-  unsigned int seed;
-
-  Barrier(const Barrier&) = delete;
-  inline Barrier(std::atomic<unsigned int> &counter, unsigned int tid, unsigned int threads, unsigned int seed):
-    counter(counter), next(), busyseed(randomBranch(seed)), tid(tid), threads(threads), seed(seed) { assert(tid < threads); }
-    
-  //inline void busyloop() { }
-  inline void busyloop(unsigned int maxCycles = 4096) { ::busyloop( (busyseed = randomNext(busyseed))%maxCycles ); }
-  inline unsigned int rand() { return seed = randomNext(seed); }
-  inline void wait() { next += threads; ++counter; while(counter < next) busyloop(); }
-  inline void subwait() { while(counter < next + tid) busyloop(); }
+  const int weightsCount;
+  Weight *weights;
   
-};
-
-
-struct Stat {
-  int neurons;
-  int activeNeurons;
-  int weights;
-  int links;
-  size_t memsize;
+  const char *filename;
+  
+  explicit WeightHolder(int weightsCount = 0, Weight *weights = nullptr):
+    weightsCount(weightsCount), weights(weights), filename()
+    { assert(weightsCount >= 0); }
+  
+  
+  virtual ~WeightHolder() { }
+  
+  
+  bool save(bool demoOnly = false) {
+    if (filename && weightsCount && !demoOnly) {
+      FILE *f = fopen(filename, "wb");
+      if (!f)
+        return printf("cannot open file for write: %s\n", filename), false;
+      if (!fwrite(weights, sizeof(*weights)*weightsCount, 1, f))
+        return fclose(f), printf("cannot write to file: %s\n", filename), false;
+      fclose(f);
+    }
+    return saveDemo();
+  }
 
-  Stat(): neurons(), activeNeurons(), weights(), links(), memsize() { }
 
-  Stat& operator+= (const Stat &b) {
-    neurons += b.neurons;
-    activeNeurons += b.activeNeurons;
-    weights += b.weights;
-    links   += b.links;
-    memsize += b.memsize;
-    return *this;
+  bool load() {
+    if (filename && weightsCount) {
+      FILE *f = fopen(filename, "rb");
+      if (!f)
+        return printf("cannot open file for read: %s\n", filename), false;
+      if (!fread(weights, sizeof(*weights)*weightsCount, 1, f))
+        return fclose(f), printf("cannot read from file: %s\n", filename), false;
+      fclose(f);
+    }
+    return true;
   }
+  
 
-  void print(const char *prefix = nullptr) const {
-    if (prefix && *prefix) printf("%s: ", prefix);
-    printf("neurons: %d / %d, links %d / %d, memSize: %llu\n", activeNeurons, neurons, weights, links, (unsigned long long)memsize);
-  }
+  virtual bool saveDemo() { return true; }
 };
 
 
-class Layer {
+
+class Layer: public WeightHolder {
 public:
   Layer *prev, *next;
 
@@ -122,11 +61,9 @@ public:
   Neuron *neurons;
   int neuronsCount;
 
-  Weight *weights;
-  int weightsCount;
   bool ownWeights;
 
-  const char *filename;
+  bool skipTrain;
 
   Stat stat;
 
@@ -135,15 +72,14 @@ public:
 
 
   Layer(Layer *prev, const Layout &layout, int weightsCount = 0, Weight *weights = nullptr):
+    WeightHolder(weightsCount, weights),
     prev(prev ? &prev->back() : nullptr),
     next(),
     layout(layout),
     neurons(),
     neuronsCount(layout.getCount()),
-    weights(weights),
-    weightsCount(weightsCount),
     ownWeights(!weights && weightsCount),
-    filename()
+    skipTrain()
   {
     assert(layout);
     assert(neuronsCount > 0);
@@ -169,7 +105,7 @@ public:
   }
 
 
-  virtual ~Layer() {
+  ~Layer() {
     if (next) delete next;
     if (neurons) delete[] neurons;
     if (ownWeights) delete[] weights;
@@ -183,32 +119,12 @@ public:
   inline Stat sumStat() const
     { Stat s; for(const Layer *l = this; l; l = l->next) s += l->stat; return s; }
 
-  bool save() const {
-    if (filename && weightsCount) {
-      FILE *f = fopen(filename, "wb");
-      if (!f)
-        return printf("cannot open file for write: %s\n", filename), false;
-      if (!fwrite(weights, sizeof(*weights)*weightsCount, 1, f))
-        return fclose(f), printf("cannot write to file: %s\n", filename), false;
-      fclose(f);
-    }
-    return !next || next->save();
-  }
-
-
-  bool load() {
-    if (filename && weightsCount) {
-      FILE *f = fopen(filename, "rb");
-      if (!f)
-        return printf("cannot open file for read: %s\n", filename), false;
-      if (!fread(weights, sizeof(*weights)*weightsCount, 1, f))
-        return fclose(f), printf("cannot read from file: %s\n", filename), false;
-      fclose(f);
-    }
-    return !next || next->load();
-  }
+  bool save(bool demoOnly = false)
+    { return WeightHolder::save(demoOnly) && (!next || next->save(demoOnly)); }
+  bool load()
+    { return WeightHolder::load() && (!next || next->load()); }
 
-  
+ 
   void clearAccum() {
     Accum a = {};
     for(Neuron *in = neurons, *e = in + neuronsCount; in < e; ++in)
@@ -221,8 +137,8 @@ public:
     for(Weight *iw = weights, *e = iw + weightsCount; iw < e; ++iw)
       iw->w = rand()*k + wmin;
   }
-
  
+
   virtual void split(int threadsCount) {
     layout.split(mtLayouts, threadsCount);
     if (prev) prev->layout.split(mtPrevLayouts, threadsCount);
@@ -235,10 +151,33 @@ public:
   virtual void testBackpass() { }
   
   
-  virtual void clGetThreadsData(std::vector<unsigned char> &data) { }
-  virtual void clGetPassProgram(std::string &text) { }
-  virtual void clGetBackpassWeightsProgram(std::string &text) { }
-  virtual void clGetBackpassDeltasProgram(std::string &text) { }
+  void passFull(const Layer *last = nullptr, int threadsCount = 1) {
+    struct H {
+      Layer &layer;
+      const Layer *last;
+      std::atomic<unsigned int> barrierCounter;
+      std::vector<std::thread*> threads;
+      
+      H(Layer &layer, const Layer *last, int threadsCount): layer(layer), last(last), barrierCounter(0), threads(threadsCount) { }
+      
+      void func(int tid, unsigned int seed) {
+        Barrier barrier(barrierCounter, tid, threads.size(), seed);
+        for(Layer *l = layer.next; l; l = l->next) {
+          l->pass(barrier);
+          if (l == last || !l->next) break;
+          barrier.wait();
+        }
+      }
+    } h(*this, last, threadsCount);
+    
+    for(Layer *l = this; l; l = l->next)
+      l->split(threadsCount);
+    for(int i = 1; i < threadsCount; ++i)
+      h.threads[i] = new std::thread(&H::func, &h, i, rand());
+    h.func(0, rand());
+    for(int i = 1; i < threadsCount; ++i)
+      { h.threads[i]->join(); delete h.threads[i]; }
+  }
 };
 
 
diff --git a/projects/neural/layer.simple.inc.cpp b/projects/neural/layer.simple.inc.cpp
index 9b4d28f..847d1be 100644
--- a/projects/neural/layer.simple.inc.cpp
+++ b/projects/neural/layer.simple.inc.cpp
@@ -3,25 +3,9 @@
 
 
 #include "layer.inc.cpp"
+#include "func.inc.cpp"
 
 
-typedef void Func(Neuron &n, AccumReal s);
-
-
-inline void funcSigmoidExp(Neuron &n, AccumReal s) {
-  //if (s > 5) s = 5; else if (s < -5) s = -5;
-  AccumReal ss = 1/(1 + std::exp(-s)); n.v = ss; n.d = ss * (1-ss);
-}
-
-
-inline void funcSigmoidExp2(Neuron &n, AccumReal s) {
-  //if (s > 5) s = 5; else if (s < -5) s = -5;
-  AccumReal ss = 1/(1 + std::exp(-s)); n.v = ss; n.d = 0;//ss * (1-ss) * 0.1;
-}
-
-
-inline void funcReLU(Neuron &n, AccumReal s)
-  { n.v = s > 0 ? s : 0; n.d = s > 0; }
 
 
 
@@ -218,7 +202,7 @@ public:
     optLayout(optimizeLayoutSimple(layout)),
     prevOptLayout(optimizeLayoutSimple(this->prev->layout))
   {
-    if (ownWeights) fillWeights(-1, 1);
+    if (ownWeights) fillWeights(-0.5, 0.5);
   }
 
 
diff --git a/projects/neural/layer.simple.test.inc.cpp b/projects/neural/layer.simple.test.inc.cpp
index fc6add5..e025591 100644
--- a/projects/neural/layer.simple.test.inc.cpp
+++ b/projects/neural/layer.simple.test.inc.cpp
@@ -7,7 +7,7 @@
 #include "layer.sub.inc.cpp"
 
 
-class SimpleTest: public Test {
+class SimpleTest: public LayerTest {
 public:
   static void init(const Layout &cl, const Layout &pl = Layout())
     { Test::init(cl.getCount(), pl.getCount(), cl.getActiveCount()*pl.getActiveCount()); }
@@ -173,7 +173,7 @@ public:
     {
       Layer l(nullptr, pl);
       new LayerSimple<funcSigmoidExp>(l, cl);
-      Test::testLayer("LayerSimple", l);
+      testLayer("LayerSimple", l);
     }
     
     {
@@ -184,7 +184,7 @@ public:
       
       Layer l(nullptr, ppl);
       new LayerSub<funcSigmoidExp>(l, cl);
-      Test::testLayer("LayerSub", l);
+      testLayer("LayerSub", l);
     }
     
     return st;
diff --git a/projects/neural/layer.test.inc.cpp b/projects/neural/layer.test.inc.cpp
index ec5a7bf..a12c154 100644
--- a/projects/neural/layer.test.inc.cpp
+++ b/projects/neural/layer.test.inc.cpp
@@ -2,132 +2,12 @@
 #define LAYER_TEST_INC_CPP
 
 
-#include <thread>
+#include "test.inc.cpp"
 
-#include "layer.inc.cpp"
 
 
-
-class Test {
+class LayerTest: public Test {
 public:
-  class Stage {
-  public:
-    const int errors;
-    inline explicit Stage(const char *name): errors(Test::errors) {
-      for(int i = 0; i < level; ++i) printf("- ");
-      printf("%s\n", name);
-      fflush(stdout);
-      ++level;
-    }
-    inline ~Stage() {
-      --level;
-      if (!*this) {
-        for(int i = 0; i < level; ++i) printf("- ");
-        printf("FAILED\n");
-      }
-      fflush(stdout);
-    }
-    operator bool() { return Test::errors == errors; }
-  };
-
-private:
-  static int level;
-
-protected:
-  static std::vector<Neuron> c_neurons;
-  static std::vector<Neuron> p_neurons;
-  static std::vector<Weight> weights;
-
-public:
-  static int errors;
-
-
-  static void init(int c_count, int p_count, int w_count) {
-    Neuron n = {};
-    Weight w = {};
-
-    c_neurons.clear();
-    p_neurons.clear();
-    weights.clear();
-    c_neurons.resize(c_count, n);
-    p_neurons.resize(p_count, n);
-    weights.resize(w_count, w);
-  }
-
-
-  static bool verifyNeurons(const char *name, const Layout &l, const Neuron *neurons, bool ignorePadded = false) {
-    Stage st(name);
-    for(int y = 0; y < l.sy; ++y)
-    for(int x = 0; x < l.sx; ++x)
-    for(int z = 0; z < l.sz; ++z) {
-      int n = neurons[ (y*l.sx + x)*l.sz + z ].a.i;
-      int i = x >= l.x0 && x < l.x1
-           && y >= l.y0 && y < l.y1
-           && z >= l.z0 && z < l.z1;
-      if (ignorePadded ? i && n != i : n != i) {
-        printf(
-          "wrong neuron mark %d, expected %d (%d, %d, %d)\n",
-          n, i, y, x, z );
-        l.printYXZ("layout");
-        ++errors;
-        return st;
-      }
-    }
-    return st;
-  }
-  
-  
-  static bool verifyNeuronIndices(const char *name, const Layout &l, const Neuron *neurons, int base = 1, int stride = 1) {
-    Stage st(name);
-    for(int y = 0; y < l.sy; ++y)
-    for(int x = 0; x < l.sx; ++x)
-    for(int z = 0; z < l.sz; ++z) {
-      bool active = x >= l.x0 && x < l.x1
-                 && y >= l.y0 && y < l.y1
-                 && z >= l.z0 && z < l.z1;
-      
-      int n = neurons[ (y*l.sx + x)*l.sz + z ].a.i;
-      int i = (((y - l.y0)*l.getW() + x - l.x0)*l.getD() + z - l.z0)*stride + base;
-      
-      if (!active) i = 0;
-      
-      if (n != i) {
-        printf(
-          "wrong neuron mark %d, expected %d (%d, %d, %d)\n",
-          n, i, y, x, z );
-        l.printYXZ("layout");
-        ++errors;
-        return st;
-      }
-    }
-    return st;
-  }
-  
-  
-  static bool verifyNeuronsAccum(const Layout &l, Neuron *neurons, int accum = 1, bool ignoreBounds = false) {
-    for(int y = 0; y < l.sy; ++y)
-    for(int x = 0; x < l.sx; ++x)
-    for(int z = 0; z < l.sz; ++z) {
-      Neuron &n = neurons[ (y*l.sx + x)*l.sz + z ];
-      int i = ( x >= l.x0 && x < l.x1
-             && y >= l.y0 && y < l.y1
-             && z >= l.z0 && z < l.z1 )*accum;
-      if (ignoreBounds) i = accum;
-      if (n.v != 0 && n.v != i) {
-        printf(
-          "wrong neuron mark %g, expected 0 or %d (%d, %d, %d)\n",
-          n.v, i, y, x, z );
-        l.printYXZ("layout");
-        ++errors;
-        return false;
-      }
-      if (n.v) n.a.i = 1;
-      n.v = 0;
-    }
-    return true;
-  }
-  
-  
   static bool testLayer(const char *name, Layer &l) {
     Stage st(name);
 
@@ -261,12 +141,4 @@ public:
 };
 
 
-int Test::level = 0;
-std::vector<Neuron> Test::c_neurons;
-std::vector<Neuron> Test::p_neurons;
-std::vector<Weight> Test::weights;
-int Test::errors = 0;
-
-
-
 #endif
diff --git a/projects/neural/layout.inc.cpp b/projects/neural/layout.inc.cpp
index 83a78a0..7f02b03 100644
--- a/projects/neural/layout.inc.cpp
+++ b/projects/neural/layout.inc.cpp
@@ -45,6 +45,10 @@ struct Layout {
   inline Layout& padXY (int p)          { return padXY (p, p); }
   inline Layout& padXYZ(int p)          { return padXYZ(p, p); }
 
+  inline bool hasPadX() const { return x0 > 0 || x1 < sx; }
+  inline bool hasPadY() const { return y0 > 0 || y1 < sy; }
+  inline bool hasPadZ() const { return z0 > 0 || z1 < sz; }
+  inline bool hasPad()  const { return hasPadX() || hasPadY() || hasPadZ(); }
 
   inline int getW() const { return x1 - x0; }
   inline int getH() const { return y1 - y0; }
diff --git a/projects/neural/segment.cx4.inc.cpp b/projects/neural/segment.cx4.inc.cpp
new file mode 100644
index 0000000..61234ba
--- /dev/null
+++ b/projects/neural/segment.cx4.inc.cpp
@@ -0,0 +1,292 @@
+#ifndef SEGMENT_CX4_INC_CPP
+#define SEGMENT_CX4_INC_CPP
+
+
+#include "segment.inc.cpp"
+#include "func.inc.cpp"
+#include "layer.conv.inc.cpp"
+
+
+class SegmentCx4: public Segment {
+public:
+  enum {
+    KSX = 4,
+    KSY = 4,
+    SX = 12,
+    SY = 12,
+    MSX = 5,
+    MSY = 5,
+  };
+  
+  const int msx, msy, msz;
+  
+  Neuron *m_neurons;
+  Neuron *b_neurons;
+  
+  SegmentCx4(int sz, int msz, Weight *weights = nullptr):
+    Segment(SX, SY, sz, msz*KSY*KSX*sz, weights), msx(MSX), msy(MSY), msz(msz)
+  {
+      m_neurons = new Neuron[msx*msy*msz + sx*sy*sz];
+      b_neurons = m_neurons + msx*msy*msz;
+      clear();
+  }
+  ~SegmentCx4()
+    { delete[] m_neurons; }  
+  
+  
+  void clear() override
+    { memset(m_neurons, 0, sizeof(*m_neurons)*(msx*msy*msz + sx*sy*sz)); }
+
+    
+  inline void check(int x, int y, int z) {
+    Segment::check(x, y, z);
+    assert(layout.getD() == sz);
+  }
+
+
+  
+  Quality pass(Barrier &barrier, int x, int y, int z, NeuronReal trainRatio) override {
+    check(x, y, z);
+    
+    Layout l = layout;
+    const int ksx = 4, ksy = 4;
+    int tid = barrier.tid;
+    int threads = barrier.threads;
+    
+    int sx = this->sx;
+    int sy = this->sy;
+    int sz = this->sz;
+    int msx = this->msx;
+    int msy = this->msy;
+    int msz = this->msz;
+    
+    int ksxyz = ksx*ksy*sz;
+    int fv_dkx = l.sz - sz;
+    int fv_dky = (l.sx - ksx)*l.sz;
+    
+    NeuronReal *f_values = this->f_values + (y*l.sx + x)*l.sz + z;
+    
+    // stage 1: pass from front to mid
+    
+    Weight *iw = weights + tid*ksxyz;
+    Neuron *imn = m_neurons + tid;
+    NeuronReal *ifv = f_values;
+    
+    for(int mz = tid; mz < msz; mz += threads, iw += threads*ksxyz, imn += threads - msx*msy*msz, ifv = f_values)
+    for(int my = 0; my < MSY; ++my, ifv += 2*(l.sx - MSX)*l.sz)
+    for(int mx = 0; mx < MSX; ++mx, imn += msz, ifv += 2*l.sz) {
+      AccumReal a = 0;
+      
+      Weight *iiw = iw;
+      NeuronReal *iifv = ifv;
+      
+      for(int ky = 0; ky < KSY; ++ky, iifv += fv_dky)
+      for(int kx = 0; kx < KSX; ++kx, iifv += fv_dkx)
+      for(Weight *e = iiw + sz; iiw < e; ++iiw, ++iifv)
+        a += *iifv * iiw->w;
+      
+      if (a > 0) imn->v = a, imn->d = 1; else imn->v = imn->d = 0;
+    }
+    
+    barrier.wait();
+    
+    // stage 2: pass from mid to back and verify
+    
+    AccumReal qa = 0;
+    for(int by = 2 + tid; by < 10; by += threads)
+    for(int bx = 2; bx < 10; ++bx)
+    for(int bz = 0; bz < sz; ++bz) {
+      AccumReal a = 0;
+      Neuron &bn = b_neurons[ (by*sx + bx)*sz + bz ];
+      
+      for(int ky = by%2; ky < ksy; ky += 2)
+      for(int kx = bx%2; kx < ksx; kx += 2) {
+        int mx = (bx - kx)/2;
+        int my = (by - ky)/2;
+        assert(mx >= 0 && mx < msx && (bx - kx)%2 == 0);
+        assert(my >= 0 && my < msy && (by - ky)%2 == 0);
+        for(int mz = 0; mz < msz; ++mz) {
+          Neuron &mn = m_neurons[ (my*msx + mx)*msz + mz ];
+          Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + bz ];
+          a += mn.v * w.w;
+        }
+      }
+      
+      if (a > 0) bn.v = a, bn.d = 1; else bn.v = bn.d = 0;
+      
+      NeuronReal fn = f_values[ (by*l.sx + bx)*l.sz + bz ];
+      NeuronReal d = fn - bn.v;
+      bn.d *= d*trainRatio;
+      qa += d*d;
+    }
+    Quality q(qa/(64*sz));
+    
+    if (trainRatio <= 0) return q;
+    
+    barrier.wait();
+    
+    // stage 3: backpass deltas
+    
+    for(int mz = tid; mz < msz; mz += threads)
+    for(int my = 1; my < 4; ++my)
+    for(int mx = 1; mx < 4; ++mx) {
+      AccumReal a = 0;
+      Neuron &mn = m_neurons[ (my*msx + mx)*msz + mz ];
+      
+      for(int ky = 0; ky < ksy; ++ky)
+      for(int kx = 0; kx < ksx; ++kx)
+      for(int kz = 0; kz < sz;  ++kz) {
+        int bx = mx*2 + kx;
+        int by = my*2 + ky;
+        Neuron &bn = b_neurons[ (by*sx + bx)*sz + kz ];
+        Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + kz ];
+        a += bn.d * w.w;
+      }
+      mn.d *= a;
+    }
+    
+    barrier.wait();
+    
+    // stage 4: update weights
+
+    for(int mz = tid; mz < msz; mz += threads)
+    for(int by = 4; by <  8; ++by)
+    for(int bx = 4; bx <  8; ++bx)
+    for(int bz = 0; bz < sz; ++bz) {
+      Neuron &bn = b_neurons[ (by*sx + bx)*sz + bz ];
+      NeuronReal fv = f_values[ (by*l.sx + bx)*l.sz + bz ];
+      
+      for(int ky = by%2; ky < ksy; ky += 2)
+      for(int kx = bx%2; kx < ksx; kx += 2) {
+        int mx = (bx - kx)/2;
+        int my = (by - ky)/2;
+        assert(mx >= 1 && mx < 4 && (bx - kx)%2 == 0);
+        assert(my >= 1 && my < 4 && (by - ky)%2 == 0);
+        Neuron &mn = m_neurons[ (my*msx + mx)*msz + mz ];
+        Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + bz ];
+        w.w += bn.d*mn.v + mn.d*fv;
+      }
+    }
+    
+    return q;
+  }
+  
+  
+  
+  Quality testPass(int x, int y, int z, NeuronReal trainRatio) override {
+    check(x, y, z);
+    
+    Layout l = layout;
+    const int ksx = 4, ksy = 4;
+    
+    // stage 1: pass
+    
+    clear();
+    
+    for(int my = 0; my < msy; ++my)
+    for(int mx = 0; mx < msx; ++mx)
+    for(int mz = 0; mz < msz; ++mz) {
+      AccumReal a = 0;
+      Neuron &mn = m_neurons[ (my*msx + mx)*msz + mz ];
+      
+      for(int ky = 0; ky < ksy; ++ky)
+      for(int kx = 0; kx < ksx; ++kx)
+      for(int kz = 0; kz < sz;  ++kz) {
+        int fx = x + mx*2 + kx;
+        int fy = y + my*2 + ky;
+        int fz = z + kz;
+        NeuronReal fv = f_values[ (fy*l.sx + fx)*l.sz + fz ];
+        Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + kz ];
+        a += fv * w.w;
+      }
+      
+      if (a < 0) { mn.v = mn.d = 0; continue; }
+      mn.v = a; mn.d = 1;
+      
+      for(int ky = 0; ky < ksy; ++ky)
+      for(int kx = 0; kx < ksx; ++kx)
+      for(int kz = 0; kz < sz;  ++kz) {
+        int bx = mx*2 + kx;
+        int by = my*2 + ky;
+        int bz = kz;
+        Neuron &bn = b_neurons[ (by*sx + bx)*sz + bz ];
+        Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + kz ];
+        bn.a.v += a * w.w;
+      }
+    }
+    
+    // stage 2: finalize values and verify
+    
+    AccumReal qa = 0;
+    for(int by = 2; by < 10; ++by)
+    for(int bx = 2; bx < 10; ++bx)
+    for(int bz = 0; bz < sz; ++bz) {
+        Neuron &bn = b_neurons[ (by*sx + bx)*sz + bz ];
+        if (bn.a.v > 0) bn.v = bn.a.v, bn.d = 1; else bn.v = bn.d = 0;
+        bn.a.v = 0;
+        
+        NeuronReal fn = f_values[ ((y + by)*l.sx + x + bx)*l.sz + z + bz ];
+        NeuronReal d = fn - bn.v;
+        bn.d *= d*trainRatio;
+        qa += d*d;
+    }
+    Quality q(qa/(64*sz));
+    
+    if (trainRatio <= 0) return q;
+    
+    // stage 3: backpass deltas
+    
+    for(int my = 0; my < msy; ++my)
+    for(int mx = 0; mx < msx; ++mx)
+    for(int mz = 0; mz < msz; ++mz) {
+      AccumReal a = 0;
+      Neuron &mn = m_neurons[ (my*msx + mx)*msz + mz ];
+      
+      for(int ky = 0; ky < ksy; ++ky)
+      for(int kx = 0; kx < ksx; ++kx)
+      for(int kz = 0; kz < sz;  ++kz) {
+        int bx = mx*2 + kx;
+        int by = my*2 + ky;
+        int bz = kz;
+        Neuron &bn = b_neurons[ (by*sx + bx)*sz + bz ];
+        Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + kz ];
+        a += bn.d * w.w;
+      }
+      mn.d *= a;
+    }
+      
+    // stage 4: update weights
+
+    for(int by = 4; by <  8; ++by)
+    for(int bx = 4; bx <  8; ++bx)
+    for(int bz = 0; bz < sz; ++bz) {
+      Neuron &bn = b_neurons[ (by*sx + bx)*sz + bz ];
+      NeuronReal fv = f_values[ ((y + by)*l.sx + x + bx)*l.sz + z + bz ];
+      
+      for(int ky = by%2; ky < ksy; ky += 2)
+      for(int kx = bx%2; kx < ksx; kx += 2)
+      for(int mz = 0; mz < msz; ++mz) {
+        int mx = (bx - kx)/2;
+        int my = (by - ky)/2;
+        assert(mx >= 1 && mx < 4 && (bx - kx)%2 == 0);
+        assert(my >= 1 && my < 4 && (by - ky)%2 == 0);
+        Neuron &mn = m_neurons[ (my*msx + mx)*msz + mz ];
+        Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + bz ];
+        w.w += bn.d*mn.v + mn.d*fv;
+      }
+    }
+    
+    return q;
+  }
+
+  
+  bool saveDemo() override
+    { return !filename || saveConvDemoImage(filename, msz, 4, 4, sz, weights); }
+};
+
+
+
+
+#endif
+
+
diff --git a/projects/neural/segment.cx4.test.inc.cpp b/projects/neural/segment.cx4.test.inc.cpp
new file mode 100644
index 0000000..6143900
--- /dev/null
+++ b/projects/neural/segment.cx4.test.inc.cpp
@@ -0,0 +1,32 @@
+#ifndef SEGMENT_CX4_TEST_INC_CPP
+#define SEGMENT_CX4_TEST_INC_CPP
+
+
+#include "segment.test.inc.cpp"
+#include "segment.cx4.inc.cpp"
+
+
+class Cx4Test: public SegmentTest {
+public:
+  static bool test(const char *name, const Layout &l, int msz, int x, int y, int z) {
+    Stage st(name);
+    
+    {
+      SegmentCx4 s(l.getD(), msz);
+      testSegment("SegmentCx4", s, l, x, y, z, 0.001);
+    }
+    
+    return st;
+  }
+
+  
+  static bool test(const char *name = "cx4") {
+    Stage st(name);
+    test("square-16x3", Layout(16, 16, 3), 5, 1, 2, 0);
+    test("random-rect", Layout(23, 58, 4).expandX(2, 0).expandY(5, 3).expandZ(3, 1), 7, 4, 5, 3);
+    return st;
+  }
+};
+
+
+#endif
diff --git a/projects/neural/segment.inc.cpp b/projects/neural/segment.inc.cpp
new file mode 100644
index 0000000..ad4e3ea
--- /dev/null
+++ b/projects/neural/segment.inc.cpp
@@ -0,0 +1,42 @@
+#ifndef SEGMENT_INC_CPP
+#define SEGMENT_INC_CPP
+
+
+#include "layer.inc.cpp"
+
+
+class Segment: public WeightHolder {
+public:
+  const int sx, sy, sz;
+  
+  Layout layout;
+  NeuronReal *f_values;
+  
+  Segment(int sx, int sy, int sz, int weightsCount, Weight *weights = nullptr):
+    WeightHolder(weightsCount, weights), sx(sx), sy(sy), sz(sz), f_values() { }
+  
+  virtual ~Segment() { }
+  
+  virtual void clear() { }
+  virtual void split(int threadsCount) { }
+  
+  virtual Quality pass(Barrier &barrier, int x, int y, int z, NeuronReal trainRatio) { return barrier.tid ? Quality() : testPass(x, y, z, trainRatio); }
+  virtual Quality testPass(int x, int y, int z, NeuronReal trainRatio) { return Quality::bad(); }
+  
+  
+  inline void check(int x, int y, int z) {
+    #ifndef NDEBUG
+    Layout l = layout;
+    assert(l);
+    assert(f_values);
+    assert(weights);
+    assert(l.x0 <= x && x+sx <= l.x1);
+    assert(l.y0 <= y && y+sy <= l.y1);
+    assert(l.z0 <= z && z+sz <= l.z1);
+    #endif
+  }
+};
+
+
+#endif
+
diff --git a/projects/neural/segment.test.inc.cpp b/projects/neural/segment.test.inc.cpp
new file mode 100644
index 0000000..1b7da72
--- /dev/null
+++ b/projects/neural/segment.test.inc.cpp
@@ -0,0 +1,109 @@
+#ifndef SEGMENT_TEST_INC_CPP
+#define SEGMENT_TEST_INC_CPP
+
+
+#include "test.inc.cpp"
+#include "segment.inc.cpp"
+
+
+
+class SegmentTest: public Test {
+public:
+  static bool testSegment(const char *name, Segment &segment, Layout l, int x, int y, int z, NeuronReal trainRatio) {
+    Stage st(name);
+
+    struct H {
+      Segment &segment;
+      int x, y, z;
+      Quality testQ;
+      NeuronReal ratio;
+      
+      std::vector<std::thread*> threads;
+      std::vector<Quality> qualities;
+      std::atomic<unsigned int> counter;
+      
+      H(Segment &segment, int x, int y, int z, NeuronReal ratio): segment(segment), x(x), y(y), z(z), ratio(ratio), counter(0) { }
+  
+      void prepareData()
+        { memcpy(segment.weights, weights.data(), segment.weightsCount*sizeof(Weight)); }
+  
+      void func(int tid, unsigned int seed) {
+        Barrier barrier(counter, tid, threads.size(), seed);
+        qualities[tid] = segment.pass(barrier, x, y, z, ratio);
+      }
+      
+      bool test(const char *name, int threadsCount) {
+        Stage st(name);
+        
+        assert(threadsCount > 0);
+        counter = 0;
+        threads.clear();
+        qualities.clear();
+        threads.resize(threadsCount, nullptr);
+        qualities.resize(threadsCount);
+        
+        prepareData();
+
+        segment.split(threadsCount);
+        for(int i = 1; i < threadsCount; ++i) threads[i] = new std::thread(&H::func, this, i, rand());
+        func(0, rand());
+        
+        Quality q = qualities[0];
+        for(int i = 1; i < threadsCount; ++i) { threads[i]->join(); delete threads[i]; q += qualities[i]; }
+        threads.clear();
+        
+        if ( fabs(q.train - testQ.train) > 1e-10 
+          || fabs(q.human - testQ.human) > 1e-10 )
+        {
+            printf("qualities differs, was %g (%g), expected %g (%g)\n",
+              q.human, q.train, testQ.human, testQ.train );
+            ++errors;
+        }
+        
+        for(int i = 0; i < segment.weightsCount; ++i) {
+          WeightReal a = segment.weights[i].w;
+          WeightReal b = weights[i + segment.weightsCount].w;
+          if (fabs(a - b) > 1e-10) {
+            printf("weights differs at %d, was %g, expected %g\n", i, a, b);
+            segment.layout.printYXZ("layout");
+            ++errors; break;
+          }
+        }
+        
+        return st;
+      }
+    } h(segment, x, y, z, trainRatio);
+    
+    
+    assert(segment.weightsCount > 0);
+    
+    int valuesCount = l.getCount();
+    init(0, 0, segment.weightsCount*3, valuesCount);
+    
+    for(int i = 0; i < valuesCount; ++i)
+      values[i] = rand()/(NeuronReal)RAND_MAX;
+    for(int i = 0; i < segment.weightsCount; ++i)
+      weights[i].w = (WeightReal)(2.0*rand()/RAND_MAX - 1);
+    
+    segment.layout = l;
+    segment.f_values = values.data();
+    segment.weights = &weights[segment.weightsCount];
+    segment.check(x, y, z);
+    
+    h.prepareData();
+    h.testQ = segment.testPass(x, y, z, trainRatio);
+    segment.weights += segment.weightsCount;
+
+    h.test("single-thread", 1);
+    h.test("single-thread-repeat", 1);
+    h.test("2-threads", 2);
+    h.test("7-threads", 7);
+    h.test("7-threads-repeat", 7);
+    h.test("8-threads", 8);
+    
+    return st;
+  }
+};
+
+
+#endif
diff --git a/projects/neural/test.all.inc.cpp b/projects/neural/test.all.inc.cpp
new file mode 100644
index 0000000..227f226
--- /dev/null
+++ b/projects/neural/test.all.inc.cpp
@@ -0,0 +1,23 @@
+#ifndef TEST_ALL_INC_CPP
+#define TEST_ALL_INC_CPP
+
+
+
+#include "layer.simple.test.inc.cpp"
+#include "layer.conv.test.inc.cpp"
+#include "segment.cx4.test.inc.cpp"
+
+
+class AllTest: public Test {
+public:
+  static bool test(const char *name = "all") {
+    Stage st(name);
+    //SimpleTest::test();
+    //ConvTest::test();
+    Cx4Test::test();
+    return st;
+  }
+};
+
+
+#endif
diff --git a/projects/neural/test.inc.cpp b/projects/neural/test.inc.cpp
new file mode 100644
index 0000000..d8cd8b6
--- /dev/null
+++ b/projects/neural/test.inc.cpp
@@ -0,0 +1,144 @@
+#ifndef TEST_INC_CPP
+#define TEST_INC_CPP
+
+
+#include "common.inc.cpp"
+
+
+
+class Test {
+public:
+  class Stage {
+  public:
+    const int errors;
+    inline explicit Stage(const char *name): errors(Test::errors) {
+      for(int i = 0; i < level; ++i) printf("- ");
+      printf("%s\n", name);
+      fflush(stdout);
+      ++level;
+    }
+    inline ~Stage() {
+      --level;
+      if (!*this) {
+        for(int i = 0; i < level; ++i) printf("- ");
+        printf("FAILED\n");
+      }
+      fflush(stdout);
+    }
+    operator bool() { return Test::errors == errors; }
+  };
+
+private:
+  static int level;
+
+protected:
+  static std::vector<Neuron> c_neurons;
+  static std::vector<Neuron> p_neurons;
+  static std::vector<Weight> weights;
+  static std::vector<NeuronReal> values;
+
+public:
+  static int errors;
+
+
+  static void init(int c_count, int p_count, int w_count, int v_count = 0) {
+    Neuron n = {};
+    Weight w = {};
+
+    c_neurons.clear();
+    p_neurons.clear();
+    weights.clear();
+    values.clear();
+    
+    c_neurons.resize(c_count, n);
+    p_neurons.resize(p_count, n);
+    weights.resize(w_count, w);
+    values.resize(v_count, 0);
+  }
+
+
+  static bool verifyNeurons(const char *name, const Layout &l, const Neuron *neurons, bool ignorePadded = false) {
+    Stage st(name);
+    for(int y = 0; y < l.sy; ++y)
+    for(int x = 0; x < l.sx; ++x)
+    for(int z = 0; z < l.sz; ++z) {
+      int n = neurons[ (y*l.sx + x)*l.sz + z ].a.i;
+      int i = x >= l.x0 && x < l.x1
+           && y >= l.y0 && y < l.y1
+           && z >= l.z0 && z < l.z1;
+      if (ignorePadded ? i && n != i : n != i) {
+        printf(
+          "wrong neuron mark %d, expected %d (%d, %d, %d)\n",
+          n, i, y, x, z );
+        l.printYXZ("layout");
+        ++errors;
+        return st;
+      }
+    }
+    return st;
+  }
+  
+  
+  static bool verifyNeuronIndices(const char *name, const Layout &l, const Neuron *neurons, int base = 1, int stride = 1) {
+    Stage st(name);
+    for(int y = 0; y < l.sy; ++y)
+    for(int x = 0; x < l.sx; ++x)
+    for(int z = 0; z < l.sz; ++z) {
+      bool active = x >= l.x0 && x < l.x1
+                 && y >= l.y0 && y < l.y1
+                 && z >= l.z0 && z < l.z1;
+      
+      int n = neurons[ (y*l.sx + x)*l.sz + z ].a.i;
+      int i = (((y - l.y0)*l.getW() + x - l.x0)*l.getD() + z - l.z0)*stride + base;
+      
+      if (!active) i = 0;
+      
+      if (n != i) {
+        printf(
+          "wrong neuron mark %d, expected %d (%d, %d, %d)\n",
+          n, i, y, x, z );
+        l.printYXZ("layout");
+        ++errors;
+        return st;
+      }
+    }
+    return st;
+  }
+  
+  
+  static bool verifyNeuronsAccum(const Layout &l, Neuron *neurons, int accum = 1, bool ignoreBounds = false) {
+    for(int y = 0; y < l.sy; ++y)
+    for(int x = 0; x < l.sx; ++x)
+    for(int z = 0; z < l.sz; ++z) {
+      Neuron &n = neurons[ (y*l.sx + x)*l.sz + z ];
+      int i = ( x >= l.x0 && x < l.x1
+             && y >= l.y0 && y < l.y1
+             && z >= l.z0 && z < l.z1 )*accum;
+      if (ignoreBounds) i = accum;
+      if (n.v != 0 && n.v != i) {
+        printf(
+          "wrong neuron mark %g, expected 0 or %d (%d, %d, %d)\n",
+          n.v, i, y, x, z );
+        l.printYXZ("layout");
+        ++errors;
+        return false;
+      }
+      if (n.v) n.a.i = 1;
+      n.v = 0;
+    }
+    return true;
+  }
+};
+
+
+int Test::level = 0;
+std::vector<Neuron> Test::c_neurons;
+std::vector<Neuron> Test::p_neurons;
+std::vector<Weight> Test::weights;
+std::vector<NeuronReal> Test::values;
+int Test::errors = 0;
+
+
+
+#endif
+
diff --git a/projects/neural/tga.inc.cpp b/projects/neural/tga.inc.cpp
new file mode 100644
index 0000000..a013b09
--- /dev/null
+++ b/projects/neural/tga.inc.cpp
@@ -0,0 +1,61 @@
+#ifndef TGA_INC_CPP
+#define TGA_INC_CPP
+
+
+#include <cstdio>
+
+
+bool tgaSave(const char *filename, const unsigned char *data, int w, int h, int ch) {
+  if (!data || w <= 0 || h <= 0 || w > 0xffff || h > 0xffff || (ch != 3 && ch != 4)) {
+    printf("ERROR: cannot save image (bad image): %s\n", filename);
+    return false;
+  }
+
+  FILE *f = fopen(filename, "wb");
+  if (!f) {
+    printf("ERROR: cannot open file: %s\n", filename);
+    return false;
+  }
+
+  #pragma pack(push,1)
+  struct Header {
+    unsigned char  idLength;
+    unsigned char  colormapType;
+    unsigned char  imageType;
+    unsigned char  colormapIndex[2];
+    unsigned char  colormapLength[2];
+    unsigned char  colormapSize;
+    unsigned char  xOrigin[2];
+    unsigned char  yOrigin[2];
+    unsigned char  width[2];
+    unsigned char  height[2];
+    unsigned char  pixelSize;
+    unsigned char  attributes;
+  };
+  #pragma pack(pop)
+  Header header = {};
+  header.imageType = 2;
+  header.width[0] = w;
+  header.width[1] = w >> 8;
+  header.height[0] = h;
+  header.height[1] = h >> 8;
+  header.pixelSize = ch == 4 ? 32 : 24;
+  fwrite(&header, sizeof(header), 1, f);
+
+  int rowSize = w*ch;
+  const unsigned char *row = data + h*rowSize;
+  for(unsigned short r = h; r; --r, row -= rowSize) {
+    for(const unsigned char *c = row - rowSize; c < row; c += ch) {
+      fputc(c[2], f);
+      fputc(c[1], f);
+      fputc(c[0], f);
+      if (ch == 4) fputc(c[3], f);
+    }
+  }
+  fclose(f);
+
+  return true;
+}
+
+
+#endif
diff --git a/projects/neural/train.cx4.inc.cpp b/projects/neural/train.cx4.inc.cpp
new file mode 100644
index 0000000..54831ad
--- /dev/null
+++ b/projects/neural/train.cx4.inc.cpp
@@ -0,0 +1,293 @@
+#ifndef TRAIN_CX4_INC_CPP
+#define TRAIN_CX4_INC_CPP
+
+
+#include "train.segment.inc.cpp"
+#include "segment.cx4.inc.cpp"
+#include "layer.inc.cpp"
+
+
+class TrainerCx4: public TrainerSegment {
+protected:
+  FILE *f;
+  std::vector<unsigned char> data;
+  std::vector<NeuronReal> values;
+  std::vector<NeuronReal> valuesMeasure;
+  std::vector<unsigned char> tmpdata;
+  std::vector<int> shuffle;
+  
+  Layout trainLayout;
+  Layout measureLayout;
+  
+  size_t imageSize;
+  size_t preparedImageSize;
+  int imagesInFile;
+  int imagesInMemory;
+  
+  volatile unsigned int seed;
+
+public:
+  Layer *layerFull;
+  Layer *layerPre;
+  int loadImagesCount;
+  int blocksPerLoading;
+  
+  const char *infile;
+  const char *cachefile;
+  const char *outfile;
+  
+  TrainerCx4():
+    f(),
+    imageSize(),
+    preparedImageSize(),
+    imagesInFile(),
+    imagesInMemory(),
+    seed(),
+    layerFull(),
+    layerPre(),
+    loadImagesCount(),
+    blocksPerLoading(1),
+    infile(),
+    cachefile(),
+    outfile() { }
+
+protected:
+  void preprocess(unsigned char *src, NeuronReal *dst) {
+    struct IL: public Iter {
+      typedef const unsigned char* DataType;
+      static inline void iter4(Neuron &n, DataType d, DataAccumType&) { n.v = *d/(NeuronReal)255; }
+    };
+    struct IS: public Iter {
+      typedef NeuronReal* DataType;
+      static inline void iter4(Neuron &n, DataType d, DataAccumType&) { *d = n.v; }
+    };
+    
+    Layer &fl = *layerFull;
+    Layer &bl = *layerPre;
+
+    iterateNeurons2<IL>(fl.layout, fl.layout, fl.neurons, src);
+    fl.passFull(&bl, threadsCount);
+    iterateNeurons2<IS>(bl.layout, bl.layout, bl.neurons, dst);
+  }
+  
+  
+  bool loadImage(int fromIndex, int toIndex) {
+    unsigned char *src = data.data();
+    if (!layerPre) src += toIndex*imageSize;
+    
+    fseeko64(f, fromIndex*imageSize, SEEK_SET);
+    if (!fread(src, imageSize, 1, f))
+      return fclose(f), f = nullptr, false;
+    
+    if (layerPre) preprocess(src, values.data() + toIndex*preparedImageSize);
+    
+    return true;
+  }
+  
+  
+  bool loadImages() {
+    for(int i = 0; i < imagesInMemory; ++i) {
+      int j = rand()%imagesInFile;
+      if (i != j) std::swap(shuffle[i], shuffle[j]);
+    }
+    
+    typedef std::pair<int, int> Pair;
+    typedef std::set<Pair> Set;
+    Set set;
+    for(int i = 0; i < imagesInMemory; ++i)
+      set.insert(Pair(shuffle[i], i));
+    for(Set::iterator i = set.begin(); i != set.end(); ++i)
+      loadImage(i->first, i->second);
+    
+    return true;
+  }
+  
+  
+  void prepareMeasure() {
+    if (measuresPerBlock <= 0) return;
+    int sy = segment->sy;
+    int sx = segment->sx;
+    int sz = segment->sz;
+    int sxz = sx*sz;
+    int w = (layerPre ? layerPre : layerFull)->layout.getW();
+    int h = (layerPre ? layerPre : layerFull)->layout.getH();
+    int rowstride = w*sz;
+    NeuronReal *dst = valuesMeasure.data();
+    for(int i = 0; i < measuresPerBlock; ++i) {
+      int index = rand()%imagesInMemory;
+      int x = rand()%(w - sx + 1);
+      int y = rand()%(h - sy + 1);
+      if (layerPre) {
+        const NeuronReal *src = values.data() + index*preparedImageSize + y*rowstride + x*sz;
+        for(int j = 0; j < sy; ++j, src += rowstride, dst += sxz)
+          memcpy(dst, src, sxz*sizeof(*dst));
+      } else {
+        const unsigned char *src = data.data() + index*preparedImageSize + y*rowstride + x*sz;
+        for(int j = 0; j < sy; ++j, src += rowstride - sxz)
+        for(int k = 0; k < sxz; ++k, ++src, ++dst)
+          *dst = *src/(NeuronReal)255;
+      }
+    }
+  }
+  
+  
+  bool prepare() override {
+    assert(infile);
+    assert(layerFull);
+    assert(loadImagesCount > 0);
+
+    Layer &fl = layerFull->front();
+    Layer &bl = layerFull->back();
+    
+    imageSize = fl.layout.getActiveCount();
+    f = fopen(infile, "rb");
+    if (!f) return false;
+    fseeko64(f, 0, SEEK_END);
+    imagesInFile = ftello64(f)/imageSize;
+    if (imagesInFile < 1) return fclose(f), f = nullptr, false;
+    imagesInMemory = loadImagesCount > imagesInFile ? imagesInFile : loadImagesCount;
+    
+    
+    Layout l = layerPre ? layerPre->layout : layerFull->layout;
+    assert(l.getW() >= segment->sx);
+    assert(l.getH() >= segment->sy);
+    assert(l.getD() == segment->sz);
+    
+    measureLayout = Layout(segment->sx, segment->sy, segment->sz);
+    valuesMeasure.resize(measuresPerBlock * measureLayout.getActiveCount());
+    if (layerPre) {
+      assert(l);
+      preparedImageSize = layerPre->layout.getActiveCount();
+      trainLayout = Layout(l.getW(), l.getH(), l.getD());
+      data.resize(imageSize);
+      values.resize(imagesInMemory * preparedImageSize);
+    } else {
+      trainLayout = measureLayout;
+      data.resize(imagesInMemory * imageSize);
+      values.resize(segment->sx * segment->sy * segment->sz);
+    }
+
+    segment->f_values = values.data();
+    segment->layout = trainLayout;
+    tmpdata.resize(bl.layout.getActiveCount());
+    if (tmpdata.size() < imageSize) tmpdata.resize(imageSize);
+    
+    size_t memsize = data.size()*sizeof(data.front())
+                   + values.size()*sizeof(values.front())
+                   + valuesMeasure.size()*sizeof(valuesMeasure.front())
+                   + tmpdata.size()*sizeof(tmpdata.front());
+    printf("allocated size: %lld\n", (long long)(memsize));
+
+    shuffle.resize(imagesInFile);
+    for(int i = 0; i < imagesInFile; ++i)
+      shuffle[i] = i;
+
+    if (!loadImages()) return false;
+    prepareMeasure();
+    return true;
+  }
+  
+  
+  void finish() override
+    { if (f) fclose(f), f = nullptr; }
+
+    
+  bool prepareBlock(int block, bool measureOnly) override {
+    if (block > 0 && blocksPerLoading > 0 && (block % blocksPerLoading) == 0 && !loadImages())
+      return false;
+    seed = rand();
+    return true;
+  }
+  
+  
+  void finishBlock(int block) override {
+    if (outfile) {
+      struct IL: public Iter {
+        typedef const unsigned char* DataType;
+        static inline void iter4(Neuron &n, DataType d, DataAccumType&) { n.v = *d/(NeuronReal)255; }
+      };
+      struct IS: public Iter {
+        typedef unsigned char* DataType;
+        static inline void iter4(Neuron &n, DataType d, DataAccumType&) { *d = n.v < 0 ? 0 : n.v > 1 ? 255 : (unsigned char)(n.v*255.999); }
+      };
+      
+      Layer &fl = *layerFull;
+      Layer &bl = fl.back();
+
+      std::string outfile0(outfile);
+      std::string outfile1 = outfile0 + ".1.tga";
+      outfile0 += ".0.tga";
+
+      int index = rand()%imagesInFile;
+      fseeko64(f, index*imageSize, SEEK_SET);
+      fread(tmpdata.data(), imageSize, 1, f);
+      tgaSave(outfile0.c_str(), tmpdata.data(), fl.layout.getW(), fl.layout.getH(), fl.layout.getD());
+
+      iterateNeurons2<IL>(fl.layout, fl.layout, fl.neurons, tmpdata.data());
+      fl.passFull(&bl, threadsCount);
+      
+      iterateNeurons2<IS>(bl.layout, bl.layout, bl.neurons, tmpdata.data());
+      tgaSave(outfile1.c_str(), tmpdata.data(), bl.layout.getW(), bl.layout.getH(), bl.layout.getD());
+      
+      segment->saveDemo();
+    }
+  }
+
+
+  void loadData(Barrier &barrier, int block, int iter, bool measureOnly) override {
+    int tid = barrier.tid;
+    int threads = barrier.threads;
+    int sx = segment->sx;
+    int sy = segment->sy;
+    int sz = segment->sz;
+    int sxz = sx*sz;
+
+    if (measureOnly) {
+      if (!tid) {
+        segment->layout = measureLayout;
+        segment->f_values = valuesMeasure.data() + iter*sy*sxz;
+        x = y = z = 0;
+      }
+    } else
+    if (layerPre) {
+      if (!tid) {
+        unsigned int s = randomNext(seed & iter);
+        int index = (s = randomNext(s))%imagesInMemory;
+        x = (s = randomNext(s)) % (layerPre->layout.getW() - sx + 1);
+        y = (s = randomNext(s)) % (layerPre->layout.getH() - sy + 1);
+        z = 0;
+        segment->layout = trainLayout;
+        segment->f_values = values.data() + index*preparedImageSize;
+      }
+    } else {
+      int w = layerFull->layout.getW();
+      int h = layerFull->layout.getH();
+      
+      unsigned int s = randomNext(seed & iter);
+      int index = (s = randomNext(s))%imagesInMemory;
+      int x0    = (s = randomNext(s))%(w - sx + 1);
+      int y0    = (s = randomNext(s))%(h - sy + 1);
+      
+      int rowstride = w*sz;
+      int dr = rowstride*threads - sxz;
+      int vdr = sxz*(threads - 1);
+
+      const unsigned char *id0 = data.data() + index*imageSize + y0*rowstride + x0*sz;
+      const unsigned char *id = id0 + tid*rowstride;
+      NeuronReal *iv = values.data() + tid*sxz;
+      
+      for(const unsigned char *e = id0 + sy*rowstride; id < e; id += dr, iv += vdr)
+      for(const unsigned char *e = id + sxz;        id < e; ++id, ++iv)
+        *iv = *id/(NeuronReal)255;
+      
+      if (!tid) {
+        segment->layout = trainLayout;
+        segment->f_values = values.data();
+        x = 0, y = 0, z = 0;
+      }
+    }
+  }
+};
+
+
+#endif
diff --git a/projects/neural/train.digit.inc.cpp b/projects/neural/train.digit.inc.cpp
index 8277f42..f868ca0 100644
--- a/projects/neural/train.digit.inc.cpp
+++ b/projects/neural/train.digit.inc.cpp
@@ -87,7 +87,7 @@ protected:
 
 
   Quality verifyData(Barrier &barrier, int, int iter) override {
-    Quality q = {};
+    Quality q;
     if (barrier.tid) return q;
     
     struct I: public Iter {
diff --git a/projects/neural/train.image.inc.cpp b/projects/neural/train.image.inc.cpp
index d7dd9a2..18c5c46 100644
--- a/projects/neural/train.image.inc.cpp
+++ b/projects/neural/train.image.inc.cpp
@@ -2,6 +2,8 @@
 #define TRAIN_IMAGE_INC_CPP
 
 
+#include <set>
+
 #include "train.inc.cpp"
 #include "layer.simple.inc.cpp"
 
@@ -9,194 +11,168 @@
 class TrainerImage: public Trainer {
 protected:
   std::vector<unsigned char> data;
-  std::vector<unsigned int> shuffle;
-  const char *datafile;
-  const char *outfile;
-  Layout ofl, obl;
-  Layout::List oflist, oblist;
-  int stride, count;
+  std::vector<unsigned char> tmpdata;
+  std::vector<int> shuffle;
+  std::vector<int> shuffle2;
+  Layout pbl;
+  Layout::List flist, blist;
+  FILE *f;
+  size_t imgsize;
+  int count;
+  int workCount;
 
 public:
-  TrainerImage(): stride(), count() { }
+  int pad;
+  const char *datafile;
+  const char *outfile;
+  Layer *dataLayer;
+  
+  TrainerImage(): f(), imgsize(), count(), workCount(), pad(), datafile(), outfile(), dataLayer() { }
 
-  bool configure(const char *datafile, const char *outfile) {
-    this->datafile = datafile;
-    this->outfile = outfile;
-  }
+protected:
+  bool prepare() override {
+    assert(datafile);   
+    assert(fl->layout.getD() == 3);
+    
+    Layer *dl = dataLayer ? dataLayer : fl;
+    assert(dl->layout.getW() == bl->layout.getW());
+    assert(dl->layout.getH() == bl->layout.getH());
+    assert(dl->layout.getD() == bl->layout.getD());
+    
+    imgsize = fl->layout.getActiveCount();
+    fl->layout.split(flist, threadsCount);
+    bl->layout.split(blist, threadsCount);
+    pbl = bl->layout;
+    pbl.padXY(pad);
+    
+    f = fopen(datafile, "rb");
+    if (!f) return false;
     
-    data.clear();
+    fseeko64(f, 0, SEEK_END);
+    long long size = ftello64(f);
+    count = size/imgsize;
+    if (count < 1) return fclose(f), f = nullptr, false;
 
-    FILE *f = fopen(filename, "rb");
-    if (!f)
-      return printf("cannot open file for read: %s\n", filename), false;
-    fseek(f, 0, SEEK_END);
-    size_t fs = ftello(f);
-    fseek(f, 0, SEEK_SET);
+    workCount = itersPerBlock > count ? count : itersPerBlock;
+    printf("allocated size: %lld\n", (long long)(imgsize*workCount));
+    data.resize(workCount*imgsize);
 
-    data.resize(fs, 0);
-    if (!fread(data.data(), fs, 1, f))
-      return printf("cannot read from file: %s\n", filename), fclose(f), data.clear(), false;
+    shuffle.resize(count);
+    for(int i = 0; i < count; ++i)
+      shuffle[i] = i;
 
-    fclose(f);
-    return true;
+    shuffle2.resize(workCount);
+    for(int i = 0; i < workCount; ++i)
+      shuffle2[i] = i;
+    
+    return loadBlocks();
+    //return true;
   }
-
   
-void imgTrain(Layer &l, const char *datafile, int size, const char *outfile, double trainRatio, int count) {
-  Layer &bl = l.back();
-
-  assert(!l.prev);
-  assert(datafile);
-  assert(count > 0 && size > 0);
-  assert(l.size == size);
-  assert(bl.size == size);
-
-  int blockSize = 1000;//1024*1024*1024/size;
-  assert(blockSize > 0);
-
-  FILE *f = fopen(datafile, "rb");
-  if (!f)
-    { printf("cannot open file: %s\n", datafile); return; }
-  fseeko64(f, 0, SEEK_END);
-  long long fsize = ftello64(f);
-  int xCount = (int)(fsize/size);
-  if (xCount <= 0)
-    { printf("no tests in file: %s\n", datafile); return; }
-
-  int *block = new int[blockSize*2];
-  int *shuffle = block + blockSize;
-  double *results = new double[blockSize];
-  unsigned char *blockData = new unsigned char[(blockSize + 1)*size];
-  unsigned char *blockResData = blockData + blockSize*size;
-  bool err = false;
-
-  for(int j = 0; j < blockSize; ++j)
-    { shuffle[j] = j; results[j] = 0; }
-
-  int blocksCount = (count - 1)/blockSize + 1;
-
-  printf("training %d (%d x %d blocks), tests: %d, ratio: %f:\n", blocksCount*blockSize, blocksCount, blockSize, xCount, trainRatio);
-
-  double avgSum = 0;
-  for(int i = 0; i < blocksCount; ++i) {
-    for(int j = 0; j < blockSize; ++j) {
-      block[j] = rand()%xCount;
-      std::swap(shuffle[i], shuffle[rand()%blockSize]);
-    }
-    std::sort(block, block + blockSize);
+  
+  void finish() override
+    { if (f) fclose(f), f = nullptr; }
 
-    for(int j = 0; j < blockSize; ++j) {
-      fseeko64(f, block[j]*(long long)size, SEEK_SET);
-      if (!fread(blockData + j*size, size, 1, f))
-        { printf("cannot read data from file: %s\n", datafile); err = true; break; }
+    
+  bool loadBlocks() {
+    for(int i = 0; i < workCount; ++i) {
+      int j = rand()%count;
+      if (i != j) std::swap(shuffle[i], shuffle[j]);
     }
-    if (err) break;
-
-    printf("  next data block loaded\n");
-
-    double sumQ = 0;
-    for(int j = 0; j < blockSize; ++j) {
-      unsigned char *data = blockData + shuffle[j]*size;
-      for(double *ia = l.a, *e = ia + l.size; ia < e; ++ia, ++data)
-        *ia = *data/255.0;
-
-      double firstQ = 0, q = 0;
-      for(int repeat = 0; repeat < 1; ++repeat) {
-        l.pass();
-
-        for(double *ia = l.a, *iba = bl.a, *ibda = bl.da, *e = ia + l.size; ia < e; ++ia, ++iba, ++ibda) {
-          double d = *ia - *iba;
-          *ibda = d;
-          q += d*d;
-        }
-        q /= size;
-        if (!repeat) firstQ = q;
-
-        bl.backpass(trainRatio);
-      }
-
-      sumQ += firstQ;
-      avgSum += firstQ - results[j];
-      results[j] = firstQ;
-      int avgCnt = i ? blockSize : j + 1;
-      printf("  %4d: total: %6d, avg result: %f, last result: %f -> %f\n", j+1, i*blockSize+j+1, avgSum/avgCnt, firstQ, q);
+    
+    typedef std::pair<int, int> Pair;
+    typedef std::set<Pair> Set;
+    Set set;
+    for(int i = 0; i < workCount; ++i)
+      set.insert(Pair(shuffle[i], i));
+    
+    for(Set::iterator i = set.begin(); i != set.end(); ++i) {
+      fseeko64(f, i->first*imgsize, SEEK_SET);
+      if (!fread(data.data() + i->second*imgsize, imgsize, 1, f))
+        return fclose(f), f = nullptr, false;
     }
-
-    printf("%4d: total: %6d, avg result: %f\n", i+1, (i+1)*blockSize, sumQ/blockSize);
-
-    if (outfile && !l.save(outfile))
-      { printf("cannot save neural network weights to file: %s\n", outfile); err = true; break; }
-
-    unsigned char *data = blockResData;
-    for(double *iba = bl.a, *e = iba + bl.size; iba < e; ++iba, ++data)
-      *data = (unsigned char)(*iba*255.999);
-    tgaSave("data/output/sampleX.tga", blockData + shuffle[blockSize-1]*size, 256, 256, 3);
-    tgaSave("data/output/sampleY.tga", blockResData, 256, 256, 3);
-  }
-
-  delete[] block;
-  delete[] results;
-  delete[] blockData;
-
-  printf("finished\n");
-}
-  
-
-protected:
-  bool prepare() override {
-    ofl = optimizeLayoutSimple(fl->layout);
-    obl = optimizeLayoutSimple(bl->layout);
-    assert(ofl && obl);
-    assert(ofl.getActiveCount() == obl.getActiveCount());
     
-    ofl.split(oflist, threadsCount);
-    obl.split(oblist, threadsCount);
-    stride = ofl.getActiveCount() + 1;
-    count = data.size()/stride;
-    if (count <= 0) return false;
-    shuffle.resize(count);
-    for(int i = 0; i < count; ++i)
-      shuffle[i] = i;
     return true;
   }
 
-
   bool prepareBlock() override {
-    int cnt = itersPerBlock > count ? count : itersPerBlock;
-    for(int i = 0; i < cnt; ++i) {
-      int j = rand()%count;
-      if (i != j) std::swap(shuffle[i], shuffle[j]);
+    for(int i = 0; i < workCount; ++i) {
+      int j = rand()%workCount;
+      if (i != j) std::swap(shuffle2[i], shuffle2[j]);
     }
+    //return loadBlocks();
     return true;
   }
+  
+  
+  void finishBlock() override {
+    if (outfile && !dataLayer) {
+      std::string outfile0(outfile);
+      std::string outfile1 = outfile0 + ".1.tga";
+      outfile0 += ".0.tga";
+      
+      unsigned char *id0 = data.data() + shuffle2[(itersPerBlock-1)%workCount]*imgsize;
+      tgaSave(outfile0.c_str(), id0, fl->layout.getW(), fl->layout.getH(), fl->layout.getD());
+
+      struct I: public Iter {
+        typedef unsigned char* DataType;
+        static inline void iter4(Neuron &n, DataType d, DataAccumType&) { *d = n.v < 0 ? 0 : n.v > 1 ? 255 : (unsigned char)(n.v*255.999); }
+      };
+      
+      tmpdata.resize(imgsize);
+      unsigned char *id1 = tmpdata.data();
+      iterateNeurons2<I>(bl->layout, bl->layout, bl->neurons, id1);
+      tgaSave(outfile1.c_str(), id1, bl->layout.getW(), bl->layout.getH(), bl->layout.getD());
+    }
+  }
 
 
   void loadData(Barrier &barrier, int, int iter) override {
     struct I: public Iter {
-      typedef const unsigned char* Type;
-      static inline void iter4(Neuron &n, Type d, AccumType&) { n.v = *d/(NeuronReal)255; }
+      typedef const unsigned char* DataType;
+      static inline void iter4(Neuron &n, DataType d, DataAccumType&) { n.v = *d/(NeuronReal)255; }
     };
-    const unsigned char *id = data.data() + shuffle[iter%count]*stride;
-    iterateNeurons2<I>(oflist[barrier.tid], ofl, fl->neurons, id);
+    const unsigned char *id = data.data() + shuffle2[iter%workCount]*imgsize;
+    iterateNeurons2<I>(flist[barrier.tid], fl->layout, fl->neurons, id);
   }
 
 
-  AccumReal verifyDataMain(int, int iter) override {
-    struct I: public Iter {
-      typedef int Type;
-      struct AccumType { int ri, mi; NeuronReal m; };
-      static inline void iter4(Neuron &n, Type d, AccumType &a) {
-        NeuronReal v1 = d == a.ri;
-        NeuronReal v0 = n.v;
-        n.d *= v1 - v0;
-        if (a.m < v0) { a.m = v0; a.mi = d; }
-      }
-    };
+  Quality verifyData(Barrier &barrier, int, int iter) override {
+    Layout l = blist[barrier.tid];
+    Layout dl = bl->layout;
+    Layout pl = pbl;
+    
+    int d = l.getD();
+    int w = l.getW();
+    int dx = l.sz - d;
+    int dy = (l.sx - w)*l.sz;
+    int ddx = dl.getD();
+    int ddy = (dl.getW() - w)*ddx;
+      
+    AccumReal aq = 0;
+    NeuronReal ratio = this->ratio;
+    Neuron *in = bl->neurons + (l.y0*l.sx + l.x0)*l.sz + l.z0;
+    const unsigned char *id = data.data() + shuffle2[iter%workCount]*imgsize + ((l.y0-dl.y0)*l.sx + l.x0-dl.x0)*l.sz + l.z0-dl.z0;
     
-    I::AccumType a = { data[ (shuffle[iter%count] + 1)*stride - 1 ], 0, 0 };
-    iterateNeurons2<I>(obl, obl, bl->neurons, 0, 1, &a);
+    for(int y = l.y0; y < l.y1; ++y, in += dy, id += ddy) {
+      bool outside = y < pl.y0 || y >= pl.y1;
+      for(int x = l.x0; x < l.x1; ++x, in += dx, id += ddx) {
+        if (outside || x < pl.x0 || x >= pl.x1) {
+          for(Neuron *e = in + d; in < e; ++in) in->d = 0;
+        } else {
+          const unsigned char *iid = id;
+          for(Neuron *e = in + d; in < e; ++in, ++iid) {
+            NeuronReal v1 = *iid/(NeuronReal)255;
+            NeuronReal v0 = in->v;
+            NeuronReal diff = v1 - v0;
+            in->d *= diff*ratio;
+            aq += diff*diff;
+          }
+        }
+      }
+    }
     
-    return a.mi != a.ri;
+    return Quality( sqrt(aq/pbl.getActiveCount()) );
   }
 };
 
diff --git a/projects/neural/train.inc.cpp b/projects/neural/train.inc.cpp
index 21a8a56..41fd23a 100644
--- a/projects/neural/train.inc.cpp
+++ b/projects/neural/train.inc.cpp
@@ -2,43 +2,9 @@
 #define TRAIN_INC_CPP
 
 
-#include <chrono>
-#include <thread>
-
-
 #include "layer.inc.cpp"
 
 
-long long timeUs() {
-  static std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
-  return (long long)std::chrono::duration_cast<std::chrono::microseconds>( std::chrono::steady_clock::now() - begin ).count();
-}
-
-
-struct Quality {
-  AccumReal train;
-  AccumReal human;
-  
-  inline Quality& operator+=(const Quality &b) {
-    train += b.train;
-    human += b.human;
-    return *this;
-  }
-
-  inline Quality& operator*=(AccumReal x) {
-    train *= x;
-    human *= x;
-    return *this;
-  }
-  
-  inline bool operator<(const Quality &b) const {
-    return human < b.human ? true
-         : b.human < human ? false
-         : train < b.train;
-  }
-};
-
-
 class Trainer {
 private:
   std::atomic<unsigned int> barrierCounter;
@@ -57,6 +23,7 @@ protected:
   std::atomic<unsigned int> skipBackpass;
   Layer *fl;
   Layer *bl;
+  Layer *ffl;
   
   virtual bool prepare() { return true; }
   virtual bool prepareBlock() { return true; }
@@ -70,7 +37,7 @@ private:
   void threadFunc(int tid, unsigned int seed, int block) {
     Barrier barrier(barrierCounter, tid, threadsCount, seed);
 
-    Quality sumQ = {};
+    Quality sumQ;
     for(int i = 0; i < itersPerBlock; ++i) {
       barrier.wait();
       loadData(barrier, block, i);
@@ -88,14 +55,16 @@ private:
       bool skipBp = skipBackpass;
       
       barrier.wait();
-      if (ratio > 0 && !skipBp) {
-        for(Layer *l = bl; l->prev && l->prev->prev; l = l->prev) {
+      if (ffl && ratio > 0 && !skipBp) {
+        for(Layer *l = bl; l != ffl; l = l->prev) {
           barrier.wait();
           l->backpassDeltas(barrier);
         }
         for(Layer *l = bl; l->prev; l = l->prev) {
-          barrier.wait();
-          l->backpassWeights(barrier);
+          if (!l->skipTrain) {
+            barrier.wait();
+            l->backpassWeights(barrier);
+          }
         }
       }
     }
@@ -107,8 +76,8 @@ private:
     barrierCounter = 0;
     std::vector<std::thread*> t(threadsCount, nullptr);
     for(int i = 1; i < threadsCount; ++i)
-      t[i] = new std::thread(&Trainer::threadFunc, this, i, block, rand());
-    threadFunc(0, block, rand());
+      t[i] = new std::thread(&Trainer::threadFunc, this, i, rand(), block);
+    threadFunc(0, rand(), block);
 
     Quality result = qualities[0];
     for(int i = 1; i < threadsCount; ++i)
@@ -164,6 +133,8 @@ public:
 
     fl = layer;
     bl = &layer->back();
+    ffl = fl->next;
+    while(ffl && ffl->skipTrain) ffl = ffl->next;
     
     qualities.clear();
     qualities.resize(threadsCount, Quality{});
diff --git a/projects/neural/train.segment.inc.cpp b/projects/neural/train.segment.inc.cpp
new file mode 100644
index 0000000..836ab3e
--- /dev/null
+++ b/projects/neural/train.segment.inc.cpp
@@ -0,0 +1,178 @@
+#ifndef TRAIN_SEGMENT_INC_CPP
+#define TRAIN_SEGMENT_INC_CPP
+
+
+#include "segment.inc.cpp"
+#include "layer.inc.cpp"
+
+
+class TrainerSegment {
+private:
+  std::atomic<unsigned int> barrierCounter;
+  std::vector<QualityPair> qualities;
+
+public:
+  Segment *segment;
+  AccumReal ratio;
+  int threadsCount;
+  
+  int measuresPerBlock;
+  int trainsPerBlock;
+  int blocksPerSaving;
+  
+  int blocksCount;
+  AccumReal qmin;
+
+protected:
+  volatile int x, y, z;
+  
+  virtual bool prepare() { return true; }
+  virtual bool prepareBlock(int block, bool measureOnly) { return true; }
+  virtual void finishBlock(int block) { }
+  virtual void finish() { }
+
+  virtual void loadData(Barrier &barrier, int block, int iter, bool measure) { }
+
+private:
+  void threadFunc(int tid, unsigned int seed, int block, bool measureOnly) {
+    Barrier barrier(barrierCounter, tid, threadsCount, seed);
+
+    QualityPair q;
+    if (!measureOnly) {
+      for(int i = 0; i < trainsPerBlock; ++i) {
+        barrier.wait();
+        loadData(barrier, block, i, false);
+        barrier.wait();
+        q.train += segment->pass(barrier, x, y, z, ratio);
+      }
+    }
+    
+    for(int i = 0; i < measuresPerBlock; ++i) {
+      barrier.wait();
+      loadData(barrier, block, i, true);
+      barrier.wait();
+      q.measure += segment->pass(barrier, x, y, z, 0);
+    }
+    
+    qualities[tid] = q;
+  }
+
+
+  QualityPair runThreads(int block, bool measureOnly) {
+    barrierCounter = 0;
+    std::vector<std::thread*> t(threadsCount, nullptr);
+    for(int i = 1; i < threadsCount; ++i)
+      t[i] = new std::thread(&TrainerSegment::threadFunc, this, i, rand(), block, measureOnly);
+    threadFunc(0, rand(), block, measureOnly);
+
+    QualityPair q = qualities[0];
+    for(int i = 1; i < threadsCount; ++i)
+      { t[i]->join(); delete t[i]; q += qualities[i]; }
+    
+    q.measure *= 1/(AccumReal)measuresPerBlock;
+    q.train *= 1/(AccumReal)trainsPerBlock;
+    return q;
+  }
+
+
+public:
+  TrainerSegment():
+    barrierCounter(0),
+    segment(),
+    ratio(),
+    threadsCount(1),
+    measuresPerBlock(),
+    trainsPerBlock(),
+    blocksPerSaving(),
+    blocksCount(),
+    qmin(),
+    x(), y(), z()
+    { }
+
+
+  QualityPair run() {
+    int trainsPerBlock = ratio > 0 ? this->trainsPerBlock : 0;
+    int blocksCount = trainsPerBlock > 0 || this->blocksCount > 0 ? this->blocksCount : 1;
+
+    assert(segment);
+    assert(threadsCount > 0);
+    assert(measuresPerBlock >= 0);
+    assert(trainsPerBlock >= 0);
+    assert(measuresPerBlock + trainsPerBlock > 0);
+
+    QualityPair bad(Quality::bad(), Quality::bad());
+    
+    printf("training segment: threads %d, trainsPerBlock %d, measuresPerBlock %d, ratio: %lf\n", threadsCount, trainsPerBlock, measuresPerBlock, ratio);
+    fflush(stdout);
+
+    qualities.clear();
+    qualities.resize(threadsCount);
+    segment->split(threadsCount);
+    
+    if (!prepare())
+      return printf("cannot prepare\n"), bad;
+
+    
+    QualityPair result = bad, best = result, saved = result;
+    long long fullTimeStartUs = timeUs();
+    int i = 0;
+    int bps = blocksPerSaving > 0 ? blocksPerSaving : 1;
+    int nextSave = i + bps;
+    while(true) {
+      bool measureOnly = measuresPerBlock > 0 && (!i || trainsPerBlock <= 0);
+      
+      if (!prepareBlock(i, measureOnly)) {
+        printf("cannot prepare block\n");
+        result = bad;
+        break;
+      };
+
+      long long runTimeUs = timeUs();
+      result = runThreads(i, measureOnly);
+      runTimeUs = timeUs() - runTimeUs;
+
+      finishBlock(i);
+
+      long long t = timeUs();
+      long long fullTimeUs = t - fullTimeStartUs;
+      fullTimeStartUs = t;
+      ++i;
+
+      Quality q = measuresPerBlock > 0 ? result.measure : result.train;
+      
+      if (i == 1) saved = result;
+      bool good = result < best;
+      bool done = (blocksCount > 0 && i >= blocksCount) || q.human <= qmin;
+      bool saving = !measureOnly && ratio > 0 && (i >= nextSave || done) && result < saved;
+      if (good) best = result;
+      
+      Quality bq = measuresPerBlock > 0 ? best.measure : best.train;
+      
+      printf("%4d, total %7d, avg.result %12g (%12g), best %12g (%12g), time: %f / %f%s\n",
+        i, i*trainsPerBlock,
+        q.human, q.train, bq.human, bq.train,
+        runTimeUs*0.000001, fullTimeUs*0.000001,
+        (saving ? ", saving" : "" ) );
+      fflush(stdout);
+
+      if (saving) {
+        if (!segment->save()) {
+          printf("saving failed\n");
+          result = bad;
+          break;
+        }
+        saved = result;
+        nextSave += bps;
+      }
+
+      if (done) break;
+    }
+
+    finish();
+
+    return result;
+  }
+};
+
+
+#endif
diff --git a/projects/neural/trainer.cpp b/projects/neural/trainer.cpp
index 6567a45..820f172 100644
--- a/projects/neural/trainer.cpp
+++ b/projects/neural/trainer.cpp
@@ -3,8 +3,10 @@
 #include <ctime>
 
 #include "layer.all.inc.cpp"
-#include "layer.all.test.inc.cpp"
+#include "test.all.inc.cpp"
 #include "train.digit.inc.cpp"
+#include "train.image.inc.cpp"
+#include "train.cx4.inc.cpp"
 
 
 bool runTests() {
@@ -13,40 +15,149 @@ bool runTests() {
 }
 
 
-int main() {
-  srand(time(NULL));
+bool trainDigits() {
+  #define FILENAME "data/output/weights-digit.bin"
 
-  //return !runTests();
+  printf("create neural network\n");
+  Layer l(                    nullptr, Layout(28, 28) );
+  (new LayerSimple<funcSigmoidExp>( l, Layout(256)   ))->filename = FILENAME "1";
+  (new LayerSimple<funcSigmoidExp>( l, Layout(64)    ))->filename = FILENAME "2";
+  (new LayerSimple<funcSigmoidExp>( l, Layout(10)    ))->filename = FILENAME "3";
+  l.sumStat().print();
   
-  //#define FILENAME "data/output/weights-digit.bin"
+  #undef FILENAME
+
+  printf("load training data\n");
+  TrainerDigit t;
+  if (!t.loadSymbolMap("data/symbols-data.bin")) return 1; // 28x28
+
+  printf("try load previously saved network\n"); l.load();
+  t.configure(l, 0.5, 8, 70000, 0, 0, 0.00001).run();
+  
+  return true;
+}
+
+
+bool trainDigitsConv() {
   #define FILENAME "data/output/weights-digit-conv.bin"
 
   printf("create neural network\n");
-  //Layer l(                    nullptr, Layout(28, 28) );
-  //(new LayerSimple<funcSigmoidExp>( l, Layout(256)   ))->filename = FILENAME "1";
-  //(new LayerSimple<funcSigmoidExp>( l, Layout(64)    ))->filename = FILENAME "2";
-  //(new LayerSimple<funcSigmoidExp>( l, Layout(10)    ))->filename = FILENAME "3";
-  
   Layer l(nullptr, Layout(28, 28));
-  (new LayerConvShared<funcReLU>(l, Layout(24, 24, 6), Kernel(5, 1, 0)))->filename = FILENAME "1";
-  (new LayerSub<funcSigmoidExp>(l, Layout(12, 12, 6)))->filename = FILENAME "2";
-  (new LayerConvShared<funcReLU>(l, Layout(8, 8, 48), Kernel(5, 1, 0)))->filename = FILENAME "3";
-  (new LayerSub<funcSigmoidExp>(l, Layout(4, 4, 48)))->filename = FILENAME "4";
-  (new LayerSimple<funcSigmoidExp>(l, Layout(64)))->filename = FILENAME "5";
-  (new LayerSimple<funcSigmoidExp>(l, Layout(10)))->filename = FILENAME "6";
+  Layer *ll[10] = {};
+  ll[1] = new LayerConvShared<funcReLU>(l, Layout(12, 12, 6), Kernel(4, 2, 0)); ll[1]->filename = FILENAME "1";
+  ll[2] = new LayerConvShared<funcReLU>(l, Layout(4, 4, 12), Kernel(4, 2, 0)); ll[2]->filename = FILENAME "2";
+  ll[3] = new LayerSimple<funcSigmoidExp>(l, Layout(64)); ll[3]->filename = FILENAME "3";
+  ll[4] = new LayerSimple<funcSigmoidExp>(l, Layout(10)); ll[4]->filename = FILENAME "4";
 
+  
+  #undef FILENAME
   l.sumStat().print();
 
   printf("load training data\n");
   TrainerDigit t;
   if (!t.loadSymbolMap("data/symbols-data.bin")) return 1; // 28x28
 
-  //printf("try load previously saved network\n"); l.load();
+  printf("try load previously saved network\n"); l.load();
 
-  printf("train\n");
+  //ll[1]->skipTrain = true;
+  //ll[2]->skipTrain = true;
+  
+  t.configure(l, 0.01, 8, 70000, 0, 0, 0.00001).run();
   //t.configure(l, 0.5, 8, 70000, 0, 0, 0.00001).run();
-  t.configure(l, 0.5, 8, 7000, 0, 0, 0.00001).run();
+  
+  return true;
+}
+
+
+bool trainImage() {
+  #define FILENAME "data/output/weights-image.bin"
+
+  printf("create neural network\n");
+  Layer l(nullptr, Layout(128, 128, 3));
+  Layer *ll[20] = {};
+  ll[ 1] = new LayerConvShared<funcReLU>(l, Layout(63, 63, 24), Kernel(4, 2, 0)); ll[1]->filename = FILENAME "1";
+  ll[ 2] = new LayerConvShared<funcReLU>(l, Layout(29, 29, 48), Kernel(5, 2, 0)); ll[2]->filename = FILENAME "2";
+  //ll[ 3] = new LayerConvShared<funcReLU>(l, Layout(14, 14, 24), Kernel(4, 2, 0)); ll[3]->filename = FILENAME "3";
+  //ll[ 4] = new LayerConvShared<funcReLU>(l, Layout( 6,  6, 48), Kernel(4, 2, 0)); ll[4]->filename = FILENAME "4";
+  //ll[ 5] = new LayerConvShared<funcReLU>(l, Layout( 2,  2, 96), Kernel(4, 2, 0)); ll[5]->filename = FILENAME "5";
+  //ll[ 6] = new LayerDeconvShared<funcReLU>(l, Layout(  6,  6, 48), Kernel(4, 2, 0), ll[5]->weights);
+  //ll[ 7] = new LayerDeconvShared<funcReLU>(l, Layout( 14, 14, 24), Kernel(4, 2, 0), ll[4]->weights);
+  //ll[ 8] = new LayerDeconvShared<funcReLU>(l, Layout( 30, 30, 12), Kernel(4, 2, 0), ll[3]->weights);
+  ll[ 9] = new LayerDeconvShared<funcReLU>(l, Layout( 63, 63, 24), Kernel(5, 2, 0), ll[2]->weights);
+  ll[10] = new LayerDeconvShared<funcReLU>(l, Layout(128, 128, 3), Kernel(4, 2, 0), ll[1]->weights);
+
+  l.sumStat().print();
 
-  return 0;
+  printf("try load previously saved network\n"); l.load();
+
+  ll[1]->skipTrain = true;
+  ll[10]->skipTrain = true;
+
+  
+  TrainerImage t;
+  t.pad = 16;
+  t.datafile = "data/img128-data.bin";
+  t.outfile = FILENAME ".test";
+
+  t.configure(l, 0.00001, 8, 1000, 0, 0, 0.00001).run();
+  
+  #undef FILENAME
+  return true;
+}
+
+
+bool trainCx4() {
+  #define FILENAME "data/output/weights-cx4.bin"
+
+  printf("create neural network\n");
+  Layer l(nullptr, Layout(512, 512, 3).expandXY(2));
+  Layer *fl[20] = { &l };
+  int cnt = 1;
+  fl[cnt] = new LayerConvShared<funcReLU>(l, Layout(257, 257, 24).expandXY(3), Kernel(4, 2, -2)); fl[cnt]->filename = FILENAME "1"; ++cnt;
+  fl[cnt] = new LayerConvShared<funcReLU>(l, Layout(130, 130, 48), Kernel(4, 2, -2)); fl[cnt]->filename = FILENAME "2"; ++cnt;
+  fl[cnt] = new LayerConvShared<funcReLU>(l, Layout( 66,  66, 96), Kernel(4, 2, -2)); fl[cnt]->filename = FILENAME "3"; ++cnt;
+  //fl[cnt] = new LayerConvShared<funcReLU>(l, Layout( 6,  6, 48), Kernel(4, 2,  0)); fl[cnt]->filename = FILENAME "4"; ++cnt;
+  //fl[cnt] = new LayerConvShared<funcReLU>(l, Layout( 2,  2, 96), Kernel(4, 2,  0)); fl[cnt]->filename = FILENAME "5"; ++cnt;
+  for(int i = cnt-1; i > 0; --i) {
+    Layer *bl = new LayerDeconvShared<funcReLU>(l, fl[i-1]->layout, dynamic_cast<LayerConvShared<funcReLU>*>(fl[i])->kernel, fl[i]->weights);
+    if (i < cnt-1) fl[i]->skipTrain = bl->skipTrain = true;
+  }
+
+  l.sumStat().print();
+
+  printf("try load previously saved network\n"); l.load();
+
+  SegmentCx4 s(fl[cnt-2]->layout.getD(), fl[cnt-1]->layout.getD(), fl[cnt-1]->weights);
+  s.filename = fl[cnt-1]->filename;
+
+  TrainerCx4 t;
+  t.layerFull        = &l;
+  t.layerPre         = cnt > 2 ? fl[cnt-2] : nullptr;
+  t.segment          = &s;
+  t.ratio            = 0.000001;
+  t.threadsCount     = 8;
+  t.measuresPerBlock = 1000;
+  t.trainsPerBlock   = 10000;
+  t.loadImagesCount  = 100;
+  t.blocksPerLoading = 10;
+  t.qmin             = 0.00001;
+  t.infile           = "data/img512-data.bin";
+  t.outfile          = FILENAME ".test";
+  
+  t.run();
+  
+  #undef FILENAME
+  return true;
+}
+
+
+int main() {
+  srand(time(NULL));
+
+  //return !runTests();
+  //return !trainDigits();
+  //return !trainDigitsConv();
+  //return !trainImage();
+  return !trainCx4();
 }