diff --git a/simple/neural/build-nn-trainer-img-pp.sh b/simple/neural/build-nn-trainer-img-pp.sh
new file mode 100755
index 0000000..b6c6c7d
--- /dev/null
+++ b/simple/neural/build-nn-trainer-img-pp.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -e
+
+c++ -Wall -DNDEBUG -O3 nn-trainer-img.cpp -lm -o nn-trainer-img-pp
+
+
+
+
diff --git a/simple/neural/build-nn-trainer2-pp.sh b/simple/neural/build-nn-trainer2-pp.sh
new file mode 100755
index 0000000..cc78b35
--- /dev/null
+++ b/simple/neural/build-nn-trainer2-pp.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -e
+
+c++ -Wall -DNDEBUG -O3 nn-trainer2.cpp -lm -o nn-trainer2-pp
+
+
+
+
diff --git a/simple/neural/build-nn-trainer3-img-pp.sh b/simple/neural/build-nn-trainer3-img-pp.sh
new file mode 100755
index 0000000..5605767
--- /dev/null
+++ b/simple/neural/build-nn-trainer3-img-pp.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -e
+
+c++ -Wall -DNDEBUG -O3 -pthread nn-trainer3-img.cpp -lm -o nn-trainer3-img-pp
+
+
+
+
diff --git a/simple/neural/build-nn-trainer3-pp.sh b/simple/neural/build-nn-trainer3-pp.sh
new file mode 100755
index 0000000..c777057
--- /dev/null
+++ b/simple/neural/build-nn-trainer3-pp.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -e
+
+c++ -Wall -DNDEBUG -O3 -pthread nn-trainer3.cpp -lm -o nn-trainer3-pp
+
+
+
+
diff --git a/simple/neural/nn-trainer-img.cpp b/simple/neural/nn-trainer-img.cpp
new file mode 100644
index 0000000..28312c5
--- /dev/null
+++ b/simple/neural/nn-trainer-img.cpp
@@ -0,0 +1,157 @@
+
+#include <ctime>
+#include <cstdlib>
+#include <cstdio>
+
+#include <algorithm>
+
+#include "nnlayer.lnk.inc.cpp"
+#include "tga.inc.cpp"
+
+
+void imgTrain(Layer &l, const char *datafile, int size, const char *outfile, double trainRatio, int count) {
+  Layer &bl = l.back();
+
+  assert(!l.prev);
+  assert(datafile);
+  assert(count > 0 && size > 0);
+  assert(l.size == size);
+  assert(bl.size == size);
+
+  int blockSize = 1000;//1024*1024*1024/size;
+  assert(blockSize > 0);
+
+  FILE *f = fopen(datafile, "rb");
+  if (!f)
+    { printf("cannot open file: %s\n", datafile); return; }
+  fseeko64(f, 0, SEEK_END);
+  long long fsize = ftello64(f);
+  int xCount = (int)(fsize/size);
+  if (xCount <= 0)
+    { printf("no tests in file: %s\n", datafile); return; }
+
+  int *block = new int[blockSize*2];
+  int *shuffle = block + blockSize;
+  double *results = new double[blockSize];
+  unsigned char *blockData = new unsigned char[(blockSize + 1)*size];
+  unsigned char *blockResData = blockData + blockSize*size;
+  bool err = false;
+
+  for(int j = 0; j < blockSize; ++j)
+    { shuffle[j] = j; results[j] = 0; }
+
+  int blocksCount = (count - 1)/blockSize + 1;
+
+  printf("training %d (%d x %d blocks), tests: %d, ratio: %f:\n", blocksCount*blockSize, blocksCount, blockSize, xCount, trainRatio);
+
+  double avgSum = 0;
+  for(int i = 0; i < blocksCount; ++i) {
+    for(int j = 0; j < blockSize; ++j) {
+      block[j] = rand()%xCount;
+      std::swap(shuffle[i], shuffle[rand()%blockSize]);
+    }
+    std::sort(block, block + blockSize);
+
+    for(int j = 0; j < blockSize; ++j) {
+      fseeko64(f, block[j]*(long long)size, SEEK_SET);
+      if (!fread(blockData + j*size, size, 1, f))
+        { printf("cannot read data from file: %s\n", datafile); err = true; break; }
+    }
+    if (err) break;
+
+    printf("  next data block loaded\n");
+
+    double sumQ = 0;
+    for(int j = 0; j < blockSize; ++j) {
+      unsigned char *data = blockData + shuffle[j]*size;
+      for(double *ia = l.a, *e = ia + l.size; ia < e; ++ia, ++data)
+        *ia = *data/255.0;
+
+      double firstQ = 0, q = 0;
+      for(int repeat = 0; repeat < 1; ++repeat) {
+        l.pass();
+
+        for(double *ia = l.a, *iba = bl.a, *ibda = bl.da, *e = ia + l.size; ia < e; ++ia, ++iba, ++ibda) {
+          double d = *ia - *iba;
+          *ibda = d;
+          q += d*d;
+        }
+        q /= size;
+        if (!repeat) firstQ = q;
+
+        bl.backpass(trainRatio);
+      }
+
+      sumQ += firstQ;
+      avgSum += firstQ - results[j];
+      results[j] = firstQ;
+      int avgCnt = i ? blockSize : j + 1;
+      printf("  %4d: total: %6d, avg result: %f, last result: %f -> %f\n", j+1, i*blockSize+j+1, avgSum/avgCnt, firstQ, q);
+    }
+
+    printf("%4d: total: %6d, avg result: %f\n", i+1, (i+1)*blockSize, sumQ/blockSize);
+
+    if (outfile && !l.save(outfile))
+      { printf("cannot save neural network weights to file: %s\n", outfile); err = true; break; }
+
+    unsigned char *data = blockResData;
+    for(double *iba = bl.a, *e = iba + bl.size; iba < e; ++iba, ++data)
+      *data = (unsigned char)(*iba*255.999);
+    tgaSave("data/output/sampleX.tga", blockData + shuffle[blockSize-1]*size, 256, 256, 3);
+    tgaSave("data/output/sampleY.tga", blockResData, 256, 256, 3);
+  }
+
+  delete[] block;
+  delete[] results;
+  delete[] blockData;
+
+  printf("finished\n");
+}
+
+
+int main() {
+  srand(time(NULL));
+
+  //const char *datafile = "data/img512-data.bin";
+  //const char *outfile = "data/output/img512-weights.bin";
+  const char *datafile = "data/img256-data.bin";
+  const char *outfile = "data/output/img256-weights.bin";
+
+  printf("create neural network\n");
+  //Layer l(nullptr, 512*512*3);
+  //new LayerLinkConvolution(l, 512, 512, 3, 256, 256, 3,  32);
+  //new LayerLinkConvolution(l, 256, 256, 3, 128, 128, 3,  64);
+  //new LayerLinkConvolution(l, 128, 128, 3,  64,  64, 3, 128);
+  //new LayerLinkConvolution(l,  64,  64, 3,  32,  32, 3, 256);
+  //new LayerLinkConvolution(l,  32,  32, 3,  16,  16, 4, 256);
+  //new LayerLinkConvolution(l,  16,  16, 4,  16,  16, 4, 256);
+  //new LayerLinkConvolution(l,  16,  16, 4,  16,  16, 4, 256);
+  //new LayerLinkConvolution(l,  16,  16, 4,  32,  32, 3, 256);
+  //new LayerLinkConvolution(l,  32,  32, 3,  64,  64, 3, 128);
+  //new LayerLinkConvolution(l,  64,  64, 3, 128, 128, 3,  64);
+  //new LayerLinkConvolution(l, 128, 128, 3, 256, 256, 3,  32);
+  //new LayerLinkConvolution(l, 256, 256, 3, 512, 512, 3,  16);
+
+  Layer l(nullptr, 256*256*3);
+  new LayerLinkConvolution(l, 256, 256, 3, 128, 128, 3,  32);
+  new LayerLinkConvolution(l, 128, 128, 3,  64,  64, 3,  64);
+  new LayerLinkConvolution(l,  64,  64, 3,  32,  32, 3, 128);
+  new LayerLinkConvolution(l,  32,  32, 3,  16,  16, 4, 256);
+  new LayerLinkConvolution(l,  16,  16, 4,  16,  16, 4, 256);
+  new LayerLinkConvolution(l,  16,  16, 4,  16,  16, 4, 256);
+  new LayerLinkConvolution(l,  16,  16, 4,  32,  32, 3, 128);
+  new LayerLinkConvolution(l,  32,  32, 3,  64,  64, 3,  64);
+  new LayerLinkConvolution(l,  64,  64, 3, 128, 128, 3,  32);
+  new LayerLinkConvolution(l, 128, 128, 3, 256, 256, 3,  16);
+
+  printf("  neurons: %d, links %d, memSize: %llu\n", l.totalSize(), l.totalLinks(), (unsigned long long)l.totalMemSize());
+
+  printf("try load previously saved network\n");
+  l.load(outfile);
+
+  printf("train\n");
+  imgTrain(l, datafile, l.size, outfile, 0.1, 1000000);
+
+  return 0;
+}
+
diff --git a/simple/neural/nn-trainer.cpp b/simple/neural/nn-trainer.cpp
index 2465ed2..c6af313 100644
--- a/simple/neural/nn-trainer.cpp
+++ b/simple/neural/nn-trainer.cpp
@@ -39,12 +39,12 @@ int main() {
   //new LayerSimple(l, 10);
 
   Layer l(nullptr, 784);
-  new LayerLinkConvolution(l, 28, 28, 1, 22, 22, 1,  60, 1, 3);
-  new LayerLinkConvolution(l, 22, 22, 1, 14, 14, 1, 100, 1, 4);
-  new LayerLinkConvolution(l, 14, 14, 1,  4,  4, 1, 140, 1, 5);
+  new LayerLinkConvolution(l, 28, 28, 1, 22, 22, 1,  60);
+  new LayerLinkConvolution(l, 22, 22, 1, 14, 14, 1, 100);
+  new LayerLinkConvolution(l, 14, 14, 1,  4,  4, 1, 140);
   new LayerSimple(l, 10);
 
-  printf("  neurons: %d, links %d\n", l.totalSize(), l.totalLinks());
+  printf("  neurons: %d, links %d, memSize: %llu\n", l.totalSize(), l.totalLinks(), (unsigned long long)l.totalMemSize());
 
   //printf("try load previously saved network\n");
   //l.load(filename);
diff --git a/simple/neural/nn-trainer2.cpp b/simple/neural/nn-trainer2.cpp
new file mode 100644
index 0000000..152b452
--- /dev/null
+++ b/simple/neural/nn-trainer2.cpp
@@ -0,0 +1,96 @@
+
+#include <ctime>
+#include <cstdlib>
+
+#include "nnlayer2.inc.cpp"
+#include "nnlayer2.conv.inc.cpp"
+#include "nnlayer2.mt.inc.cpp"
+
+
+bool train(const char *infile, const char *outfile, Layer &l, int blockSize, int totalCount, Real trainRatio) {
+  assert(blockSize > 0);
+  int blockCount = totalCount/blockSize;
+  assert(blockCount > 0);
+  assert(!l.prev);
+  assert(l.size && l.back().size);
+
+  printf("load training data\n");
+
+  FILE *f = fopen(infile, "rb");
+  if (!f)
+    return printf("cannot open file '%s' for read\n", infile), false;
+  fseek(f, 0, SEEK_END);
+  int fs = ftell(f);
+  fseek(f, 0, SEEK_SET);
+
+  int sizeX = l.size;
+  int sizeY = l.back().size;
+  int count = fs/(sizeX+1);
+  if (count < blockSize)
+    return printf("file '%s' is lesser minimal size\n", infile), fclose(f), false;
+
+  unsigned char *data = new unsigned char[(sizeX + sizeY)*count];
+  memset(data, 0, (sizeX + sizeY)*count);
+  for(int i = 0; i < count; ++i) {
+    unsigned char *d = data + (sizeX + sizeY)*i;
+    if (!fread(d, sizeX+1, 1, f) || d[sizeX] >= sizeY)
+      return printf("cannot read from file '%s'\n", infile), delete[] data, fclose(f), false;
+    d += sizeX;
+    unsigned char c = *d;
+    *d = 0;
+    d[c] = 255;
+  }
+  fclose(f);
+
+  printf("train %d x %d = %d, ratio: %f\n", blockCount, blockSize, blockCount*blockSize, trainRatio);
+
+  int *shuffle = new int[blockSize];
+  TrainMT tmt;
+  tmt.layer = &l;
+  tmt.dataX = data;
+  tmt.dataY = data + sizeX;
+  tmt.strideX = tmt.strideY = sizeX + sizeY;
+  tmt.shuffle = shuffle;
+  tmt.count = blockSize;
+  tmt.trainRatio = trainRatio;
+  for(int i = 0; i < blockCount; ++i) {
+    for(int j = 0; j < blockSize; ++j)
+      shuffle[j] = rand()%count;
+    Real res = tmt.train(8);
+    printf("%4d, total %7d, avg.result %f\n", i+1, (i+1)*blockSize, res);
+    if ( ((i+1)%10) == 0 || i+1 == blockCount ) {
+      if (!l.saveAll(outfile)) return delete[] data, delete[] shuffle, false;
+      printf("  saved\n");
+    }
+  }
+
+  delete[] shuffle;
+  delete[] data;
+  return true;
+}
+
+
+int main() {
+  srand(time(NULL));
+
+  const char *infile = "data/symbols-data.bin"; // 28x28
+  const char *outfile = "data/output/weights.bin";
+
+  printf("create neural network\n");
+  Layer l(nullptr, 784);
+  createConv(l, 28, 28, 1, 22, 22, 1,  60);
+  createConv(l, 22, 22, 1, 14, 14, 1, 100);
+  createConv(l, 14, 14, 1,  4,  4, 1, 140);
+  createConv(l, 4, 4, 1, 1, 1, 10, 16);
+
+  printf("  neurons: %d, links %d, memSize: %llu\n", l.totalNeurons(), l.totalLinks(), (unsigned long long)l.totalMemSize());
+
+  //printf("try load previously saved network\n");
+  //l.loadAll(outfile);
+
+  printf("train\n");
+  train(infile, outfile, l, 10000, 2000000, 0.1);
+
+  return 0;
+}
+
diff --git a/simple/neural/nn-trainer3-img.cpp b/simple/neural/nn-trainer3-img.cpp
new file mode 100644
index 0000000..3290622
--- /dev/null
+++ b/simple/neural/nn-trainer3-img.cpp
@@ -0,0 +1,146 @@
+
+#include <ctime>
+#include <cstdlib>
+#include <cstdio>
+
+#include <chrono>
+#include <algorithm>
+
+#include "nnlayer3.mt.inc.cpp"
+#include "tga.inc.cpp"
+
+
+long long timeUs() {
+  static std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
+  return (long long)std::chrono::duration_cast<std::chrono::microseconds>( std::chrono::steady_clock::now() - begin ).count();
+}
+
+
+void imgTrain(Layer &l, const char *datafile, int size, const char *outfile, int blockSize, int blocksCount, Real trainRatio, int threads) {
+  Layer &fl = l.front();
+  Layer &bl = l.back();
+
+  assert(!l.prev);
+  assert(datafile);
+  assert(size > 0);
+  assert(fl.countNeurons() == size);
+  assert(bl.countNeurons() == size);
+
+  assert(blockSize > 0);
+  assert(blocksCount > 0);
+  assert(trainRatio > 0);
+  assert(threads > 0);
+
+  FILE *f = fopen(datafile, "rb");
+  if (!f)
+    { printf("cannot open file: %s\n", datafile); return; }
+  fseeko64(f, 0, SEEK_END);
+  long long fsize = ftello64(f);
+  int xCount = (int)(fsize/size);
+  if (xCount <= 0)
+    { printf("no tests in file: %s\n", datafile); return; }
+
+  printf("allocate %lld bytes for tests\n", ((long long)blockSize + 1)*size);
+
+  int *block = new int[blockSize*2];
+  int *shuffle = block + blockSize;
+  unsigned char *blockData = new unsigned char[(blockSize + 1)*size];
+  unsigned char *blockResData = blockData + blockSize*size;
+  bool err = false;
+
+  for(int j = 0; j < blockSize; ++j)
+    shuffle[j] = j;
+
+  TrainMT tmt;
+  tmt.layer = &fl;
+  tmt.dataX = blockData;
+  tmt.dataY = blockData;
+  tmt.strideX = tmt.strideY = size;
+  tmt.shuffle = shuffle;
+  tmt.count = blockSize;
+  tmt.trainRatio = trainRatio;
+
+  printf("training %d (%d x %d blocks), tests: %d, ratio: %f:\n", blocksCount*blockSize, blocksCount, blockSize, xCount, trainRatio);
+
+  long long t0 = timeUs();
+  for(int i = 0; i < blocksCount; ++i) {
+    for(int j = 0; j < blockSize; ++j) {
+      block[j] = rand()%xCount;
+      std::swap(shuffle[j], shuffle[rand()%blockSize]);
+    }
+    std::sort(block, block + blockSize);
+
+    for(int j = 0; j < blockSize; ++j) {
+      fseeko64(f, block[j]*(long long)size, SEEK_SET);
+      if (!fread(blockData + j*size, size, 1, f))
+        { printf("cannot read data from file: %s\n", datafile); err = true; break; }
+    }
+    if (err) break;
+
+    //printf("  next data block loaded\n");
+
+    long long t = timeUs();
+    double res = tmt.train(threads);
+    long long dt = timeUs() - t;
+
+    if (outfile && !fl.saveAll(outfile))
+      { printf("cannot save neural network weights to file: %s\n", outfile); err = true; break; }
+
+    unsigned char *data = blockResData;
+    for(Neuron *ibn = bl.neurons, *e = ibn + size; ibn < e; ++ibn, ++data) {
+      Real v = (ibn->v - 0.25)*2;
+      *data = v < 0 ? 0u : v > 1 ? 255u : (unsigned char)(v * 255.999);
+    }
+    tgaSave("data/output/sampleX.tga", blockData + shuffle[blockSize-1]*size, 256, 256, 3);
+    tgaSave("data/output/sampleY.tga", blockResData, 256, 256, 3);
+
+    long long t1 = timeUs();
+    long long dt0 = t1 - t0;
+    t0 = t1;
+
+    printf("%4d: total: %6d, avg result: %f, time: %f + %f = %f\n", i+1, (i+1)*blockSize, res, (dt0-dt)*0.000001, dt*0.000001, dt0*0.000001);
+
+  }
+
+  delete[] block;
+  delete[] blockData;
+
+  printf("finished\n");
+}
+
+
+int main() {
+  srand(time(NULL));
+
+  const char *datafile = "data/img256-data.bin";
+  const char *outfile = "data/output/img256-weights.bin";
+
+  printf("create neural network\n");
+
+  Layer l(nullptr, 256, 256, 3);
+  //new Layer(&l, 128, 128, 3,  6);
+  //new Layer(&l,  64,  64, 3,  8);
+  //new Layer(&l,  32,  32, 3, 11);
+  //new Layer(&l,  16,  16, 4, 16);
+  //new Layer(&l,  16,  16, 4, 16);
+  //new Layer(&l,  16,  16, 4, 16);
+  //new Layer(&l,  32,  32, 3, 11);
+  //new Layer(&l,  64,  64, 3,  8);
+  //new Layer(&l, 128, 128, 3,  6);
+  new Layer(&l, 256, 256, 3,  4);
+  new Layer(&l, 256, 256, 3,  4);
+  new Layer(&l, 256, 256, 3,  4);
+
+  printf("  neurons: %d, links %d, memSize: %llu\n", l.totalNeurons(), l.totalLinks(), (unsigned long long)l.totalMemSize());
+
+  if (outfile) {
+    printf("try load previously saved network\n");
+    l.loadAll(outfile);
+  }
+
+  printf("train\n");
+  imgTrain(l, datafile, l.countNeurons(), outfile, 1000, 10000, 0.1, 4);
+
+  return 0;
+}
+
diff --git a/simple/neural/nn-trainer3.cpp b/simple/neural/nn-trainer3.cpp
new file mode 100644
index 0000000..327bfec
--- /dev/null
+++ b/simple/neural/nn-trainer3.cpp
@@ -0,0 +1,112 @@
+
+#include <ctime>
+#include <cstdlib>
+
+#include <chrono>
+
+#include "nnlayer3.inc.cpp"
+#include "nnlayer3.mt.inc.cpp"
+
+
+long long timeUs() {
+  static std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
+  return (long long)std::chrono::duration_cast<std::chrono::microseconds>( std::chrono::steady_clock::now() - begin ).count();
+}
+
+
+bool train(const char *infile, const char *outfile, Layer &l, int blockSize, int totalCount, Real trainRatio) {
+  assert(blockSize > 0);
+  int blockCount = totalCount/blockSize;
+  assert(blockCount > 0);
+  assert(!l.prev);
+  assert(l.countNeurons() && l.back().countNeurons());
+
+  printf("load training data\n");
+
+  FILE *f = fopen(infile, "rb");
+  if (!f)
+    return printf("cannot open file '%s' for read\n", infile), false;
+  fseek(f, 0, SEEK_END);
+  int fs = ftell(f);
+  fseek(f, 0, SEEK_SET);
+
+  int sizeX = l.countNeurons();
+  int sizeY = l.back().countNeurons();
+  int count = fs/(sizeX+1);
+  if (count < blockSize)
+    return printf("file '%s' is lesser minimal size\n", infile), fclose(f), false;
+
+  unsigned char *data = new unsigned char[(sizeX + sizeY)*count];
+  memset(data, 0, (sizeX + sizeY)*count);
+  for(int i = 0; i < count; ++i) {
+    unsigned char *d = data + (sizeX + sizeY)*i;
+    if (!fread(d, sizeX+1, 1, f) || d[sizeX] >= sizeY)
+      return printf("cannot read from file '%s'\n", infile), delete[] data, fclose(f), false;
+    d += sizeX;
+    unsigned char c = *d;
+    *d = 0;
+    d[c] = 255;
+  }
+  fclose(f);
+
+  printf("train %d x %d = %d, ratio: %f\n", blockCount, blockSize, blockCount*blockSize, trainRatio);
+
+  int *shuffle = new int[blockSize];
+  TrainMT tmt;
+  tmt.layer = &l;
+  tmt.dataX = data;
+  tmt.dataY = data + sizeX;
+  tmt.strideX = tmt.strideY = sizeX + sizeY;
+  tmt.shuffle = shuffle;
+  tmt.count = blockSize;
+  tmt.trainRatio = trainRatio;
+
+  long long timeStartUs = timeUs();
+  for(int i = 0; i < blockCount; ++i) {
+    long long timeBlockStartUs = timeUs();
+
+    for(int j = 0; j < blockSize; ++j)
+      shuffle[j] = rand()%count;
+    double res = tmt.train(4);
+
+    long long dt = timeUs() - timeBlockStartUs;
+
+    printf("%4d, total %7d, avg.result %f, time: %f\n", i+1, (i+1)*blockSize, res, dt*0.000001);
+    if ( ((i+1)%100) == 0 || i+1 == blockCount ) {
+      if (!l.saveAll(outfile)) return delete[] data, delete[] shuffle, false;
+      printf("  saved\n");
+    }
+  }
+
+  long long dt = timeUs() - timeStartUs;
+  printf("finished int time: %f\n", dt*0.000001);
+
+  delete[] shuffle;
+  delete[] data;
+  return true;
+}
+
+
+int main() {
+  srand(time(NULL));
+
+  const char *infile = "data/symbols-data.bin"; // 28x28
+  const char *outfile = "data/output/weights.bin";
+
+  printf("create neural network\n");
+  Layer l(nullptr, 28, 28,  1);
+  new Layer(&l, 14, 14,  1, 28);
+  new Layer(&l,  7,  7,  1, 14);
+  new Layer(&l,  1,  1, 10,  7);
+
+  printf("  neurons: %d, links %d, memSize: %llu\n", l.totalNeurons(), l.totalLinks(), (unsigned long long)l.totalMemSize());
+
+  //printf("try load previously saved network\n");
+  //l.loadAll(outfile);
+
+  printf("train\n");
+  train(infile, outfile, l, 10000, 1000000, 0.01);
+
+  return 0;
+}
+
diff --git a/simple/neural/nnlayer.conv.inc.cpp b/simple/neural/nnlayer.conv.inc.cpp
index ff792a5..e05c8e6 100644
--- a/simple/neural/nnlayer.conv.inc.cpp
+++ b/simple/neural/nnlayer.conv.inc.cpp
@@ -26,6 +26,7 @@ public:
     double k = RaLU ? 1.0/(WW*psz*sz) : 1;
     for(double *iw = w, *e = iw + wsize; iw < e; ++iw)
       *iw = (rand()/(double)RAND_MAX*2 - 1)*k;
+    memsize += wsize*sizeof(double);
   }
 
   Layer& pass() override {
@@ -133,6 +134,7 @@ public:
     dw = w + wsize;
     for(double *iw = w, *e = iw + wsize; iw < e; ++iw)
       *iw = (rand()/(double)RAND_MAX*2 - 1)*1;
+    memsize += (wsize + WW*psz)*sizeof(double);
   }
 
   Layer& pass() override {
diff --git a/simple/neural/nnlayer.inc.cpp b/simple/neural/nnlayer.inc.cpp
index a3cef7e..3edf30b 100644
--- a/simple/neural/nnlayer.inc.cpp
+++ b/simple/neural/nnlayer.inc.cpp
@@ -13,17 +13,19 @@
 class Layer {
 public:
   Layer *prev, *next;
+  size_t memsize;
   int size, wsize, links;
   double *a, *da, *w;
 
   Layer(Layer *prev, int size):
-    prev(), next(), size(size), wsize(), links(), w()
+    prev(), next(), memsize(), size(size), wsize(), links(), w()
   {
     assert(size > 0);
     a = new double[size*2];
     da = a + size;
     memset(a, 0, sizeof(*a)*size*2);
     if (prev) (this->prev = &prev->back())->next = this;
+    memsize += size*2*sizeof(double);
   }
 
   virtual ~Layer() {
@@ -43,6 +45,8 @@ public:
   inline Layer& back()
     { Layer *l = this; while(l->next) l = l->next; return *l; }
 
+  inline size_t totalMemSize() const
+    { size_t s = 0; for(const Layer *l = this; l; l = l->next) s += l->memsize; return s; }
   inline int totalSize() const
     { int c = 0; for(const Layer *l = this; l; l = l->next) c += l->size; return c; }
   inline int totalLinks() const
@@ -111,6 +115,7 @@ public:
     double k = 1.0/this->prev->size;
     for(double *iw = w, *e = iw + wsize; iw < e; ++iw)
       *iw = (rand()/(double)RAND_MAX*2 - 1)*k;
+    memsize += wsize*sizeof(double);
   }
 
   Layer& pass() override {
diff --git a/simple/neural/nnlayer.lnk.inc.cpp b/simple/neural/nnlayer.lnk.inc.cpp
index 3efc334..d96a9c6 100644
--- a/simple/neural/nnlayer.lnk.inc.cpp
+++ b/simple/neural/nnlayer.lnk.inc.cpp
@@ -23,11 +23,20 @@ public:
     memset(wa, 0, sizeof(*wa)*wsize*2);
     for(double *iw = w, *e = iw + wsize; iw < e; ++iw)
       *iw = rand()/(double)RAND_MAX*2 - 1;
+    memsize += wsize*sizeof(double) + wsize*2*sizeof(double*);
   }
 
   ~LayerLink()
     { delete[] wa; }
 
+  bool selfCheck() const {
+    for(int i = 0; i < wsize; ++i)
+      if ( !wa[i] ||  wa[i] < prev->a  ||  wa[i] >= prev->a  + prev->size
+       || !wda[i] || wda[i] < prev->da || wda[i] >= prev->da + prev->size )
+        return false;
+    return true;
+  }
+
   Layer& pass() override {
     double *ia = a;
     double *iw = w;
@@ -72,69 +81,64 @@ public:
 
 class LayerLinkConvolution: public LayerLink {
 public:
-  LayerLinkConvolution(Layer &prev, int psx, int psy, int psz, int sx, int sy, int sz, int lsize, int step = 1, int pad = 0):
+  LayerLinkConvolution(Layer &prev, int psx, int psy, int psz, int sx, int sy, int sz, int lsize):
     LayerLink(prev, sx*sy*sz, lsize*psz)
   {
     assert(psx > 0 && psy > 0 && psz > 0);
     assert(sx > 0 && sy > 0 && sz > 0);
-    assert(step > 0 && pad >= 0);
     assert(psx*psy*psz == this->prev->size);
-    assert(pad + (sx-1)*step < psx);
-    assert(pad + (sy-1)*step < psy);
+    assert(lsize > 0 && lsize <= psx*psy);
 
-    int hs = (int)ceil(sqrt(lsize)) + 1;
+    int hs = (int)sqrt(lsize*1.5) + 2;
     int s = hs*2 + 1;
+
     struct Point {
       int x, y, r;
-      inline bool operator< (const Point &p) const { return r < p.r; }
+      inline bool operator<(const Point &b) const
+        { return r < b.r; }
     } *points = new Point[s*s], *p = points;
 
-    for(int y = -hs; y <= hs; ++y) {
-      for(int x = -hs; x <= hs; ++x, ++p) {
-        int sector = x >= 0 && y > 0 ? 1
-                   : y >= 0 && x < 0 ? 2
-                   : x <= 0 && y < 0 ? 3
-                   : y <= 0 && x > 0 ? 4
-                   : 0;
-        int subsector = sector % 2 ? abs(x) >= abs(y) : abs(y) >= abs(x);
-        p->x = x;
-        p->y = y;
-        p->r = (x*x + y*y)*10 + sector*2 + subsector;
-      }
-    }
-    std::sort(points, points + s*s);
+    int r = 0;
+    static const int rnd[] = { 9, 12, 4, 6, 0, 15, 13, 8, 2, 3, 10, 1, 5, 11, 14, 7 };
+    for(int y = -hs; y <= hs; ++y)
+      for(int x = -hs; x <= hs; ++x, ++r, ++p)
+        { p->x = x, p->y = y, p->r = (x*x + y*y)*16 + rnd[r%16]; }
+    std::sort(points, p);
 
     int *order = new int[lsize];
-    for(int z = 0; z < sz; ++z) {
-      for(int y = 0; y < sy; ++y) {
-        for(int x = 0; x < sx; ++x) {
-          p = points;
-          int *io = order;
-          for(int i = 0; i < lsize; ++i, ++io) {
-            int xx, yy;
-            do {
-              xx = pad + x*step + p->x;
-              yy = pad + y*step + p->y;
-              ++p;
-            } while(xx < 0 || yy < 0 || xx >= psx || yy >= psy);
-            *io = yy*psx + xx;
-          }
-          std::sort(order, order + lsize);
-
-          double **iwa  = &wa [ (z*sx*sy + y*sx + x)*this->lsize ];
-          double **iwda = &wda[ (z*sx*sy + y*sx + x)*this->lsize ];
-          for(int pz = 0; pz < psz; ++pz) {
-            for(int i = 0; i < lsize; ++i, ++iwa, ++iwda) {
-              *iwa  = &this->prev->a [pz*psx*psy + order[i]];
-              *iwda = &this->prev->da[pz*psx*psy + order[i]];
+
+    for(int y = 0; y < sy; ++y) {
+      for(int x = 0; x < sx; ++x) {
+        int cx = (int)((x + 0.5)/(sx + 1)*(psx + 1));
+        int cy = (int)((y + 0.5)/(sy + 1)*(psy + 1));
+
+        p = points;
+        for(int l = 0; l < lsize; ++l) {
+          int px, py;
+          do { px = cx + p->x; py = cy + p->y; ++p; }
+            while(px < 0 || py < 0 || px >= psx || py >= psy);
+          order[l] = py*psx + px;
+        }
+        std::sort(order, order + lsize);
+
+        for(int z = 0; z < sz; ++z) {
+          for(int l = 0; l < lsize; ++l, ++p) {
+            for(int pz = 0; pz < psz; ++pz) {
+              int i = (((y*sx + x)*sz + z)*lsize + l)*psz + pz;
+              int pi = order[l]*psz + pz;
+              assert(i >= 0 && i < wsize);
+              assert(pi >= 0 && pi < this->prev->size);
+              wa[i] = &this->prev->a[pi];
+              wda[i] = &this->prev->da[pi];
             }
           }
         }
       }
     }
 
-    delete[] order;
     delete[] points;
+    delete[] order;
+    assert(selfCheck());
   }
 };
 
diff --git a/simple/neural/nnlayer2.conv.inc.cpp b/simple/neural/nnlayer2.conv.inc.cpp
new file mode 100644
index 0000000..b5f8d20
--- /dev/null
+++ b/simple/neural/nnlayer2.conv.inc.cpp
@@ -0,0 +1,77 @@
+#ifndef NNLAYER2_CONV_INC_CPP
+#define NNLAYER2_CONV_INC_CPP
+
+
+#include "nnlayer2.inc.cpp"
+
+
+
+Layer* createConv(Layer &prev, int psx, int psy, int psz, int sx, int sy, int sz, int lsize) {
+  struct Point {
+    int x, y, r;
+    inline bool operator<(const Point &b) const { return r < b.r; }
+  };
+
+  static const int hs = 256;
+  static const int s = hs*2 + 1;
+  static const int rnd[] = { 9, 12, 4, 6, 0, 15, 13, 8, 2, 3, 10, 1, 5, 11, 14, 7 };
+  static std::vector<Point> points;
+
+  if (points.empty()) {
+    points.resize(s*s);
+    Point *p = points.data();
+    for(int y = -hs, r = 0; y <= hs; ++y)
+      for(int x = -hs; x <= hs; ++x, ++r, ++p)
+        { p->x = x, p->y = y, p->r = (x*x + y*y)*16 + rnd[r%16]; }
+    std::sort(points.begin(), points.end());
+  }
+
+  Layer &pl = prev.back();
+  assert(psx > 0 && psy > 0 && psz > 0);
+  assert(sx > 0 && sy > 0 && sz > 0);
+  assert(psx*psy*psz == pl.size);
+  assert(lsize > 0 && lsize <= psx*psy);
+
+  Layer &cl = *new Layer(&pl, sx*sy*sz, lsize*psz);
+  assert(cl.size && cl.lsize);
+
+  const Point *pb = points.data();
+  int *order = new int[lsize];
+
+  for(int y = 0; y < sy; ++y) {
+    for(int x = 0; x < sx; ++x) {
+      int cx = (int)((x + 0.5)/(sx + 1)*(psx + 1));
+      int cy = (int)((y + 0.5)/(sy + 1)*(psy + 1));
+
+      const Point *p = pb;
+      for(int l = 0; l < lsize; ++l) {
+        int px, py;
+        do { assert(p < pb + points.size()); px = cx + p->x; py = cy + p->y; ++p; }
+          while(px < 0 || py < 0 || px >= psx || py >= psy);
+        order[l] = py*psx + px;
+      }
+      std::sort(order, order + lsize);
+
+      for(int z = 0; z < sz; ++z) {
+        for(int l = 0; l < lsize; ++l) {
+          for(int pz = 0; pz < psz; ++pz) {
+            int i = (((y*sx + x)*sz + z)*lsize + l)*psz + pz;
+            int pi = order[l]*psz + pz;
+            assert(i >= 0 && i < cl.size*cl.lsize);
+            assert(pi >= 0 && pi < pl.size);
+            cl.links[i].nprev = pi;
+          }
+        }
+      }
+    }
+  }
+
+  delete[] order;
+  cl.prepareBackLinks();
+
+  return &cl;
+}
+
+
+
+#endif
diff --git a/simple/neural/nnlayer2.inc.cpp b/simple/neural/nnlayer2.inc.cpp
new file mode 100644
index 0000000..fb4a44e
--- /dev/null
+++ b/simple/neural/nnlayer2.inc.cpp
@@ -0,0 +1,258 @@
+#ifndef NNLAYER2_INC_CPP
+#define NNLAYER2_INC_CPP
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+
+#include <vector>
+#include <algorithm>
+
+
+typedef float Real;
+
+
+template<typename T>
+bool twrite(const T &x, FILE *f)
+  { return fwrite(&x, sizeof(x), 1, f); }
+template<typename T>
+bool tread(const T &x, FILE *f)
+  { return fwrite(&x, sizeof(x), 1, f); }
+
+
+struct Link;
+struct Neuron {
+  Real v, d;
+  inline Neuron(): v(), d() { }
+};
+
+struct Link {
+  int nprev, lnext;
+  Real w;
+  inline Link(): nprev(), lnext(), w((Real)rand()/(Real)RAND_MAX*2 - 1) { }
+};
+
+
+class Layer {
+public:
+  Layer *prev, *next;
+
+  int size, lsize;
+  Neuron *neurons;
+  Link *links;
+  int lfirst;
+
+  explicit Layer(Layer *prev = nullptr, int size = 0, int lsize = 0):
+    prev(), next(), size(), lsize(), neurons(), links(), lfirst()
+  {
+    while(prev && prev->next) prev = prev->next;
+    if (prev) prev->next = this;
+    this->prev = prev;
+    init(size, lsize);
+  }
+
+  virtual ~Layer() {
+    if (next) delete next;
+    if (prev) prev->next = nullptr;
+    delete[] neurons;
+    if (links) delete[] links;
+  }
+
+  bool init(int size, int lsize = 0) {
+    clear();
+    if (size <= 0) return false;
+    if (lsize < 0) lsize = 0;
+
+    if (prev ? lsize <= 0 : lsize) return false;
+    if (size) neurons = new Neuron[size];
+    if (lsize) links = new Link[lsize*size];
+    this->size = size;
+    this->lsize = lsize;
+    return true;
+  }
+
+  void clear() {
+    if (neurons) delete[] neurons;
+    if (links) delete[] links;
+    this->size = lsize = 0;
+    neurons = nullptr;
+    links = nullptr;
+    lfirst = 0;
+  }
+
+  void prepareBackLinks() {
+    if (!links || lsize <= 0) return;
+
+    lfirst = 0;
+    int lend = size*lsize;
+    for(Link *il = links, *e = il + lend; il < e; ++il) {
+      assert(prev && prev->neurons && il->nprev >= 0 && il->nprev < prev->size);
+      il->lnext = il - links + 1;
+    }
+
+    while(true) {
+      bool done = true;
+      for(int *il = &lfirst; links[*il].lnext != lend; il = &links[*il].lnext) {
+        int a = *il, b = links[a].lnext;
+        if (links[a].nprev > links[b].nprev) {
+          links[a].lnext = links[b].lnext;
+          links[b].lnext = a;
+          *il = b;
+          done = false;
+        }
+      }
+      if (done) break;
+    }
+
+    #ifndef NDEBUG
+    if (lsize) {
+      int next = 0;
+      for(int il = lfirst; il != lend; il = links[il].lnext) {
+        assert(links[il].nprev == next || links[il].nprev == next-1);
+        if (links[il].nprev == next) ++next;
+      }
+      assert(next = prev->size);
+    }
+    #endif
+  }
+
+  bool save(FILE *f) const {
+    if (!twrite(size, f) || !twrite(lsize, f))
+      return false;
+    for(Link *il = links, *e = il + lsize*size; il < e; ++il)
+      if (!twrite(il->nprev, f) || !twrite(il->w, f))
+          return false;
+    return true;
+  }
+
+  bool load(FILE *f) {
+    clear();
+    int size = 0, lsize = 0;
+    if (!tread(size, f) || !tread(lsize, f) || !init(size, lsize))
+      return false;
+    for(Link *il = links, *e = il + lsize*size; il < e; ++il)
+      if (!tread(il->nprev, f) || !tread(il->w, f) || !prev || !prev->neurons || il->nprev < 0 || il->nprev >= prev->size)
+          return false;
+    prepareBackLinks();
+    return true;
+  }
+
+  bool saveAll(FILE *f) const
+    { return save(f) && (!next || next->save(f)); }
+  bool loadAll(FILE *f)
+    { return load(f) && (!next || next->load(f)); }
+
+  bool saveAll(const char *filename) const {
+    assert(!prev);
+    FILE *f = fopen(filename, "wb");
+    if (!f)
+      { printf("cannot open file for write: %s\n", filename); return false; }
+    int count = totalLayers();
+    if (!twrite(count, f) || !saveAll(f))
+      { printf("cannot save to file: %s\n", filename); fclose(f); return false; }
+    fclose(f);
+    return true;
+  }
+
+  bool loadAll(const char *filename) {
+    assert(!prev);
+    FILE *f = fopen(filename, "rb");
+    if (!f)
+      { printf("cannot open file for read: %s\n", filename); return false; }
+    int count = 0;
+    if (!tread(count, f) || count <= 0)
+      { printf("cannot load from file: %s\n", filename); fclose(f); return false; }
+    if (next)
+      delete next;
+    while(--count)
+      new Layer(this);
+    if (!loadAll(f))
+      { printf("cannot load from file: %s\n", filename); fclose(f); return false; }
+    fclose(f);
+    return true;
+  }
+
+  int totalLayers() const
+    { int t = 0; for(const Layer *l = this; l; l = l->next) ++t; return t; }
+  int totalNeurons() const
+    { int t = 0; for(const Layer *l = this; l; l = l->next) t += l->size; return t; }
+  int totalLinks() const
+    { int t = 0; for(const Layer *l = this; l; l = l->next) t += l->size*l->lsize; return t; }
+  inline size_t totalMemSize() const
+    { return totalNeurons()*sizeof(Neuron) + totalLinks()*sizeof(Link); }
+
+  inline Layer& front()
+    { Layer *l = this; while(l->prev) l = l->prev; return *l; }
+  inline Layer& back()
+    { Layer *l = this; while(l->next) l = l->next; return *l; }
+
+  void pass(int nb, int ne) {
+    assert(prev);
+    int lsize = this->lsize;
+    Neuron *pn = prev->neurons;
+    Link *il = links + nb*lsize;
+    for(Neuron *in = neurons + nb, *e = neurons + ne; in < e; ++in) {
+      double s = 0;
+      for(Link *e = il + lsize; il < e; ++il)
+        s += pn[il->nprev].v * il->w;
+
+      // exp sigmoid
+      double ss = 1/(1 + exp(-s));
+      in->v = ss;
+      in->d = ss * (1-ss);
+
+      // 1/(x+1) sigmoid
+      //double ss = 1/(1+fabs(s));
+      //double ss2 = ss*0.5;
+      //in->v = s > 0 ? 1 - ss2 : ss2;
+      //in->d = ss2 * ss;
+    }
+  }
+
+  template<bool hasPrev>
+  void backpassTpl(int lb, int le) {
+    assert(hasPrev == (bool)prev);
+    if (lb == le) return;
+    Link *links = this->links;
+    Neuron *neurons = this->neurons;
+    Neuron *pneurons = prev->neurons;
+
+    double s = 0;
+    int ipn = links[lb].nprev;
+    for(int il = lb; il != le; ) {
+      Link &l = links[il];
+      if (hasPrev)
+        if (ipn != l.nprev)
+          { pneurons[ipn].d *= s; ipn = l.nprev; }
+      Neuron &n = neurons[ il/lsize ];
+      Real d = n.d;
+      if (hasPrev) s += d * l.w;
+      l.w += d * n.v;
+      il = l.lnext;
+    }
+    if (hasPrev)
+        pneurons[ipn].d *= s;
+
+    assert(le == size*lsize || ipn < links[le].nprev);
+  }
+
+  void backpass(int lb, int le) {
+    if (prev) backpassTpl<true>(lb, le); else backpassTpl<false>(lb, le);
+  }
+
+  void passAll() {
+    if (prev) pass(0, size);
+    if (next) next->passAll();
+  }
+
+  void backpassAll(Real k) {
+    if (next) backpass(lfirst, size*lsize);
+    if (prev) prev->passAll();
+  }
+};
+
+
+#endif
diff --git a/simple/neural/nnlayer2.mt.inc.cpp b/simple/neural/nnlayer2.mt.inc.cpp
new file mode 100644
index 0000000..23ea906
--- /dev/null
+++ b/simple/neural/nnlayer2.mt.inc.cpp
@@ -0,0 +1,167 @@
+#ifndef NNLAYER2_MT_INC_CPP
+#define NNLAYER2_MT_INC_CPP
+
+
+#include "nnlayer2.inc.cpp"
+
+
+#include <atomic>
+#include <thread>
+#include <vector>
+
+
+class Barrier {
+private:
+  std::atomic<unsigned int> &counter;
+  const unsigned int threads;
+  unsigned int next;
+public:
+  inline Barrier(std::atomic<unsigned int> &counter, unsigned int threads): counter(counter), threads(threads), next() { }
+  inline void wait() { next += threads; ++counter; while(counter < next); }
+};
+
+
+class TrainMT {
+private:
+  struct LDesc {
+    int nb, ne, lb, le;
+    double sumQ;
+    LDesc(): nb(), ne(), lb(), le(), sumQ() { }
+  };
+
+public:
+  Layer *layer;
+  const unsigned char *dataX;
+  const unsigned char *dataY;
+  int strideX;
+  int strideY;
+  int *shuffle;
+  int count;
+  Real trainRatio;
+
+  TrainMT():
+    layer(),
+    dataX(),
+    dataY(),
+    strideX(),
+    strideY(),
+    shuffle(),
+    count(),
+    trainRatio() { }
+
+private:
+  void trainFunc(int tid, int threads, std::atomic<unsigned int> &barrierCounter, LDesc *ldescs) {
+    Barrier barrier(barrierCounter, threads);
+
+    Layer &fl = *layer;
+    Layer &bl = layer->back();
+    int layersCount = fl.totalLayers();
+    LDesc *fld = ldescs, *bld = fld + layersCount - 1;
+
+    Real trainRatio = this->trainRatio;
+
+    double sumQ = 0;
+    for(int i = 0; i < count; ++i) {
+      int ii = shuffle[i];
+      const unsigned char *curX = dataX + strideX*ii;
+      const unsigned char *curY = dataY + strideY*ii;
+
+      const unsigned char *px = curX;
+      for(Neuron *in = fl.neurons + fld->nb, *e = fl.neurons + fld->ne; in < e; ++in, ++px)
+        in->v = Real(*px)*Real(1/255.0);
+
+      LDesc *ld = fld + 1;
+      for(Layer *l = fl.next; l; l = l->next, ++ld) {
+        barrier.wait();
+        l->pass(ld->nb, ld->ne);
+      }
+
+      double q = 0;
+      const unsigned char *py = curY;
+      for(Neuron *in = bl.neurons + bld->nb, *e = bl.neurons + bld->ne; in < e; ++in, ++py) {
+        Real d = Real(*py)*Real(1/255.0) - in->v;
+        in->d *= d * trainRatio;
+        q += d*d;
+      }
+      sumQ += q;
+
+      if (trainRatio > 0) {
+        ld = bld - 1;
+        for(Layer *l = bl.prev; l; l = l->prev, --ld) {
+          barrier.wait();
+          l->backpass(ld->lb, ld->le);
+        }
+      }
+
+      //if (!tid) printf(" - %d, %f, %f\n", i, q, sumQ);
+    }
+
+    ldescs->sumQ = sumQ;
+  }
+
+public:
+  double train(int threads) {
+    assert(threads > 0);
+    assert(layer && !layer->prev);
+    assert(dataX && dataY && shuffle);
+    assert(count > 0);
+    assert(trainRatio >= 0);
+
+    int layersCount = layer->totalLayers();
+    assert(layersCount > 0);
+    std::vector<LDesc> ldescs( threads*layersCount );
+
+    int layerId = 0;
+    for(Layer *l = layer; l; l = l->next, ++layerId) {
+      assert(layerId < layersCount);
+      int tsize = l->size/threads;
+      for(int tid = 0; tid < threads; ++tid) {
+        LDesc &desc = ldescs[tid*layersCount + layerId];
+        desc.nb = tid*tsize;
+        desc.ne = desc.nb + tsize;
+        if (tid == threads-1) desc.ne = l->size;
+      }
+
+      if (int lsize = l->size*l->lsize) {
+        int tlsize = lsize/threads;
+        int ipn = l->links[ l->lfirst ].nprev;
+        int tid = 0;
+        int count = 0;
+
+        ldescs[tid*layersCount + layerId].lb = l->lfirst;
+        if (threads > 1) {
+          for(int il = l->lfirst; il != lsize; il = l->links[il].lnext, ++count) {
+            Link &link = l->links[il];
+            if (ipn != link.nprev) {
+              if (count >= tlsize) {
+                ldescs[tid*layersCount + layerId].le = il;
+                ++tid;
+                count -= tlsize;
+                ldescs[tid*layersCount + layerId].lb = il;
+                if (tid == threads - 1) break;
+              }
+              ipn = link.nprev;
+            }
+          }
+        }
+        ldescs[tid*layersCount + layerId].le = lsize;
+      }
+    }
+    assert(layerId == layersCount);
+
+    std::atomic<unsigned int> barrierCounter(0);
+    std::vector<std::thread*> t(threads - 1);
+    for(int i = 1; i < threads; ++i)
+      t[i-1] = new std::thread(&TrainMT::trainFunc, this, i, threads, std::ref(barrierCounter), &ldescs[i*layersCount]);
+    trainFunc(0, threads, barrierCounter, &ldescs[0]);
+
+    double result = ldescs[0].sumQ;
+    for(int i = 1; i < threads; ++i)
+      { t[i-1]->join(); delete t[i-1]; result += ldescs[i*layersCount].sumQ; }
+
+    return result/(count * layer->back().size);
+  }
+};
+
+
+#endif
diff --git a/simple/neural/nnlayer3.inc.cpp b/simple/neural/nnlayer3.inc.cpp
new file mode 100644
index 0000000..4151893
--- /dev/null
+++ b/simple/neural/nnlayer3.inc.cpp
@@ -0,0 +1,370 @@
+#ifndef NNLAYER3_INC_CPP
+#define NNLAYER3_INC_CPP
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+
+#include <vector>
+#include <algorithm>
+
+
+typedef float Real;
+
+
+template<typename T>
+bool twrite(const T &x, FILE *f)
+  { return fwrite(&x, sizeof(T), 1, f); }
+template<typename T>
+bool tread(T &x, FILE *f)
+  { return fread(&x, sizeof(T), 1, f); }
+
+
+struct Neuron {
+  Real v, d;
+};
+
+
+class Layer {
+public:
+  Layer *prev, *next;
+
+  int sx, sy, sz, sl;
+  Neuron *neurons;
+  Real *weights;
+  int *invWeights;
+  int *invNeurons;
+
+
+  explicit Layer(Layer *prev = nullptr, int sx = 0, int sy = 0, int sz = 0, int sl = 0):
+    prev(), next(), sx(), sy(), sz(), sl(), neurons(), weights(), invWeights(), invNeurons()
+  {
+    while(prev && prev->next) prev = prev->next;
+    if (prev) prev->next = this;
+    this->prev = prev;
+    init(sx, sy, sz, sl);
+  }
+
+
+  virtual ~Layer() {
+    clear();
+    if (next) delete next;
+    if (prev) prev->next = nullptr;
+  }
+
+
+  void clear() {
+    if (next) next->clear();
+    if (neurons) delete[] neurons;
+    if (weights) delete[] weights;
+    if (invWeights) delete[] invWeights;
+    sx = sy = sz = sl = 0;
+    neurons = nullptr;
+    weights = nullptr;
+    invWeights = nullptr;
+    invNeurons = nullptr;
+  }
+
+
+  bool init(int sx, int sy, int sz, int sl = 0) {
+    clear();
+    if (prev && !prev->countNeurons()) return false;
+    if (sx <= 0 || sy <= 0 || sz <= 0) return false;
+    if (prev ? (sl <= 0 || sl > prev->sx || sl > prev->sy) : sl != 0) return false;
+    printf("init %d %d %d %d\n", sx, sy, sz, sl);
+
+    this->sx = sx;
+    this->sy = sy;
+    this->sz = sz;
+
+    int size = sx*sy*sz;
+    neurons = new Neuron[size];
+    memset(neurons, 0, sizeof(*neurons)*size);
+
+    if (prev) {
+      int psx = prev->sx;
+      int psy = prev->sy;
+      int psz = prev->sz;
+      int psize = psx*psy*psz;
+
+      this->sl = sl;
+      int wsize = size*sl*sl*psz;
+      weights = new Real[wsize];
+      double k = 1.0/(sl*sl*psz);
+      for(int i = 0; i < wsize; ++i)
+        weights[i] = Real( (rand()/(double)RAND_MAX*2 - 1)*k );
+
+      struct Link {
+        int n, w;
+        inline bool operator< (const Link &b) const
+          { return n < b.n ? true : (b.n < n ? false : w < b.w); }
+      } *links = new Link[wsize], *il = links;
+
+      int dx = sx > 1 ? sx - 1 : 1;
+      int dy = sy > 1 ? sy - 1 : 1;
+
+      for(int y = 0; y < sy; ++y)
+      for(int x = 0; x < sx; ++x) {
+        int py = y*(psy-sl)/dy;
+        int px = x*(psx-sl)/dx;
+        assert(py >= 0 && py <= psy-sl);
+        assert(px >= 0 && px <= psx-sl);
+
+        for(int z = 0; z < sz; ++z)
+        for(int ly = 0; ly < sl; ++ly)
+        for(int lx = 0; lx < sl; ++lx)
+        for(int pz = 0; pz < psz; ++pz, ++il) {
+          assert(il < links + wsize);
+          il->n = ((py+ly)*psx + px+lx)*psz + pz;
+          il->w = il - links;
+          assert(il->n >= 0 && il->n < psize);
+        }
+      }
+      assert(il == links + wsize);
+      std::sort(links, il);
+
+      invWeights = new int[wsize + psize + 1];
+      invNeurons = invWeights + wsize;
+      invNeurons[0] = 0;
+      int pni = 0;
+      for(int i = 0; i < wsize; ++i) {
+        assert(pni == links[i].n || pni == links[i].n + 1);
+        invWeights[i] = links[i].w;
+        if (pni == links[i].n)
+          invNeurons[pni++] = i;
+      }
+      assert(pni == psize);
+      invNeurons[psize] = wsize;
+
+      delete[] links;
+    }
+    return true;
+  }
+
+  bool save(FILE *f) const {
+    return twrite(sx, f) && twrite(sy, f) && twrite(sz, f) && twrite(sl, f)
+        && (!weights || fwrite(weights, sizeof(*weights)*sx*sy*sz*sl*sl*prev->sz, 1, f));
+  }
+
+  bool load(FILE *f) {
+    clear();
+    int sx = 0, sy = 0, sz = 0, sl = 0;
+    return tread(sx, f) && tread(sy, f) && tread(sz, f) && tread(sl, f) && init(sx, sy, sz, sl)
+        && (!weights || fread(weights, sizeof(*weights)*sx*sy*sz*sl*sl*prev->sz, 1, f));
+  }
+
+  bool saveAll(FILE *f) const
+    { return save(f) && (!next || next->saveAll(f)); }
+  bool loadAll(FILE *f)
+    { return load(f) && (!next || next->loadAll(f)); }
+
+  bool saveAll(const char *filename) const {
+    assert(!prev);
+    FILE *f = fopen(filename, "wb");
+    if (!f)
+      { printf("cannot open file for write: %s\n", filename); return false; }
+    int count = totalLayers();
+    if (!twrite(count, f) || !saveAll(f))
+      { printf("cannot save to file: %s\n", filename); fclose(f); return false; }
+    fclose(f);
+    return true;
+  }
+
+  bool loadAll(const char *filename) {
+    assert(!prev);
+    FILE *f = fopen(filename, "rb");
+    if (!f)
+      { printf("cannot open file for read: %s\n", filename); return false; }
+    int count = 0;
+    if (!tread(count, f) || count <= 0)
+      { printf("cannot load from file: %s\n", filename); fclose(f); return false; }
+    if (next)
+      delete next;
+    while(--count)
+      new Layer(this);
+    if (!loadAll(f))
+      { printf("cannot load from file: %s\n", filename); fclose(f); return false; }
+    fclose(f);
+    return true;
+  }
+
+  inline int countNeurons() const
+    { return sx*sy*sz; }
+  inline int linksPerNeuron() const
+    { return prev ? sl*sl*prev->sz : 0; }
+  inline int countLinks() const
+    { return countNeurons() * linksPerNeuron(); }
+  inline size_t memSize() const {
+    return sizeof(*neurons)*countNeurons()
+         + (sizeof(*weights) + sizeof(*invWeights))*countLinks()
+         + (prev ? sizeof(*invNeurons)*(prev->countNeurons() + 1) : 0);
+  }
+
+  int totalLayers() const
+    { int t = 0; for(const Layer *l = this; l; l = l->next) ++t; return t; }
+  int totalNeurons() const
+    { int t = 0; for(const Layer *l = this; l; l = l->next) t += l->countNeurons(); return t; }
+  int totalLinks() const
+    { int t = 0; for(const Layer *l = this; l; l = l->next) t += l->countLinks(); return t; }
+  size_t totalMemSize() const
+    { size_t t = 0; for(const Layer *l = this; l; l = l->next) t += l->memSize(); return t; }
+
+  inline Layer& front()
+    { Layer *l = this; while(l->prev) l = l->prev; return *l; }
+  inline Layer& back()
+    { Layer *l = this; while(l->next) l = l->next; return *l; }
+
+
+  void pass(int y0, int y1) {
+    if (!prev) return;
+
+    int sx = this->sx;
+    int sy = this->sy;
+    int sz = this->sz;
+    int sxz = sx*sz;
+
+    int psx = prev->sx;
+    int psy = prev->sy;
+    int psz = prev->sz;
+    int psxz = psx*psz;
+    Neuron *pneurons = prev->neurons;
+
+    int sl = this->sl;
+    int slpz = sl*psz;
+    int sllpz = sl*slpz;
+
+    int ldy = psxz - slpz;
+
+    int kpx = psx-sl, dpx = sx>1 ? sx-1 : 1;
+    int kpy = psy-sl, dpy = sy>1 ? sy-1 : 1;
+
+    Real *iw = weights + y0*sxz*sllpz;
+    Neuron *in = neurons + y0*sxz;
+    int fpy = y0*kpy;
+    for(Neuron *e = in + (y1-y0)*sxz; in < e; fpy += kpy) {
+      Neuron *pnrow = pneurons + fpy/dpy*psxz;
+
+      int fpx = 0;
+      for(Neuron *e = in + sxz; in < e; fpx += kpx) {
+        Neuron *pn = pnrow + fpx/dpx*psz;
+
+        for(Neuron *e = in + sz; in < e; ++in) {
+          double s = 0;
+
+          Neuron *ipn = pn;
+          for(Real *e = iw + sllpz; iw < e; ipn += ldy)
+          for(Real *e = iw + slpz; iw < e; ++iw, ++ipn)
+            s += *iw * ipn->v;
+
+          // exp sigmoid
+          //double ss = 1/(1 + exp(-s));
+          //in->v = ss;
+          //in->d = ss * (1-ss);
+
+          // 1/(x+1) sigmoid
+          double ss = 1/(1+fabs(s));
+          double ss2 = ss*0.5;
+          in->v = s > 0 ? 1 - ss2 : ss2;
+          in->d = ss2 * ss;
+        }
+      }
+    }
+  }
+
+
+  void backpassWeights(int y0, int y1) {
+    assert(prev);
+
+    int sx = this->sx;
+    int sy = this->sy;
+    int sz = this->sz;
+    int sxz = sx*sz;
+
+    int psx = prev->sx;
+    int psy = prev->sy;
+    int psz = prev->sz;
+    int psxz = psx*psz;
+    Neuron *pneurons = prev->neurons;
+
+    int sl = this->sl;
+    int slpz = sl*psz;
+    int sllpz = sl*slpz;
+
+    int ldy = psxz - slpz;
+
+    int kpx = psx-sl, dpx = sx>1 ? sx-1 : 1;
+    int kpy = psy-sl, dpy = sy>1 ? sy-1 : 1;
+
+    Real *iw = weights + y0*sxz*sllpz;
+    Neuron *in = neurons + y0*sxz;
+    int fpy = y0*kpy;
+    for(Neuron *e = in + (y1-y0)*sxz; in < e; fpy += kpy) {
+      Neuron *pnrow = pneurons + fpy/dpy*psxz;
+
+      int fpx = 0;
+      for(Neuron *e = in + sxz; in < e; fpx += kpx) {
+        Neuron *pn = pnrow + fpx/dpx*psz;
+
+        for(Neuron *e = in + sz; in < e; ++in) {
+          Real d = in->d;
+
+          Neuron *ipn = pn;
+          for(Real *e = iw + sllpz; iw < e; ipn += ldy)
+          for(Real *e = iw + slpz; iw < e; ++iw, ++ipn)
+            *iw += ipn->v * d;
+        }
+      }
+    }
+  }
+
+  template<bool WithWeights>
+  void backpassTpl(int y0, int y1) {
+    assert(next);
+
+    Neuron *nneurons = next->neurons;
+    Real *nweights = next->weights;
+    int *nInvWeights = next->invWeights;
+    int lpn = next->linksPerNeuron();
+
+    int sxz = sx*sz;
+
+    Neuron *in = neurons + y0*sxz;
+    int *inni = next->invNeurons + y0*sxz;
+    for(Neuron *e = in + (y1-y0)*sxz; in < e; ++in) {
+      Real v = in->v;
+      double s = 0;
+
+      int *iw = nInvWeights + *inni++;
+      for(int *e = nInvWeights + *inni; iw < e; ++iw) {
+        int wi = *iw;
+        Real d = nneurons[wi/lpn].d;
+        Real &nw = nweights[wi];
+        s += nw * d;
+        if (WithWeights) nw += d * v;
+      }
+
+      in->d *= s;
+    }
+  }
+
+  void passAll() {
+    if (prev) pass(0, sy);
+    if (next) next->passAll();
+  }
+
+  void backpassAll() {
+    if (!prev)
+      return;
+    if (!prev->prev)
+      { backpassWeights(0, sy); return; }
+    if (next)
+      backpassTpl<true>(0, sy);
+    return prev->backpassAll();
+  }
+};
+
+
+#endif
diff --git a/simple/neural/nnlayer3.mt.inc.cpp b/simple/neural/nnlayer3.mt.inc.cpp
new file mode 100644
index 0000000..584467a
--- /dev/null
+++ b/simple/neural/nnlayer3.mt.inc.cpp
@@ -0,0 +1,167 @@
+#ifndef NNLAYER3_MT_INC_CPP
+#define NNLAYER3_MT_INC_CPP
+
+
+#include "nnlayer3.inc.cpp"
+
+
+#include <atomic>
+#include <thread>
+#include <vector>
+
+
+class Barrier {
+private:
+  std::atomic<unsigned int> &counter;
+  const unsigned int threads;
+  unsigned int next;
+public:
+  inline Barrier(std::atomic<unsigned int> &counter, unsigned int threads): counter(counter), threads(threads), next() { }
+  inline void wait() { next += threads; ++counter; while(counter < next); }
+  inline void subwait(int tid) { while(counter < next + tid); }
+};
+
+
+class TrainMT {
+private:
+  struct LDesc {
+    int y0, y1;
+    double sumQ;
+    LDesc(): y0(), y1(), sumQ() { }
+  };
+
+public:
+  Layer *layer;
+  const unsigned char *dataX;
+  const unsigned char *dataY;
+  int strideX;
+  int strideY;
+  int *shuffle;
+  int count;
+  Real trainRatio;
+
+  TrainMT():
+    layer(),
+    dataX(),
+    dataY(),
+    strideX(),
+    strideY(),
+    shuffle(),
+    count(),
+    trainRatio() { }
+
+private:
+  void trainFunc(int tid, int threads, std::atomic<unsigned int> &barrierCounter, LDesc *ldescs) {
+    Barrier barrier(barrierCounter, threads);
+
+    Layer &fl = *layer;
+    Layer &bl = layer->back();
+    int layersCount = fl.totalLayers();
+    LDesc *fld = ldescs, *bld = fld + layersCount - 1;
+
+    Real trainRatio = this->trainRatio;
+
+    int fsxz = fl.sx*fl.sz;
+    int bsxz = bl.sx*bl.sz;
+
+    //barrier.subwait(tid);
+    //for(LDesc *ld = fld; ld <= bld; ++ld)
+    //  printf("t%d %d %d %d\n", tid, (int)(ld-fld), ld->y0, ld->y1);
+    //barrier.wait();
+
+    const unsigned char *dataX = this->dataX + fsxz*fld->y0;
+    const unsigned char *dataY = this->dataY + bsxz*bld->y0;
+
+    double sumQ = 0;
+    for(int i = 0; i < count; ++i) {
+      int ii = shuffle[i];
+      const unsigned char *curX = dataX + strideX*ii;
+      const unsigned char *curY = dataY + strideY*ii;
+
+      barrier.wait();
+      const unsigned char *px = curX;
+      for(Neuron *in = fl.neurons + fsxz*fld->y0, *e = fl.neurons + fsxz*fld->y1; in < e; ++in, ++px)
+        in->v = Real(*px)*Real(1/255.0);
+
+      LDesc *ld = fld + 1;
+      for(Layer *l = fl.next; l; l = l->next, ++ld) {
+        barrier.wait();
+        l->pass(ld->y0, ld->y1);
+      }
+
+      double q = 0;
+      const unsigned char *py = curY;
+      for(Neuron *in = bl.neurons + bsxz*bld->y0, *e = bl.neurons + bsxz*bld->y1; in < e; ++in, ++py) {
+        Real v = (in->v - 0.25)*2;
+        Real d = Real(*py)*Real(1/255.0) - v;
+        in->d *= d * trainRatio;
+        d *= d;
+        q += d*d;
+      }
+      sumQ += q;
+
+      if (trainRatio > 0) {
+        ld = bld;
+        for(Layer *l = &bl; l->prev; l = l->prev, --ld) {
+          if (!l->prev->prev) {
+            barrier.wait();
+            l->backpassWeights(ld->y0, ld->y1);
+            break;
+          } else
+          if (l->next) {
+            barrier.wait();
+            l->backpassTpl<true>(ld->y0, ld->y1);
+            //l->backpassTpl<false>(ld->y0, ld->y1);
+            //barrier.wait();
+            //l->next->backpassWeights(ld[1].y0, ld[1].y1);
+          }
+        }
+      }
+
+      //if (!tid) printf(" - %d, %f, %f\n", i, q, sumQ);
+    }
+
+    ldescs->sumQ = sumQ;
+  }
+
+public:
+  double train(int threads) {
+    assert(threads > 0);
+    assert(layer && !layer->prev);
+    assert(dataX && dataY && shuffle);
+    assert(count > 0);
+    assert(trainRatio >= 0);
+
+    int layersCount = layer->totalLayers();
+    assert(layersCount > 0);
+    std::vector<LDesc> ldescs( threads*layersCount );
+
+    int layerId = 0;
+    for(Layer *l = layer; l; l = l->next, ++layerId) {
+      assert(layerId < layersCount);
+      int tsy = l->sy/threads;
+      for(int tid = 0; tid < threads; ++tid) {
+        LDesc &desc = ldescs[tid*layersCount + layerId];
+        desc.y0 = tid*tsy;
+        desc.y1 = desc.y0 + tsy;
+        if (tid == threads-1) desc.y1 = l->sy;
+      }
+    }
+    assert(layerId == layersCount);
+
+    std::atomic<unsigned int> barrierCounter(0);
+    std::vector<std::thread*> t(threads - 1);
+    for(int i = 1; i < threads; ++i)
+      t[i-1] = new std::thread(&TrainMT::trainFunc, this, i, threads, std::ref(barrierCounter), &ldescs[i*layersCount]);
+    trainFunc(0, threads, barrierCounter, &ldescs[0]);
+
+    double result = ldescs[0].sumQ;
+    for(int i = 1; i < threads; ++i)
+      { t[i-1]->join(); delete t[i-1]; result += ldescs[i*layersCount].sumQ; }
+
+    return sqrt(sqrt( result/(count * layer->back().countNeurons()) ));
+  }
+};
+
+
+#endif
diff --git a/simple/neural/tga.inc.cpp b/simple/neural/tga.inc.cpp
new file mode 100644
index 0000000..bd21033
--- /dev/null
+++ b/simple/neural/tga.inc.cpp
@@ -0,0 +1,62 @@
+#ifndef TGA_INC_CPP
+#define TGA_INC_CPP
+
+
+#include <cstdio>
+
+
+bool tgaSave(const char *filename, const unsigned char *data, int w, int h, int ch) {
+    if (!data || w <= 0 || h <= 0 || w > 0xffff || h > 0xffff || (ch != 3 && ch != 4)) {
+        printf("ERROR: cannot save image (bad image): %s\n", filename);
+        return false;
+    }
+
+    FILE *f = fopen(filename, "wb");
+    if (!f) {
+        printf("ERROR: cannot open file: %s\n", filename);
+        return false;
+    }
+
+    #pragma pack(push,1)
+    struct Header {
+        unsigned char  idLength;
+        unsigned char  colormapType;
+        unsigned char  imageType;
+        unsigned char  colormapIndex[2];
+        unsigned char  colormapLength[2];
+        unsigned char  colormapSize;
+        unsigned char  xOrigin[2];
+        unsigned char  yOrigin[2];
+        unsigned char  width[2];
+        unsigned char  height[2];
+        unsigned char  pixelSize;
+        unsigned char  attributes;
+    };
+    #pragma pack(pop)
+    Header header = {};
+    header.imageType = 2;
+    header.width[0] = w;
+    header.width[1] = w >> 8;
+    header.height[0] = h;
+    header.height[1] = h >> 8;
+    header.pixelSize = ch == 4 ? 32 : 24;
+    fwrite(&header, sizeof(header), 1, f);
+
+    int rowSize = w*ch;
+    int size = h*rowSize;
+    const unsigned char *row = data + size;
+    for(unsigned short r = h; r; --r, row -= rowSize) {
+        for(const unsigned char *c = row - rowSize; c < row; c += ch) {
+            fputc(c[2], f);
+            fputc(c[1], f);
+            fputc(c[0], f);
+            if (ch == 4) fputc(c[3], f);
+        }
+    }
+    fclose(f);
+
+    return true;
+}
+
+
+#endif