From 15c502761a925190c30c7039bedec92abcfde503 Mon Sep 17 00:00:00 2001
From: Ivan Mahonin <bh@icystar.com>
Date: Mar 18 2023 07:15:05 +0000
Subject: neural: layer sub


---
diff --git a/projects/neural/layer.all.inc.cpp b/projects/neural/layer.all.inc.cpp
index 2ccae82..3391d92 100644
--- a/projects/neural/layer.all.inc.cpp
+++ b/projects/neural/layer.all.inc.cpp
@@ -4,6 +4,7 @@
 
 #include "layer.simple.inc.cpp"
 #include "layer.conv.inc.cpp"
+#include "layer.sub.inc.cpp"
 
 
 #endif
diff --git a/projects/neural/layer.conv.shared.inc.cpp b/projects/neural/layer.conv.shared.inc.cpp
index dee1861..366093f 100644
--- a/projects/neural/layer.conv.shared.inc.cpp
+++ b/projects/neural/layer.conv.shared.inc.cpp
@@ -46,7 +46,7 @@ void iterateTestConvolutionShared(Layout cl, Layout pl, Kernel k, Neuron *c_neur
 
 
 template<typename Iter>
-void iterateConvolutionShared(Layout cl, Layout pl, Layout wl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+void iterateConvolutionSharedDyn(Layout cl, Layout pl, Layout wl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
   if (!cl) return;
   assert(pl);
   assert(wl);
@@ -95,6 +95,99 @@ void iterateConvolutionShared(Layout cl, Layout pl, Layout wl, Kernel k, Neuron 
 }
 
 
+template<typename Iter, int KSX, int KSY, int PD>
+void iterateConvolutionSharedXYD(Layout cl, Layout pl, Layout wl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+  if (!cl) return;
+  assert(pl);
+  assert(wl);
+  assert(k);
+  assert(c_neurons);
+  assert(p_neurons);
+  assert(weights);
+  assert(cl.isSubLayoutOf(wl));
+  assert(pl.x0 + k.ox >= 0 && pl.x0 + (wl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);
+  assert(pl.y0 + k.oy >= 0 && pl.y0 + (wl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);
+  
+  assert(KSX == k.sx);
+  assert(KSY == k.sy);
+  assert(PD == pl.getD());
+
+  int c_h    = cl.getH();
+  int c_w    = cl.getW();
+  int c_d    = cl.getD();
+  int c_swz  = c_w*cl.sz;
+  int c_shxz = c_h*cl.sx*cl.sz;
+  int c_dx   = cl.sz - c_d;
+  int c_dy   = (cl.sx - c_w)*cl.sz;
+
+  int p_dx   = k.dx*pl.sz;
+  int p_dy   = k.dy*pl.sx*pl.sz - c_w*p_dx;
+
+  int p_ddy  = (pl.sx - KSX)*pl.sz;
+  int p_ddx  = pl.sz - PD;
+
+  Neuron *icn = c_neurons + (cl.y0*cl.sx + cl.x0)*cl.sz + cl.z0;
+  Neuron *ipn = p_neurons + ((pl.y0 + (cl.y0 - wl.y0)*k.dy + k.oy)*pl.sx + pl.x0 + (cl.x0 - wl.x0)*k.dx + k.ox)*pl.sz + pl.z0;
+
+  for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy)
+  for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx)
+  for(Neuron *e = icn +    c_d; icn < e; ++icn) {
+    typename Iter::AccumType a;
+    Iter::init(*icn, a);
+
+    Neuron *iipn = ipn;
+    Weight *iw = weights;
+    for(int i = 0; i < KSY; ++i, iipn += p_ddy)
+    for(int i = 0; i < KSX; ++i, iipn += p_ddx)
+    for(int i = 0; i <  PD; ++i, ++iw, ++iipn)
+      Iter::iter(*iipn, *iw, a);
+
+    Iter::done(*icn, a);
+  }
+}
+
+typedef void (*iterateConvolutionSharedFunc)(Layout, Layout, Layout, Kernel, Neuron*, Neuron*, Weight*);
+template<typename Iter, int KSX, int KSY>
+iterateConvolutionSharedFunc getIterateConvolutionSharedFuncXY(int pd) {
+  if (pd <= 8) switch(pd) {
+  case 1:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 1>;
+  case 2:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 2>;
+  case 3:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 3>;
+  case 4:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 4>;
+  case 5:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 5>;
+  case 6:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 6>;
+  case 7:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 7>;
+  case 8:  return &iterateConvolutionSharedXYD<Iter, KSX, KSY, 8>;
+  }
+  return &iterateConvolutionSharedDyn<Iter>;
+}
+
+
+template<typename Iter>
+iterateConvolutionSharedFunc getIterateConvolutionSharedFunc(int ksx, int ksy, int pd) {
+  if (0 && ksx == ksy && pd <= 8) switch(ksx) {
+  case 1:  return getIterateConvolutionSharedFuncXY<Iter, 1, 1>(pd);
+  case 2:  return getIterateConvolutionSharedFuncXY<Iter, 2, 2>(pd);
+  case 3:  return getIterateConvolutionSharedFuncXY<Iter, 3, 3>(pd);
+  case 4:  return getIterateConvolutionSharedFuncXY<Iter, 4, 4>(pd);
+  case 5:  return getIterateConvolutionSharedFuncXY<Iter, 5, 5>(pd);
+  case 6:  return getIterateConvolutionSharedFuncXY<Iter, 6, 6>(pd);
+  case 7:  return getIterateConvolutionSharedFuncXY<Iter, 7, 7>(pd);
+  case 8:  return getIterateConvolutionSharedFuncXY<Iter, 8, 8>(pd);
+  }
+  return &iterateConvolutionSharedDyn<Iter>;
+}
+
+
+template<typename Iter>
+void iterateConvolutionShared(const Layout &cl, const Layout &pl, const Layout &wl, const Kernel &k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
+  iterateConvolutionSharedFunc f = getIterateConvolutionSharedFunc<Iter>(k.sx, k.sy, pl.getD());
+  f(cl, pl, wl, k, c_neurons, p_neurons, weights);
+}
+
+
+
+
 template<typename Iter>
 void iterateConvolutionSharedPoint(Layout cl, Layout pl, Layout wl, Kernel k, int kx, int ky, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) {
   if (!cl) return;
diff --git a/projects/neural/layer.inc.cpp b/projects/neural/layer.inc.cpp
index 0e7b891..0d6447a 100644
--- a/projects/neural/layer.inc.cpp
+++ b/projects/neural/layer.inc.cpp
@@ -29,6 +29,11 @@ typedef int AccumInt;
 #define RANDOM_MAX 0x7fffffff
 inline unsigned int randomNext(unsigned int prev)
   { return (1103515245*prev + 12345) & RANDOM_MAX; }
+inline unsigned int randomBranch(unsigned int seed)
+  { return randomNext(seed + 1); }
+  
+inline void busyloop(unsigned int count)
+  { while(count--) __asm__ __volatile__(""); }
 
 
 struct Accum {
@@ -64,15 +69,22 @@ class Barrier {
 private:
   std::atomic<unsigned int> &counter;
   unsigned int next;
+  unsigned int busyseed;
 public:
   const unsigned int tid;
   const unsigned int threads;
+  unsigned int seed;
 
   Barrier(const Barrier&) = delete;
-  inline Barrier(std::atomic<unsigned int> &counter, unsigned int tid, unsigned int threads):
-    counter(counter), next(), tid(tid), threads(threads) { assert(tid < threads); }
-  inline void wait() { next += threads; ++counter; while(counter < next); }
-  inline void subwait() { while(counter < next + tid); }
+  inline Barrier(std::atomic<unsigned int> &counter, unsigned int tid, unsigned int threads, unsigned int seed):
+    counter(counter), next(), busyseed(randomBranch(seed)), tid(tid), threads(threads), seed(seed) { assert(tid < threads); }
+    
+  //inline void busyloop() { }
+  inline void busyloop(unsigned int maxCycles = 4096) { ::busyloop( (busyseed = randomNext(busyseed))%maxCycles ); }
+  inline unsigned int rand() { return seed = randomNext(seed); }
+  inline void wait() { next += threads; ++counter; while(counter < next) busyloop(); }
+  inline void subwait() { while(counter < next + tid) busyloop(); }
+  
 };
 
 
@@ -136,7 +148,7 @@ public:
     assert(layout);
     assert(neuronsCount > 0);
     assert(weightsCount >= 0);
-    assert(!prev == !weightsCount);
+    assert(prev || !weightsCount);
 
     if (this->prev) this->prev->next = this;
     if (neuronsCount) {
diff --git a/projects/neural/layer.simple.inc.cpp b/projects/neural/layer.simple.inc.cpp
index 1da0312..9b4d28f 100644
--- a/projects/neural/layer.simple.inc.cpp
+++ b/projects/neural/layer.simple.inc.cpp
@@ -15,7 +15,7 @@ inline void funcSigmoidExp(Neuron &n, AccumReal s) {
 
 
 inline void funcSigmoidExp2(Neuron &n, AccumReal s) {
-  if (s > 5) s = 5; else if (s < -5) s = -5;
+  //if (s > 5) s = 5; else if (s < -5) s = -5;
   AccumReal ss = 1/(1 + std::exp(-s)); n.v = ss; n.d = 0;//ss * (1-ss) * 0.1;
 }
 
diff --git a/projects/neural/layer.simple.test.inc.cpp b/projects/neural/layer.simple.test.inc.cpp
index 7fef70e..fc6add5 100644
--- a/projects/neural/layer.simple.test.inc.cpp
+++ b/projects/neural/layer.simple.test.inc.cpp
@@ -4,6 +4,7 @@
 
 #include "layer.test.inc.cpp"
 #include "layer.simple.inc.cpp"
+#include "layer.sub.inc.cpp"
 
 
 class SimpleTest: public Test {
@@ -175,6 +176,17 @@ public:
       Test::testLayer("LayerSimple", l);
     }
     
+    {
+      Layout ppl(cl.getW()*2, cl.getH()*2, cl.getD());
+      ppl.expandX( pl.x0, pl.sx - pl.x1 );
+      ppl.expandY( pl.y0, pl.sy - pl.y1 );
+      ppl.expandZ( pl.z0, pl.sz - pl.z1 );
+      
+      Layer l(nullptr, ppl);
+      new LayerSub<funcSigmoidExp>(l, cl);
+      Test::testLayer("LayerSub", l);
+    }
+    
     return st;
   }
 
diff --git a/projects/neural/layer.sub.inc.cpp b/projects/neural/layer.sub.inc.cpp
new file mode 100644
index 0000000..f5699cf
--- /dev/null
+++ b/projects/neural/layer.sub.inc.cpp
@@ -0,0 +1,188 @@
+#ifndef LAYER_SUB_INC_CPP
+#define LAYER_SUB_INC_CPP
+
+
+#include "layer.simple.inc.cpp"
+
+
+template<Func func>
+class LayerSub: public Layer {
+public:
+  Layout optLayout;
+  Layout::List mtOptLayouts;
+  std::vector<Neuron*> choosen;
+  
+  LayerSub(Layer &prev, const Layout &layout):
+    Layer(&prev, layout),
+    optLayout(optimizeLayoutSimple(layout)),
+    choosen(layout.getActiveCount(), nullptr)
+    { }
+
+
+  void split(int threadsCount) override {
+    Layer::split(threadsCount);
+    optLayout.split(mtOptLayouts, threadsCount);
+  }
+
+  
+  void pass(Barrier &barrier) override {
+    Layout cl = mtLayouts[barrier.tid];
+    Layout pl = prev->layout;
+    Layout wl = layout;
+    if (!cl) return;
+
+    assert(pl.getW() == wl.getW()*2);
+    assert(pl.getH() == wl.getH()*2);
+    assert(pl.getD() == wl.getD());
+    assert(cl.isSubLayoutOf(wl));
+
+    int c_h    = cl.getH();
+    int c_w    = cl.getW();
+    int c_d    = cl.getD();
+    int c_sxz  = cl.sx*cl.sz;
+    int c_swz  = c_w*cl.sz;
+    int c_shxz = c_h*c_sxz;
+    int c_dy   = c_sxz - c_swz;
+    int c_dx   = cl.sz - c_d;
+
+    int w_d    = wl.getD();
+    int w_w    = wl.getW();
+    int w_dy   = (w_w - c_w)*w_d;
+    int w_dx   = w_d - c_d;
+    
+    int p_dy   = (pl.sx - c_w)*pl.sz*2;
+    int p_dx   = pl.sz*2 - c_d;
+    
+    int p_i1   = pl.sz;
+    int p_i2   = pl.sx*pl.sz;
+    int p_i3   = p_i1 + p_i2;
+
+    int cx0 = cl.x0 - wl.x0;
+    int cy0 = cl.y0 - wl.y0;
+    int cz0 = cl.z0 - wl.z0;
+    
+    Neuron *icn = neurons + (cl.y0*c_sxz + cl.x0*cl.sz + cl.z0);
+    Neuron *ipn = prev->neurons + ((pl.y0 + cy0*2)*pl.sx + pl.x0 + cx0*2)*pl.sz + pl.z0 + cz0;
+    Neuron **icc = choosen.data() + (cy0*w_w + cx0)*w_d + cz0;
+    
+    for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy, icc += w_dy)
+    for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, ipn += p_dx, icc += w_dx)
+    for(Neuron *e = icn +    c_d; icn < e; ++icn, ++ipn, ++icc) {
+      Neuron *iipn = ipn, *pn = iipn;
+      NeuronReal v = pn->v, d = pn->d;
+      pn->d = 0;
+      
+      iipn = ipn + p_i1;
+      if (v < iipn->v) { v = iipn->v; d = iipn->d; pn = iipn; }
+      iipn->d = 0;
+      
+      iipn = ipn + p_i2;
+      if (v < iipn->v) { v = iipn->v; d = iipn->d; pn = iipn; }
+      iipn->d = 0;
+      
+      iipn = ipn + p_i3;
+      if (v < iipn->v) { v = iipn->v; d = iipn->d; pn = iipn; }
+      iipn->d = 0;
+      
+      func(*icn, v);
+      icn->d *= d;
+      *icc = pn;
+    }
+  }
+  
+
+  void backpassDeltas(Barrier &barrier) override {
+    Layout cl = mtOptLayouts[barrier.tid];
+    Layout wl = optLayout;
+    if (!cl) return;
+
+    int c_h    = cl.getH();
+    int c_w    = cl.getW();
+    int c_d    = cl.getD();
+    int c_sxz  = cl.sx*cl.sz;
+    int c_swz  = c_w*cl.sz;
+    int c_shxz = c_h*c_sxz;
+    int c_dy   = c_sxz - c_swz;
+    int c_dx   = cl.sz - c_d;
+
+    int w_d    = wl.getD();
+    int w_w    = wl.getW();
+    int w_dy   = (w_w - c_w)*w_d;
+    int w_dx   = w_d - c_d;
+    
+    Neuron *icn = neurons + (cl.y0*c_sxz + cl.x0*cl.sz + cl.z0);
+    Neuron **icc = choosen.data() + ((cl.y0 - wl.y0)*w_w + cl.x0 - wl.x0)*w_d + cl.z0 - wl.z0;
+    
+    for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, icc += w_dy)
+    for(Neuron *e = icn +  c_swz; icn < e; icn += c_dx, icc += w_dx)
+    for(Neuron *e = icn +    c_d; icn < e; ++icn, ++icc) {
+      assert(*icc);
+      (*icc)->d = icn->d;
+    }
+  }
+
+  
+  void testPass() override {
+    Layout cl = layout;
+    Layout pl = prev->layout;
+
+    assert(pl.getW() == cl.getW()*2);
+    assert(pl.getH() == cl.getH()*2);
+    assert(pl.getD() == cl.getD());
+    
+    for(int cy = cl.y0; cy < cl.y1; ++cy)
+    for(int cx = cl.x0; cx < cl.x1; ++cx)
+    for(int cz = cl.z0; cz < cl.z1; ++cz) {
+      int ci = (cy*cl.sx + cx)*cl.sz + cz;
+      Neuron &cn = neurons[ci];
+
+      Neuron *c = nullptr;
+      NeuronReal v = 0, d = 0;
+
+      for(int ky = 0; ky < 2; ++ky)
+      for(int kx = 0; kx < 2; ++kx) {
+        int px = pl.x0 + (cx - cl.x0)*2 + kx;
+        int py = pl.y0 + (cy - cl.y0)*2 + ky;
+        int pz = pl.z0 + cz - cl.z0;
+        
+        Neuron &pn = prev->neurons[ (py*pl.sx + px)*pl.sz + pz ];
+        if (!c || v < pn.v) { v = pn.v; d = pn.d; c = &pn; }
+        pn.d = 0;
+      }
+      
+      assert(c);
+      c->d = d;
+      func(cn, v);
+    }
+  }
+
+  
+  void testBackpass() override {
+    Layout cl = layout;
+    Layout pl = prev->layout;
+
+    assert(pl.getW() == cl.getW()*2);
+    assert(pl.getH() == cl.getH()*2);
+    assert(pl.getD() == cl.getD());
+    
+    for(int cy = cl.y0; cy < cl.y1; ++cy)
+    for(int cx = cl.x0; cx < cl.x1; ++cx)
+    for(int cz = cl.z0; cz < cl.z1; ++cz) {
+      int ci = (cy*cl.sx + cx)*cl.sz + cz;
+      Neuron &cn = neurons[ci];
+      
+      for(int ky = 0; ky < 2; ++ky)
+      for(int kx = 0; kx < 2; ++kx) {
+        int px = pl.x0 + (cx - cl.x0)*2 + kx;
+        int py = pl.y0 + (cy - cl.y0)*2 + ky;
+        int pz = pl.z0 + cz - cl.z0;
+        
+        Neuron &pn = prev->neurons[ (py*pl.sx + px)*pl.sz + pz ];
+        pn.d *= cn.d;
+      }
+    }
+  }
+};
+
+
+#endif
diff --git a/projects/neural/layer.test.inc.cpp b/projects/neural/layer.test.inc.cpp
index 83327c2..ec5a7bf 100644
--- a/projects/neural/layer.test.inc.cpp
+++ b/projects/neural/layer.test.inc.cpp
@@ -145,10 +145,27 @@ public:
       
       H(Layer &p, Layer &c): p(p), c(c), counter(0) { }
   
+      void fillLayout(Layout l, Neuron *neurons) {
+        for(int y = 0; y < l.sy; ++y)
+        for(int x = 0; x < l.sx; ++x)
+        for(int z = 0; z < l.sz; ++z) {
+          Neuron &n = neurons[ (y*l.sx + x)*l.sz + z ];
+          n = Neuron{};
+          if ( x >= l.x0 && x < l.x1
+            && y >= l.y0 && y < l.y1
+            && z >= l.z0 && z < l.z1 )
+          {
+            n.v = rand()/(NeuronReal)RAND_MAX;
+            n.d = rand()/(NeuronReal)RAND_MAX;
+          }
+        }
+      }
+  
       void prepareData() {
         memcpy(c.neurons, c_neurons.data(), c.neuronsCount*sizeof(Neuron));
         memcpy(p.neurons, p_neurons.data(), p.neuronsCount*sizeof(Neuron));
-        memcpy(c.weights, weights.data(), c.weightsCount*sizeof(Weight));
+        if (c.weightsCount)
+          memcpy(c.weights, weights.data(), c.weightsCount*sizeof(Weight));
       }
   
       void applyDelta() {
@@ -156,8 +173,8 @@ public:
           c.neurons[i].d *= c_neurons[i].v - c.neurons[i].v;
       }
   
-      void func(int tid) {
-        Barrier barrier(counter, tid, threads.size());
+      void func(int tid, unsigned int seed) {
+        Barrier barrier(counter, tid, threads.size(), seed);
         c.pass(barrier);
         barrier.wait();
         if (!tid) applyDelta();
@@ -180,8 +197,8 @@ public:
 
         p.split(threadsCount);
         c.split(threadsCount);
-        for(int i = 1; i < threadsCount; ++i) threads[i] = new std::thread(&H::func, this, i);
-        func(0);
+        for(int i = 1; i < threadsCount; ++i) threads[i] = new std::thread(&H::func, this, i, rand());
+        func(0, rand());
         for(int i = 1; i < threadsCount; ++i) { threads[i]->join(); delete threads[i]; }
         threads.clear();
         
@@ -218,9 +235,10 @@ public:
     // make base data
     
     init(c.neuronsCount*2, p.neuronsCount*2, c.weightsCount*2);
-    for(int i = 0; i < c.neuronsCount; ++i) c_neurons[i].v = rand()/(NeuronReal)RAND_MAX;
-    for(int i = 0; i < p.neuronsCount; ++i) p_neurons[i].v = rand()/(NeuronReal)RAND_MAX;
-    memcpy(weights.data(), c.weights, c.weightsCount*sizeof(Weight));
+    h.fillLayout(c.layout, c_neurons.data());
+    h.fillLayout(p.layout, p_neurons.data());
+    if (c.weightsCount)
+      memcpy(weights.data(), c.weights, c.weightsCount*sizeof(Weight));
     
     h.prepareData();
     c.testPass();
@@ -229,7 +247,8 @@ public:
 
     memcpy(&c_neurons[c.neuronsCount], c.neurons, c.neuronsCount*sizeof(Neuron));
     memcpy(&p_neurons[p.neuronsCount], p.neurons, p.neuronsCount*sizeof(Neuron));
-    memcpy(&weights[c.weightsCount], c.weights, c.weightsCount*sizeof(Weight));
+    if (c.weightsCount)
+      memcpy(&weights[c.weightsCount], c.weights, c.weightsCount*sizeof(Weight));
 
     h.test("single-thread", 1);
     h.test("2-threads", 2);
diff --git a/projects/neural/train.inc.cpp b/projects/neural/train.inc.cpp
index 8582aa4..21a8a56 100644
--- a/projects/neural/train.inc.cpp
+++ b/projects/neural/train.inc.cpp
@@ -67,8 +67,8 @@ protected:
   virtual Quality verifyData(Barrier &barrier, int block, int iter) { return Quality{}; }
 
 private:
-  void threadFunc(int tid, int block) {
-    Barrier barrier(barrierCounter, tid, threadsCount);
+  void threadFunc(int tid, unsigned int seed, int block) {
+    Barrier barrier(barrierCounter, tid, threadsCount, seed);
 
     Quality sumQ = {};
     for(int i = 0; i < itersPerBlock; ++i) {
@@ -107,8 +107,8 @@ private:
     barrierCounter = 0;
     std::vector<std::thread*> t(threadsCount, nullptr);
     for(int i = 1; i < threadsCount; ++i)
-      t[i] = new std::thread(&Trainer::threadFunc, this, i, block);
-    threadFunc(0, block);
+      t[i] = new std::thread(&Trainer::threadFunc, this, i, block, rand());
+    threadFunc(0, block, rand());
 
     Quality result = qualities[0];
     for(int i = 1; i < threadsCount; ++i)
diff --git a/projects/neural/trainer.cpp b/projects/neural/trainer.cpp
index e14f3e6..6567a45 100644
--- a/projects/neural/trainer.cpp
+++ b/projects/neural/trainer.cpp
@@ -28,9 +28,12 @@ int main() {
   //(new LayerSimple<funcSigmoidExp>( l, Layout(10)    ))->filename = FILENAME "3";
   
   Layer l(nullptr, Layout(28, 28));
-  (new LayerConvShared<funcSigmoidExp>(l, Layout(11, 11, 16), Kernel(6, 2, 0)))->filename = FILENAME "1";
-  (new LayerSimple<funcSigmoidExp>(l, Layout(64)))->filename  = FILENAME "2";
-  (new LayerSimple<funcSigmoidExp>(l, Layout(10)))->filename  = FILENAME "3";
+  (new LayerConvShared<funcReLU>(l, Layout(24, 24, 6), Kernel(5, 1, 0)))->filename = FILENAME "1";
+  (new LayerSub<funcSigmoidExp>(l, Layout(12, 12, 6)))->filename = FILENAME "2";
+  (new LayerConvShared<funcReLU>(l, Layout(8, 8, 48), Kernel(5, 1, 0)))->filename = FILENAME "3";
+  (new LayerSub<funcSigmoidExp>(l, Layout(4, 4, 48)))->filename = FILENAME "4";
+  (new LayerSimple<funcSigmoidExp>(l, Layout(64)))->filename = FILENAME "5";
+  (new LayerSimple<funcSigmoidExp>(l, Layout(10)))->filename = FILENAME "6";
 
   l.sumStat().print();
 
@@ -42,7 +45,7 @@ int main() {
 
   printf("train\n");
   //t.configure(l, 0.5, 8, 70000, 0, 0, 0.00001).run();
-  t.configure(l, 0.1, 8, 70000, 0, 0, 0.00001).run();
+  t.configure(l, 0.5, 8, 7000, 0, 0, 0.00001).run();
 
   return 0;
 }