From 15c502761a925190c30c7039bedec92abcfde503 Mon Sep 17 00:00:00 2001 From: Ivan Mahonin Date: Mar 18 2023 07:15:05 +0000 Subject: neural: layer sub --- diff --git a/projects/neural/layer.all.inc.cpp b/projects/neural/layer.all.inc.cpp index 2ccae82..3391d92 100644 --- a/projects/neural/layer.all.inc.cpp +++ b/projects/neural/layer.all.inc.cpp @@ -4,6 +4,7 @@ #include "layer.simple.inc.cpp" #include "layer.conv.inc.cpp" +#include "layer.sub.inc.cpp" #endif diff --git a/projects/neural/layer.conv.shared.inc.cpp b/projects/neural/layer.conv.shared.inc.cpp index dee1861..366093f 100644 --- a/projects/neural/layer.conv.shared.inc.cpp +++ b/projects/neural/layer.conv.shared.inc.cpp @@ -46,7 +46,7 @@ void iterateTestConvolutionShared(Layout cl, Layout pl, Kernel k, Neuron *c_neur template -void iterateConvolutionShared(Layout cl, Layout pl, Layout wl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) { +void iterateConvolutionSharedDyn(Layout cl, Layout pl, Layout wl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) { if (!cl) return; assert(pl); assert(wl); @@ -95,6 +95,99 @@ void iterateConvolutionShared(Layout cl, Layout pl, Layout wl, Kernel k, Neuron } +template +void iterateConvolutionSharedXYD(Layout cl, Layout pl, Layout wl, Kernel k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) { + if (!cl) return; + assert(pl); + assert(wl); + assert(k); + assert(c_neurons); + assert(p_neurons); + assert(weights); + assert(cl.isSubLayoutOf(wl)); + assert(pl.x0 + k.ox >= 0 && pl.x0 + (wl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx); + assert(pl.y0 + k.oy >= 0 && pl.y0 + (wl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy); + + assert(KSX == k.sx); + assert(KSY == k.sy); + assert(PD == pl.getD()); + + int c_h = cl.getH(); + int c_w = cl.getW(); + int c_d = cl.getD(); + int c_swz = c_w*cl.sz; + int c_shxz = c_h*cl.sx*cl.sz; + int c_dx = cl.sz - c_d; + int c_dy = (cl.sx - c_w)*cl.sz; + + int p_dx = k.dx*pl.sz; + int p_dy = k.dy*pl.sx*pl.sz - c_w*p_dx; + + int p_ddy = (pl.sx - KSX)*pl.sz; + int p_ddx = pl.sz - PD; + + Neuron *icn = c_neurons + (cl.y0*cl.sx + cl.x0)*cl.sz + cl.z0; + Neuron *ipn = p_neurons + ((pl.y0 + (cl.y0 - wl.y0)*k.dy + k.oy)*pl.sx + pl.x0 + (cl.x0 - wl.x0)*k.dx + k.ox)*pl.sz + pl.z0; + + for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy) + for(Neuron *e = icn + c_swz; icn < e; icn += c_dx, ipn += p_dx) + for(Neuron *e = icn + c_d; icn < e; ++icn) { + typename Iter::AccumType a; + Iter::init(*icn, a); + + Neuron *iipn = ipn; + Weight *iw = weights; + for(int i = 0; i < KSY; ++i, iipn += p_ddy) + for(int i = 0; i < KSX; ++i, iipn += p_ddx) + for(int i = 0; i < PD; ++i, ++iw, ++iipn) + Iter::iter(*iipn, *iw, a); + + Iter::done(*icn, a); + } +} + +typedef void (*iterateConvolutionSharedFunc)(Layout, Layout, Layout, Kernel, Neuron*, Neuron*, Weight*); +template +iterateConvolutionSharedFunc getIterateConvolutionSharedFuncXY(int pd) { + if (pd <= 8) switch(pd) { + case 1: return &iterateConvolutionSharedXYD; + case 2: return &iterateConvolutionSharedXYD; + case 3: return &iterateConvolutionSharedXYD; + case 4: return &iterateConvolutionSharedXYD; + case 5: return &iterateConvolutionSharedXYD; + case 6: return &iterateConvolutionSharedXYD; + case 7: return &iterateConvolutionSharedXYD; + case 8: return &iterateConvolutionSharedXYD; + } + return &iterateConvolutionSharedDyn; +} + + +template +iterateConvolutionSharedFunc getIterateConvolutionSharedFunc(int ksx, int ksy, int pd) { + if (0 && ksx == ksy && pd <= 8) switch(ksx) { + case 1: return getIterateConvolutionSharedFuncXY(pd); + case 2: return getIterateConvolutionSharedFuncXY(pd); + case 3: return getIterateConvolutionSharedFuncXY(pd); + case 4: return getIterateConvolutionSharedFuncXY(pd); + case 5: return getIterateConvolutionSharedFuncXY(pd); + case 6: return getIterateConvolutionSharedFuncXY(pd); + case 7: return getIterateConvolutionSharedFuncXY(pd); + case 8: return getIterateConvolutionSharedFuncXY(pd); + } + return &iterateConvolutionSharedDyn; +} + + +template +void iterateConvolutionShared(const Layout &cl, const Layout &pl, const Layout &wl, const Kernel &k, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) { + iterateConvolutionSharedFunc f = getIterateConvolutionSharedFunc(k.sx, k.sy, pl.getD()); + f(cl, pl, wl, k, c_neurons, p_neurons, weights); +} + + + + template void iterateConvolutionSharedPoint(Layout cl, Layout pl, Layout wl, Kernel k, int kx, int ky, Neuron *c_neurons, Neuron *p_neurons, Weight *weights) { if (!cl) return; diff --git a/projects/neural/layer.inc.cpp b/projects/neural/layer.inc.cpp index 0e7b891..0d6447a 100644 --- a/projects/neural/layer.inc.cpp +++ b/projects/neural/layer.inc.cpp @@ -29,6 +29,11 @@ typedef int AccumInt; #define RANDOM_MAX 0x7fffffff inline unsigned int randomNext(unsigned int prev) { return (1103515245*prev + 12345) & RANDOM_MAX; } +inline unsigned int randomBranch(unsigned int seed) + { return randomNext(seed + 1); } + +inline void busyloop(unsigned int count) + { while(count--) __asm__ __volatile__(""); } struct Accum { @@ -64,15 +69,22 @@ class Barrier { private: std::atomic &counter; unsigned int next; + unsigned int busyseed; public: const unsigned int tid; const unsigned int threads; + unsigned int seed; Barrier(const Barrier&) = delete; - inline Barrier(std::atomic &counter, unsigned int tid, unsigned int threads): - counter(counter), next(), tid(tid), threads(threads) { assert(tid < threads); } - inline void wait() { next += threads; ++counter; while(counter < next); } - inline void subwait() { while(counter < next + tid); } + inline Barrier(std::atomic &counter, unsigned int tid, unsigned int threads, unsigned int seed): + counter(counter), next(), busyseed(randomBranch(seed)), tid(tid), threads(threads), seed(seed) { assert(tid < threads); } + + //inline void busyloop() { } + inline void busyloop(unsigned int maxCycles = 4096) { ::busyloop( (busyseed = randomNext(busyseed))%maxCycles ); } + inline unsigned int rand() { return seed = randomNext(seed); } + inline void wait() { next += threads; ++counter; while(counter < next) busyloop(); } + inline void subwait() { while(counter < next + tid) busyloop(); } + }; @@ -136,7 +148,7 @@ public: assert(layout); assert(neuronsCount > 0); assert(weightsCount >= 0); - assert(!prev == !weightsCount); + assert(prev || !weightsCount); if (this->prev) this->prev->next = this; if (neuronsCount) { diff --git a/projects/neural/layer.simple.inc.cpp b/projects/neural/layer.simple.inc.cpp index 1da0312..9b4d28f 100644 --- a/projects/neural/layer.simple.inc.cpp +++ b/projects/neural/layer.simple.inc.cpp @@ -15,7 +15,7 @@ inline void funcSigmoidExp(Neuron &n, AccumReal s) { inline void funcSigmoidExp2(Neuron &n, AccumReal s) { - if (s > 5) s = 5; else if (s < -5) s = -5; + //if (s > 5) s = 5; else if (s < -5) s = -5; AccumReal ss = 1/(1 + std::exp(-s)); n.v = ss; n.d = 0;//ss * (1-ss) * 0.1; } diff --git a/projects/neural/layer.simple.test.inc.cpp b/projects/neural/layer.simple.test.inc.cpp index 7fef70e..fc6add5 100644 --- a/projects/neural/layer.simple.test.inc.cpp +++ b/projects/neural/layer.simple.test.inc.cpp @@ -4,6 +4,7 @@ #include "layer.test.inc.cpp" #include "layer.simple.inc.cpp" +#include "layer.sub.inc.cpp" class SimpleTest: public Test { @@ -175,6 +176,17 @@ public: Test::testLayer("LayerSimple", l); } + { + Layout ppl(cl.getW()*2, cl.getH()*2, cl.getD()); + ppl.expandX( pl.x0, pl.sx - pl.x1 ); + ppl.expandY( pl.y0, pl.sy - pl.y1 ); + ppl.expandZ( pl.z0, pl.sz - pl.z1 ); + + Layer l(nullptr, ppl); + new LayerSub(l, cl); + Test::testLayer("LayerSub", l); + } + return st; } diff --git a/projects/neural/layer.sub.inc.cpp b/projects/neural/layer.sub.inc.cpp new file mode 100644 index 0000000..f5699cf --- /dev/null +++ b/projects/neural/layer.sub.inc.cpp @@ -0,0 +1,188 @@ +#ifndef LAYER_SUB_INC_CPP +#define LAYER_SUB_INC_CPP + + +#include "layer.simple.inc.cpp" + + +template +class LayerSub: public Layer { +public: + Layout optLayout; + Layout::List mtOptLayouts; + std::vector choosen; + + LayerSub(Layer &prev, const Layout &layout): + Layer(&prev, layout), + optLayout(optimizeLayoutSimple(layout)), + choosen(layout.getActiveCount(), nullptr) + { } + + + void split(int threadsCount) override { + Layer::split(threadsCount); + optLayout.split(mtOptLayouts, threadsCount); + } + + + void pass(Barrier &barrier) override { + Layout cl = mtLayouts[barrier.tid]; + Layout pl = prev->layout; + Layout wl = layout; + if (!cl) return; + + assert(pl.getW() == wl.getW()*2); + assert(pl.getH() == wl.getH()*2); + assert(pl.getD() == wl.getD()); + assert(cl.isSubLayoutOf(wl)); + + int c_h = cl.getH(); + int c_w = cl.getW(); + int c_d = cl.getD(); + int c_sxz = cl.sx*cl.sz; + int c_swz = c_w*cl.sz; + int c_shxz = c_h*c_sxz; + int c_dy = c_sxz - c_swz; + int c_dx = cl.sz - c_d; + + int w_d = wl.getD(); + int w_w = wl.getW(); + int w_dy = (w_w - c_w)*w_d; + int w_dx = w_d - c_d; + + int p_dy = (pl.sx - c_w)*pl.sz*2; + int p_dx = pl.sz*2 - c_d; + + int p_i1 = pl.sz; + int p_i2 = pl.sx*pl.sz; + int p_i3 = p_i1 + p_i2; + + int cx0 = cl.x0 - wl.x0; + int cy0 = cl.y0 - wl.y0; + int cz0 = cl.z0 - wl.z0; + + Neuron *icn = neurons + (cl.y0*c_sxz + cl.x0*cl.sz + cl.z0); + Neuron *ipn = prev->neurons + ((pl.y0 + cy0*2)*pl.sx + pl.x0 + cx0*2)*pl.sz + pl.z0 + cz0; + Neuron **icc = choosen.data() + (cy0*w_w + cx0)*w_d + cz0; + + for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, ipn += p_dy, icc += w_dy) + for(Neuron *e = icn + c_swz; icn < e; icn += c_dx, ipn += p_dx, icc += w_dx) + for(Neuron *e = icn + c_d; icn < e; ++icn, ++ipn, ++icc) { + Neuron *iipn = ipn, *pn = iipn; + NeuronReal v = pn->v, d = pn->d; + pn->d = 0; + + iipn = ipn + p_i1; + if (v < iipn->v) { v = iipn->v; d = iipn->d; pn = iipn; } + iipn->d = 0; + + iipn = ipn + p_i2; + if (v < iipn->v) { v = iipn->v; d = iipn->d; pn = iipn; } + iipn->d = 0; + + iipn = ipn + p_i3; + if (v < iipn->v) { v = iipn->v; d = iipn->d; pn = iipn; } + iipn->d = 0; + + func(*icn, v); + icn->d *= d; + *icc = pn; + } + } + + + void backpassDeltas(Barrier &barrier) override { + Layout cl = mtOptLayouts[barrier.tid]; + Layout wl = optLayout; + if (!cl) return; + + int c_h = cl.getH(); + int c_w = cl.getW(); + int c_d = cl.getD(); + int c_sxz = cl.sx*cl.sz; + int c_swz = c_w*cl.sz; + int c_shxz = c_h*c_sxz; + int c_dy = c_sxz - c_swz; + int c_dx = cl.sz - c_d; + + int w_d = wl.getD(); + int w_w = wl.getW(); + int w_dy = (w_w - c_w)*w_d; + int w_dx = w_d - c_d; + + Neuron *icn = neurons + (cl.y0*c_sxz + cl.x0*cl.sz + cl.z0); + Neuron **icc = choosen.data() + ((cl.y0 - wl.y0)*w_w + cl.x0 - wl.x0)*w_d + cl.z0 - wl.z0; + + for(Neuron *e = icn + c_shxz; icn < e; icn += c_dy, icc += w_dy) + for(Neuron *e = icn + c_swz; icn < e; icn += c_dx, icc += w_dx) + for(Neuron *e = icn + c_d; icn < e; ++icn, ++icc) { + assert(*icc); + (*icc)->d = icn->d; + } + } + + + void testPass() override { + Layout cl = layout; + Layout pl = prev->layout; + + assert(pl.getW() == cl.getW()*2); + assert(pl.getH() == cl.getH()*2); + assert(pl.getD() == cl.getD()); + + for(int cy = cl.y0; cy < cl.y1; ++cy) + for(int cx = cl.x0; cx < cl.x1; ++cx) + for(int cz = cl.z0; cz < cl.z1; ++cz) { + int ci = (cy*cl.sx + cx)*cl.sz + cz; + Neuron &cn = neurons[ci]; + + Neuron *c = nullptr; + NeuronReal v = 0, d = 0; + + for(int ky = 0; ky < 2; ++ky) + for(int kx = 0; kx < 2; ++kx) { + int px = pl.x0 + (cx - cl.x0)*2 + kx; + int py = pl.y0 + (cy - cl.y0)*2 + ky; + int pz = pl.z0 + cz - cl.z0; + + Neuron &pn = prev->neurons[ (py*pl.sx + px)*pl.sz + pz ]; + if (!c || v < pn.v) { v = pn.v; d = pn.d; c = &pn; } + pn.d = 0; + } + + assert(c); + c->d = d; + func(cn, v); + } + } + + + void testBackpass() override { + Layout cl = layout; + Layout pl = prev->layout; + + assert(pl.getW() == cl.getW()*2); + assert(pl.getH() == cl.getH()*2); + assert(pl.getD() == cl.getD()); + + for(int cy = cl.y0; cy < cl.y1; ++cy) + for(int cx = cl.x0; cx < cl.x1; ++cx) + for(int cz = cl.z0; cz < cl.z1; ++cz) { + int ci = (cy*cl.sx + cx)*cl.sz + cz; + Neuron &cn = neurons[ci]; + + for(int ky = 0; ky < 2; ++ky) + for(int kx = 0; kx < 2; ++kx) { + int px = pl.x0 + (cx - cl.x0)*2 + kx; + int py = pl.y0 + (cy - cl.y0)*2 + ky; + int pz = pl.z0 + cz - cl.z0; + + Neuron &pn = prev->neurons[ (py*pl.sx + px)*pl.sz + pz ]; + pn.d *= cn.d; + } + } + } +}; + + +#endif diff --git a/projects/neural/layer.test.inc.cpp b/projects/neural/layer.test.inc.cpp index 83327c2..ec5a7bf 100644 --- a/projects/neural/layer.test.inc.cpp +++ b/projects/neural/layer.test.inc.cpp @@ -145,10 +145,27 @@ public: H(Layer &p, Layer &c): p(p), c(c), counter(0) { } + void fillLayout(Layout l, Neuron *neurons) { + for(int y = 0; y < l.sy; ++y) + for(int x = 0; x < l.sx; ++x) + for(int z = 0; z < l.sz; ++z) { + Neuron &n = neurons[ (y*l.sx + x)*l.sz + z ]; + n = Neuron{}; + if ( x >= l.x0 && x < l.x1 + && y >= l.y0 && y < l.y1 + && z >= l.z0 && z < l.z1 ) + { + n.v = rand()/(NeuronReal)RAND_MAX; + n.d = rand()/(NeuronReal)RAND_MAX; + } + } + } + void prepareData() { memcpy(c.neurons, c_neurons.data(), c.neuronsCount*sizeof(Neuron)); memcpy(p.neurons, p_neurons.data(), p.neuronsCount*sizeof(Neuron)); - memcpy(c.weights, weights.data(), c.weightsCount*sizeof(Weight)); + if (c.weightsCount) + memcpy(c.weights, weights.data(), c.weightsCount*sizeof(Weight)); } void applyDelta() { @@ -156,8 +173,8 @@ public: c.neurons[i].d *= c_neurons[i].v - c.neurons[i].v; } - void func(int tid) { - Barrier barrier(counter, tid, threads.size()); + void func(int tid, unsigned int seed) { + Barrier barrier(counter, tid, threads.size(), seed); c.pass(barrier); barrier.wait(); if (!tid) applyDelta(); @@ -180,8 +197,8 @@ public: p.split(threadsCount); c.split(threadsCount); - for(int i = 1; i < threadsCount; ++i) threads[i] = new std::thread(&H::func, this, i); - func(0); + for(int i = 1; i < threadsCount; ++i) threads[i] = new std::thread(&H::func, this, i, rand()); + func(0, rand()); for(int i = 1; i < threadsCount; ++i) { threads[i]->join(); delete threads[i]; } threads.clear(); @@ -218,9 +235,10 @@ public: // make base data init(c.neuronsCount*2, p.neuronsCount*2, c.weightsCount*2); - for(int i = 0; i < c.neuronsCount; ++i) c_neurons[i].v = rand()/(NeuronReal)RAND_MAX; - for(int i = 0; i < p.neuronsCount; ++i) p_neurons[i].v = rand()/(NeuronReal)RAND_MAX; - memcpy(weights.data(), c.weights, c.weightsCount*sizeof(Weight)); + h.fillLayout(c.layout, c_neurons.data()); + h.fillLayout(p.layout, p_neurons.data()); + if (c.weightsCount) + memcpy(weights.data(), c.weights, c.weightsCount*sizeof(Weight)); h.prepareData(); c.testPass(); @@ -229,7 +247,8 @@ public: memcpy(&c_neurons[c.neuronsCount], c.neurons, c.neuronsCount*sizeof(Neuron)); memcpy(&p_neurons[p.neuronsCount], p.neurons, p.neuronsCount*sizeof(Neuron)); - memcpy(&weights[c.weightsCount], c.weights, c.weightsCount*sizeof(Weight)); + if (c.weightsCount) + memcpy(&weights[c.weightsCount], c.weights, c.weightsCount*sizeof(Weight)); h.test("single-thread", 1); h.test("2-threads", 2); diff --git a/projects/neural/train.inc.cpp b/projects/neural/train.inc.cpp index 8582aa4..21a8a56 100644 --- a/projects/neural/train.inc.cpp +++ b/projects/neural/train.inc.cpp @@ -67,8 +67,8 @@ protected: virtual Quality verifyData(Barrier &barrier, int block, int iter) { return Quality{}; } private: - void threadFunc(int tid, int block) { - Barrier barrier(barrierCounter, tid, threadsCount); + void threadFunc(int tid, unsigned int seed, int block) { + Barrier barrier(barrierCounter, tid, threadsCount, seed); Quality sumQ = {}; for(int i = 0; i < itersPerBlock; ++i) { @@ -107,8 +107,8 @@ private: barrierCounter = 0; std::vector t(threadsCount, nullptr); for(int i = 1; i < threadsCount; ++i) - t[i] = new std::thread(&Trainer::threadFunc, this, i, block); - threadFunc(0, block); + t[i] = new std::thread(&Trainer::threadFunc, this, i, block, rand()); + threadFunc(0, block, rand()); Quality result = qualities[0]; for(int i = 1; i < threadsCount; ++i) diff --git a/projects/neural/trainer.cpp b/projects/neural/trainer.cpp index e14f3e6..6567a45 100644 --- a/projects/neural/trainer.cpp +++ b/projects/neural/trainer.cpp @@ -28,9 +28,12 @@ int main() { //(new LayerSimple( l, Layout(10) ))->filename = FILENAME "3"; Layer l(nullptr, Layout(28, 28)); - (new LayerConvShared(l, Layout(11, 11, 16), Kernel(6, 2, 0)))->filename = FILENAME "1"; - (new LayerSimple(l, Layout(64)))->filename = FILENAME "2"; - (new LayerSimple(l, Layout(10)))->filename = FILENAME "3"; + (new LayerConvShared(l, Layout(24, 24, 6), Kernel(5, 1, 0)))->filename = FILENAME "1"; + (new LayerSub(l, Layout(12, 12, 6)))->filename = FILENAME "2"; + (new LayerConvShared(l, Layout(8, 8, 48), Kernel(5, 1, 0)))->filename = FILENAME "3"; + (new LayerSub(l, Layout(4, 4, 48)))->filename = FILENAME "4"; + (new LayerSimple(l, Layout(64)))->filename = FILENAME "5"; + (new LayerSimple(l, Layout(10)))->filename = FILENAME "6"; l.sumStat().print(); @@ -42,7 +45,7 @@ int main() { printf("train\n"); //t.configure(l, 0.5, 8, 70000, 0, 0, 0.00001).run(); - t.configure(l, 0.1, 8, 70000, 0, 0, 0.00001).run(); + t.configure(l, 0.5, 8, 7000, 0, 0, 0.00001).run(); return 0; }