From e4740de5c46356293ceb8931c5ca0fb8574e564b Mon Sep 17 00:00:00 2001 From: Ivan Mahonin Date: Apr 01 2023 12:41:10 +0000 Subject: benchmark --- diff --git a/projects/neural/benchmark.inc.cpp b/projects/neural/benchmark.inc.cpp new file mode 100644 index 0000000..8cffeb1 --- /dev/null +++ b/projects/neural/benchmark.inc.cpp @@ -0,0 +1,257 @@ +#ifndef BENCHMARK_INC_CPP +#define BENCHMARK_INC_CPP + + +#include "common.inc.cpp" +#include "layer.conv.inc.cpp" + + +class Benchmark: public ThreadControl { +private: + typedef int Int; + typedef double Float; + + int repeats; + int mode; + Layout pl, cl; + Kernel k; + std::vector pvalues; + std::vector cvalues; + std::vector weights; + + + __attribute__((always_inline)) + void threadFuncXYCP(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) { + Layout pl = this->pl; + Layout cl = this->cl; + Kernel k = this->k; + + assert(k.sx == k_sx); + assert(k.sy == k_sy); + assert(cl.getD() == c_d); + assert(pl.getD() == p_d); + + Int tid = barrier.tid; + Int ts = barrier.threads; + + Int c_w = cl.getW(); + Int c_h = cl.getH(); + Int c_wd = c_w*c_d; + Int c_hw = c_h*c_w; + Int c_hwd = c_h*c_wd; + Int c_sxz = cl.sx*cl.sz; + + Int p_sxz = pl.sx*pl.sz; + Int p_szk = pl.sz*k.dx; + Int p_sxzk = p_sxz*k.dy; + + //Int k_sxd = k_sx*p_d; + Int k_syx = k_sx*k_sy; + Int k_syxd = k_syx*p_d; + + Float *cvalues = this->cvalues.data() + cl.y0*c_sxz + cl.x0*cl.sz + cl.z0; + Float *pvalues = this->pvalues.data() + (pl.y0 + k.oy)*p_sxz + (pl.x0 + k.ox)*pl.sz + pl.z0; + Float *weights = this->weights.data(); + + if (mode == 0) { + for(Int i = repeats; i; --i) { + for(Int i = tid; i < c_hwd; i += ts) { + Int cy = i/c_wd; + Int cx = i%c_wd/c_d; + Int cz = i%c_d; + + Float *ic = &cvalues[ cy*c_sxz + cx*cl.sz + cz ]; + Float *ip = &pvalues[ cy*p_sxzk + cx*p_szk ]; + Float *iw = &weights[ cz*k_syxd ]; + + Float a = 0; + + for(Int i = 0; i < p_d; ++i, ++ip, ++iw) + for(Int i = 0; i < k_syx; ++i) { + Int ky = i/k_sx; + Int kx = i%k_sx; + a += iw[i] * ip[ ky*p_sxz + kx*pl.sz ]; + } + + *ic = a; + } + barrier.wait2(); + } + } else + if (mode == 1) { + if (c_w > 1 || c_h > 1 || pl.sx != pl.getW() || pl.sz != p_d) { + for(Int i = repeats; i; --i) + for(Int i = 0; i < k_syx; ++i) { + Int ky = i/k_sx; + Int kx = i%k_sx; + //Int pi = ky*p_sxz + kx*pl.sz; + //Int wi = i*p_d; + Float *ip = &pvalues[ ky*p_sxz + kx*pl.sz ]; + Float *iw = &weights[ i*p_d ]; + for(Int i = tid; i < c_hw; i += ts) { + Int cy = i/c_w; + Int cx = i%c_w; + Float *iip = &ip[ cy*p_sxzk + cx*p_szk ]; + Float *iic = &cvalues[ cy*c_sxz + cx*cl.sz ]; + + for(Int cz = 0; cz < c_d; ++cz) { + //Int pii = pi + cy*p_sxzk + cx*p_szk; + //Int wii = wi + cz*k_syxd; + Float *iiw = &iw[ cz*k_syxd ]; + Float a = iic[cz]; + + for(Int i = 0; i < p_d; ++i) + iip[i] += iiw[i] * a; + //pvalues[pii + i] += weights[wii + i] * a; + } + } + barrier.wait2(); + } + } else { + //Int c_dd = c_d*p_d; + //Int c_wdd = c_wd*p_d; + //Int c_hwdd = c_hwd*p_d; + Int cnt = k_syxd/ts; + for(Int i = repeats; i; --i) { + for(Int i = cnt*tid; i < cnt; ++i) { + //Int ky = i/k_sxd; + //Int kx = i%k_sxd/p_d; + //Int pz = i%p_d; + //Float *ip = &pvalues[ i ];//ky*p_sxz + kx*pl.sz + pz ]; + Float *iw = &weights[ i*c_d ]; + Float a = 0; + for(Int cz = 0; cz < c_d; ++cz) { + a += iw[ cz ] * cvalues[ cz ]; //*k_syxd + } + pvalues[ i ] = a; + //*ip = a; + } + barrier.wait2(); + } + } + } + } + + __attribute__((always_inline)) + void threadFuncXYC(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) { + switch(p_d) { + case 3: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 3 ); + case 4: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 4 ); + case 24: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 24 ); + case 48: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 28 ); + } + threadFuncXYCP( barrier, k_sx, k_sy, c_d, p_d ); + } + + __attribute__((always_inline)) + void threadFuncXY(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) { + switch(c_d) { + case 3: return threadFuncXYC( barrier, k_sx, k_sy, 3, p_d ); + case 4: return threadFuncXYC( barrier, k_sx, k_sy, 4, p_d ); + case 24: return threadFuncXYC( barrier, k_sx, k_sy, 24, p_d ); + case 48: return threadFuncXYC( barrier, k_sx, k_sy, 48, p_d ); + } + threadFuncXYC( barrier, k_sx, k_sy, c_d, p_d ); + } + + void threadFunc(Barrier &barrier) override { + Int k_sx = k.sx; + Int k_sy = k.sy; + Int c_d = cl.getD(); + Int p_d = pl.getD(); + + if (k_sy == k_sx) switch(k_sx) { + case 4: return threadFuncXY( barrier, 4, 4, c_d, p_d ); + } + threadFuncXY( barrier, k_sx, k_sy, c_d, p_d ); + } + + + void init(int mode, Layout pl, Layout cl, Kernel k, long long totalLinks) { + assert(pl); + assert(cl); + assert(k); + assert(totalLinks > 0); + assert(0 <= pl.x0 + k.ox && (cl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx); + assert(0 <= pl.y0 + k.oy && (cl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy); + + this->mode = mode; + this->pl = pl; + this->cl = cl; + this->k = k; + + pvalues.resize(pl.getCount()); + cvalues.resize(cl.getCount()); + weights.resize(cl.getD()*k.sx*k.sy*pl.getD()); + + for(int i = 0; i < (int)pvalues.size(); ++i) pvalues[i] = rand(); + for(int i = 0; i < (int)cvalues.size(); ++i) cvalues[i] = rand(); + for(int i = 0; i < (int)weights.size(); ++i) weights[i] = rand(); + + long long links = weights.size() * cl.getW() * cl.getH(); + repeats = (totalLinks - 1)/links + 1; + + //printf( "benchmark init: prev %lld, curr %lld, links %lld, repeats %d, total links: %lld\n", + // (long long)pvalues.size(), (long long)cvalues.size(), links, repeats, links*repeats ); + } + + + void run(const char *name, int threadsCount, int mode, Layout pl, Layout cl, Kernel k, long long totalLinks) { + init(mode, pl, cl, k, totalLinks); + + volatile long long t0 = timeUs(); + runThreads(threadsCount); + volatile long long t1 = timeUs(); + + Float sum = 0; + for(int i = 0; i < (int)pvalues.size(); ++i) sum += pvalues[i]; + for(int i = 0; i < (int)cvalues.size(); ++i) sum += cvalues[i]; + for(int i = 0; i < (int)weights.size(); ++i) sum += weights[i]; + printf("%s %d: %f, %lld\n", name, mode, (t1 - t0)*1e-6, (long long)sum); + } + + + void run(const char *name, int threadsCount, Layout pl, Layout cl, Kernel k, long long totalLinks) { + for(int mode = 0; mode < 2; ++mode) + run(name, threadsCount, mode, pl, cl, k, totalLinks); + } + + +public: + void run(int threadsCount = 1) { + //printf("run benchmark: %d\n", threadsCount); + /* + run( "514x3 -> 258x24", threadsCount, + Layout(514, 514, 3).expandXY(2), + Layout(258, 258, 24).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 ); + run( "258x24 -> 130x48", threadsCount, + Layout(258, 258, 24).expandXY(2), + Layout(130, 130, 48).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 ); + run( "130x48 -> 66x96 ", threadsCount, + Layout(130, 130, 48).expandXY(2), + Layout( 66, 66, 96).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 ); + run( "66x96 -> 34x144", threadsCount, + Layout( 66, 66, 96).expandXY(2), + Layout(34, 34, 144).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 ); + run( "34x144 -> 18x216", threadsCount, + Layout(34, 34, 144).expandXY(2), + Layout(18, 18, 216).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 ); + run( "18x216 -> 10x324", threadsCount, + Layout(18, 18, 216).expandXY(2), + Layout(10, 10, 324).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 ); + run( "10x324 -> 6x486 ", threadsCount, + Layout(10, 10, 324).expandXY(2), + Layout( 6, 6, 486).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 ); + run( "6x486 -> 4x729 ", threadsCount, + Layout( 6, 6, 486).expandXY(2), + Layout( 4, 4, 729).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 ); + */ + run( "4x768 -> 1x1093 ", threadsCount, + Layout( 4, 4, 768).expandXY(0), + Layout( 1, 1, 1093).expandXY(0), Kernel(4, 2, 0), 10ll*1000*1000*1000 ); + } +}; + + + +#endif diff --git a/projects/neural/benchmark.segment.inc.cpp b/projects/neural/benchmark.segment.inc.cpp new file mode 100644 index 0000000..713a0cb --- /dev/null +++ b/projects/neural/benchmark.segment.inc.cpp @@ -0,0 +1,90 @@ + +#include "segment.inc.cpp" +#include "segment.test.inc.cpp" +#include "segment.cx4.inc.cpp" + + +class BenchmarkSegment { +public: + static std::vector values; + static std::vector weights; + + static void runSegment(const char *name, Segment &s, int threads, int repeats, bool backpass) { + class H: public ThreadControl { + public: + Segment &s; + int repeats; + NeuronReal ratio; + std::vector qualities; + + H(Segment &s, int repeats, NeuronReal ratio): s(s), repeats(repeats), ratio(ratio) { } + + void threadFunc(Barrier &barrier) override { + Segment &s = this->s; + Layout l = s.layout; + NeuronReal r = ratio; + int sx = l.getW() - s.sx + 1; + int sy = l.getH() - s.sy + 1; + int sz = l.getD() - s.sz + 1; + + Quality q; + for(int i = 0; i < repeats; ++i) { + int x = l.x0 + barrier.commonRand()%sx; + int y = l.y0 + barrier.commonRand()%sy; + int z = l.z0 + barrier.commonRand()%sz; + q += s.pass(barrier, x, y, z, r); + barrier.wait(); + } + qualities[barrier.tid] = q; + } + } h(s, repeats, backpass ? 1 : 0); + + int cnt = s.layout.getCount(); + values.resize(cnt); + weights.resize(s.weightsCount); + s.f_values = values.data(); + s.weights = weights.data(); + + for(int i = 0; i < cnt; ++i) + s.f_values[i] = rand()/(NeuronReal)RAND_MAX; + for(int i = 0; i < s.weightsCount; ++i) + s.weights[i].w = rand()/(WeightReal)RAND_MAX; + h.qualities.resize(threads); + + volatile long long t0 = timeUs(); + h.runThreads(threads); + volatile long long t1 = timeUs(); + + AccumReal sum = 0; + for(int i = 0; i < threads; ++i) sum += h.qualities[i].train + h.qualities[i].human; + for(int i = 0; i < s.weightsCount; ++i) sum += s.weights[i].w; + + printf("%s %d: %f, %lld\n", name, (int)backpass, (t1 - t0)*1e-6, (long long)sum); + } + + + static void runSegment(const char *name, Segment &s, int threads, int repeats) { + runSegment(name, s, threads, repeats, false); + runSegment(name, s, threads, repeats, true); + } + + + static void runSegment(const char *name, Segment &s, int threads) { + int repeats = (int)( (1ll*1000*1000*1000 + s.effectiveLinks/2)/s.effectiveLinks ); + if (!SegmentTest::testSegment(name, s)) return; + runSegment(name, s, threads, repeats, false); + runSegment(name, s, threads, repeats, true); + } + + + static void run(int threads) { + { SegmentCx4 s(3, 24); s.layout = Layout(514, 514, 3).expandXY(2); + runSegment("cx4-3x24", s, threads); } + } +}; + + +std::vector BenchmarkSegment::values; +std::vector BenchmarkSegment::weights; + + diff --git a/projects/neural/common.inc.cpp b/projects/neural/common.inc.cpp index 40efc35..03f43af 100644 --- a/projects/neural/common.inc.cpp +++ b/projects/neural/common.inc.cpp @@ -12,9 +12,12 @@ #include #include #include -#include #include +#include +#include +#include + #include "layout.inc.cpp" @@ -36,6 +39,10 @@ inline unsigned int randomBranch(unsigned int seed) inline void busyloop(unsigned int count) { while(count--) __asm__ __volatile__(""); } +inline void sleep() + { std::this_thread::sleep_for(std::chrono::nanoseconds(0)); } +inline void sleepUs(long long us) + { std::this_thread::sleep_for(std::chrono::microseconds(us)); } inline long long timeUs() { @@ -75,29 +82,82 @@ struct Iter { +class Barrier; + +class ThreadControl { +private: + friend class Barrier; + + std::mutex mutex; + std::condition_variable cond; + std::atomic counter; + std::vector threads; + unsigned int commonSeed; + + void runSingleThread(unsigned int tid, unsigned int seed); + +protected: + virtual void threadFunc(Barrier&) { } + +public: + ThreadControl(): counter(0), commonSeed() { } + void runThreads(unsigned int threadsCount = 1) { + assert(threadsCount); + counter = 0; + threads.clear(); + threads.resize(threadsCount); + commonSeed = rand(); + for(unsigned int i = 1; i < threadsCount; ++i) + threads[i] = new std::thread(&ThreadControl::runSingleThread, this, i, rand()); + runSingleThread(0, rand()); + for(unsigned int i = 1; i < threadsCount; ++i) + { threads[i]->join(); delete threads[i]; } + threads.clear(); + } +}; + + class Barrier { private: - std::atomic &counter; + ThreadControl &owner; unsigned int next; unsigned int busyseed; public: const unsigned int tid; const unsigned int threads; unsigned int seed; + unsigned int commonSeed; Barrier(const Barrier&) = delete; - inline Barrier(std::atomic &counter, unsigned int tid, unsigned int threads, unsigned int seed): - counter(counter), next(), busyseed(randomBranch(seed)), tid(tid), threads(threads), seed(seed) { assert(tid < threads); } + inline Barrier(ThreadControl &owner, unsigned int tid, unsigned int seed, unsigned int commonSeed): + owner(owner), next(), busyseed(randomBranch(seed)), tid(tid), threads(owner.threads.size()), seed(seed), commonSeed(commonSeed) + { assert(tid < threads); } //inline void busyloop() { } inline void busyloop(unsigned int maxCycles = 4096) { ::busyloop( (busyseed = randomNext(busyseed))%maxCycles ); } inline unsigned int rand() { return seed = randomNext(seed); } - inline void wait() { next += threads; ++counter; while(counter < next) busyloop(); } - inline void subwait() { while(counter < next + tid) busyloop(); } + inline unsigned int commonRand() { return commonSeed = randomNext(commonSeed); } + inline void wait() { next += threads; ++owner.counter; while(owner.counter < next) busyloop(); } + inline void subwait() { while(owner.counter < next + tid) busyloop(); } + + inline void wait2() { + next += threads; + std::unique_lock lock(owner.mutex); + if (++owner.counter == next) owner.cond.notify_all(); else + while(owner.counter < next) owner.cond.wait(lock); + } + inline void wait3() { next += threads; ++owner.counter; while(owner.counter < next) sleepUs(1); } }; +void ThreadControl::runSingleThread(unsigned int tid, unsigned int seed) { + Barrier barrier(*this, tid, seed, commonSeed); + threadFunc(barrier); +} + + + struct Stat { int neurons; int activeNeurons; diff --git a/projects/neural/layer.inc.cpp b/projects/neural/layer.inc.cpp index fe357de..ebdc838 100644 --- a/projects/neural/layer.inc.cpp +++ b/projects/neural/layer.inc.cpp @@ -152,31 +152,22 @@ public: void passFull(const Layer *last = nullptr, int threadsCount = 1) { - struct H { + class H: public ThreadControl { + public: Layer &layer; const Layer *last; - std::atomic barrierCounter; - std::vector threads; - H(Layer &layer, const Layer *last, int threadsCount): layer(layer), last(last), barrierCounter(0), threads(threadsCount) { } + H(Layer &layer, const Layer *last): layer(layer), last(last) { } - void func(int tid, unsigned int seed) { - Barrier barrier(barrierCounter, tid, threads.size(), seed); + void threadFunc(Barrier &barrier) override { for(Layer *l = layer.next; l; l = l->next) { l->pass(barrier); if (l == last || !l->next) break; barrier.wait(); } } - } h(*this, last, threadsCount); - - for(Layer *l = this; l; l = l->next) - l->split(threadsCount); - for(int i = 1; i < threadsCount; ++i) - h.threads[i] = new std::thread(&H::func, &h, i, rand()); - h.func(0, rand()); - for(int i = 1; i < threadsCount; ++i) - { h.threads[i]->join(); delete h.threads[i]; } + }; + H(*this, last).runThreads(threadsCount); } }; diff --git a/projects/neural/layer.test.inc.cpp b/projects/neural/layer.test.inc.cpp index a12c154..c99db49 100644 --- a/projects/neural/layer.test.inc.cpp +++ b/projects/neural/layer.test.inc.cpp @@ -3,6 +3,7 @@ #include "test.inc.cpp" +#include "layer.inc.cpp" @@ -16,14 +17,12 @@ public: Layer &c = *l.next; - struct H { + class H: public ThreadControl { + public: Layer &p; Layer &c; - std::vector threads; - std::atomic counter; - - H(Layer &p, Layer &c): p(p), c(c), counter(0) { } + H(Layer &p, Layer &c): p(p), c(c) { } void fillLayout(Layout l, Neuron *neurons) { for(int y = 0; y < l.sy; ++y) @@ -53,11 +52,10 @@ public: c.neurons[i].d *= c_neurons[i].v - c.neurons[i].v; } - void func(int tid, unsigned int seed) { - Barrier barrier(counter, tid, threads.size(), seed); + void threadFunc(Barrier &barrier) override { c.pass(barrier); barrier.wait(); - if (!tid) applyDelta(); + if (!barrier.tid) applyDelta(); barrier.wait(); c.backpassDeltas(barrier); barrier.wait(); @@ -69,18 +67,11 @@ public: assert(threadsCount > 0); - counter = 0; - threads.clear(); - threads.resize(threadsCount, nullptr); - prepareData(); p.split(threadsCount); c.split(threadsCount); - for(int i = 1; i < threadsCount; ++i) threads[i] = new std::thread(&H::func, this, i, rand()); - func(0, rand()); - for(int i = 1; i < threadsCount; ++i) { threads[i]->join(); delete threads[i]; } - threads.clear(); + runThreads(threadsCount); for(int i = 0; i < c.neuronsCount; ++i) { NeuronReal a = c.neurons[i].v; diff --git a/projects/neural/segment.conv.inc.cpp b/projects/neural/segment.conv.inc.cpp new file mode 100644 index 0000000..40b114a --- /dev/null +++ b/projects/neural/segment.conv.inc.cpp @@ -0,0 +1,297 @@ +#ifndef SEGMENT_CONV_INC_CPP +#define SEGMENT_CONV_INC_CPP + + +#include "segment.inc.cpp" +#include "func.inc.cpp" +#include "layer.conv.inc.cpp" + + +class SegmentConv: public Segment { +public: + enum { + KSX = 4, + KSY = 4, + }; + + const int msx, msy, msz; + + NeuronReal *m_values; + NeuronReal *b_values; + + SegmentConv(int sx, int sy, int sz, int msz, Weight *weights = nullptr): + Segment(sx, sy, sz, msz*KSY*KSX*sz, weights), msx((sx - KSX)/2 + 1), msy((sy - KSY)/2 + 1), msz(msz) + { + assert(msx > 0); + assert(msy > 0); + assert(msz > 0); + m_values = new NeuronReal[msx*msy*msz + sx*sy*sz]; + b_values = m_values + msx*msy*msz; + clear(); + } + ~SegmentConv() + { delete[] m_values; } + + + void clear() override + { memset(m_values, 0, sizeof(*m_values)*(msx*msy*msz + sx*sy*sz)); } + + + inline void check(int x, int y, int z) { + Segment::check(x, y, z); + assert(layout.getD() == sz); + } + + + + Quality pass(Barrier &barrier, int x, int y, int z, NeuronReal trainRatio) override { + check(x, y, z); + Layout l = layout; + NeuronReal *f_values = this->f_values + (y*l.sx + x)*l.sz + z; + } + + __attribute__((always_inline)) + inline Quality pass(Barrier &barrier, NeuronReal *f_values, NeuronReal trainRatio) { + Layout l = layout; + int tid = barrier.tid; + int threads = barrier.threads; + + int sx = this->sx; + //int sy = this->sy; + int sz = this->sz; + int msx = this->msx; + int msy = this->msy; + int msz = this->msz; + int msxz = msx*msz; + + int ksxz = KSX*sz; + int ksyxz = KSY*ksxz; + + int fv_dkx = l.sz - sz; + int fv_dky = (l.sx - KSX)*l.sz; + int fv_dmx = 2*l.sz; + int fv_dmy = 2*(l.sx - msx)*l.sz; + + int mn_dtz = threads - msx*msy*msz; + + // stage 1: pass from front to mid + + int f_sxz = l.sx*l.sz; + int f_sz2 = l.sz*2; + int f_sxz2 = l.sx*f_sz2; + + int m_cnt = msx*msy*msz; + int mi0 = m_cnt*tid/threads; + int mi1 = m_cnt*(tid+1)/threads; + + for(int i = mi0; i < mi1; ++i) { + int my = i/msxz; + int mx = i%msxz/msz; + int mz = i%msz; + + AccumReal a = 0; + int wi = i*ksyxz; + int fvi = my*f_sxz2 + mx*f_sz2 + mz; + for(int ky = 0; ky < KSY; ++ky, fvi += f_sxz, wi += ksxz) { + Weight *iw = &weights[wi]; + NeuronReal *ifv = &f_values[fvi]; + for(int i = 0; i < ksxz; ++i) + a += ifv[i]*iw[i].w; + } + + m_values[i] = a > 0 ? a : 0; + } + + barrier.wait(); + + // stage 2: pass from mid to back and verify + + AccumReal qa = 0; + for(int by = 2 + tid; by < 10; by += threads) + for(int bx = 2; bx < 10; ++bx) + for(int bz = 0; bz < sz; ++bz) { + AccumReal a = 0; + Neuron &bn = b_neurons[ (by*sx + bx)*sz + bz ]; + + for(int ky = by%2; ky < KSY; ky += 2) + for(int kx = bx%2; kx < KSX; kx += 2) { + int mx = (bx - kx)/2; + int my = (by - ky)/2; + assert(mx >= 0 && mx < msx && (bx - kx)%2 == 0); + assert(my >= 0 && my < msy && (by - ky)%2 == 0); + for(int mz = 0; mz < msz; ++mz) { + Neuron &mn = m_neurons[ (my*msx + mx)*msz + mz ]; + Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + bz ]; + a += mn.v * w.w; + } + } + + if (a > 0) bn.v = a, bn.d = 1; else bn.v = bn.d = 0; + + NeuronReal fn = f_values[ (by*l.sx + bx)*l.sz + bz ]; + NeuronReal d = fn - bn.v; + bn.d *= d*trainRatio; + qa += d*d; + } + Quality q(qa/(64*sz)); + + if (trainRatio <= 0) return q; + + barrier.wait(); + + // stage 3: backpass deltas + + for(int mz = tid; mz < msz; mz += threads) + for(int my = 1; my < 4; ++my) + for(int mx = 1; mx < 4; ++mx) { + AccumReal a = 0; + Neuron &mn = m_neurons[ (my*msx + mx)*msz + mz ]; + + for(int ky = 0; ky < ksy; ++ky) + for(int kx = 0; kx < ksx; ++kx) + for(int kz = 0; kz < sz; ++kz) { + int bx = mx*2 + kx; + int by = my*2 + ky; + Neuron &bn = b_neurons[ (by*sx + bx)*sz + kz ]; + Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + kz ]; + a += bn.d * w.w; + } + mn.d *= a; + } + + barrier.wait(); + + // stage 4: update weights + + for(int mz = tid; mz < msz; mz += threads) + for(int by = 4; by < 8; ++by) + for(int bx = 4; bx < 8; ++bx) + for(int bz = 0; bz < sz; ++bz) { + Neuron &bn = b_neurons[ (by*sx + bx)*sz + bz ]; + NeuronReal fv = f_values[ (by*l.sx + bx)*l.sz + bz ]; + + for(int ky = by%2; ky < ksy; ky += 2) + for(int kx = bx%2; kx < ksx; kx += 2) { + int mx = (bx - kx)/2; + int my = (by - ky)/2; + assert(mx >= 1 && mx < 4 && (bx - kx)%2 == 0); + assert(my >= 1 && my < 4 && (by - ky)%2 == 0); + Neuron &mn = m_neurons[ (my*msx + mx)*msz + mz ]; + Weight &w = weights[ ((mz*ksy + ky)*ksx + kx)*sz + bz ]; + w.w += bn.d*mn.v + mn.d*fv; + } + } + + return q; + } + + + + Quality testPass(int x, int y, int z, NeuronReal trainRatio) override { + check(x, y, z); + + Layout l = layout; + + // stage 1: pass + + clear(); + + for(int my = 0; my < msy; ++my) + for(int mx = 0; mx < msx; ++mx) + for(int mz = 0; mz < msz; ++mz) { + AccumReal a = 0; + for(int ky = 0; ky < KSY; ++ky) + for(int kx = 0; kx < KSX; ++kx) + for(int kz = 0; kz < sz; ++kz) { + int fx = x + mx*2 + kx; + int fy = y + my*2 + ky; + int fz = z + kz; + NeuronReal fv = f_values[ (fy*l.sx + fx)*l.sz + fz ]; + Weight &w = weights[ ((mz*KSY + ky)*KSX + kx)*sz + kz ]; + a += fv * w.w; + } + + NeuronReal &mv = m_values[ (my*msx + mx)*msz + mz ]; + if (a < 0) { mv = 0; continue; } + mv = a; + + for(int ky = 0; ky < KSY; ++ky) + for(int kx = 0; kx < KSX; ++kx) + for(int kz = 0; kz < sz; ++kz) { + int bx = mx*2 + kx; + int by = my*2 + ky; + int bz = kz; + NeuronReal &bv = b_values[ (by*sx + bx)*sz + bz ]; + Weight &w = weights[ ((mz*KSY + ky)*KSX + kx)*sz + kz ]; + bv += a * w.w; + } + } + + // stage 2: finalize values and verify + + AccumReal qa = 0; + for(int by = 0; by < sy; ++by) + for(int bx = 0; bx < sx; ++bx) + for(int bz = 0; bz < sz; ++bz) { + NeuronReal fn = f_values[ ((y + by)*l.sx + x + bx)*l.sz + z + bz ]; + NeuronReal &bv = b_values[ (by*sx + bx)*sz + bz ]; + if (bv > 0) { + NeuronReal d = fn - bv; + bv = d*trainRatio; + qa += d*d; + } else { + bv = 0; + qa += fn*fn; + } + } + Quality q(qa/(KSX*KSY*sz)); + + if (trainRatio <= 0) return q; + + // stage 3: backpass deltas and update weights + + for(int my = 0; my < msy; ++my) + for(int mx = 0; mx < msx; ++mx) + for(int mz = 0; mz < msz; ++mz) { + NeuronReal mv = m_values[ (my*msx + mx)*msz + mz ]; + if (!mv) continue; + + AccumReal a = 0; + for(int ky = 0; ky < KSY; ++ky) + for(int kx = 0; kx < KSX; ++kx) + for(int kz = 0; kz < sz; ++kz) { + int bx = mx*2 + kx; + int by = my*2 + ky; + int bz = kz; + NeuronReal bv = b_values[ (by*sx + bx)*sz + bz ]; + Weight &w = weights[ ((mz*KSY + ky)*KSX + kx)*sz + kz ]; + a += bv * w.w; + } + + for(int ky = 0; ky < KSY; ++ky) + for(int kx = 0; kx < KSX; ++kx) + for(int kz = 0; kz < sz; ++kz) { + int bx = mx*2 + kx; + int by = my*2 + ky; + int bz = kz; + NeuronReal fv = f_values[ ((y + by)*l.sx + x + bx)*l.sz + z + bz ]; + NeuronReal bv = b_values[ (by*sx + bx)*sz + bz ]; + Weight &w = weights[ ((mz*KSY + ky)*KSX + kx)*sz + kz ]; + w.w += bv*mv + fv*a; + } + } + + return q; + } + + + bool saveDemo() override + { return !filename || saveConvDemoImage(filename, msz, KSX, KSY, sz, weights); } +}; + + + + +#endif + + diff --git a/projects/neural/segment.cx4.inc.cpp b/projects/neural/segment.cx4.inc.cpp index 61234ba..517688b 100644 --- a/projects/neural/segment.cx4.inc.cpp +++ b/projects/neural/segment.cx4.inc.cpp @@ -54,15 +54,20 @@ public: int threads = barrier.threads; int sx = this->sx; - int sy = this->sy; + //int sy = this->sy; int sz = this->sz; int msx = this->msx; int msy = this->msy; int msz = this->msz; int ksxyz = ksx*ksy*sz; + int fv_dkx = l.sz - sz; int fv_dky = (l.sx - ksx)*l.sz; + int fv_dmx = 2*l.sz; + int fv_dmy = 2*(l.sx - MSX)*l.sz; + + int mn_dtz = threads - msx*msy*msz; NeuronReal *f_values = this->f_values + (y*l.sx + x)*l.sz + z; @@ -72,9 +77,9 @@ public: Neuron *imn = m_neurons + tid; NeuronReal *ifv = f_values; - for(int mz = tid; mz < msz; mz += threads, iw += threads*ksxyz, imn += threads - msx*msy*msz, ifv = f_values) - for(int my = 0; my < MSY; ++my, ifv += 2*(l.sx - MSX)*l.sz) - for(int mx = 0; mx < MSX; ++mx, imn += msz, ifv += 2*l.sz) { + for(int mz = tid; mz < msz; mz += threads, iw += threads*ksxyz, imn += mn_dtz, ifv = f_values) + for(int my = 0; my < MSY; ++my, ifv += fv_dmy) + for(int mx = 0; mx < MSX; ++mx, imn += msz, ifv += fv_dmx) { AccumReal a = 0; Weight *iiw = iw; diff --git a/projects/neural/segment.inc.cpp b/projects/neural/segment.inc.cpp index ad4e3ea..badd35a 100644 --- a/projects/neural/segment.inc.cpp +++ b/projects/neural/segment.inc.cpp @@ -9,11 +9,12 @@ class Segment: public WeightHolder { public: const int sx, sy, sz; + int effectiveLinks; Layout layout; NeuronReal *f_values; Segment(int sx, int sy, int sz, int weightsCount, Weight *weights = nullptr): - WeightHolder(weightsCount, weights), sx(sx), sy(sy), sz(sz), f_values() { } + WeightHolder(weightsCount, weights), sx(sx), sy(sy), sz(sz), effectiveLinks(weightsCount), f_values() { } virtual ~Segment() { } diff --git a/projects/neural/segment.layer.inc.cpp b/projects/neural/segment.layer.inc.cpp new file mode 100644 index 0000000..0dc5d55 --- /dev/null +++ b/projects/neural/segment.layer.inc.cpp @@ -0,0 +1,73 @@ +#ifndef SEGMENT_LAYER_INC_CPP +#define SEGMENT_LAYER_INC_CPP + + +#include "segment.inc.cpp" +#include "layer.inc.cpp" + + +class SegmentLayer: public Segment { +public: + Layer *fl; + Layer *bl; + + SegmentLayer(Layer &fl, Layer &bl): + Segment(fl.layout.getW(), fl.layout.getH(), fl.layout.getD(), fl.weightsCount, fl.weights) + { + assert(fl.layout.getW() == bl.layout.getW()); + assert(fl.layout.getH() == bl.layout.getH()); + assert(fl.layout.getD() == bl.layout.getD()); + filename = fl.filename; + } + + SegmentLayer(Layer &layer): SegmentLayer(layer, layer.back()) { } + + + Quality pass(Barrier &barrier, int x, int y, int z, NeuronReal trainRatio) override { + check(x, y, z); + + // copy values + + // pass + + Layer *ffl = fl; + for(Layer *l = fl->next; l; l = l->next) { + if (!l->skipTrain) ffl = l; + barrier.wait(); + l->pass(barrier); + if (l == bl) break; + if (!l->next) return Quality::nan(); + } + + // verify + + Quality q; + + if (trainRatio <= 0) return q; + + // back pass deltas + + for(Layer *l = bl; l != ffl; l = l->next) { + barrier.wait(); + l->backpassDeltas(barrier); + } + + // beack pass weights + + for(Layer *l = bl; l; l = l->next) { + if (!l->skipTrain) { + barrier.wait(); + l->backpassDeltas(barrier); + } + } + + return q; + } +}; + + + + +#endif + + diff --git a/projects/neural/segment.test.inc.cpp b/projects/neural/segment.test.inc.cpp index 1b7da72..93151c5 100644 --- a/projects/neural/segment.test.inc.cpp +++ b/projects/neural/segment.test.inc.cpp @@ -12,45 +12,36 @@ public: static bool testSegment(const char *name, Segment &segment, Layout l, int x, int y, int z, NeuronReal trainRatio) { Stage st(name); - struct H { + class H: public ThreadControl { + public: Segment &segment; int x, y, z; Quality testQ; NeuronReal ratio; - - std::vector threads; std::vector qualities; - std::atomic counter; - H(Segment &segment, int x, int y, int z, NeuronReal ratio): segment(segment), x(x), y(y), z(z), ratio(ratio), counter(0) { } + H(Segment &segment, int x, int y, int z, NeuronReal ratio): segment(segment), x(x), y(y), z(z), ratio(ratio) { } void prepareData() { memcpy(segment.weights, weights.data(), segment.weightsCount*sizeof(Weight)); } - void func(int tid, unsigned int seed) { - Barrier barrier(counter, tid, threads.size(), seed); - qualities[tid] = segment.pass(barrier, x, y, z, ratio); - } + void threadFunc(Barrier &barrier) override + { qualities[barrier.tid] = segment.pass(barrier, x, y, z, ratio); } bool test(const char *name, int threadsCount) { Stage st(name); assert(threadsCount > 0); - counter = 0; - threads.clear(); qualities.clear(); - threads.resize(threadsCount, nullptr); qualities.resize(threadsCount); prepareData(); segment.split(threadsCount); - for(int i = 1; i < threadsCount; ++i) threads[i] = new std::thread(&H::func, this, i, rand()); - func(0, rand()); + runThreads(threadsCount); - Quality q = qualities[0]; - for(int i = 1; i < threadsCount; ++i) { threads[i]->join(); delete threads[i]; q += qualities[i]; } - threads.clear(); + Quality q; + for(int i = 0; i < threadsCount; ++i) q += qualities[i]; if ( fabs(q.train - testQ.train) > 1e-10 || fabs(q.human - testQ.human) > 1e-10 ) @@ -103,6 +94,15 @@ public: return st; } + + + static bool testSegment(const char *name, Segment &segment) { + Layout l = segment.layout; + int x = l.x0 + rand()%(l.getW() - segment.sx + 1); + int y = l.y0 + rand()%(l.getH() - segment.sy + 1); + int z = l.z0 + rand()%(l.getD() - segment.sz + 1); + return testSegment(name, segment, l, x, y, z, 0.1); + } }; diff --git a/projects/neural/test.all.inc.cpp b/projects/neural/test.all.inc.cpp index 227f226..a3d22e2 100644 --- a/projects/neural/test.all.inc.cpp +++ b/projects/neural/test.all.inc.cpp @@ -12,8 +12,8 @@ class AllTest: public Test { public: static bool test(const char *name = "all") { Stage st(name); - //SimpleTest::test(); - //ConvTest::test(); + SimpleTest::test(); + ConvTest::test(); Cx4Test::test(); return st; } diff --git a/projects/neural/train.cx4.inc.cpp b/projects/neural/train.cx4.inc.cpp index 54831ad..c9060c7 100644 --- a/projects/neural/train.cx4.inc.cpp +++ b/projects/neural/train.cx4.inc.cpp @@ -147,6 +147,8 @@ protected: if (imagesInFile < 1) return fclose(f), f = nullptr, false; imagesInMemory = loadImagesCount > imagesInFile ? imagesInFile : loadImagesCount; + for(Layer *l = layerFull; l; l = l->next) + l->split(threadsCount); Layout l = layerPre ? layerPre->layout : layerFull->layout; assert(l.getW() >= segment->sx); diff --git a/projects/neural/train.image.inc.cpp b/projects/neural/train.image.inc.cpp index 18c5c46..47001e8 100644 --- a/projects/neural/train.image.inc.cpp +++ b/projects/neural/train.image.inc.cpp @@ -34,10 +34,12 @@ protected: assert(datafile); assert(fl->layout.getD() == 3); + #ifndef NDEBUG Layer *dl = dataLayer ? dataLayer : fl; assert(dl->layout.getW() == bl->layout.getW()); assert(dl->layout.getH() == bl->layout.getH()); assert(dl->layout.getD() == bl->layout.getD()); + #endif imgsize = fl->layout.getActiveCount(); fl->layout.split(flist, threadsCount); diff --git a/projects/neural/train.inc.cpp b/projects/neural/train.inc.cpp index 41fd23a..6e58719 100644 --- a/projects/neural/train.inc.cpp +++ b/projects/neural/train.inc.cpp @@ -5,9 +5,8 @@ #include "layer.inc.cpp" -class Trainer { +class Trainer: public ThreadControl { private: - std::atomic barrierCounter; std::vector qualities; public: @@ -24,6 +23,7 @@ protected: Layer *fl; Layer *bl; Layer *ffl; + int currentBlock; virtual bool prepare() { return true; } virtual bool prepareBlock() { return true; } @@ -34,9 +34,8 @@ protected: virtual Quality verifyData(Barrier &barrier, int block, int iter) { return Quality{}; } private: - void threadFunc(int tid, unsigned int seed, int block) { - Barrier barrier(barrierCounter, tid, threadsCount, seed); - + void threadFunc(Barrier &barrier) override { + int block = currentBlock; Quality sumQ; for(int i = 0; i < itersPerBlock; ++i) { barrier.wait(); @@ -68,27 +67,21 @@ private: } } } - qualities[tid] = sumQ; + qualities[barrier.tid] = sumQ; } Quality runThreads(int block) { - barrierCounter = 0; - std::vector t(threadsCount, nullptr); - for(int i = 1; i < threadsCount; ++i) - t[i] = new std::thread(&Trainer::threadFunc, this, i, rand(), block); - threadFunc(0, rand(), block); - - Quality result = qualities[0]; - for(int i = 1; i < threadsCount; ++i) - { t[i]->join(); delete t[i]; result += qualities[i]; } - return result *= 1/(AccumReal)itersPerBlock; + currentBlock = block; + ThreadControl::runThreads(threadsCount); + Quality q; + for(int i = 0; i < threadsCount; ++i) q += qualities[i]; + return q *= 1/(AccumReal)itersPerBlock; } public: Trainer(): - barrierCounter(0), layer(), ratio(), threadsCount(1), diff --git a/projects/neural/train.segment.inc.cpp b/projects/neural/train.segment.inc.cpp index 836ab3e..4ee8701 100644 --- a/projects/neural/train.segment.inc.cpp +++ b/projects/neural/train.segment.inc.cpp @@ -6,11 +6,10 @@ #include "layer.inc.cpp" -class TrainerSegment { +class TrainerSegment: public ThreadControl { private: - std::atomic barrierCounter; std::vector qualities; - + public: Segment *segment; AccumReal ratio; @@ -25,6 +24,8 @@ public: protected: volatile int x, y, z; + int currentBlock; + bool currentBlockMeasureOnly; virtual bool prepare() { return true; } virtual bool prepareBlock(int block, bool measureOnly) { return true; } @@ -34,8 +35,9 @@ protected: virtual void loadData(Barrier &barrier, int block, int iter, bool measure) { } private: - void threadFunc(int tid, unsigned int seed, int block, bool measureOnly) { - Barrier barrier(barrierCounter, tid, threadsCount, seed); + void threadFunc(Barrier &barrier) override { + int block = currentBlock; + bool measureOnly = currentBlockMeasureOnly; QualityPair q; if (!measureOnly) { @@ -54,21 +56,16 @@ private: q.measure += segment->pass(barrier, x, y, z, 0); } - qualities[tid] = q; + qualities[barrier.tid] = q; } QualityPair runThreads(int block, bool measureOnly) { - barrierCounter = 0; - std::vector t(threadsCount, nullptr); - for(int i = 1; i < threadsCount; ++i) - t[i] = new std::thread(&TrainerSegment::threadFunc, this, i, rand(), block, measureOnly); - threadFunc(0, rand(), block, measureOnly); - - QualityPair q = qualities[0]; - for(int i = 1; i < threadsCount; ++i) - { t[i]->join(); delete t[i]; q += qualities[i]; } - + currentBlock = block; + currentBlockMeasureOnly = measureOnly; + ThreadControl::runThreads(threadsCount); + QualityPair q; + for(int i = 0; i < threadsCount; ++i) q += qualities[i]; q.measure *= 1/(AccumReal)measuresPerBlock; q.train *= 1/(AccumReal)trainsPerBlock; return q; @@ -77,7 +74,6 @@ private: public: TrainerSegment(): - barrierCounter(0), segment(), ratio(), threadsCount(1), diff --git a/projects/neural/trainer.cpp b/projects/neural/trainer.cpp index 820f172..e639553 100644 --- a/projects/neural/trainer.cpp +++ b/projects/neural/trainer.cpp @@ -7,6 +7,8 @@ #include "train.digit.inc.cpp" #include "train.image.inc.cpp" #include "train.cx4.inc.cpp" +#include "benchmark.inc.cpp" +#include "benchmark.segment.inc.cpp" bool runTests() { @@ -115,7 +117,7 @@ bool trainCx4() { int cnt = 1; fl[cnt] = new LayerConvShared(l, Layout(257, 257, 24).expandXY(3), Kernel(4, 2, -2)); fl[cnt]->filename = FILENAME "1"; ++cnt; fl[cnt] = new LayerConvShared(l, Layout(130, 130, 48), Kernel(4, 2, -2)); fl[cnt]->filename = FILENAME "2"; ++cnt; - fl[cnt] = new LayerConvShared(l, Layout( 66, 66, 96), Kernel(4, 2, -2)); fl[cnt]->filename = FILENAME "3"; ++cnt; + //fl[cnt] = new LayerConvShared(l, Layout( 66, 66, 96), Kernel(4, 2, -2)); fl[cnt]->filename = FILENAME "3"; ++cnt; //fl[cnt] = new LayerConvShared(l, Layout( 6, 6, 48), Kernel(4, 2, 0)); fl[cnt]->filename = FILENAME "4"; ++cnt; //fl[cnt] = new LayerConvShared(l, Layout( 2, 2, 96), Kernel(4, 2, 0)); fl[cnt]->filename = FILENAME "5"; ++cnt; for(int i = cnt-1; i > 0; --i) { @@ -137,9 +139,9 @@ bool trainCx4() { t.ratio = 0.000001; t.threadsCount = 8; t.measuresPerBlock = 1000; - t.trainsPerBlock = 10000; + t.trainsPerBlock = 100000; t.loadImagesCount = 100; - t.blocksPerLoading = 10; + t.blocksPerLoading = 1; t.qmin = 0.00001; t.infile = "data/img512-data.bin"; t.outfile = FILENAME ".test"; @@ -154,6 +156,8 @@ bool trainCx4() { int main() { srand(time(NULL)); + //while(1) BenchmarkSegment::run(8); + //while(1) Benchmark().run(8); //return !runTests(); //return !trainDigits(); //return !trainDigitsConv();