#ifndef NNLAYER3_MT_INC_CPP
#define NNLAYER3_MT_INC_CPP
#include "nnlayer3.inc.cpp"
#include <atomic>
#include <thread>
#include <vector>
class Barrier {
private:
std::atomic<unsigned int> &counter;
const unsigned int threads;
unsigned int next;
public:
inline Barrier(std::atomic<unsigned int> &counter, unsigned int threads): counter(counter), threads(threads), next() { }
inline void wait() { next += threads; ++counter; while(counter < next); }
inline void subwait(int tid) { while(counter < next + tid); }
};
class TrainMT {
private:
struct LDesc {
int y0, y1;
double sumQ;
LDesc(): y0(), y1(), sumQ() { }
};
public:
Layer *layer;
const unsigned char *dataX;
const unsigned char *dataY;
int strideX;
int strideY;
int *shuffle;
int count;
Real trainRatio;
TrainMT():
layer(),
dataX(),
dataY(),
strideX(),
strideY(),
shuffle(),
count(),
trainRatio() { }
private:
void trainFunc(int tid, int threads, std::atomic<unsigned int> &barrierCounter, LDesc *ldescs) {
Barrier barrier(barrierCounter, threads);
Layer &fl = *layer;
Layer &bl = layer->back();
int layersCount = fl.totalLayers();
LDesc *fld = ldescs, *bld = fld + layersCount - 1;
Real trainRatio = this->trainRatio;
int fsxz = fl.sx*fl.sz;
int bsxz = bl.sx*bl.sz;
//barrier.subwait(tid);
//for(LDesc *ld = fld; ld <= bld; ++ld)
// printf("t%d %d %d %d\n", tid, (int)(ld-fld), ld->y0, ld->y1);
//barrier.wait();
const unsigned char *dataX = this->dataX + fsxz*fld->y0;
const unsigned char *dataY = this->dataY + bsxz*bld->y0;
double sumQ = 0;
for(int i = 0; i < count; ++i) {
int ii = shuffle[i];
const unsigned char *curX = dataX + strideX*ii;
const unsigned char *curY = dataY + strideY*ii;
barrier.wait();
const unsigned char *px = curX;
for(Neuron *in = fl.neurons + fsxz*fld->y0, *e = fl.neurons + fsxz*fld->y1; in < e; ++in, ++px)
in->v = Real(*px)*Real(1/255.0);
LDesc *ld = fld + 1;
for(Layer *l = fl.next; l; l = l->next, ++ld) {
barrier.wait();
l->pass(ld->y0, ld->y1);
}
double q = 0;
const unsigned char *py = curY;
for(Neuron *in = bl.neurons + bsxz*bld->y0, *e = bl.neurons + bsxz*bld->y1; in < e; ++in, ++py) {
Real v = (in->v - 0.25)*2;
Real d = Real(*py)*Real(1/255.0) - v;
in->d *= d * trainRatio;
d *= d;
q += d*d;
}
sumQ += q;
if (trainRatio > 0) {
ld = bld;
for(Layer *l = &bl; l->prev; l = l->prev, --ld) {
if (!l->prev->prev) {
barrier.wait();
l->backpassWeights(ld->y0, ld->y1);
break;
} else
if (l->next) {
barrier.wait();
l->backpassTpl<true>(ld->y0, ld->y1);
//l->backpassTpl<false>(ld->y0, ld->y1);
//barrier.wait();
//l->next->backpassWeights(ld[1].y0, ld[1].y1);
}
}
}
//if (!tid) printf(" - %d, %f, %f\n", i, q, sumQ);
}
ldescs->sumQ = sumQ;
}
public:
double train(int threads) {
assert(threads > 0);
assert(layer && !layer->prev);
assert(dataX && dataY && shuffle);
assert(count > 0);
assert(trainRatio >= 0);
int layersCount = layer->totalLayers();
assert(layersCount > 0);
std::vector<LDesc> ldescs( threads*layersCount );
int layerId = 0;
for(Layer *l = layer; l; l = l->next, ++layerId) {
assert(layerId < layersCount);
int tsy = l->sy/threads;
for(int tid = 0; tid < threads; ++tid) {
LDesc &desc = ldescs[tid*layersCount + layerId];
desc.y0 = tid*tsy;
desc.y1 = desc.y0 + tsy;
if (tid == threads-1) desc.y1 = l->sy;
}
}
assert(layerId == layersCount);
std::atomic<unsigned int> barrierCounter(0);
std::vector<std::thread*> t(threads - 1);
for(int i = 1; i < threads; ++i)
t[i-1] = new std::thread(&TrainMT::trainFunc, this, i, threads, std::ref(barrierCounter), &ldescs[i*layersCount]);
trainFunc(0, threads, barrierCounter, &ldescs[0]);
double result = ldescs[0].sumQ;
for(int i = 1; i < threads; ++i)
{ t[i-1]->join(); delete t[i-1]; result += ldescs[i*layersCount].sumQ; }
return sqrt(sqrt( result/(count * layer->back().countNeurons()) ));
}
};
#endif