|
|
e4740d |
#ifndef BENCHMARK_INC_CPP
|
|
|
e4740d |
#define BENCHMARK_INC_CPP
|
|
|
e4740d |
|
|
|
e4740d |
|
|
|
e4740d |
#include "common.inc.cpp"
|
|
|
e4740d |
#include "layer.conv.inc.cpp"
|
|
|
e4740d |
|
|
|
e4740d |
|
|
|
e4740d |
class Benchmark: public ThreadControl {
|
|
|
e4740d |
private:
|
|
|
e4740d |
typedef int Int;
|
|
|
e4740d |
typedef double Float;
|
|
|
e4740d |
|
|
|
e4740d |
int repeats;
|
|
|
e4740d |
int mode;
|
|
|
e4740d |
Layout pl, cl;
|
|
|
e4740d |
Kernel k;
|
|
|
e4740d |
std::vector<float> pvalues;</float>
|
|
|
e4740d |
std::vector<float> cvalues;</float>
|
|
|
e4740d |
std::vector<float> weights;</float>
|
|
|
e4740d |
|
|
|
e4740d |
|
|
|
e4740d |
__attribute__((always_inline))
|
|
|
e4740d |
void threadFuncXYCP(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) {
|
|
|
e4740d |
Layout pl = this->pl;
|
|
|
e4740d |
Layout cl = this->cl;
|
|
|
e4740d |
Kernel k = this->k;
|
|
|
e4740d |
|
|
|
e4740d |
assert(k.sx == k_sx);
|
|
|
e4740d |
assert(k.sy == k_sy);
|
|
|
e4740d |
assert(cl.getD() == c_d);
|
|
|
e4740d |
assert(pl.getD() == p_d);
|
|
|
e4740d |
|
|
|
e4740d |
Int tid = barrier.tid;
|
|
|
e4740d |
Int ts = barrier.threads;
|
|
|
e4740d |
|
|
|
e4740d |
Int c_w = cl.getW();
|
|
|
e4740d |
Int c_h = cl.getH();
|
|
|
e4740d |
Int c_wd = c_w*c_d;
|
|
|
e4740d |
Int c_hw = c_h*c_w;
|
|
|
e4740d |
Int c_hwd = c_h*c_wd;
|
|
|
e4740d |
Int c_sxz = cl.sx*cl.sz;
|
|
|
e4740d |
|
|
|
e4740d |
Int p_sxz = pl.sx*pl.sz;
|
|
|
e4740d |
Int p_szk = pl.sz*k.dx;
|
|
|
e4740d |
Int p_sxzk = p_sxz*k.dy;
|
|
|
e4740d |
|
|
|
e4740d |
//Int k_sxd = k_sx*p_d;
|
|
|
e4740d |
Int k_syx = k_sx*k_sy;
|
|
|
e4740d |
Int k_syxd = k_syx*p_d;
|
|
|
e4740d |
|
|
|
e4740d |
Float *cvalues = this->cvalues.data() + cl.y0*c_sxz + cl.x0*cl.sz + cl.z0;
|
|
|
e4740d |
Float *pvalues = this->pvalues.data() + (pl.y0 + k.oy)*p_sxz + (pl.x0 + k.ox)*pl.sz + pl.z0;
|
|
|
e4740d |
Float *weights = this->weights.data();
|
|
|
e4740d |
|
|
|
e4740d |
if (mode == 0) {
|
|
|
e4740d |
for(Int i = repeats; i; --i) {
|
|
|
e4740d |
for(Int i = tid; i < c_hwd; i += ts) {
|
|
|
e4740d |
Int cy = i/c_wd;
|
|
|
e4740d |
Int cx = i%c_wd/c_d;
|
|
|
e4740d |
Int cz = i%c_d;
|
|
|
e4740d |
|
|
|
e4740d |
Float *ic = &cvalues[ cy*c_sxz + cx*cl.sz + cz ];
|
|
|
e4740d |
Float *ip = &pvalues[ cy*p_sxzk + cx*p_szk ];
|
|
|
e4740d |
Float *iw = &weights[ cz*k_syxd ];
|
|
|
e4740d |
|
|
|
e4740d |
Float a = 0;
|
|
|
e4740d |
|
|
|
e4740d |
for(Int i = 0; i < p_d; ++i, ++ip, ++iw)
|
|
|
e4740d |
for(Int i = 0; i < k_syx; ++i) {
|
|
|
e4740d |
Int ky = i/k_sx;
|
|
|
e4740d |
Int kx = i%k_sx;
|
|
|
e4740d |
a += iw[i] * ip[ ky*p_sxz + kx*pl.sz ];
|
|
|
e4740d |
}
|
|
|
e4740d |
|
|
|
e4740d |
*ic = a;
|
|
|
e4740d |
}
|
|
|
e4740d |
barrier.wait2();
|
|
|
e4740d |
}
|
|
|
e4740d |
} else
|
|
|
e4740d |
if (mode == 1) {
|
|
|
e4740d |
if (c_w > 1 || c_h > 1 || pl.sx != pl.getW() || pl.sz != p_d) {
|
|
|
e4740d |
for(Int i = repeats; i; --i)
|
|
|
e4740d |
for(Int i = 0; i < k_syx; ++i) {
|
|
|
e4740d |
Int ky = i/k_sx;
|
|
|
e4740d |
Int kx = i%k_sx;
|
|
|
e4740d |
//Int pi = ky*p_sxz + kx*pl.sz;
|
|
|
e4740d |
//Int wi = i*p_d;
|
|
|
e4740d |
Float *ip = &pvalues[ ky*p_sxz + kx*pl.sz ];
|
|
|
e4740d |
Float *iw = &weights[ i*p_d ];
|
|
|
e4740d |
for(Int i = tid; i < c_hw; i += ts) {
|
|
|
e4740d |
Int cy = i/c_w;
|
|
|
e4740d |
Int cx = i%c_w;
|
|
|
e4740d |
Float *iip = &ip[ cy*p_sxzk + cx*p_szk ];
|
|
|
e4740d |
Float *iic = &cvalues[ cy*c_sxz + cx*cl.sz ];
|
|
|
e4740d |
|
|
|
e4740d |
for(Int cz = 0; cz < c_d; ++cz) {
|
|
|
e4740d |
//Int pii = pi + cy*p_sxzk + cx*p_szk;
|
|
|
e4740d |
//Int wii = wi + cz*k_syxd;
|
|
|
e4740d |
Float *iiw = &iw[ cz*k_syxd ];
|
|
|
e4740d |
Float a = iic[cz];
|
|
|
e4740d |
|
|
|
e4740d |
for(Int i = 0; i < p_d; ++i)
|
|
|
e4740d |
iip[i] += iiw[i] * a;
|
|
|
e4740d |
//pvalues[pii + i] += weights[wii + i] * a;
|
|
|
e4740d |
}
|
|
|
e4740d |
}
|
|
|
e4740d |
barrier.wait2();
|
|
|
e4740d |
}
|
|
|
e4740d |
} else {
|
|
|
e4740d |
//Int c_dd = c_d*p_d;
|
|
|
e4740d |
//Int c_wdd = c_wd*p_d;
|
|
|
e4740d |
//Int c_hwdd = c_hwd*p_d;
|
|
|
e4740d |
Int cnt = k_syxd/ts;
|
|
|
e4740d |
for(Int i = repeats; i; --i) {
|
|
|
e4740d |
for(Int i = cnt*tid; i < cnt; ++i) {
|
|
|
e4740d |
//Int ky = i/k_sxd;
|
|
|
e4740d |
//Int kx = i%k_sxd/p_d;
|
|
|
e4740d |
//Int pz = i%p_d;
|
|
|
e4740d |
//Float *ip = &pvalues[ i ];//ky*p_sxz + kx*pl.sz + pz ];
|
|
|
e4740d |
Float *iw = &weights[ i*c_d ];
|
|
|
e4740d |
Float a = 0;
|
|
|
e4740d |
for(Int cz = 0; cz < c_d; ++cz) {
|
|
|
e4740d |
a += iw[ cz ] * cvalues[ cz ]; //*k_syxd
|
|
|
e4740d |
}
|
|
|
e4740d |
pvalues[ i ] = a;
|
|
|
e4740d |
//*ip = a;
|
|
|
e4740d |
}
|
|
|
e4740d |
barrier.wait2();
|
|
|
e4740d |
}
|
|
|
e4740d |
}
|
|
|
e4740d |
}
|
|
|
e4740d |
}
|
|
|
e4740d |
|
|
|
e4740d |
__attribute__((always_inline))
|
|
|
e4740d |
void threadFuncXYC(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) {
|
|
|
e4740d |
switch(p_d) {
|
|
|
e4740d |
case 3: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 3 );
|
|
|
e4740d |
case 4: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 4 );
|
|
|
e4740d |
case 24: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 24 );
|
|
|
e4740d |
case 48: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 28 );
|
|
|
e4740d |
}
|
|
|
e4740d |
threadFuncXYCP( barrier, k_sx, k_sy, c_d, p_d );
|
|
|
e4740d |
}
|
|
|
e4740d |
|
|
|
e4740d |
__attribute__((always_inline))
|
|
|
e4740d |
void threadFuncXY(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) {
|
|
|
e4740d |
switch(c_d) {
|
|
|
e4740d |
case 3: return threadFuncXYC( barrier, k_sx, k_sy, 3, p_d );
|
|
|
e4740d |
case 4: return threadFuncXYC( barrier, k_sx, k_sy, 4, p_d );
|
|
|
e4740d |
case 24: return threadFuncXYC( barrier, k_sx, k_sy, 24, p_d );
|
|
|
e4740d |
case 48: return threadFuncXYC( barrier, k_sx, k_sy, 48, p_d );
|
|
|
e4740d |
}
|
|
|
e4740d |
threadFuncXYC( barrier, k_sx, k_sy, c_d, p_d );
|
|
|
e4740d |
}
|
|
|
e4740d |
|
|
|
e4740d |
void threadFunc(Barrier &barrier) override {
|
|
|
e4740d |
Int k_sx = k.sx;
|
|
|
e4740d |
Int k_sy = k.sy;
|
|
|
e4740d |
Int c_d = cl.getD();
|
|
|
e4740d |
Int p_d = pl.getD();
|
|
|
e4740d |
|
|
|
e4740d |
if (k_sy == k_sx) switch(k_sx) {
|
|
|
e4740d |
case 4: return threadFuncXY( barrier, 4, 4, c_d, p_d );
|
|
|
e4740d |
}
|
|
|
e4740d |
threadFuncXY( barrier, k_sx, k_sy, c_d, p_d );
|
|
|
e4740d |
}
|
|
|
e4740d |
|
|
|
e4740d |
|
|
|
e4740d |
void init(int mode, Layout pl, Layout cl, Kernel k, long long totalLinks) {
|
|
|
e4740d |
assert(pl);
|
|
|
e4740d |
assert(cl);
|
|
|
e4740d |
assert(k);
|
|
|
e4740d |
assert(totalLinks > 0);
|
|
|
e4740d |
assert(0 <= pl.x0 + k.ox && (cl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);
|
|
|
e4740d |
assert(0 <= pl.y0 + k.oy && (cl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);
|
|
|
e4740d |
|
|
|
e4740d |
this->mode = mode;
|
|
|
e4740d |
this->pl = pl;
|
|
|
e4740d |
this->cl = cl;
|
|
|
e4740d |
this->k = k;
|
|
|
e4740d |
|
|
|
e4740d |
pvalues.resize(pl.getCount());
|
|
|
e4740d |
cvalues.resize(cl.getCount());
|
|
|
e4740d |
weights.resize(cl.getD()*k.sx*k.sy*pl.getD());
|
|
|
e4740d |
|
|
|
e4740d |
for(int i = 0; i < (int)pvalues.size(); ++i) pvalues[i] = rand();
|
|
|
e4740d |
for(int i = 0; i < (int)cvalues.size(); ++i) cvalues[i] = rand();
|
|
|
e4740d |
for(int i = 0; i < (int)weights.size(); ++i) weights[i] = rand();
|
|
|
e4740d |
|
|
|
e4740d |
long long links = weights.size() * cl.getW() * cl.getH();
|
|
|
e4740d |
repeats = (totalLinks - 1)/links + 1;
|
|
|
e4740d |
|
|
|
e4740d |
//printf( "benchmark init: prev %lld, curr %lld, links %lld, repeats %d, total links: %lld\n",
|
|
|
e4740d |
// (long long)pvalues.size(), (long long)cvalues.size(), links, repeats, links*repeats );
|
|
|
e4740d |
}
|
|
|
e4740d |
|
|
|
e4740d |
|
|
|
e4740d |
void run(const char *name, int threadsCount, int mode, Layout pl, Layout cl, Kernel k, long long totalLinks) {
|
|
|
e4740d |
init(mode, pl, cl, k, totalLinks);
|
|
|
e4740d |
|
|
|
e4740d |
volatile long long t0 = timeUs();
|
|
|
e4740d |
runThreads(threadsCount);
|
|
|
e4740d |
volatile long long t1 = timeUs();
|
|
|
e4740d |
|
|
|
e4740d |
Float sum = 0;
|
|
|
e4740d |
for(int i = 0; i < (int)pvalues.size(); ++i) sum += pvalues[i];
|
|
|
e4740d |
for(int i = 0; i < (int)cvalues.size(); ++i) sum += cvalues[i];
|
|
|
e4740d |
for(int i = 0; i < (int)weights.size(); ++i) sum += weights[i];
|
|
|
e4740d |
printf("%s %d: %f, %lld\n", name, mode, (t1 - t0)*1e-6, (long long)sum);
|
|
|
e4740d |
}
|
|
|
e4740d |
|
|
|
e4740d |
|
|
|
e4740d |
void run(const char *name, int threadsCount, Layout pl, Layout cl, Kernel k, long long totalLinks) {
|
|
|
e4740d |
for(int mode = 0; mode < 2; ++mode)
|
|
|
e4740d |
run(name, threadsCount, mode, pl, cl, k, totalLinks);
|
|
|
e4740d |
}
|
|
|
e4740d |
|
|
|
e4740d |
|
|
|
e4740d |
public:
|
|
|
e4740d |
void run(int threadsCount = 1) {
|
|
|
e4740d |
//printf("run benchmark: %d\n", threadsCount);
|
|
|
e4740d |
/*
|
|
|
e4740d |
run( "514x3 -> 258x24", threadsCount,
|
|
|
e4740d |
Layout(514, 514, 3).expandXY(2),
|
|
|
e4740d |
Layout(258, 258, 24).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
|
|
|
e4740d |
run( "258x24 -> 130x48", threadsCount,
|
|
|
e4740d |
Layout(258, 258, 24).expandXY(2),
|
|
|
e4740d |
Layout(130, 130, 48).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
|
|
|
e4740d |
run( "130x48 -> 66x96 ", threadsCount,
|
|
|
e4740d |
Layout(130, 130, 48).expandXY(2),
|
|
|
e4740d |
Layout( 66, 66, 96).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
|
|
|
e4740d |
run( "66x96 -> 34x144", threadsCount,
|
|
|
e4740d |
Layout( 66, 66, 96).expandXY(2),
|
|
|
e4740d |
Layout(34, 34, 144).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
|
|
|
e4740d |
run( "34x144 -> 18x216", threadsCount,
|
|
|
e4740d |
Layout(34, 34, 144).expandXY(2),
|
|
|
e4740d |
Layout(18, 18, 216).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
|
|
|
e4740d |
run( "18x216 -> 10x324", threadsCount,
|
|
|
e4740d |
Layout(18, 18, 216).expandXY(2),
|
|
|
e4740d |
Layout(10, 10, 324).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
|
|
|
e4740d |
run( "10x324 -> 6x486 ", threadsCount,
|
|
|
e4740d |
Layout(10, 10, 324).expandXY(2),
|
|
|
e4740d |
Layout( 6, 6, 486).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
|
|
|
e4740d |
run( "6x486 -> 4x729 ", threadsCount,
|
|
|
e4740d |
Layout( 6, 6, 486).expandXY(2),
|
|
|
e4740d |
Layout( 4, 4, 729).expandXY(2), Kernel(4, 2, -2), 10ll*1000*1000*1000 );
|
|
|
e4740d |
*/
|
|
|
e4740d |
run( "4x768 -> 1x1093 ", threadsCount,
|
|
|
e4740d |
Layout( 4, 4, 768).expandXY(0),
|
|
|
e4740d |
Layout( 1, 1, 1093).expandXY(0), Kernel(4, 2, 0), 10ll*1000*1000*1000 );
|
|
|
e4740d |
}
|
|
|
e4740d |
};
|
|
|
e4740d |
|
|
|
e4740d |
|
|
|
e4740d |
|
|
|
e4740d |
#endif
|