Tree - bw/helianthuslab - Cool Bug Repo

bw / helianthuslab

Blame projects/neural/benchmark.inc.cpp

Blob Raw

		e4740d	`#ifndef BENCHMARK_INC_CPP`
		e4740d	`#define BENCHMARK_INC_CPP`
		e4740d
		e4740d
		e4740d	`#include "common.inc.cpp"`
		e4740d	`#include "layer.conv.inc.cpp"`
		e4740d
		e4740d
		e4740d	`class Benchmark: public ThreadControl {`
		e4740d	`private:`
		e4740d	`typedef int Int;`
		e4740d	`typedef double Float;`
		e4740d
		e4740d	`int repeats;`
		e4740d	`int mode;`
		e4740d	`Layout pl, cl;`
		e4740d	`Kernel k;`
		e4740d	`std::vector<float> pvalues;</float>`
		e4740d	`std::vector<float> cvalues;</float>`
		e4740d	`std::vector<float> weights;</float>`
		e4740d
		e4740d
		e4740d	`__attribute__((always_inline))`
		e4740d	`void threadFuncXYCP(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) {`
		e4740d	`Layout pl = this->pl;`
		e4740d	`Layout cl = this->cl;`
		e4740d	`Kernel k = this->k;`
		e4740d
		e4740d	`assert(k.sx == k_sx);`
		e4740d	`assert(k.sy == k_sy);`
		e4740d	`assert(cl.getD() == c_d);`
		e4740d	`assert(pl.getD() == p_d);`
		e4740d
		e4740d	`Int tid = barrier.tid;`
		e4740d	`Int ts = barrier.threads;`
		e4740d
		e4740d	`Int c_w = cl.getW();`
		e4740d	`Int c_h = cl.getH();`
		e4740d	`Int c_wd = c_w*c_d;`
		e4740d	`Int c_hw = c_h*c_w;`
		e4740d	`Int c_hwd = c_h*c_wd;`
		e4740d	`Int c_sxz = cl.sx*cl.sz;`
		e4740d
		e4740d	`Int p_sxz = pl.sx*pl.sz;`
		e4740d	`Int p_szk = pl.sz*k.dx;`
		e4740d	`Int p_sxzk = p_sxz*k.dy;`
		e4740d
		e4740d	`//Int k_sxd = k_sx*p_d;`
		e4740d	`Int k_syx = k_sx*k_sy;`
		e4740d	`Int k_syxd = k_syx*p_d;`
		e4740d
		e4740d	`Float cvalues = this->cvalues.data() + cl.y0c_sxz + cl.x0*cl.sz + cl.z0;`
		e4740d	`Float pvalues = this->pvalues.data() + (pl.y0 + k.oy)p_sxz + (pl.x0 + k.ox)*pl.sz + pl.z0;`
		e4740d	`Float *weights = this->weights.data();`
		e4740d
		e4740d	`if (mode == 0) {`
		e4740d	`for(Int i = repeats; i; --i) {`
		e4740d	`for(Int i = tid; i < c_hwd; i += ts) {`
		e4740d	`Int cy = i/c_wd;`
		e4740d	`Int cx = i%c_wd/c_d;`
		e4740d	`Int cz = i%c_d;`
		e4740d
		e4740d	`Float ic = &cvalues[ cyc_sxz + cx*cl.sz + cz ];`
		e4740d	`Float ip = &pvalues[ cyp_sxzk + cx*p_szk ];`
		e4740d	`Float iw = &weights[ czk_syxd ];`
		e4740d
		e4740d	`Float a = 0;`
		e4740d
		e4740d	`for(Int i = 0; i < p_d; ++i, ++ip, ++iw)`
		e4740d	`for(Int i = 0; i < k_syx; ++i) {`
		e4740d	`Int ky = i/k_sx;`
		e4740d	`Int kx = i%k_sx;`
		e4740d	`a += iw[i] * ip[ kyp_sxz + kxpl.sz ];`
		e4740d	`}`
		e4740d
		e4740d	`*ic = a;`
		e4740d	`}`
		e4740d	`barrier.wait2();`
		e4740d	`}`
		e4740d	`} else`
		e4740d	`if (mode == 1) {`
		e4740d	`if (c_w > 1 \|\| c_h > 1 \|\| pl.sx != pl.getW() \|\| pl.sz != p_d) {`
		e4740d	`for(Int i = repeats; i; --i)`
		e4740d	`for(Int i = 0; i < k_syx; ++i) {`
		e4740d	`Int ky = i/k_sx;`
		e4740d	`Int kx = i%k_sx;`
		e4740d	`//Int pi = kyp_sxz + kxpl.sz;`
		e4740d	`//Int wi = i*p_d;`
		e4740d	`Float ip = &pvalues[ kyp_sxz + kx*pl.sz ];`
		e4740d	`Float iw = &weights[ ip_d ];`
		e4740d	`for(Int i = tid; i < c_hw; i += ts) {`
		e4740d	`Int cy = i/c_w;`
		e4740d	`Int cx = i%c_w;`
		e4740d	`Float iip = &ip[ cyp_sxzk + cx*p_szk ];`
		e4740d	`Float iic = &cvalues[ cyc_sxz + cx*cl.sz ];`
		e4740d
		e4740d	`for(Int cz = 0; cz < c_d; ++cz) {`
		e4740d	`//Int pii = pi + cyp_sxzk + cxp_szk;`
		e4740d	`//Int wii = wi + cz*k_syxd;`
		e4740d	`Float iiw = &iw[ czk_syxd ];`
		e4740d	`Float a = iic[cz];`
		e4740d
		e4740d	`for(Int i = 0; i < p_d; ++i)`
		e4740d	`iip[i] += iiw[i] * a;`
		e4740d	`//pvalues[pii + i] += weights[wii + i] * a;`
		e4740d	`}`
		e4740d	`}`
		e4740d	`barrier.wait2();`
		e4740d	`}`
		e4740d	`} else {`
		e4740d	`//Int c_dd = c_d*p_d;`
		e4740d	`//Int c_wdd = c_wd*p_d;`
		e4740d	`//Int c_hwdd = c_hwd*p_d;`
		e4740d	`Int cnt = k_syxd/ts;`
		e4740d	`for(Int i = repeats; i; --i) {`
		e4740d	`for(Int i = cnt*tid; i < cnt; ++i) {`
		e4740d	`//Int ky = i/k_sxd;`
		e4740d	`//Int kx = i%k_sxd/p_d;`
		e4740d	`//Int pz = i%p_d;`
		e4740d	`//Float ip = &pvalues[ i ];//kyp_sxz + kx*pl.sz + pz ];`
		e4740d	`Float iw = &weights[ ic_d ];`
		e4740d	`Float a = 0;`
		e4740d	`for(Int cz = 0; cz < c_d; ++cz) {`
		e4740d	`a += iw[ cz ] * cvalues[ cz ]; //*k_syxd`
		e4740d	`}`
		e4740d	`pvalues[ i ] = a;`
		e4740d	`//*ip = a;`
		e4740d	`}`
		e4740d	`barrier.wait2();`
		e4740d	`}`
		e4740d	`}`
		e4740d	`}`
		e4740d	`}`
		e4740d
		e4740d	`__attribute__((always_inline))`
		e4740d	`void threadFuncXYC(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) {`
		e4740d	`switch(p_d) {`
		e4740d	`case 3: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 3 );`
		e4740d	`case 4: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 4 );`
		e4740d	`case 24: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 24 );`
		e4740d	`case 48: return threadFuncXYCP( barrier, k_sx, k_sy, c_d, 28 );`
		e4740d	`}`
		e4740d	`threadFuncXYCP( barrier, k_sx, k_sy, c_d, p_d );`
		e4740d	`}`
		e4740d
		e4740d	`__attribute__((always_inline))`
		e4740d	`void threadFuncXY(Barrier &barrier, int k_sx, int k_sy, int c_d, int p_d) {`
		e4740d	`switch(c_d) {`
		e4740d	`case 3: return threadFuncXYC( barrier, k_sx, k_sy, 3, p_d );`
		e4740d	`case 4: return threadFuncXYC( barrier, k_sx, k_sy, 4, p_d );`
		e4740d	`case 24: return threadFuncXYC( barrier, k_sx, k_sy, 24, p_d );`
		e4740d	`case 48: return threadFuncXYC( barrier, k_sx, k_sy, 48, p_d );`
		e4740d	`}`
		e4740d	`threadFuncXYC( barrier, k_sx, k_sy, c_d, p_d );`
		e4740d	`}`
		e4740d
		e4740d	`void threadFunc(Barrier &barrier) override {`
		e4740d	`Int k_sx = k.sx;`
		e4740d	`Int k_sy = k.sy;`
		e4740d	`Int c_d = cl.getD();`
		e4740d	`Int p_d = pl.getD();`
		e4740d
		e4740d	`if (k_sy == k_sx) switch(k_sx) {`
		e4740d	`case 4: return threadFuncXY( barrier, 4, 4, c_d, p_d );`
		e4740d	`}`
		e4740d	`threadFuncXY( barrier, k_sx, k_sy, c_d, p_d );`
		e4740d	`}`
		e4740d
		e4740d
		e4740d	`void init(int mode, Layout pl, Layout cl, Kernel k, long long totalLinks) {`
		e4740d	`assert(pl);`
		e4740d	`assert(cl);`
		e4740d	`assert(k);`
		e4740d	`assert(totalLinks > 0);`
		e4740d	`assert(0 <= pl.x0 + k.ox && (cl.getW()-1)*k.dx + k.ox + k.sx <= pl.sx);`
		e4740d	`assert(0 <= pl.y0 + k.oy && (cl.getH()-1)*k.dy + k.oy + k.sy <= pl.sy);`
		e4740d
		e4740d	`this->mode = mode;`
		e4740d	`this->pl = pl;`
		e4740d	`this->cl = cl;`
		e4740d	`this->k = k;`
		e4740d
		e4740d	`pvalues.resize(pl.getCount());`
		e4740d	`cvalues.resize(cl.getCount());`
		e4740d	`weights.resize(cl.getD()k.sxk.sy*pl.getD());`
		e4740d
		e4740d	`for(int i = 0; i < (int)pvalues.size(); ++i) pvalues[i] = rand();`
		e4740d	`for(int i = 0; i < (int)cvalues.size(); ++i) cvalues[i] = rand();`
		e4740d	`for(int i = 0; i < (int)weights.size(); ++i) weights[i] = rand();`
		e4740d
		e4740d	`long long links = weights.size() * cl.getW() * cl.getH();`
		e4740d	`repeats = (totalLinks - 1)/links + 1;`
		e4740d
		e4740d	`//printf( "benchmark init: prev %lld, curr %lld, links %lld, repeats %d, total links: %lld\n",`
		e4740d	`// (long long)pvalues.size(), (long long)cvalues.size(), links, repeats, links*repeats );`
		e4740d	`}`
		e4740d
		e4740d
		e4740d	`void run(const char *name, int threadsCount, int mode, Layout pl, Layout cl, Kernel k, long long totalLinks) {`
		e4740d	`init(mode, pl, cl, k, totalLinks);`
		e4740d
		e4740d	`volatile long long t0 = timeUs();`
		e4740d	`runThreads(threadsCount);`
		e4740d	`volatile long long t1 = timeUs();`
		e4740d
		e4740d	`Float sum = 0;`
		e4740d	`for(int i = 0; i < (int)pvalues.size(); ++i) sum += pvalues[i];`
		e4740d	`for(int i = 0; i < (int)cvalues.size(); ++i) sum += cvalues[i];`
		e4740d	`for(int i = 0; i < (int)weights.size(); ++i) sum += weights[i];`
		e4740d	`printf("%s %d: %f, %lld\n", name, mode, (t1 - t0)*1e-6, (long long)sum);`
		e4740d	`}`
		e4740d
		e4740d
		e4740d	`void run(const char *name, int threadsCount, Layout pl, Layout cl, Kernel k, long long totalLinks) {`
		e4740d	`for(int mode = 0; mode < 2; ++mode)`
		e4740d	`run(name, threadsCount, mode, pl, cl, k, totalLinks);`
		e4740d	`}`
		e4740d
		e4740d
		e4740d	`public:`
		e4740d	`void run(int threadsCount = 1) {`
		e4740d	`//printf("run benchmark: %d\n", threadsCount);`
		e4740d	`/*`
		e4740d	`run( "514x3 -> 258x24", threadsCount,`
		e4740d	`Layout(514, 514, 3).expandXY(2),`
		e4740d	`Layout(258, 258, 24).expandXY(2), Kernel(4, 2, -2), 10ll10001000*1000 );`
		e4740d	`run( "258x24 -> 130x48", threadsCount,`
		e4740d	`Layout(258, 258, 24).expandXY(2),`
		e4740d	`Layout(130, 130, 48).expandXY(2), Kernel(4, 2, -2), 10ll10001000*1000 );`
		e4740d	`run( "130x48 -> 66x96 ", threadsCount,`
		e4740d	`Layout(130, 130, 48).expandXY(2),`
		e4740d	`Layout( 66, 66, 96).expandXY(2), Kernel(4, 2, -2), 10ll10001000*1000 );`
		e4740d	`run( "66x96 -> 34x144", threadsCount,`
		e4740d	`Layout( 66, 66, 96).expandXY(2),`
		e4740d	`Layout(34, 34, 144).expandXY(2), Kernel(4, 2, -2), 10ll10001000*1000 );`
		e4740d	`run( "34x144 -> 18x216", threadsCount,`
		e4740d	`Layout(34, 34, 144).expandXY(2),`
		e4740d	`Layout(18, 18, 216).expandXY(2), Kernel(4, 2, -2), 10ll10001000*1000 );`
		e4740d	`run( "18x216 -> 10x324", threadsCount,`
		e4740d	`Layout(18, 18, 216).expandXY(2),`
		e4740d	`Layout(10, 10, 324).expandXY(2), Kernel(4, 2, -2), 10ll10001000*1000 );`
		e4740d	`run( "10x324 -> 6x486 ", threadsCount,`
		e4740d	`Layout(10, 10, 324).expandXY(2),`
		e4740d	`Layout( 6, 6, 486).expandXY(2), Kernel(4, 2, -2), 10ll10001000*1000 );`
		e4740d	`run( "6x486 -> 4x729 ", threadsCount,`
		e4740d	`Layout( 6, 6, 486).expandXY(2),`
		e4740d	`Layout( 4, 4, 729).expandXY(2), Kernel(4, 2, -2), 10ll10001000*1000 );`
		e4740d	`*/`
		e4740d	`run( "4x768 -> 1x1093 ", threadsCount,`
		e4740d	`Layout( 4, 4, 768).expandXY(0),`
		e4740d	`Layout( 1, 1, 1093).expandXY(0), Kernel(4, 2, 0), 10ll10001000*1000 );`
		e4740d	`}`
		e4740d	`};`
		e4740d
		e4740d
		e4740d
		e4740d	`#endif`

bw / helianthuslab

Source Code

Blame projects/neural/benchmark.inc.cpp