From f14ea7090a5602ce63d3ea927ff5a58b0de84133 Mon Sep 17 00:00:00 2001 From: Ivan Mahonin Date: Jul 21 2018 15:02:48 +0000 Subject: contourgl: cl optimize3 --- diff --git a/c++/contourgl/cl/contour-base.cl b/c++/contourgl/cl/contour-base.cl index 4dd14ea..d359393 100644 --- a/c++/contourgl/cl/contour-base.cl +++ b/c++/contourgl/cl/contour-base.cl @@ -15,7 +15,6 @@ along with this program. If not, see . */ -#define PRECISION 1e-6f #define ONE 65536 #define TWO 131072 // (ONE)*2 #define HALF 32768 // (ONE)/2 @@ -31,8 +30,7 @@ kernel void clear( int id = get_global_id(0); if (id >= width*height) return; int c = id % width; - int4 v = { 0, 0, c | (c + 1), 0 }; - mark_buffer[id] = v; + mark_buffer[id] = (int4)(0, 0, c | (c + 1), 0); } kernel void path( @@ -41,7 +39,8 @@ kernel void path( global int *mark_buffer, global float2 *points, int begin, - int end ) + int end, + int4 bounds ) { int id = get_global_id(0); if (id >= end) return; @@ -57,8 +56,8 @@ kernel void path( if (flipx) { p0.x = s.x - p0.x; p1.x = s.x - p1.x; } if (flipy) { p0.y = s.y - p0.y; p1.y = s.y - p1.y; } float2 d = p1 - p0; - float kx = fabs(d.y) < PRECISION ? 1e10 : d.x/d.y; - float ky = fabs(d.x) < PRECISION ? 1e10 : d.y/d.x; + float kx = d.x/d.y; + float ky = d.y/d.x; global int *row, *mark; float2 px, py, pp1; @@ -67,32 +66,33 @@ kernel void path( while(p0.x != p1.x || p0.y != p1.y) { ix = (int)p0.x; - iy = (int)p0.y; - if (iy > h1) break; + iy = max((int)p0.y, 0); + if (iy > h1) return; px.x = (float)(ix + 1); px.y = p0.y + ky*(px.x - p0.x); - py.y = max((float)(iy + 1), 0.f); + py.y = (float)(iy + 1); py.x = p0.x + kx*(py.y - p0.y); pp1 = p1; if (pp1.x > px.x) pp1 = px; if (pp1.y > py.y) pp1 = py; - if (iy >= 0) { - cover = pp1.y - p0.y; - area = px.x - 0.5f*(p0.x + pp1.x); - if (flipx) { ix = w1 - ix; area = 1.f - area; } - if (flipy) { iy = h1 - iy; cover = -cover; } - ix = clamp(ix, 0, w1); - row = mark_buffer + 4*iy*width; - mark = row + 4*ix; - atomic_add(mark, (int)(area*cover*ONE_F)); - atomic_add(mark + 1, (int)(cover*ONE_F)); - iix = (ix & (ix + 1)) - 1; - while(iix >= 0) { - atomic_min(row + 4*iix + 2, ix); - iix = (iix & (iix + 1)) - 1; - } + cover = (pp1.y - p0.y)*ONE_F; + area = px.x - 0.5f*(p0.x + pp1.x); + if (flipx) { ix = w1 - ix; area = 1.f - area; } + if (flipy) { iy = h1 - iy; cover = -cover; } + ix = clamp(ix, 0, w1); + + row = mark_buffer + 4*iy*width; + mark = row + 4*ix; + atomic_add(mark, (int)(area*cover)); + atomic_add(mark + 1, (int)cover); + + row += 2; + iix = (ix & (ix + 1)) - 1; + while(iix >= bounds.s0) { + atomic_min(row + 4*iix, ix); + iix = (iix & (iix + 1)) - 1; } p0 = pp1; @@ -108,6 +108,7 @@ kernel void fill( global int4 *mark_buffer, global float4 *image, float4 color, + int4 bounds, int invert, int evenodd ) { @@ -118,15 +119,12 @@ kernel void fill( global float4 *image_row = image + id*width; global float4 *pixel; - int icover = 0; - //int ialpha; - int c0 = 0; - int c1 = 0; - int i = 0; - float alpha; int4 m; - while(c0 < width) { - c1 = min(c1, width); + float alpha; + //int ialpha; + int icover = 0, c0 = bounds.s0, c1 = bounds.s0; + while(c0 < bounds.s2) { + c1 = min(c1, bounds.s2); mark = &row[c1]; m = *mark; *mark = (int4)(0, 0, c1 | (c1 + 1), 0); @@ -139,7 +137,7 @@ kernel void fill( while(c0 < c1) image_row[c0++] = color; - if (c1 >= width) return; + if (c1 >= bounds.s2) return; //ialpha = abs(mark.x + icover); //ialpha = evenodd ? ONE - abs((ialpha % TWO) - ONE) diff --git a/c++/contourgl/clrender.cpp b/c++/contourgl/clrender.cpp index e72bc79..f28d6e1 100644 --- a/c++/contourgl/clrender.cpp +++ b/c++/contourgl/clrender.cpp @@ -573,20 +573,27 @@ void ClRender3::draw(const Path &path) { assert(surface); assert(points_buffer); - int miny = max(0, path.miny); - int maxy = min(surface->height, path.maxy); + ContextRect bounds; + bounds.minx = max(1, path.bounds.minx); + bounds.maxx = min(surface->width, path.bounds.maxx); + bounds.miny = max(0, path.bounds.miny); + bounds.maxy = min(surface->height, path.bounds.maxy); int invert_int = path.invert ? 1 : 0; int evenodd_int = path.evenodd ? 1 : 0; - if (miny >= maxy || path.begin >= path.end) return; + if ( bounds.minx >= bounds.maxx + || bounds.miny >= bounds.maxy + || path.begin >= path.end ) return; cl.err |= clSetKernelArg(contour_path_kernel, 4, sizeof(path.begin), &path.begin); cl.err |= clSetKernelArg(contour_path_kernel, 5, sizeof(path.end), &path.end); + cl.err |= clSetKernelArg(contour_path_kernel, 6, sizeof(bounds), &bounds); assert(!cl.err); - cl.err |= clSetKernelArg(contour_fill_kernel, 1, sizeof(maxy), &maxy); // restrict height + cl.err |= clSetKernelArg(contour_fill_kernel, 1, sizeof(bounds.maxy), &bounds.maxy); // restrict height cl.err |= clSetKernelArg(contour_fill_kernel, 4, sizeof(path.color), &path.color); - cl.err |= clSetKernelArg(contour_fill_kernel, 5, sizeof(invert_int), &invert_int); - cl.err |= clSetKernelArg(contour_fill_kernel, 6, sizeof(evenodd_int), &evenodd_int); + cl.err |= clSetKernelArg(contour_fill_kernel, 5, sizeof(bounds), &bounds); + cl.err |= clSetKernelArg(contour_fill_kernel, 6, sizeof(invert_int), &invert_int); + cl.err |= clSetKernelArg(contour_fill_kernel, 7, sizeof(evenodd_int), &evenodd_int); assert(!cl.err); @@ -608,8 +615,8 @@ void ClRender3::draw(const Path &path) { &path_event ); assert(!cl.err); - offset = miny; - count = ((maxy - miny - 1)/group_size + 1)*group_size; + offset = bounds.miny; + count = ((bounds.maxy - bounds.miny - 1)/group_size + 1)*group_size; cl.err |= clEnqueueNDRangeKernel( cl.queue, contour_fill_kernel, diff --git a/c++/contourgl/clrender.h b/c++/contourgl/clrender.h index f676fa2..b862853 100644 --- a/c++/contourgl/clrender.h +++ b/c++/contourgl/clrender.h @@ -102,8 +102,7 @@ public: class ClRender3 { public: struct Path { - int miny; - int maxy; + ContextRect bounds; int begin; int end; Color color; diff --git a/c++/contourgl/test.cpp b/c++/contourgl/test.cpp index 506ba21..b0434f3 100644 --- a/c++/contourgl/test.cpp +++ b/c++/contourgl/test.cpp @@ -364,18 +364,23 @@ void Test::test_cl3(Environment &e, Data &data, Surface &surface) { path.invert = i->invert; path.evenodd = i->evenodd; - path.miny = path.maxy = (int)floor(i->contour.get_chunks().front().p1.y); + path.bounds.minx = path.bounds.maxx = (int)floor(i->contour.get_chunks().front().p1.x); + path.bounds.miny = path.bounds.maxy = (int)floor(i->contour.get_chunks().front().p1.y); path.begin = (int)points.size(); points.reserve(points.size() + i->contour.get_chunks().size() + 1); for(Contour::ChunkList::const_iterator j = i->contour.get_chunks().begin(); j != i->contour.get_chunks().end(); ++j) { + int x = (int)floor(j->p1.x); int y = (int)floor(j->p1.y); - if (path.miny > y) path.miny = y; - if (path.maxy < y) path.maxy = y; + if (path.bounds.minx > x) path.bounds.minx = x; + if (path.bounds.maxx < x) path.bounds.maxx = x; + if (path.bounds.miny > y) path.bounds.miny = y; + if (path.bounds.maxy < y) path.bounds.maxy = y; points.push_back(vec2f(j->p1)); } path.end = (int)points.size(); points.push_back( points[path.begin] ); - ++path.maxy; + ++path.bounds.maxx; + ++path.bounds.maxy; paths.push_back(path); }