diff --git a/c++/contourgl/cl/contour-base.cl b/c++/contourgl/cl/contour-base.cl index b6e4eca..ff0322e 100644 --- a/c++/contourgl/cl/contour-base.cl +++ b/c++/contourgl/cl/contour-base.cl @@ -31,13 +31,13 @@ kernel void clear( { int id = get_global_id(0); int c = id % width; - marks[id] = (int4)(0, 0, c | (c + 1), 0); + marks[id] = (int4)(0, 0, 0, 0); } kernel void path( int width, int height, - global int *marks, + global long *marks, global float2 *points, int end, int minx ) @@ -49,51 +49,38 @@ kernel void path( bool flipx = p1.x < p0.x; bool flipy = p1.y < p0.y; - if (flipx) { p0.x = (float)width - p0.x; p1.x = (float)width - p1.x; } + if (flipx) { p0.x = (float)width - p0.x; p1.x = (float)width - p1.x; } if (flipy) { p0.y = (float)height - p0.y; p1.y = (float)height - p1.y; } float2 d = p1 - p0; - float kx = d.x/d.y; - float ky = d.y/d.x; int w1 = width - 1; int h1 = height - 1; - - global int *row; - float2 px, py, pp1; - float cover, area; - int ix, iy, iix; + float kx = d.x/d.y; + float ky = d.y/d.x; while(p0.x != p1.x || p0.y != p1.y) { - ix = (int)p0.x; - iy = max((int)p0.y, 0); + int iy = max((int)p0.y, 0); + int ix = (int)p0.x; if (iy > h1) return; + float2 px, py; px.x = (float)(ix + 1); - px.y = p0.y + ky*(px.x - p0.x); py.y = (float)(iy + 1); + ix = clamp(ix, 0, w1); + + px.y = p0.y + ky*(px.x - p0.x); py.x = p0.x + kx*(py.y - p0.y); - pp1 = p1; + + float2 pp1 = p1; if (pp1.x > px.x) pp1 = px; if (pp1.y > py.y) pp1 = py; - cover = (pp1.y - p0.y)*ONE_F; - area = px.x - 0.5f*(p0.x + pp1.x); - if (flipx) { ix = w1 - ix; area = 1.f - area; } + float cover = (pp1.y - p0.y)*ONE_F; + float area = px.x - 0.5f*(p0.x + pp1.x); if (flipy) { iy = h1 - iy; cover = -cover; } - ix = clamp(ix, 0, w1); - - row = marks + 4*iy*width; - atomic_add((global long*)(row + 4*ix), upsample((int)cover, (int)(area*cover))); - //atomic_add(row + 4*ix, (int)(area*cover)); - //atomic_add(row + 4*ix + 1, (int)cover); - - row += 2; - iix = (ix & (ix + 1)) - 1; - while(iix >= minx) { - atomic_min(row + 4*iix, ix); - iix = (iix & (iix + 1)) - 1; - } - + if (flipx) { ix = w1 - ix; area = 1.f - area; } p0 = pp1; + + atomic_add(marks + ix*height + iy, upsample((int)cover, (int)(area*cover))); } } @@ -101,50 +88,28 @@ kernel void path( // different implementations for: // antialiased, transparent, inverted, evenodd contours and combinations (total 16 implementations) kernel void fill( - int width, - global int4 *marks, + int height, + global int2 *marks, global float4 *image, float4 color, int4 bounds ) { if (get_global_id(0) >= bounds.s3) return; - int id = width*(int)get_global_id(0); + int id = (int)get_global_id(0) + bounds.s0*height; marks += id; image += id; - global int4 *mark; - global float4 *pixel; - - //prefetch(row + bounds.s0, bounds.s2 - bounds.s0); - //prefetch(image_row + bounds.s0, bounds.s2 - bounds.s0); - - int4 m; - float alpha; - //int ialpha; - int icover = 0, c0 = bounds.s0, c1 = bounds.s0; - while(c1 < bounds.s2) { - //ialpha = abs(icover); - //ialpha = evenodd ? ONE - abs((ialpha % TWO) - ONE) - // : min(ialpha, ONE); - //if (invert) ialpha = ONE - ialpha; - if (abs(icover) > HALF) - while(c0 < c1) - image[c0++] = color; - mark = &marks[c1]; - m = *mark; - *mark = (int4)(0, 0, c1 | (c1 + 1), 0); - - //ialpha = abs(mark.x + icover); - //ialpha = evenodd ? ONE - abs((ialpha % TWO) - ONE) - // : min(ialpha, ONE); - //if (invert) ialpha = ONE - ialpha; + int icover = 0; + while(true) { + int2 m = *marks; + *marks = (int2)(0, 0); + float alpha = (float)abs(m.x + icover)*color.w*DIV_ONE_F; + marks += height; - alpha = (float)abs(m.x + icover)*DIV_ONE_F; - pixel = &image[c1]; - *pixel = *pixel*(1.f - alpha) + color*alpha; - icover += m.y; - c0 = c1 + 1; - c1 = m.z; + *image = *image*(1.f - alpha) + color*alpha; + + if (++bounds.s0 >= bounds.s2) return; + image += height; } } diff --git a/c++/contourgl/clcontext.cpp b/c++/contourgl/clcontext.cpp index c8b31b9..f98d0eb 100644 --- a/c++/contourgl/clcontext.cpp +++ b/c++/contourgl/clcontext.cpp @@ -155,7 +155,7 @@ cl_program ClContext::load_program(const std::string &filename) { cl_program program = clCreateProgramWithSource(context, 1, &text_pointer, NULL, NULL); assert(program); - const char options[] = " -cl-fast-relaxed-math -Werror "; + const char options[] = " -Werror "; err = clBuildProgram(program, 1, &device, options, NULL, NULL); if (err) { diff --git a/c++/contourgl/clrender.cpp b/c++/contourgl/clrender.cpp index 587b703..9a058c6 100644 --- a/c++/contourgl/clrender.cpp +++ b/c++/contourgl/clrender.cpp @@ -511,7 +511,7 @@ void ClRender3::send_surface(Surface *surface) { cl.err |= clSetKernelArg(contour_path_kernel, 2, sizeof(mark_buffer), &mark_buffer); assert(!cl.err); - cl.err |= clSetKernelArg(contour_fill_kernel, 0, sizeof(surface->width), &surface->width); + cl.err |= clSetKernelArg(contour_fill_kernel, 0, sizeof(surface->height), &surface->height); cl.err |= clSetKernelArg(contour_fill_kernel, 1, sizeof(mark_buffer), &mark_buffer); cl.err |= clSetKernelArg(contour_fill_kernel, 2, sizeof(surface_image), &surface_image); assert(!cl.err); @@ -595,7 +595,7 @@ void ClRender3::draw(const Path &path) { offset = path.begin; count = path.end - path.begin - 1; - group_size = 8; + group_size = 128; count = ((count - 1)/group_size + 1)*group_size; cl.err |= clEnqueueNDRangeKernel( @@ -606,7 +606,7 @@ void ClRender3::draw(const Path &path) { offset = bounds.miny; count = bounds.maxy - bounds.miny; - group_size = 3; + group_size = 16; count = ((count - 1)/group_size + 1)*group_size; cl.err |= clEnqueueNDRangeKernel( diff --git a/c++/contourgl/test.cpp b/c++/contourgl/test.cpp index e32b29c..2416e07 100644 --- a/c++/contourgl/test.cpp +++ b/c++/contourgl/test.cpp @@ -381,6 +381,7 @@ void Test::test_cl2(Environment &e, Data &data, Surface &surface) { void Test::test_cl3(Environment &e, Data &data, Surface &surface) { // prepare data + int align = (1024 - 1)/sizeof(vec2f) + 1; vector paths; vector points; paths.reserve(data.size()); @@ -405,7 +406,7 @@ void Test::test_cl3(Environment &e, Data &data, Surface &surface) { points.push_back(vec2f(j->p1)); } path.end = (int)points.size(); - points.push_back( points[path.begin] ); + do { points.push_back( points[path.begin] ); } while(points.size() % align); ++path.bounds.maxx; ++path.bounds.maxy;