From 20cefb1a60ca4b2b9b44f4d81bdcb3558ca3d338 Mon Sep 17 00:00:00 2001 From: Ivan Mahonin Date: Jul 24 2018 17:01:00 +0000 Subject: contourgl: cl optimize6 --- diff --git a/c++/contourgl/cl/contour-base.cl b/c++/contourgl/cl/contour-base.cl index fa16b9c..6106fc6 100644 --- a/c++/contourgl/cl/contour-base.cl +++ b/c++/contourgl/cl/contour-base.cl @@ -1,5 +1,5 @@ /* - ......... 2015 Ivan Mahonin + ......... 2015-2018 Ivan Mahonin This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,39 +27,33 @@ kernel void clear( int width, int height, - global int4 *mark_buffer ) + global int4 *marks ) { int id = get_global_id(0); - if (id >= width*height) return; int c = id % width; - mark_buffer[id] = (int4)(0, 0, c | (c + 1), 0); + marks[id] = (int4)(0, 0, c | (c + 1), 0); } kernel void path( int width, int height, - global int *mark_buffer, + global int *marks, global float2 *points, - int begin, - int end, int4 bounds ) { int id = get_global_id(0); - if (id >= end) return; - - float2 s = { (float)width, (float)height }; - int w1 = width - 1; - int h1 = height - 1; - float2 p0 = points[id]; float2 p1 = points[id + 1]; + bool flipx = p1.x < p0.x; bool flipy = p1.y < p0.y; - if (flipx) { p0.x = s.x - p0.x; p1.x = s.x - p1.x; } - if (flipy) { p0.y = s.y - p0.y; p1.y = s.y - p1.y; } + if (flipx) { p0.x = (float)width - p0.x; p1.x = (float)width - p1.x; } + if (flipy) { p0.y = (float)height - p0.y; p1.y = (float)height - p1.y; } float2 d = p1 - p0; float kx = d.x/d.y; float ky = d.y/d.x; + int w1 = width - 1; + int h1 = height - 1; global int *row; float2 px, py, pp1; @@ -85,8 +79,10 @@ kernel void path( if (flipy) { iy = h1 - iy; cover = -cover; } ix = clamp(ix, 0, w1); - row = mark_buffer + 4*iy*width; + row = marks + 4*iy*width; atomic_add((global long*)(row + 4*ix), upsample((int)cover, (int)(area*cover))); + //atomic_add(row + 4*ix, (int)(area*cover)); + //atomic_add(row + 4*ix + 1, (int)cover); row += 2; iix = (ix & (ix + 1)) - 1; @@ -104,37 +100,34 @@ kernel void path( // antialiased, transparent, inverted, evenodd contours and combinations (total 16 implementations) kernel void fill( int width, - int height, - global int4 *mark_buffer, + global int4 *marks, global float4 *image, float4 color, - int4 bounds, - int invert, - int evenodd ) + int2 boundsx ) { - int id = get_global_id(0); - if (id >= height) return; - global int4 *row = mark_buffer + id*width; + int id = width*(int)get_global_id(0); + marks += id; + image += id; global int4 *mark; - global float4 *image_row = image + id*width; global float4 *pixel; + //prefetch(row + boundsx.s0, boundsx.s1 - boundsx.s0); + //prefetch(image_row + boundsx.s0, boundsx.s1 - boundsx.s0); + int4 m; float alpha; //int ialpha; - int icover = 0, c0 = bounds.s0, c1 = bounds.s0; - while(c0 < bounds.s2) { + int icover = 0, c0 = boundsx.s0, c1 = boundsx.s0; + while(c1 < boundsx.s1) { //ialpha = abs(icover); //ialpha = evenodd ? ONE - abs((ialpha % TWO) - ONE) // : min(ialpha, ONE); //if (invert) ialpha = ONE - ialpha; if (abs(icover) > HALF) while(c0 < c1) - image_row[c0++] = color; - - if (c1 >= bounds.s2) return; - - mark = &row[c1]; + image[c0++] = color; + + mark = &marks[c1]; m = *mark; *mark = (int4)(0, 0, c1 | (c1 + 1), 0); @@ -144,12 +137,11 @@ kernel void fill( //if (invert) ialpha = ONE - ialpha; alpha = (float)abs(m.x + icover)*DIV_ONE_F; - pixel = &image_row[c1]; - *pixel = (float4)( pixel->xyz*(1.f - alpha) + color.xyz*alpha, - min(pixel->w + alpha, 1.f) ); + pixel = &image[c1]; + *pixel = *pixel*(1.f - alpha) + color*alpha; - c0 = c1 + 1; - c1 = min(m.z, bounds.s2); icover += m.y; + c0 = c1 + 1; + c1 = m.z; } } diff --git a/c++/contourgl/clcontext.cpp b/c++/contourgl/clcontext.cpp index 5d1b7bb..c8b31b9 100644 --- a/c++/contourgl/clcontext.cpp +++ b/c++/contourgl/clcontext.cpp @@ -131,7 +131,7 @@ ClContext::ClContext(): // command queue cl_command_queue_properties props = 0 - | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE + //| CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE //| CL_QUEUE_PROFILING_ENABLE | 0; queue = clCreateCommandQueue( diff --git a/c++/contourgl/clrender.cpp b/c++/contourgl/clrender.cpp index f28d6e1..03f32a3 100644 --- a/c++/contourgl/clrender.cpp +++ b/c++/contourgl/clrender.cpp @@ -512,9 +512,8 @@ void ClRender3::send_surface(Surface *surface) { assert(!cl.err); cl.err |= clSetKernelArg(contour_fill_kernel, 0, sizeof(surface->width), &surface->width); - cl.err |= clSetKernelArg(contour_fill_kernel, 1, sizeof(surface->height), &surface->height); - cl.err |= clSetKernelArg(contour_fill_kernel, 2, sizeof(mark_buffer), &mark_buffer); - cl.err |= clSetKernelArg(contour_fill_kernel, 3, sizeof(surface_image), &surface_image); + cl.err |= clSetKernelArg(contour_fill_kernel, 1, sizeof(mark_buffer), &mark_buffer); + cl.err |= clSetKernelArg(contour_fill_kernel, 2, sizeof(surface_image), &surface_image); assert(!cl.err); wait(); @@ -578,61 +577,43 @@ void ClRender3::draw(const Path &path) { bounds.maxx = min(surface->width, path.bounds.maxx); bounds.miny = max(0, path.bounds.miny); bounds.maxy = min(surface->height, path.bounds.maxy); - int invert_int = path.invert ? 1 : 0; - int evenodd_int = path.evenodd ? 1 : 0; if ( bounds.minx >= bounds.maxx || bounds.miny >= bounds.maxy || path.begin >= path.end ) return; - cl.err |= clSetKernelArg(contour_path_kernel, 4, sizeof(path.begin), &path.begin); - cl.err |= clSetKernelArg(contour_path_kernel, 5, sizeof(path.end), &path.end); - cl.err |= clSetKernelArg(contour_path_kernel, 6, sizeof(bounds), &bounds); - assert(!cl.err); + vec2i boundsx(bounds.minx, bounds.maxx); - cl.err |= clSetKernelArg(contour_fill_kernel, 1, sizeof(bounds.maxy), &bounds.maxy); // restrict height - cl.err |= clSetKernelArg(contour_fill_kernel, 4, sizeof(path.color), &path.color); - cl.err |= clSetKernelArg(contour_fill_kernel, 5, sizeof(bounds), &bounds); - cl.err |= clSetKernelArg(contour_fill_kernel, 6, sizeof(invert_int), &invert_int); - cl.err |= clSetKernelArg(contour_fill_kernel, 7, sizeof(evenodd_int), &evenodd_int); + cl.err |= clSetKernelArg(contour_path_kernel, 4, sizeof(bounds), &bounds); assert(!cl.err); - - cl_event path_event; - - size_t group_size = 1; + cl.err |= clSetKernelArg(contour_fill_kernel, 3, sizeof(path.color), &path.color); + cl.err |= clSetKernelArg(contour_fill_kernel, 4, sizeof(boundsx), &boundsx); + assert(!cl.err); size_t offset = path.begin; - size_t count = ((path.end - path.begin - 1)/group_size + 1)*group_size; + size_t count = path.end - path.begin; cl.err |= clEnqueueNDRangeKernel( - cl.queue, - contour_path_kernel, - 1, - &offset, - &count, - NULL,//&group_size, - prev_event ? 1 : 0, - prev_event ? &prev_event : NULL, - &path_event ); + cl.queue, contour_path_kernel, + 1, &offset, &count, NULL, + 0, NULL, NULL ); assert(!cl.err); offset = bounds.miny; - count = ((bounds.maxy - bounds.miny - 1)/group_size + 1)*group_size; + count = bounds.maxy - bounds.miny; cl.err |= clEnqueueNDRangeKernel( - cl.queue, - contour_fill_kernel, - 1, - &offset, - &count, - NULL,//&group_size, - 1, - &path_event, - &prev_event ); + cl.queue, contour_fill_kernel, + 1, &offset, &count, NULL, + 0, NULL, NULL ); assert(!cl.err); } void ClRender3::wait() { cl.err |= clFinish(cl.queue); assert(!cl.err); - prev_event = NULL; + if (prev_event) { + cl.err |= clReleaseEvent(prev_event); + assert(!cl.err); + prev_event = NULL; + } } diff --git a/c++/contourgl/geometry.h b/c++/contourgl/geometry.h index 88691cc..4df5f39 100644 --- a/c++/contourgl/geometry.h +++ b/c++/contourgl/geometry.h @@ -134,6 +134,10 @@ typedef vec2 vec2f; typedef line2 line2f; typedef rect rectf; +typedef vec2 vec2i; +typedef line2 line2i; +typedef rect recti; + class ContextRect { public: int minx, miny, maxx, maxy;