diff --git a/c++/contourgl/cl/contour-base.cl b/c++/contourgl/cl/contour-base.cl
new file mode 100644
index 0000000..d854ce3
--- /dev/null
+++ b/c++/contourgl/cl/contour-base.cl
@@ -0,0 +1,158 @@
+/*
+    ......... 2015 Ivan Mahonin
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+kernel void clear(
+	int width,
+	int height,
+	global int4 *mark_buffer )
+{
+	int id = get_global_id(0);
+	if (id >= width*height) return;
+	int c = id % width;
+	int4 v = { 0, 0, c | (c + 1), 0 };
+	mark_buffer[id] = v;
+}
+
+kernel void path(
+	int width,
+	int height,
+	global int *mark_buffer,
+	global float2 *points,
+	int begin,
+	int end )
+{
+	const float e = 1e-6f;
+	int id = get_global_id(0);
+	if (id >= end) return;
+	
+	float2 s = { (float)width, (float)height }; 
+	int w1 = width - 1;
+	int h1 = height - 1;
+
+	float2 p0 = points[id];
+	float2 p1 = points[id + 1];
+	bool flipx = p1.x < p0.x;
+	bool flipy = p1.y < p0.y;
+	if (flipx) { p0.x = s.x - p0.x; p1.x = s.x - p1.x; }
+	if (flipy) { p0.y = s.y - p0.y; p1.y = s.y - p1.y; }
+	float2 d = p1 - p0;
+	float kx = fabs(d.y) < e ? 1e10 : d.x/d.y;
+	float ky = fabs(d.x) < e ? 1e10 : d.y/d.x;
+	
+	while(p0.x != p1.x || p0.y != p1.y) {
+		int ix = (int)floor(p0.x + e);
+		int iy = (int)floor(p0.y + e);
+		if (iy > h1) break;
+
+		float2 px, py;
+		px.x = (float)(ix + 1);
+		px.y = p0.y + ky*(px.x - p0.x);
+		py.y = max((float)(iy + 1), 0.f);
+		py.x = p0.x + kx*(py.y - p0.y);
+		float2 pp1 = p1;
+		if (pp1.x > px.x) pp1 = px;
+		if (pp1.y > py.y) pp1 = py;
+		
+		if (iy >= 0) {
+			float cover = pp1.y - p0.y;
+			float area = px.x - 0.5f*(p0.x + pp1.x);
+			if (flipx) { ix = w1 - ix; area = 1.f - area; }
+			if (flipy) { iy = h1 - iy; cover = -cover; }
+			ix = clamp(ix, 0, w1);
+			global int *row = mark_buffer + 4*iy*width;
+			global int *mark = row + 4*ix;
+			atomic_add(mark, (int)round(area*cover*65536.f));
+			atomic_add(mark + 1, (int)round(cover*65536.f));
+			int iix = (ix & (ix + 1)) - 1;
+			while(iix > 0) {
+				atomic_min(row + 4*iix + 2, ix);
+				iix = (iix & (iix + 1)) - 1;
+			}
+		}
+		
+		p0 = pp1;
+	}
+}
+
+kernel void fill(
+	int width,
+	int height,
+	global int4 *mark_buffer,
+	global float4 *image,
+	float4 color,
+	int invert,
+	int evenodd )
+{
+	const int scale = 65536;
+	const int scale2 = 2*scale;
+	const int scale05 = scale/2;
+
+	int id = get_global_id(0);
+	if (id >= height) return;
+	int w1 = width - 1;
+	global int4 *row = mark_buffer + id*width;
+	global float4 *image_row = image + id*width;
+
+	int cover = 0;
+	int ialpha;
+	int2 c0 = { 0, id };
+	int2 c1 = c0;
+	int4 empty_mark = { 0, 0, 0, 0 };
+	while(c0.x < w1) {
+		int4 mark;
+		while(c1.x < width) {
+			mark = row[c1.x];
+			empty_mark.z = c1.x | (c1.x + 1);
+			row[c1.x] = empty_mark; 
+			if (mark.x || mark.y) break;
+			c1.x = min(mark.z, width);
+		}
+		
+		ialpha = abs(cover);
+		ialpha = evenodd ? scale - abs((ialpha % scale2) - scale)
+						 : min(ialpha, scale);
+		if (invert) ialpha = scale - ialpha;  
+		if (ialpha > scale05) {
+			while(c0.x < c1.x) {
+				image_row[c0.x] = color;
+				++c0.x;
+			}
+		}
+	
+		if (c1.x >= width) return;
+		
+		ialpha = abs(mark.x + cover);
+		ialpha = evenodd ? scale - abs((ialpha % scale2) - scale)
+						 : min(ialpha, scale);
+		if (invert) ialpha = scale - ialpha;  
+		if (ialpha > 4) {
+			float alpha = (float)ialpha/(float)scale;
+			float alpha_inv = 1.f - alpha;
+			global float4 *pixel = &image_row[c1.x];
+			float4 cl = *pixel;
+			cl.x = cl.x*alpha_inv + color.x*alpha;
+			cl.y = cl.y*alpha_inv + color.y*alpha;
+			cl.z = cl.z*alpha_inv + color.z*alpha;
+			cl.w = min(cl.w + alpha, 1.f);
+			*pixel = cl;
+		}
+		
+		c0.x = c1.x + 1;
+		c1.x = min(mark.z, width);
+		cover += mark.y;
+	}
+}
diff --git a/c++/contourgl/clrender.cpp b/c++/contourgl/clrender.cpp
index d4cda7a..bb079af 100644
--- a/c++/contourgl/clrender.cpp
+++ b/c++/contourgl/clrender.cpp
@@ -418,3 +418,213 @@ void ClRender2::wait() {
 }
 
 
+// ------------------------------------------------
+
+
+ClRender3::ClRender3(ClContext &cl):
+	cl(cl),
+	contour_program(),
+	contour_clear_kernel(),
+	contour_path_kernel(),
+	contour_fill_kernel(),
+	surface(),
+	points_buffer(),
+	mark_buffer(),
+	surface_image(),
+	prev_event()
+{
+	contour_program = cl.load_program("contour-base.cl");
+	assert(contour_program);
+
+	contour_clear_kernel = clCreateKernel(contour_program, "clear", &cl.err);
+	assert(!cl.err);
+	assert(contour_clear_kernel);
+
+	contour_path_kernel = clCreateKernel(contour_program, "path", &cl.err);
+	assert(!cl.err);
+	assert(contour_path_kernel);
+
+	contour_fill_kernel = clCreateKernel(contour_program, "fill", &cl.err);
+	assert(!cl.err);
+	assert(contour_fill_kernel);
+}
+
+ClRender3::~ClRender3() {
+	send_points(NULL, 0);
+	send_surface(NULL);
+
+	clReleaseKernel(contour_path_kernel);
+	clReleaseKernel(contour_fill_kernel);
+	clReleaseProgram(contour_program);
+}
+
+void ClRender3::send_surface(Surface *surface) {
+	if (this->surface) {
+		wait();
+		cl.err |= clReleaseMemObject(surface_image);
+		assert(!cl.err);
+		surface_image = NULL;
+	}
+
+	this->surface = surface;
+
+	if (this->surface) {
+		//Measure t("ClRender::send_surface");
+
+		int zero_mark[4] = { };
+
+		surface_image = clCreateBuffer(
+			cl.context, CL_MEM_READ_WRITE,
+			surface->count()*sizeof(Color), NULL,
+			&cl.err );
+		assert(!cl.err);
+		assert(surface_image);
+
+		mark_buffer = clCreateBuffer(
+			cl.context, CL_MEM_READ_WRITE,
+			surface->count()*sizeof(zero_mark), NULL,
+			&cl.err );
+		assert(!cl.err);
+		assert(mark_buffer);
+
+		cl.err |= clEnqueueWriteBuffer(
+			cl.queue, surface_image, false,
+			0, surface->count()*sizeof(Color), surface->data,
+			0, NULL, NULL );
+		assert(!cl.err);
+
+		cl.err |= clSetKernelArg(contour_clear_kernel, 0, sizeof(surface->width), &surface->width);
+		cl.err |= clSetKernelArg(contour_clear_kernel, 1, sizeof(surface->height), &surface->height);
+		cl.err |= clSetKernelArg(contour_clear_kernel, 2, sizeof(mark_buffer), &mark_buffer);
+		assert(!cl.err);
+
+		size_t count = surface->count();
+		cl.err |= clEnqueueNDRangeKernel(
+			cl.queue, contour_clear_kernel,
+			1, NULL, &count, NULL,
+			0, NULL, NULL );
+		assert(!cl.err);
+
+		cl.err |= clSetKernelArg(contour_path_kernel, 0, sizeof(surface->width), &surface->width);
+		cl.err |= clSetKernelArg(contour_path_kernel, 1, sizeof(surface->height), &surface->height);
+		cl.err |= clSetKernelArg(contour_path_kernel, 2, sizeof(mark_buffer), &mark_buffer);
+		assert(!cl.err);
+
+		cl.err |= clSetKernelArg(contour_fill_kernel, 0, sizeof(surface->width), &surface->width);
+		cl.err |= clSetKernelArg(contour_fill_kernel, 1, sizeof(surface->height), &surface->height);
+		cl.err |= clSetKernelArg(contour_fill_kernel, 2, sizeof(mark_buffer), &mark_buffer);
+		cl.err |= clSetKernelArg(contour_fill_kernel, 3, sizeof(surface_image), &surface_image);
+		assert(!cl.err);
+
+		wait();
+	}
+}
+
+Surface* ClRender3::receive_surface() {
+	if (surface) {
+		//Measure t("ClRender::receive_surface");
+
+		cl.err |= clEnqueueReadBuffer(
+			cl.queue, surface_image, CL_FALSE,
+			0, surface->count()*sizeof(Color), surface->data,
+			prev_event ? 1 : 0,
+			prev_event ? &prev_event : NULL,
+			NULL );
+		assert(!cl.err);
+
+		wait();
+	}
+	return surface;
+}
+
+void ClRender3::send_points(const vec2f *points, int count) {
+	if (points_buffer) {
+		wait();
+		cl.err |= clReleaseMemObject(points_buffer);
+		assert(!cl.err);
+		points_buffer = NULL;
+	}
+
+	if (points && count > 0) {
+		points_buffer = clCreateBuffer(
+			cl.context, CL_MEM_READ_ONLY,
+			count*sizeof(vec2f), NULL,
+			&cl.err );
+		assert(!cl.err);
+		assert(points_buffer);
+
+		cl.err |= clEnqueueWriteBuffer(
+			cl.queue, points_buffer, false,
+			0, count*sizeof(vec2f), points,
+			0, NULL, NULL );
+		assert(!cl.err);
+
+		cl.err |= clSetKernelArg(contour_path_kernel, 3, sizeof(points_buffer), &points_buffer);
+		assert(!cl.err);
+
+		wait();
+	}
+}
+
+void ClRender3::draw(const Path &path) {
+	//Measure t("ClRender::contour");
+
+	assert(surface);
+	assert(points_buffer);
+
+	int miny        = max(0, path.miny);
+	int maxy        = min(surface->height, path.maxy);
+	int invert_int  = path.invert  ? 1 : 0;
+	int evenodd_int = path.evenodd ? 1 : 0;
+	if (miny >= maxy || path.begin >= path.end) return;
+
+	cl.err |= clSetKernelArg(contour_path_kernel, 4, sizeof(path.begin), &path.begin);
+	cl.err |= clSetKernelArg(contour_path_kernel, 5, sizeof(path.begin), &path.end);
+	assert(!cl.err);
+
+	cl.err |= clSetKernelArg(contour_fill_kernel, 1, sizeof(maxy), &maxy); // restrict height
+	cl.err |= clSetKernelArg(contour_fill_kernel, 4, sizeof(path.color), &path.color);
+	cl.err |= clSetKernelArg(contour_fill_kernel, 5, sizeof(invert_int), &invert_int);
+	cl.err |= clSetKernelArg(contour_fill_kernel, 6, sizeof(evenodd_int), &evenodd_int);
+	assert(!cl.err);
+
+
+	cl_event path_event;
+
+	size_t group_size = 1;
+
+	size_t offset = path.begin;
+	size_t count = ((path.end - path.begin - 1)/group_size + 1)*group_size;
+	cl.err |= clEnqueueNDRangeKernel(
+		cl.queue,
+		contour_path_kernel,
+		1,
+		&offset,
+		&count,
+		NULL,//&group_size,
+		prev_event ? 1 : 0,
+		prev_event ? &prev_event : NULL,
+		&path_event );
+	assert(!cl.err);
+
+	offset = miny;
+	count = ((maxy - miny - 1)/group_size + 1)*group_size;
+	cl.err |= clEnqueueNDRangeKernel(
+		cl.queue,
+		contour_fill_kernel,
+		1,
+		&offset,
+		&count,
+		NULL,//&group_size,
+		1,
+		&path_event,
+		&prev_event );
+	assert(!cl.err);
+}
+
+void ClRender3::wait() {
+	cl.err |= clFinish(cl.queue);
+	assert(!cl.err);
+	prev_event = NULL;
+}
+
diff --git a/c++/contourgl/clrender.h b/c++/contourgl/clrender.h
index fd32aa5..f676fa2 100644
--- a/c++/contourgl/clrender.h
+++ b/c++/contourgl/clrender.h
@@ -99,4 +99,43 @@ public:
 };
 
 
+class ClRender3 {
+public:
+	struct Path {
+		int miny;
+		int maxy;
+		int begin;
+		int end;
+		Color color;
+		bool invert;
+		bool evenodd;
+	};
+
+private:
+	ClContext &cl;
+	cl_program contour_program;
+	cl_kernel contour_clear_kernel;
+	cl_kernel contour_path_kernel;
+	cl_kernel contour_fill_kernel;
+
+	Surface *surface;
+	cl_mem points_buffer;
+	cl_mem mark_buffer;
+	cl_mem surface_image;
+	cl_event prev_event;
+
+public:
+	ClRender3(ClContext &cl);
+	~ClRender3();
+
+	void send_surface(Surface *surface);
+	Surface* receive_surface();
+
+	void send_points(const vec2f *points, int count);
+
+	void draw(const Path &path);
+	void wait();
+};
+
+
 #endif
diff --git a/c++/contourgl/contourgl.cpp b/c++/contourgl/contourgl.cpp
index c1c1596..e2063dd 100644
--- a/c++/contourgl/contourgl.cpp
+++ b/c++/contourgl/contourgl.cpp
@@ -66,6 +66,9 @@ int main() {
 			{ Surface surface(width, height);
 			  Measure t("test_lines_cl2.tga", surface, true);
 			  Test::test_cl2(e, data, surface); }
+			{ Surface surface(width, height);
+			  Measure t("test_lines_cl3.tga", surface, true);
+			  Test::test_cl3(e, data, surface); }
 		}
 		{ Measure t("test_lines_downgrade", true); Test::downgrade(data, datalow); }
 
@@ -88,6 +91,9 @@ int main() {
 			{ Surface surface(width, height);
 			  Measure t("test_lineslow_cl2.tga", surface, true);
 			  Test::test_cl2(e, datalow, surface); }
+			{ Surface surface(width, height);
+			  Measure t("test_lineslow_cl3.tga", surface, true);
+			  Test::test_cl3(e, datalow, surface); }
 		}
 	}
 
diff --git a/c++/contourgl/test.cpp b/c++/contourgl/test.cpp
index bf221c6..634b79e 100644
--- a/c++/contourgl/test.cpp
+++ b/c++/contourgl/test.cpp
@@ -351,3 +351,66 @@ void Test::test_cl2(Environment &e, Data &data, Surface &surface) {
 	}
 	clr.receive_surface();
 }
+
+void Test::test_cl3(Environment &e, Data &data, Surface &surface) {
+	// prepare data
+	vector<ClRender3::Path> paths;
+	vector<vec2f> points;
+	paths.reserve(data.size());
+	for(Data::const_iterator i = data.begin(); i != data.end(); ++i) {
+		if (!i->contour.get_chunks().empty()) {
+			ClRender3::Path path = {};
+			path.color = i->color;
+			path.invert = i->invert;
+			path.evenodd = i->evenodd;
+
+			path.miny = path.maxy = (int)floor(i->contour.get_chunks().front().p1.y);
+			path.begin = (int)points.size();
+			points.reserve(points.size() + i->contour.get_chunks().size() + 1);
+			for(Contour::ChunkList::const_iterator j = i->contour.get_chunks().begin(); j != i->contour.get_chunks().end(); ++j) {
+				int y = (int)floor(j->p1.y);
+				if (path.miny > y) path.miny = y;
+				if (path.maxy < y) path.maxy = y;
+				points.push_back(vec2f(j->p1));
+			}
+			path.end = (int)points.size();
+			points.push_back( points[path.begin] );
+			++path.maxy;
+
+			paths.push_back(path);
+		}
+	}
+
+	// draw
+
+	ClRender3 clr(e.cl);
+
+	// warm-up
+	{
+		clr.send_surface(&surface); clr.send_points(&points.front(), (int)points.size());
+		for(int ii = 0; ii < 100; ++ii)
+			for(vector<ClRender3::Path>::const_iterator i = paths.begin(); i != paths.end(); ++i)
+				clr.draw(*i);
+		clr.wait(); clr.send_points(NULL, 0); clr.send_surface(NULL);
+	}
+
+	// measure
+	/*{
+		clr.send_surface(&surface); clr.send_points(&points.front(), (int)points.size());
+		for(int ii = 0; ii < 100; ++ii)
+			for(vector<ClRender3::Path>::const_iterator i = paths.begin(); i != paths.end(); ++i)
+				clr.draw(*i);
+		clr.wait(); clr.send_points(NULL, 0); clr.send_surface(NULL);
+	}*/
+
+	// actual task
+	clr.send_surface(&surface);
+	clr.send_points(&points.front(), (int)points.size());
+	{
+		Measure t("render");
+		for(vector<ClRender3::Path>::const_iterator i = paths.begin(); i != paths.end(); ++i)
+			clr.draw(*i);
+		clr.wait();
+	}
+	clr.receive_surface();
+}
diff --git a/c++/contourgl/test.h b/c++/contourgl/test.h
index b8437ff..6d3d3dd 100644
--- a/c++/contourgl/test.h
+++ b/c++/contourgl/test.h
@@ -55,6 +55,7 @@ public:
 	static void test_sw(Environment &e, Data &data, Surface &surface);
 	static void test_cl(Environment &e, Data &data, Surface &surface);
 	static void test_cl2(Environment &e, Data &data, Surface &surface);
+	static void test_cl3(Environment &e, Data &data, Surface &surface);
 };
 
 #endif