From 6fa0096079c5fc9d6f9d53451f443b733cd11a78 Mon Sep 17 00:00:00 2001
From: Ivan Mahonin <bh@icystar.com>
Date: Jul 26 2018 17:54:31 +0000
Subject: contourgl: CUDA implementation


---

diff --git a/c++/contourgl/Makefile b/c++/contourgl/Makefile
index 09ef1f2..c8d388c 100644
--- a/c++/contourgl/Makefile
+++ b/c++/contourgl/Makefile
@@ -43,7 +43,9 @@ SOURCES = \
 	utils.cpp
 
 ifdef CUDA
-	SOURCES += cudacontext.cpp
+	SOURCES += \
+		cudacontext.cpp \
+		cudarender.cpp
 	CUDA_SOURCES = \
 		cuda/contour.cu \
 		cuda/hello.cu
diff --git a/c++/contourgl/contourgl.cpp b/c++/contourgl/contourgl.cpp
index 06ddd49..9c24d70 100644
--- a/c++/contourgl/contourgl.cpp
+++ b/c++/contourgl/contourgl.cpp
@@ -98,10 +98,13 @@ int main() {
 			{ Surface surface(width, height);
 			  Measure t("test_lineslow_cl2.tga", surface, true);
 			  Test::test_cl2(e, datalow, surface); }
-			*/
 			{ Surface surface(width, height);
 			  Measure t("test_lineslow_cl3.tga", surface, true);
 			  Test::test_cl3(e, datalow, surface); }
+			*/
+			{ Surface surface(width, height);
+			  Measure t("test_lineslow_cu.tga", surface, true);
+			  Test::test_cu(e, datalow, surface); }
 		}
 	}
 
diff --git a/c++/contourgl/cuda/contour.cu b/c++/contourgl/cuda/contour.cu
index e69de29..db537b2 100644
--- a/c++/contourgl/cuda/contour.cu
+++ b/c++/contourgl/cuda/contour.cu
@@ -0,0 +1,151 @@
+/*
+    ......... 2018 Ivan Mahonin
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+extern "C" {
+
+#define ONE       65536
+#define TWO      131072               // (ONE)*2
+#define HALF      32768               // (ONE)/2
+#define ONE_F     65536.f             // (float)(ONE)
+#define DIV_ONE_F 0.0000152587890625f // 1.f/(ONE_F)
+
+
+__global__ void clear(
+	int width,
+	int height,
+	int4 *marks )
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+	int c = id % width;
+	marks[id] = make_int4(0, 0, c | (c + 1), 0);
+}
+
+__global__ void path(
+	int width,
+	int height,
+	int *marks,
+	const float2 *points,
+	int begin,
+	int end,
+	int minx )
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x + begin;
+	if (id >= end) return;
+	float2 p0 = points[id];
+	float2 p1 = points[id + 1];
+	
+	bool flipx = p1.x < p0.x;
+	bool flipy = p1.y < p0.y;
+	if (flipx) { p0.x = (float)width - p0.x; p1.x = (float)width - p1.x; }
+	if (flipy) { p0.y = (float)height - p0.y; p1.y = (float)height - p1.y; }
+	
+	float2 d;
+	d.x = p1.x - p0.x;
+	d.y = p1.y - p0.y;
+	float kx = d.x/d.y;
+	float ky = d.y/d.x;
+	int w1 = width - 1;
+	int h1 = height - 1;
+	
+	while(p0.x != p1.x || p0.y != p1.y) {
+		int ix = (int)p0.x;
+		int iy = max((int)p0.y, 0);
+		if (iy > h1) return;
+
+		float2 px, py;
+		px.x = (float)(ix + 1);
+		py.y = (float)(iy + 1);
+		ix = max(0, min(w1, ix));
+		
+		px.y = p0.y + ky*(px.x - p0.x);
+		py.x = p0.x + kx*(py.y - p0.y);
+
+		float2 pp1 = p1;
+		if (pp1.x > px.x) pp1 = px;
+		if (pp1.y > py.y) pp1 = py;
+		
+		float cover = (pp1.y - p0.y)*ONE_F;
+		float area = px.x - 0.5f*(p0.x + pp1.x);
+		if (flipx) { ix = w1 - ix; area = 1.f - area; }
+		if (flipy) { iy = h1 - iy; cover = -cover; }
+		
+		int *row = marks + 4*iy*width;
+		atomicAdd(
+			(unsigned long long*)(row + 4*ix),
+			((unsigned long long)(unsigned int)(int)(cover) << 32)
+			| (unsigned long long)(unsigned int)((int)(area*cover)) );
+		//row[4*ix] += (int)(area*cover);
+		//row[4*ix + 1] += (int)(cover);
+		//atomicAdd(row + 4*ix, (int)(area*cover));
+		//atomicAdd(row + 4*ix + 1, (int)(cover));
+		
+		row += 2;
+		int iix = (ix & (ix + 1)) - 1;
+		while(iix >= minx) {
+			atomicMin(row + 4*iix, ix);
+			iix = (iix & (iix + 1)) - 1;
+		}
+		
+		p0 = pp1;
+	}
+}
+
+__global__ void fill(
+	int width,
+	int4 *marks,
+	float4 *image,
+	float4 color,
+	int4 bounds )
+{
+	int id = blockIdx.x*blockDim.x + threadIdx.x + bounds.y;
+	if (id >= bounds.w) return;
+	id *= width;
+	marks += id;
+	image += id;
+	
+	int4 *mark;
+	float4 *pixel;
+
+	int4 m;
+	int icover = 0, c0 = bounds.x, c1 = bounds.x;
+	while(c1 < bounds.z) {
+		if (abs(icover) > HALF)
+			while(c0 < c1)
+				image[c0++] = color;
+
+		mark = &marks[c1];
+		m = *mark;
+		*mark = make_int4(0, 0, c1 | (c1 + 1), 0); 
+		
+		float alpha = (float)abs(m.x + icover)*DIV_ONE_F;
+		float one_alpha = 1.f - alpha;
+		
+		pixel = &image[c1];
+		float4 p = *pixel;
+		p.x = p.x*one_alpha + color.x*alpha;
+		p.y = p.y*one_alpha + color.y*alpha;
+		p.z = p.z*one_alpha + color.z*alpha;
+		p.w = p.w*one_alpha + color.w*alpha;
+		*pixel = p;
+		
+		icover += m.y;
+		c0 = c1 + 1;
+		c1 = m.z;
+	}
+}
+
+}
\ No newline at end of file
diff --git a/c++/contourgl/cudacontext.cpp b/c++/contourgl/cudacontext.cpp
index 3901098..d9d978d 100644
--- a/c++/contourgl/cudacontext.cpp
+++ b/c++/contourgl/cudacontext.cpp
@@ -86,7 +86,7 @@ CudaContext::CudaContext():
     err = cuCtxCreate(&context, CU_CTX_SCHED_AUTO, device);
     assert(!err);
 
-    hello();
+    //hello();
 }
 
 CudaContext::~CudaContext() {
diff --git a/c++/contourgl/cudarender.cpp b/c++/contourgl/cudarender.cpp
new file mode 100644
index 0000000..70eff45
--- /dev/null
+++ b/c++/contourgl/cudarender.cpp
@@ -0,0 +1,193 @@
+/*
+    ......... 2018 Ivan Mahonin
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <cassert>
+
+#include <algorithm>
+#include <iostream>
+
+#include "cudarender.h"
+#include "measure.h"
+
+
+using namespace std;
+
+
+CudaRender::CudaRender(CudaContext &cu):
+	cu(cu),
+	contour_module(),
+	contour_clear_kernel(),
+	contour_path_kernel(),
+	contour_fill_kernel(),
+	surface(),
+	points_buffer(),
+	mark_buffer(),
+	surface_image()
+{
+	cu.err = cuModuleLoad(&contour_module, "cuda/contour.ptx");
+	assert(!cu.err);
+
+	cu.err = cuModuleGetFunction(&contour_clear_kernel, contour_module, "clear");
+	assert(!cu.err);
+
+	cu.err = cuModuleGetFunction(&contour_path_kernel, contour_module, "path");
+	assert(!cu.err);
+
+	cu.err = cuModuleGetFunction(&contour_fill_kernel, contour_module, "fill");
+	assert(!cu.err);
+}
+
+CudaRender::~CudaRender() {
+	send_points(NULL, 0);
+	send_surface(NULL);
+
+	cu.err = cuModuleUnload(contour_module);
+	assert(!cu.err);
+}
+
+void CudaRender::send_surface(Surface *surface) {
+	if (this->surface) {
+		wait();
+
+		cu.err = cuMemFree(surface_image);
+		assert(!cu.err);
+		surface_image = 0;
+
+		cu.err = cuMemFree(mark_buffer);
+		assert(!cu.err);
+		mark_buffer = 0;
+	}
+
+	this->surface = surface;
+
+	if (this->surface) {
+		int zero_mark[4] = { };
+
+		cu.err = cuMemAlloc(&surface_image, surface->data_size());
+		assert(!cu.err);
+
+		cu.err = cuMemcpyHtoD(surface_image, surface->data, surface->data_size());
+		assert(!cu.err);
+
+		cu.err = cuMemAlloc(&mark_buffer, surface->count()*sizeof(zero_mark));
+		assert(!cu.err);
+
+		size_t group_size = 32;
+		size_t count = surface->count();
+		cu.err = cuLaunchKernel(
+			contour_clear_kernel,
+			(count - 1)/group_size + 1, 1, 1,
+			group_size, 1, 1,
+			0, 0, 0,
+			CudaParams()
+				.add(surface->width)
+				.add(surface->height)
+				.add(mark_buffer)
+				.get_extra() );
+		assert(!cu.err);
+
+		wait();
+	}
+}
+
+Surface* CudaRender::receive_surface() {
+	if (surface) {
+		wait();
+		cu.err = cuMemcpyDtoH(surface->data, surface_image, surface->data_size());
+		assert(!cu.err);
+	}
+	return surface;
+}
+
+void CudaRender::send_points(const vec2f *points, int count) {
+	if (points_buffer) {
+		wait();
+		cu.err = cuMemFree(points_buffer);
+		assert(!cu.err);
+		points_buffer = 0;
+	}
+
+	if (points && count > 0) {
+		cu.err = cuMemAlloc(&points_buffer, count*sizeof(vec2f));
+		assert(!cu.err);
+
+		cu.err = cuMemcpyHtoD(points_buffer, points, count*sizeof(vec2f));
+		assert(!cu.err);
+	}
+}
+
+void CudaRender::draw(const Path &path) {
+	assert(surface);
+	assert(points_buffer);
+
+	ContextRect bounds;
+	bounds.minx = max(1, path.bounds.minx);
+	bounds.maxx = min(surface->width, path.bounds.maxx);
+	bounds.miny = max(0, path.bounds.miny);
+	bounds.maxy = min(surface->height, path.bounds.maxy);
+	if ( bounds.minx >= bounds.maxx
+	  || bounds.miny >= bounds.maxy
+	  || path.begin >= path.end ) return;
+
+	vec2i boundsx(bounds.minx, bounds.maxx);
+
+	size_t group_size, count;
+
+	count = path.end - path.begin;
+	group_size = 8;
+
+	count = (count - 1)/group_size + 1;
+	cu.err = cuLaunchKernel(
+		contour_path_kernel,
+		count, 1, 1,
+		group_size, 1, 1,
+		0, 0, 0,
+		CudaParams()
+			.add(surface->width)
+			.add(surface->height)
+			.add(mark_buffer)
+			.add(points_buffer)
+			.add(path.begin)
+			.add(path.end)
+			.add(bounds.minx)
+			.get_extra() );
+	assert(!cu.err);
+
+	count = bounds.maxy - bounds.miny;
+	group_size = 1;
+
+	count = (count - 1)/group_size + 1;
+	cu.err = cuLaunchKernel(
+		contour_fill_kernel,
+		count, 1, 1,
+		group_size, 1, 1,
+		0, 0, 0,
+		CudaParams()
+			.add(surface->width)
+			.add(mark_buffer)
+			.add(surface_image)
+			.add(path.color, 16)
+			.add(bounds, 16)
+			.get_extra() );
+	assert(!cu.err);
+}
+
+void CudaRender::wait() {
+	cu.err = cuStreamSynchronize(0);
+	assert(!cu.err);
+}
+
diff --git a/c++/contourgl/cudarender.h b/c++/contourgl/cudarender.h
new file mode 100644
index 0000000..7eae406
--- /dev/null
+++ b/c++/contourgl/cudarender.h
@@ -0,0 +1,66 @@
+/*
+    ......... 2018 Ivan Mahonin
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CUDARENDER_H_
+#define _CUDARENDER_H_
+
+#include <vector>
+
+#include "cudacontext.h"
+#include "geometry.h"
+#include "contour.h"
+#include "swrender.h"
+
+
+class CudaRender {
+public:
+	struct Path {
+		ContextRect bounds;
+		int begin;
+		int end;
+		Color color;
+		bool invert;
+		bool evenodd;
+	};
+
+private:
+	CudaContext &cu;
+	CUmodule contour_module;
+	CUfunction contour_clear_kernel;
+	CUfunction contour_path_kernel;
+	CUfunction contour_fill_kernel;
+
+	Surface *surface;
+	CUdeviceptr points_buffer;
+	CUdeviceptr mark_buffer;
+	CUdeviceptr surface_image;
+
+public:
+	CudaRender(CudaContext &cl);
+	~CudaRender();
+
+	void send_surface(Surface *surface);
+	Surface* receive_surface();
+
+	void send_points(const vec2f *points, int count);
+
+	void draw(const Path &path);
+	void wait();
+};
+
+
+#endif
diff --git a/c++/contourgl/test.cpp b/c++/contourgl/test.cpp
index fc75a99..e32b29c 100644
--- a/c++/contourgl/test.cpp
+++ b/c++/contourgl/test.cpp
@@ -26,6 +26,9 @@
 #include "utils.h"
 #include "clrender.h"
 
+#ifdef CUDA
+#include "cudarender.h"
+#endif
 
 using namespace std;
 
@@ -444,3 +447,74 @@ void Test::test_cl3(Environment &e, Data &data, Surface &surface) {
 	}
 	clr.receive_surface();
 }
+
+void Test::test_cu(Environment &e, Data &data, Surface &surface) {
+#ifdef CUDA
+	// prepare data
+	vector<CudaRender::Path> paths;
+	vector<vec2f> points;
+	paths.reserve(data.size());
+	for(Data::const_iterator i = data.begin(); i != data.end(); ++i) {
+		if (!i->contour.get_chunks().empty()) {
+			CudaRender::Path path = {};
+			path.color = i->color;
+			path.invert = i->invert;
+			path.evenodd = i->evenodd;
+
+			path.bounds.minx = path.bounds.maxx = (int)floor(i->contour.get_chunks().front().p1.x);
+			path.bounds.miny = path.bounds.maxy = (int)floor(i->contour.get_chunks().front().p1.y);
+			path.begin = (int)points.size();
+			points.reserve(points.size() + i->contour.get_chunks().size() + 1);
+			for(Contour::ChunkList::const_iterator j = i->contour.get_chunks().begin(); j != i->contour.get_chunks().end(); ++j) {
+				int x = (int)floor(j->p1.x);
+				int y = (int)floor(j->p1.y);
+				if (path.bounds.minx > x) path.bounds.minx = x;
+				if (path.bounds.maxx < x) path.bounds.maxx = x;
+				if (path.bounds.miny > y) path.bounds.miny = y;
+				if (path.bounds.maxy < y) path.bounds.maxy = y;
+				points.push_back(vec2f(j->p1));
+			}
+			path.end = (int)points.size();
+			points.push_back( points[path.begin] );
+			++path.bounds.maxx;
+			++path.bounds.maxy;
+
+			paths.push_back(path);
+		}
+	}
+
+	// draw
+
+	CudaRender cur(e.cu);
+
+	// warm-up
+	cur.send_surface(&surface);
+	cur.send_points(&points.front(), (int)points.size());
+	for(int ii = 0; ii < 1000; ++ii)
+		for(vector<CudaRender::Path>::const_iterator i = paths.begin(); i != paths.end(); ++i)
+			cur.draw(*i);
+	cur.wait();
+
+	// measure
+	{
+		for(int ii = 0; ii < 1000; ++ii) {
+			Measure t("render", false, true);
+			for(vector<CudaRender::Path>::const_iterator i = paths.begin(); i != paths.end(); ++i)
+				cur.draw(*i);
+			cur.wait();
+		}
+	}
+	cur.send_points(NULL, 0);
+	cur.send_surface(NULL);
+
+	// actual task
+	cur.send_surface(&surface);
+	cur.send_points(&points.front(), (int)points.size());
+	{
+		for(vector<CudaRender::Path>::const_iterator i = paths.begin(); i != paths.end(); ++i)
+			cur.draw(*i);
+		cur.wait();
+	}
+	cur.receive_surface();
+#endif
+}
diff --git a/c++/contourgl/test.h b/c++/contourgl/test.h
index 6d3d3dd..a96d7c8 100644
--- a/c++/contourgl/test.h
+++ b/c++/contourgl/test.h
@@ -56,6 +56,7 @@ public:
 	static void test_cl(Environment &e, Data &data, Surface &surface);
 	static void test_cl2(Environment &e, Data &data, Surface &surface);
 	static void test_cl3(Environment &e, Data &data, Surface &surface);
+	static void test_cu(Environment &e, Data &data, Surface &surface);
 };
 
 #endif