From a7622f642993cc177afba8f159185a6c09b63468 Mon Sep 17 00:00:00 2001
From: Ivan Mahonin <bh@icystar.com>
Date: Jul 26 2018 19:08:17 +0000
Subject: contourgl: cuda simple code


---

diff --git a/c++/contourgl/Makefile b/c++/contourgl/Makefile
index c8d388c..d4df731 100644
--- a/c++/contourgl/Makefile
+++ b/c++/contourgl/Makefile
@@ -16,6 +16,7 @@ CXXFLAGS := $(CXXFLAGS) $(shell pkg-config --cflags $(DEPLIBS))
 LIBS := $(LIBS) $(shell pkg-config --libs $(DEPLIBS))
 
 ifdef CUDA
+	CUDA_FLAGS := -O3 -use_fast_math
 	CXXFLAGS := $(CXXFLAGS) -DCUDA $(shell PKG_CONFIG_PATH=$(CUDA_PKGCONFIG) pkg-config --cflags $(CUDA))
 	LIBS := $(LIBS) $(shell PKG_CONFIG_PATH=$(CUDA_PKGCONFIG) pkg-config --libs $(CUDA))
 endif
@@ -61,7 +62,7 @@ PTXS = $(CUDA_SOURCES:.cu=.ptx)
 # internal targets
 
 %.ptx: %.cu
-	$(CUDA_PATH)/bin/nvcc -ptx $< -o $@
+	$(CUDA_PATH)/bin/nvcc $(CUDA_FLAGS) -ptx $< -o $@
 
 # rule for make *.d files with include (.h) dependencies
 %.d: %.cpp
diff --git a/c++/contourgl/cl/contour-base.cl b/c++/contourgl/cl/contour-base.cl
index ff0322e..4d23607 100644
--- a/c++/contourgl/cl/contour-base.cl
+++ b/c++/contourgl/cl/contour-base.cl
@@ -24,16 +24,6 @@
 #define DIV_ONE_F 0.0000152587890625f // 1.f/(ONE_F)
 
 
-kernel void clear(
-	int width,
-	int height,
-	global int4 *marks )
-{
-	int id = get_global_id(0);
-	int c = id % width;
-	marks[id] = (int4)(0, 0, 0, 0);
-}
-
 kernel void path(
 	int width,
 	int height,
@@ -58,14 +48,14 @@ kernel void path(
 	float ky = d.y/d.x;
 	
 	while(p0.x != p1.x || p0.y != p1.y) {
-		int iy = max((int)p0.y, 0);
-		int ix = (int)p0.x;
-		if (iy > h1) return;
+		int ix = max((int)p0.x, 0);
+		int iy = (int)p0.y;
+		if (ix > w1) return;
 
 		float2 px, py;
 		px.x = (float)(ix + 1);
 		py.y = (float)(iy + 1);
-		ix = clamp(ix, 0, w1);
+		iy = clamp(iy, 0, h1);
 		
 		px.y = p0.y + ky*(px.x - p0.x);
 		py.x = p0.x + kx*(py.y - p0.y);
@@ -74,13 +64,13 @@ kernel void path(
 		if (pp1.x > px.x) pp1 = px;
 		if (pp1.y > py.y) pp1 = py;
 		
-		float cover = (pp1.y - p0.y)*ONE_F;
-		float area = px.x - 0.5f*(p0.x + pp1.x);
-		if (flipy) { iy = h1 - iy; cover = -cover; }
-		if (flipx) { ix = w1 - ix; area = 1.f - area; }
+		float cover = (pp1.x - p0.x)*ONE_F;
+		float area = py.y - 0.5f*(p0.y + pp1.y);
+		if (flipx) { ix = w1 - ix; cover = -cover; }
+		if (flipy) { iy = h1 - iy; area = 1.f - area; }
 		p0 = pp1;
 		
-		atomic_add(marks + ix*height + iy, upsample((int)cover, (int)(area*cover)));
+		atomic_add(marks + iy*width + ix, upsample((int)cover, (int)(area*cover)));
 	}
 }
 
@@ -88,14 +78,14 @@ kernel void path(
 // different implementations for:
 //   antialiased, transparent, inverted, evenodd contours and combinations (total 16 implementations)
 kernel void fill(
-	int height,
+	int width,
 	global int2 *marks,
 	global float4 *image,
 	float4 color,
 	int4 bounds )
 {
-	if (get_global_id(0) >= bounds.s3) return;
-	int id = (int)get_global_id(0) + bounds.s0*height;
+	if (get_global_id(0) >= bounds.s2) return;
+	int id = (int)get_global_id(0) + bounds.s1*width;
 	marks += id;
 	image += id;
 
@@ -104,12 +94,12 @@ kernel void fill(
 		int2 m = *marks;
 		*marks = (int2)(0, 0);
 		float alpha = (float)abs(m.x + icover)*color.w*DIV_ONE_F;
-		marks += height;
+		marks += width;
 
 		icover += m.y;
 		*image = *image*(1.f - alpha) + color*alpha;
 		
-		if (++bounds.s0 >= bounds.s2) return;
-		image += height;
+		if (++bounds.s1 >= bounds.s3) return;
+		image += width;
 	}
 }
diff --git a/c++/contourgl/clrender.cpp b/c++/contourgl/clrender.cpp
index 9a058c6..a2a8f8e 100644
--- a/c++/contourgl/clrender.cpp
+++ b/c++/contourgl/clrender.cpp
@@ -424,7 +424,6 @@ void ClRender2::wait() {
 ClRender3::ClRender3(ClContext &cl):
 	cl(cl),
 	contour_program(),
-	contour_clear_kernel(),
 	contour_path_kernel(),
 	contour_fill_kernel(),
 	surface(),
@@ -436,10 +435,6 @@ ClRender3::ClRender3(ClContext &cl):
 	contour_program = cl.load_program("contour-base.cl");
 	assert(contour_program);
 
-	contour_clear_kernel = clCreateKernel(contour_program, "clear", &cl.err);
-	assert(!cl.err);
-	assert(contour_clear_kernel);
-
 	contour_path_kernel = clCreateKernel(contour_program, "path", &cl.err);
 	assert(!cl.err);
 	assert(contour_path_kernel);
@@ -453,10 +448,10 @@ ClRender3::~ClRender3() {
 	send_points(NULL, 0);
 	send_surface(NULL);
 
-	clReleaseKernel(contour_path_kernel);
-	clReleaseKernel(contour_fill_kernel);
-	clReleaseKernel(contour_clear_kernel);
-	clReleaseProgram(contour_program);
+	cl.err |= clReleaseKernel(contour_path_kernel);
+	cl.err |= clReleaseKernel(contour_fill_kernel);
+	cl.err |= clReleaseProgram(contour_program);
+	assert(!cl.err);
 }
 
 void ClRender3::send_surface(Surface *surface) {
@@ -472,7 +467,7 @@ void ClRender3::send_surface(Surface *surface) {
 	if (this->surface) {
 		//Measure t("ClRender::send_surface");
 
-		int zero_mark[4] = { };
+		vec2i zero_mark;
 
 		surface_image = clCreateBuffer(
 			cl.context, CL_MEM_READ_WRITE,
@@ -494,15 +489,10 @@ void ClRender3::send_surface(Surface *surface) {
 			0, NULL, NULL );
 		assert(!cl.err);
 
-		cl.err |= clSetKernelArg(contour_clear_kernel, 0, sizeof(surface->width), &surface->width);
-		cl.err |= clSetKernelArg(contour_clear_kernel, 1, sizeof(surface->height), &surface->height);
-		cl.err |= clSetKernelArg(contour_clear_kernel, 2, sizeof(mark_buffer), &mark_buffer);
-		assert(!cl.err);
-
-		size_t count = surface->count();
-		cl.err |= clEnqueueNDRangeKernel(
-			cl.queue, contour_clear_kernel,
-			1, NULL, &count, NULL,
+		cl.err |= clEnqueueFillBuffer(
+			cl.queue, mark_buffer,
+			&zero_mark, sizeof(zero_mark),
+			0, surface->count()*sizeof(zero_mark),
 			0, NULL, NULL );
 		assert(!cl.err);
 
@@ -511,7 +501,7 @@ void ClRender3::send_surface(Surface *surface) {
 		cl.err |= clSetKernelArg(contour_path_kernel, 2, sizeof(mark_buffer), &mark_buffer);
 		assert(!cl.err);
 
-		cl.err |= clSetKernelArg(contour_fill_kernel, 0, sizeof(surface->height), &surface->height);
+		cl.err |= clSetKernelArg(contour_fill_kernel, 0, sizeof(surface->width), &surface->width);
 		cl.err |= clSetKernelArg(contour_fill_kernel, 1, sizeof(mark_buffer), &mark_buffer);
 		cl.err |= clSetKernelArg(contour_fill_kernel, 2, sizeof(surface_image), &surface_image);
 		assert(!cl.err);
@@ -604,8 +594,8 @@ void ClRender3::draw(const Path &path) {
 		0, NULL, NULL );
 	assert(!cl.err);
 
-	offset = bounds.miny;
-	count = bounds.maxy - bounds.miny;
+	offset = bounds.minx;
+	count = bounds.maxx - bounds.minx;
 	group_size = 16;
 
 	count = ((count - 1)/group_size + 1)*group_size;
diff --git a/c++/contourgl/clrender.h b/c++/contourgl/clrender.h
index b862853..64b14c4 100644
--- a/c++/contourgl/clrender.h
+++ b/c++/contourgl/clrender.h
@@ -113,7 +113,6 @@ public:
 private:
 	ClContext &cl;
 	cl_program contour_program;
-	cl_kernel contour_clear_kernel;
 	cl_kernel contour_path_kernel;
 	cl_kernel contour_fill_kernel;
 
diff --git a/c++/contourgl/contourgl.cpp b/c++/contourgl/contourgl.cpp
index 9c24d70..8c9b105 100644
--- a/c++/contourgl/contourgl.cpp
+++ b/c++/contourgl/contourgl.cpp
@@ -88,9 +88,9 @@ int main() {
 		*/
 		{
 			Environment e(width, height, false, false, 8);
-			//{ Surface surface(width, height);
-			//  Measure t("test_lineslow_sw.tga", surface, true);
-			//  Test::test_sw(e, datalow, surface); }
+			{ Surface surface(width, height);
+			  Measure t("test_lineslow_sw.tga", surface, true);
+			  Test::test_sw(e, datalow, surface); }
 			/*
 			{ Surface surface(width, height);
 			  Measure t("test_lineslow_cl.tga", surface, true);
@@ -98,10 +98,10 @@ int main() {
 			{ Surface surface(width, height);
 			  Measure t("test_lineslow_cl2.tga", surface, true);
 			  Test::test_cl2(e, datalow, surface); }
+			*/
 			{ Surface surface(width, height);
 			  Measure t("test_lineslow_cl3.tga", surface, true);
 			  Test::test_cl3(e, datalow, surface); }
-			*/
 			{ Surface surface(width, height);
 			  Measure t("test_lineslow_cu.tga", surface, true);
 			  Test::test_cu(e, datalow, surface); }
diff --git a/c++/contourgl/cuda/contour.cu b/c++/contourgl/cuda/contour.cu
index db537b2..ef737c0 100644
--- a/c++/contourgl/cuda/contour.cu
+++ b/c++/contourgl/cuda/contour.cu
@@ -15,7 +15,7 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-extern "C" {
+//extern "C" {
 
 #define ONE       65536
 #define TWO      131072               // (ONE)*2
@@ -24,16 +24,6 @@ extern "C" {
 #define DIV_ONE_F 0.0000152587890625f // 1.f/(ONE_F)
 
 
-__global__ void clear(
-	int width,
-	int height,
-	int4 *marks )
-{
-	int id = blockIdx.x*blockDim.x + threadIdx.x;
-	int c = id % width;
-	marks[id] = make_int4(0, 0, c | (c + 1), 0);
-}
-
 __global__ void path(
 	int width,
 	int height,
@@ -50,102 +40,82 @@ __global__ void path(
 	
 	bool flipx = p1.x < p0.x;
 	bool flipy = p1.y < p0.y;
-	if (flipx) { p0.x = (float)width - p0.x; p1.x = (float)width - p1.x; }
+	if (flipx) { p0.x = (float)width  - p0.x; p1.x = (float)width  - p1.x; }
 	if (flipy) { p0.y = (float)height - p0.y; p1.y = (float)height - p1.y; }
-	
 	float2 d;
 	d.x = p1.x - p0.x;
 	d.y = p1.y - p0.y;
-	float kx = d.x/d.y;
-	float ky = d.y/d.x;
 	int w1 = width - 1;
 	int h1 = height - 1;
+	float kx = d.x/d.y;
+	float ky = d.y/d.x;
 	
 	while(p0.x != p1.x || p0.y != p1.y) {
-		int ix = (int)p0.x;
-		int iy = max((int)p0.y, 0);
-		if (iy > h1) return;
+		int ix = max((int)p0.x, 0);
+		int iy = (int)p0.y;
+		if (ix > w1) return;
 
 		float2 px, py;
 		px.x = (float)(ix + 1);
 		py.y = (float)(iy + 1);
-		ix = max(0, min(w1, ix));
+		iy = max(0, min(h1, iy));
 		
 		px.y = p0.y + ky*(px.x - p0.x);
 		py.x = p0.x + kx*(py.y - p0.y);
-
+		
 		float2 pp1 = p1;
 		if (pp1.x > px.x) pp1 = px;
 		if (pp1.y > py.y) pp1 = py;
 		
-		float cover = (pp1.y - p0.y)*ONE_F;
-		float area = px.x - 0.5f*(p0.x + pp1.x);
-		if (flipx) { ix = w1 - ix; area = 1.f - area; }
-		if (flipy) { iy = h1 - iy; cover = -cover; }
+		float cover = (pp1.x - p0.x)*ONE_F;
+		float area = py.y - 0.5f*(p0.y + pp1.y);
+		if (flipx) { ix = w1 - ix; cover = -cover; }
+		if (flipy) { iy = h1 - iy; area = 1.f - area; }
+		p0 = pp1;
 		
-		int *row = marks + 4*iy*width;
 		atomicAdd(
-			(unsigned long long*)(row + 4*ix),
+			(unsigned long long*)(marks + 2*(iy*width + ix)),
 			((unsigned long long)(unsigned int)(int)(cover) << 32)
 			| (unsigned long long)(unsigned int)((int)(area*cover)) );
-		//row[4*ix] += (int)(area*cover);
-		//row[4*ix + 1] += (int)(cover);
-		//atomicAdd(row + 4*ix, (int)(area*cover));
-		//atomicAdd(row + 4*ix + 1, (int)(cover));
-		
-		row += 2;
-		int iix = (ix & (ix + 1)) - 1;
-		while(iix >= minx) {
-			atomicMin(row + 4*iix, ix);
-			iix = (iix & (iix + 1)) - 1;
-		}
-		
-		p0 = pp1;
+		//int *mark = marks + ((iy*width + ix) << 1);
+		//atomicAdd(mark, (int)(area*cover));
+		//atomicAdd(mark + 1, (int)(cover));
 	}
 }
 
 __global__ void fill(
 	int width,
-	int4 *marks,
+	int2 *marks,
 	float4 *image,
 	float4 color,
 	int4 bounds )
 {
-	int id = blockIdx.x*blockDim.x + threadIdx.x + bounds.y;
-	if (id >= bounds.w) return;
-	id *= width;
+	int id = blockIdx.x*blockDim.x + threadIdx.x + bounds.x;
+	if (id >= bounds.z) return;
+	id += bounds.y*width;
 	marks += id;
 	image += id;
-	
-	int4 *mark;
-	float4 *pixel;
 
-	int4 m;
-	int icover = 0, c0 = bounds.x, c1 = bounds.x;
-	while(c1 < bounds.z) {
-		if (abs(icover) > HALF)
-			while(c0 < c1)
-				image[c0++] = color;
+	int icover = 0;
+	while(true) {
+		int2 m = *marks;
+		*marks = make_int2(0, 0);
+		float alpha = (float)abs(m.x + icover)*color.w*DIV_ONE_F;
+		marks += width;
 
-		mark = &marks[c1];
-		m = *mark;
-		*mark = make_int4(0, 0, c1 | (c1 + 1), 0); 
-		
-		float alpha = (float)abs(m.x + icover)*DIV_ONE_F;
+		icover += m.y;
 		float one_alpha = 1.f - alpha;
 		
-		pixel = &image[c1];
-		float4 p = *pixel;
+		float4 p = *image;
 		p.x = p.x*one_alpha + color.x*alpha;
 		p.y = p.y*one_alpha + color.y*alpha;
 		p.z = p.z*one_alpha + color.z*alpha;
 		p.w = p.w*one_alpha + color.w*alpha;
-		*pixel = p;
+		*image = p;
 		
-		icover += m.y;
-		c0 = c1 + 1;
-		c1 = m.z;
+		if (++bounds.y >= bounds.w) return;
+		image += width;
 	}
 }
 
-}
\ No newline at end of file
+//}
\ No newline at end of file
diff --git a/c++/contourgl/cudarender.cpp b/c++/contourgl/cudarender.cpp
index 70eff45..7668a90 100644
--- a/c++/contourgl/cudarender.cpp
+++ b/c++/contourgl/cudarender.cpp
@@ -30,7 +30,6 @@ using namespace std;
 CudaRender::CudaRender(CudaContext &cu):
 	cu(cu),
 	contour_module(),
-	contour_clear_kernel(),
 	contour_path_kernel(),
 	contour_fill_kernel(),
 	surface(),
@@ -41,13 +40,12 @@ CudaRender::CudaRender(CudaContext &cu):
 	cu.err = cuModuleLoad(&contour_module, "cuda/contour.ptx");
 	assert(!cu.err);
 
-	cu.err = cuModuleGetFunction(&contour_clear_kernel, contour_module, "clear");
+	//cu.err = cuModuleGetFunction(&contour_path_kernel, contour_module, "path");
+	cu.err = cuModuleGetFunction(&contour_path_kernel, contour_module, "_Z4pathiiPiPK6float2iii");
 	assert(!cu.err);
 
-	cu.err = cuModuleGetFunction(&contour_path_kernel, contour_module, "path");
-	assert(!cu.err);
-
-	cu.err = cuModuleGetFunction(&contour_fill_kernel, contour_module, "fill");
+	//cu.err = cuModuleGetFunction(&contour_fill_kernel, contour_module, "fill");
+	cu.err = cuModuleGetFunction(&contour_fill_kernel, contour_module, "_Z4filliP4int2P6float4S1_4int4");
 	assert(!cu.err);
 }
 
@@ -75,32 +73,17 @@ void CudaRender::send_surface(Surface *surface) {
 	this->surface = surface;
 
 	if (this->surface) {
-		int zero_mark[4] = { };
-
 		cu.err = cuMemAlloc(&surface_image, surface->data_size());
 		assert(!cu.err);
 
 		cu.err = cuMemcpyHtoD(surface_image, surface->data, surface->data_size());
 		assert(!cu.err);
 
-		cu.err = cuMemAlloc(&mark_buffer, surface->count()*sizeof(zero_mark));
+		cu.err = cuMemAlloc(&mark_buffer, surface->count()*sizeof(vec2i));
 		assert(!cu.err);
 
-		size_t group_size = 32;
-		size_t count = surface->count();
-		cu.err = cuLaunchKernel(
-			contour_clear_kernel,
-			(count - 1)/group_size + 1, 1, 1,
-			group_size, 1, 1,
-			0, 0, 0,
-			CudaParams()
-				.add(surface->width)
-				.add(surface->height)
-				.add(mark_buffer)
-				.get_extra() );
+		cu.err = cuMemsetD32(mark_buffer, 0, 2*surface->count());
 		assert(!cu.err);
-
-		wait();
 	}
 }
 
@@ -148,7 +131,7 @@ void CudaRender::draw(const Path &path) {
 	size_t group_size, count;
 
 	count = path.end - path.begin;
-	group_size = 8;
+	group_size = 128;
 
 	count = (count - 1)/group_size + 1;
 	cu.err = cuLaunchKernel(
@@ -167,8 +150,8 @@ void CudaRender::draw(const Path &path) {
 			.get_extra() );
 	assert(!cu.err);
 
-	count = bounds.maxy - bounds.miny;
-	group_size = 1;
+	count = bounds.maxx - bounds.minx;
+	group_size = 16;
 
 	count = (count - 1)/group_size + 1;
 	cu.err = cuLaunchKernel(
diff --git a/c++/contourgl/cudarender.h b/c++/contourgl/cudarender.h
index 7eae406..90ff746 100644
--- a/c++/contourgl/cudarender.h
+++ b/c++/contourgl/cudarender.h
@@ -40,7 +40,6 @@ public:
 private:
 	CudaContext &cu;
 	CUmodule contour_module;
-	CUfunction contour_clear_kernel;
 	CUfunction contour_path_kernel;
 	CUfunction contour_fill_kernel;
 
diff --git a/c++/contourgl/test.cpp b/c++/contourgl/test.cpp
index 2416e07..5e7a58d 100644
--- a/c++/contourgl/test.cpp
+++ b/c++/contourgl/test.cpp
@@ -1,5 +1,5 @@
 /*
-    ......... 2015 Ivan Mahonin
+    ......... 2015-2018 Ivan Mahonin
 
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by