From a7622f642993cc177afba8f159185a6c09b63468 Mon Sep 17 00:00:00 2001 From: Ivan Mahonin Date: Jul 26 2018 19:08:17 +0000 Subject: contourgl: cuda simple code --- diff --git a/c++/contourgl/Makefile b/c++/contourgl/Makefile index c8d388c..d4df731 100644 --- a/c++/contourgl/Makefile +++ b/c++/contourgl/Makefile @@ -16,6 +16,7 @@ CXXFLAGS := $(CXXFLAGS) $(shell pkg-config --cflags $(DEPLIBS)) LIBS := $(LIBS) $(shell pkg-config --libs $(DEPLIBS)) ifdef CUDA + CUDA_FLAGS := -O3 -use_fast_math CXXFLAGS := $(CXXFLAGS) -DCUDA $(shell PKG_CONFIG_PATH=$(CUDA_PKGCONFIG) pkg-config --cflags $(CUDA)) LIBS := $(LIBS) $(shell PKG_CONFIG_PATH=$(CUDA_PKGCONFIG) pkg-config --libs $(CUDA)) endif @@ -61,7 +62,7 @@ PTXS = $(CUDA_SOURCES:.cu=.ptx) # internal targets %.ptx: %.cu - $(CUDA_PATH)/bin/nvcc -ptx $< -o $@ + $(CUDA_PATH)/bin/nvcc $(CUDA_FLAGS) -ptx $< -o $@ # rule for make *.d files with include (.h) dependencies %.d: %.cpp diff --git a/c++/contourgl/cl/contour-base.cl b/c++/contourgl/cl/contour-base.cl index ff0322e..4d23607 100644 --- a/c++/contourgl/cl/contour-base.cl +++ b/c++/contourgl/cl/contour-base.cl @@ -24,16 +24,6 @@ #define DIV_ONE_F 0.0000152587890625f // 1.f/(ONE_F) -kernel void clear( - int width, - int height, - global int4 *marks ) -{ - int id = get_global_id(0); - int c = id % width; - marks[id] = (int4)(0, 0, 0, 0); -} - kernel void path( int width, int height, @@ -58,14 +48,14 @@ kernel void path( float ky = d.y/d.x; while(p0.x != p1.x || p0.y != p1.y) { - int iy = max((int)p0.y, 0); - int ix = (int)p0.x; - if (iy > h1) return; + int ix = max((int)p0.x, 0); + int iy = (int)p0.y; + if (ix > w1) return; float2 px, py; px.x = (float)(ix + 1); py.y = (float)(iy + 1); - ix = clamp(ix, 0, w1); + iy = clamp(iy, 0, h1); px.y = p0.y + ky*(px.x - p0.x); py.x = p0.x + kx*(py.y - p0.y); @@ -74,13 +64,13 @@ kernel void path( if (pp1.x > px.x) pp1 = px; if (pp1.y > py.y) pp1 = py; - float cover = (pp1.y - p0.y)*ONE_F; - float area = px.x - 0.5f*(p0.x + pp1.x); - if (flipy) { iy = h1 - iy; cover = -cover; } - if (flipx) { ix = w1 - ix; area = 1.f - area; } + float cover = (pp1.x - p0.x)*ONE_F; + float area = py.y - 0.5f*(p0.y + pp1.y); + if (flipx) { ix = w1 - ix; cover = -cover; } + if (flipy) { iy = h1 - iy; area = 1.f - area; } p0 = pp1; - atomic_add(marks + ix*height + iy, upsample((int)cover, (int)(area*cover))); + atomic_add(marks + iy*width + ix, upsample((int)cover, (int)(area*cover))); } } @@ -88,14 +78,14 @@ kernel void path( // different implementations for: // antialiased, transparent, inverted, evenodd contours and combinations (total 16 implementations) kernel void fill( - int height, + int width, global int2 *marks, global float4 *image, float4 color, int4 bounds ) { - if (get_global_id(0) >= bounds.s3) return; - int id = (int)get_global_id(0) + bounds.s0*height; + if (get_global_id(0) >= bounds.s2) return; + int id = (int)get_global_id(0) + bounds.s1*width; marks += id; image += id; @@ -104,12 +94,12 @@ kernel void fill( int2 m = *marks; *marks = (int2)(0, 0); float alpha = (float)abs(m.x + icover)*color.w*DIV_ONE_F; - marks += height; + marks += width; icover += m.y; *image = *image*(1.f - alpha) + color*alpha; - if (++bounds.s0 >= bounds.s2) return; - image += height; + if (++bounds.s1 >= bounds.s3) return; + image += width; } } diff --git a/c++/contourgl/clrender.cpp b/c++/contourgl/clrender.cpp index 9a058c6..a2a8f8e 100644 --- a/c++/contourgl/clrender.cpp +++ b/c++/contourgl/clrender.cpp @@ -424,7 +424,6 @@ void ClRender2::wait() { ClRender3::ClRender3(ClContext &cl): cl(cl), contour_program(), - contour_clear_kernel(), contour_path_kernel(), contour_fill_kernel(), surface(), @@ -436,10 +435,6 @@ ClRender3::ClRender3(ClContext &cl): contour_program = cl.load_program("contour-base.cl"); assert(contour_program); - contour_clear_kernel = clCreateKernel(contour_program, "clear", &cl.err); - assert(!cl.err); - assert(contour_clear_kernel); - contour_path_kernel = clCreateKernel(contour_program, "path", &cl.err); assert(!cl.err); assert(contour_path_kernel); @@ -453,10 +448,10 @@ ClRender3::~ClRender3() { send_points(NULL, 0); send_surface(NULL); - clReleaseKernel(contour_path_kernel); - clReleaseKernel(contour_fill_kernel); - clReleaseKernel(contour_clear_kernel); - clReleaseProgram(contour_program); + cl.err |= clReleaseKernel(contour_path_kernel); + cl.err |= clReleaseKernel(contour_fill_kernel); + cl.err |= clReleaseProgram(contour_program); + assert(!cl.err); } void ClRender3::send_surface(Surface *surface) { @@ -472,7 +467,7 @@ void ClRender3::send_surface(Surface *surface) { if (this->surface) { //Measure t("ClRender::send_surface"); - int zero_mark[4] = { }; + vec2i zero_mark; surface_image = clCreateBuffer( cl.context, CL_MEM_READ_WRITE, @@ -494,15 +489,10 @@ void ClRender3::send_surface(Surface *surface) { 0, NULL, NULL ); assert(!cl.err); - cl.err |= clSetKernelArg(contour_clear_kernel, 0, sizeof(surface->width), &surface->width); - cl.err |= clSetKernelArg(contour_clear_kernel, 1, sizeof(surface->height), &surface->height); - cl.err |= clSetKernelArg(contour_clear_kernel, 2, sizeof(mark_buffer), &mark_buffer); - assert(!cl.err); - - size_t count = surface->count(); - cl.err |= clEnqueueNDRangeKernel( - cl.queue, contour_clear_kernel, - 1, NULL, &count, NULL, + cl.err |= clEnqueueFillBuffer( + cl.queue, mark_buffer, + &zero_mark, sizeof(zero_mark), + 0, surface->count()*sizeof(zero_mark), 0, NULL, NULL ); assert(!cl.err); @@ -511,7 +501,7 @@ void ClRender3::send_surface(Surface *surface) { cl.err |= clSetKernelArg(contour_path_kernel, 2, sizeof(mark_buffer), &mark_buffer); assert(!cl.err); - cl.err |= clSetKernelArg(contour_fill_kernel, 0, sizeof(surface->height), &surface->height); + cl.err |= clSetKernelArg(contour_fill_kernel, 0, sizeof(surface->width), &surface->width); cl.err |= clSetKernelArg(contour_fill_kernel, 1, sizeof(mark_buffer), &mark_buffer); cl.err |= clSetKernelArg(contour_fill_kernel, 2, sizeof(surface_image), &surface_image); assert(!cl.err); @@ -604,8 +594,8 @@ void ClRender3::draw(const Path &path) { 0, NULL, NULL ); assert(!cl.err); - offset = bounds.miny; - count = bounds.maxy - bounds.miny; + offset = bounds.minx; + count = bounds.maxx - bounds.minx; group_size = 16; count = ((count - 1)/group_size + 1)*group_size; diff --git a/c++/contourgl/clrender.h b/c++/contourgl/clrender.h index b862853..64b14c4 100644 --- a/c++/contourgl/clrender.h +++ b/c++/contourgl/clrender.h @@ -113,7 +113,6 @@ public: private: ClContext &cl; cl_program contour_program; - cl_kernel contour_clear_kernel; cl_kernel contour_path_kernel; cl_kernel contour_fill_kernel; diff --git a/c++/contourgl/contourgl.cpp b/c++/contourgl/contourgl.cpp index 9c24d70..8c9b105 100644 --- a/c++/contourgl/contourgl.cpp +++ b/c++/contourgl/contourgl.cpp @@ -88,9 +88,9 @@ int main() { */ { Environment e(width, height, false, false, 8); - //{ Surface surface(width, height); - // Measure t("test_lineslow_sw.tga", surface, true); - // Test::test_sw(e, datalow, surface); } + { Surface surface(width, height); + Measure t("test_lineslow_sw.tga", surface, true); + Test::test_sw(e, datalow, surface); } /* { Surface surface(width, height); Measure t("test_lineslow_cl.tga", surface, true); @@ -98,10 +98,10 @@ int main() { { Surface surface(width, height); Measure t("test_lineslow_cl2.tga", surface, true); Test::test_cl2(e, datalow, surface); } + */ { Surface surface(width, height); Measure t("test_lineslow_cl3.tga", surface, true); Test::test_cl3(e, datalow, surface); } - */ { Surface surface(width, height); Measure t("test_lineslow_cu.tga", surface, true); Test::test_cu(e, datalow, surface); } diff --git a/c++/contourgl/cuda/contour.cu b/c++/contourgl/cuda/contour.cu index db537b2..ef737c0 100644 --- a/c++/contourgl/cuda/contour.cu +++ b/c++/contourgl/cuda/contour.cu @@ -15,7 +15,7 @@ along with this program. If not, see . */ -extern "C" { +//extern "C" { #define ONE 65536 #define TWO 131072 // (ONE)*2 @@ -24,16 +24,6 @@ extern "C" { #define DIV_ONE_F 0.0000152587890625f // 1.f/(ONE_F) -__global__ void clear( - int width, - int height, - int4 *marks ) -{ - int id = blockIdx.x*blockDim.x + threadIdx.x; - int c = id % width; - marks[id] = make_int4(0, 0, c | (c + 1), 0); -} - __global__ void path( int width, int height, @@ -50,102 +40,82 @@ __global__ void path( bool flipx = p1.x < p0.x; bool flipy = p1.y < p0.y; - if (flipx) { p0.x = (float)width - p0.x; p1.x = (float)width - p1.x; } + if (flipx) { p0.x = (float)width - p0.x; p1.x = (float)width - p1.x; } if (flipy) { p0.y = (float)height - p0.y; p1.y = (float)height - p1.y; } - float2 d; d.x = p1.x - p0.x; d.y = p1.y - p0.y; - float kx = d.x/d.y; - float ky = d.y/d.x; int w1 = width - 1; int h1 = height - 1; + float kx = d.x/d.y; + float ky = d.y/d.x; while(p0.x != p1.x || p0.y != p1.y) { - int ix = (int)p0.x; - int iy = max((int)p0.y, 0); - if (iy > h1) return; + int ix = max((int)p0.x, 0); + int iy = (int)p0.y; + if (ix > w1) return; float2 px, py; px.x = (float)(ix + 1); py.y = (float)(iy + 1); - ix = max(0, min(w1, ix)); + iy = max(0, min(h1, iy)); px.y = p0.y + ky*(px.x - p0.x); py.x = p0.x + kx*(py.y - p0.y); - + float2 pp1 = p1; if (pp1.x > px.x) pp1 = px; if (pp1.y > py.y) pp1 = py; - float cover = (pp1.y - p0.y)*ONE_F; - float area = px.x - 0.5f*(p0.x + pp1.x); - if (flipx) { ix = w1 - ix; area = 1.f - area; } - if (flipy) { iy = h1 - iy; cover = -cover; } + float cover = (pp1.x - p0.x)*ONE_F; + float area = py.y - 0.5f*(p0.y + pp1.y); + if (flipx) { ix = w1 - ix; cover = -cover; } + if (flipy) { iy = h1 - iy; area = 1.f - area; } + p0 = pp1; - int *row = marks + 4*iy*width; atomicAdd( - (unsigned long long*)(row + 4*ix), + (unsigned long long*)(marks + 2*(iy*width + ix)), ((unsigned long long)(unsigned int)(int)(cover) << 32) | (unsigned long long)(unsigned int)((int)(area*cover)) ); - //row[4*ix] += (int)(area*cover); - //row[4*ix + 1] += (int)(cover); - //atomicAdd(row + 4*ix, (int)(area*cover)); - //atomicAdd(row + 4*ix + 1, (int)(cover)); - - row += 2; - int iix = (ix & (ix + 1)) - 1; - while(iix >= minx) { - atomicMin(row + 4*iix, ix); - iix = (iix & (iix + 1)) - 1; - } - - p0 = pp1; + //int *mark = marks + ((iy*width + ix) << 1); + //atomicAdd(mark, (int)(area*cover)); + //atomicAdd(mark + 1, (int)(cover)); } } __global__ void fill( int width, - int4 *marks, + int2 *marks, float4 *image, float4 color, int4 bounds ) { - int id = blockIdx.x*blockDim.x + threadIdx.x + bounds.y; - if (id >= bounds.w) return; - id *= width; + int id = blockIdx.x*blockDim.x + threadIdx.x + bounds.x; + if (id >= bounds.z) return; + id += bounds.y*width; marks += id; image += id; - - int4 *mark; - float4 *pixel; - int4 m; - int icover = 0, c0 = bounds.x, c1 = bounds.x; - while(c1 < bounds.z) { - if (abs(icover) > HALF) - while(c0 < c1) - image[c0++] = color; + int icover = 0; + while(true) { + int2 m = *marks; + *marks = make_int2(0, 0); + float alpha = (float)abs(m.x + icover)*color.w*DIV_ONE_F; + marks += width; - mark = &marks[c1]; - m = *mark; - *mark = make_int4(0, 0, c1 | (c1 + 1), 0); - - float alpha = (float)abs(m.x + icover)*DIV_ONE_F; + icover += m.y; float one_alpha = 1.f - alpha; - pixel = &image[c1]; - float4 p = *pixel; + float4 p = *image; p.x = p.x*one_alpha + color.x*alpha; p.y = p.y*one_alpha + color.y*alpha; p.z = p.z*one_alpha + color.z*alpha; p.w = p.w*one_alpha + color.w*alpha; - *pixel = p; + *image = p; - icover += m.y; - c0 = c1 + 1; - c1 = m.z; + if (++bounds.y >= bounds.w) return; + image += width; } } -} \ No newline at end of file +//} \ No newline at end of file diff --git a/c++/contourgl/cudarender.cpp b/c++/contourgl/cudarender.cpp index 70eff45..7668a90 100644 --- a/c++/contourgl/cudarender.cpp +++ b/c++/contourgl/cudarender.cpp @@ -30,7 +30,6 @@ using namespace std; CudaRender::CudaRender(CudaContext &cu): cu(cu), contour_module(), - contour_clear_kernel(), contour_path_kernel(), contour_fill_kernel(), surface(), @@ -41,13 +40,12 @@ CudaRender::CudaRender(CudaContext &cu): cu.err = cuModuleLoad(&contour_module, "cuda/contour.ptx"); assert(!cu.err); - cu.err = cuModuleGetFunction(&contour_clear_kernel, contour_module, "clear"); + //cu.err = cuModuleGetFunction(&contour_path_kernel, contour_module, "path"); + cu.err = cuModuleGetFunction(&contour_path_kernel, contour_module, "_Z4pathiiPiPK6float2iii"); assert(!cu.err); - cu.err = cuModuleGetFunction(&contour_path_kernel, contour_module, "path"); - assert(!cu.err); - - cu.err = cuModuleGetFunction(&contour_fill_kernel, contour_module, "fill"); + //cu.err = cuModuleGetFunction(&contour_fill_kernel, contour_module, "fill"); + cu.err = cuModuleGetFunction(&contour_fill_kernel, contour_module, "_Z4filliP4int2P6float4S1_4int4"); assert(!cu.err); } @@ -75,32 +73,17 @@ void CudaRender::send_surface(Surface *surface) { this->surface = surface; if (this->surface) { - int zero_mark[4] = { }; - cu.err = cuMemAlloc(&surface_image, surface->data_size()); assert(!cu.err); cu.err = cuMemcpyHtoD(surface_image, surface->data, surface->data_size()); assert(!cu.err); - cu.err = cuMemAlloc(&mark_buffer, surface->count()*sizeof(zero_mark)); + cu.err = cuMemAlloc(&mark_buffer, surface->count()*sizeof(vec2i)); assert(!cu.err); - size_t group_size = 32; - size_t count = surface->count(); - cu.err = cuLaunchKernel( - contour_clear_kernel, - (count - 1)/group_size + 1, 1, 1, - group_size, 1, 1, - 0, 0, 0, - CudaParams() - .add(surface->width) - .add(surface->height) - .add(mark_buffer) - .get_extra() ); + cu.err = cuMemsetD32(mark_buffer, 0, 2*surface->count()); assert(!cu.err); - - wait(); } } @@ -148,7 +131,7 @@ void CudaRender::draw(const Path &path) { size_t group_size, count; count = path.end - path.begin; - group_size = 8; + group_size = 128; count = (count - 1)/group_size + 1; cu.err = cuLaunchKernel( @@ -167,8 +150,8 @@ void CudaRender::draw(const Path &path) { .get_extra() ); assert(!cu.err); - count = bounds.maxy - bounds.miny; - group_size = 1; + count = bounds.maxx - bounds.minx; + group_size = 16; count = (count - 1)/group_size + 1; cu.err = cuLaunchKernel( diff --git a/c++/contourgl/cudarender.h b/c++/contourgl/cudarender.h index 7eae406..90ff746 100644 --- a/c++/contourgl/cudarender.h +++ b/c++/contourgl/cudarender.h @@ -40,7 +40,6 @@ public: private: CudaContext &cu; CUmodule contour_module; - CUfunction contour_clear_kernel; CUfunction contour_path_kernel; CUfunction contour_fill_kernel; diff --git a/c++/contourgl/test.cpp b/c++/contourgl/test.cpp index 2416e07..5e7a58d 100644 --- a/c++/contourgl/test.cpp +++ b/c++/contourgl/test.cpp @@ -1,5 +1,5 @@ /* - ......... 2015 Ivan Mahonin + ......... 2015-2018 Ivan Mahonin This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by