From 6fa0096079c5fc9d6f9d53451f443b733cd11a78 Mon Sep 17 00:00:00 2001 From: Ivan Mahonin Date: Jul 26 2018 17:54:31 +0000 Subject: contourgl: CUDA implementation --- diff --git a/c++/contourgl/Makefile b/c++/contourgl/Makefile index 09ef1f2..c8d388c 100644 --- a/c++/contourgl/Makefile +++ b/c++/contourgl/Makefile @@ -43,7 +43,9 @@ SOURCES = \ utils.cpp ifdef CUDA - SOURCES += cudacontext.cpp + SOURCES += \ + cudacontext.cpp \ + cudarender.cpp CUDA_SOURCES = \ cuda/contour.cu \ cuda/hello.cu diff --git a/c++/contourgl/contourgl.cpp b/c++/contourgl/contourgl.cpp index 06ddd49..9c24d70 100644 --- a/c++/contourgl/contourgl.cpp +++ b/c++/contourgl/contourgl.cpp @@ -98,10 +98,13 @@ int main() { { Surface surface(width, height); Measure t("test_lineslow_cl2.tga", surface, true); Test::test_cl2(e, datalow, surface); } - */ { Surface surface(width, height); Measure t("test_lineslow_cl3.tga", surface, true); Test::test_cl3(e, datalow, surface); } + */ + { Surface surface(width, height); + Measure t("test_lineslow_cu.tga", surface, true); + Test::test_cu(e, datalow, surface); } } } diff --git a/c++/contourgl/cuda/contour.cu b/c++/contourgl/cuda/contour.cu index e69de29..db537b2 100644 --- a/c++/contourgl/cuda/contour.cu +++ b/c++/contourgl/cuda/contour.cu @@ -0,0 +1,151 @@ +/* + ......... 2018 Ivan Mahonin + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +extern "C" { + +#define ONE 65536 +#define TWO 131072 // (ONE)*2 +#define HALF 32768 // (ONE)/2 +#define ONE_F 65536.f // (float)(ONE) +#define DIV_ONE_F 0.0000152587890625f // 1.f/(ONE_F) + + +__global__ void clear( + int width, + int height, + int4 *marks ) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x; + int c = id % width; + marks[id] = make_int4(0, 0, c | (c + 1), 0); +} + +__global__ void path( + int width, + int height, + int *marks, + const float2 *points, + int begin, + int end, + int minx ) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x + begin; + if (id >= end) return; + float2 p0 = points[id]; + float2 p1 = points[id + 1]; + + bool flipx = p1.x < p0.x; + bool flipy = p1.y < p0.y; + if (flipx) { p0.x = (float)width - p0.x; p1.x = (float)width - p1.x; } + if (flipy) { p0.y = (float)height - p0.y; p1.y = (float)height - p1.y; } + + float2 d; + d.x = p1.x - p0.x; + d.y = p1.y - p0.y; + float kx = d.x/d.y; + float ky = d.y/d.x; + int w1 = width - 1; + int h1 = height - 1; + + while(p0.x != p1.x || p0.y != p1.y) { + int ix = (int)p0.x; + int iy = max((int)p0.y, 0); + if (iy > h1) return; + + float2 px, py; + px.x = (float)(ix + 1); + py.y = (float)(iy + 1); + ix = max(0, min(w1, ix)); + + px.y = p0.y + ky*(px.x - p0.x); + py.x = p0.x + kx*(py.y - p0.y); + + float2 pp1 = p1; + if (pp1.x > px.x) pp1 = px; + if (pp1.y > py.y) pp1 = py; + + float cover = (pp1.y - p0.y)*ONE_F; + float area = px.x - 0.5f*(p0.x + pp1.x); + if (flipx) { ix = w1 - ix; area = 1.f - area; } + if (flipy) { iy = h1 - iy; cover = -cover; } + + int *row = marks + 4*iy*width; + atomicAdd( + (unsigned long long*)(row + 4*ix), + ((unsigned long long)(unsigned int)(int)(cover) << 32) + | (unsigned long long)(unsigned int)((int)(area*cover)) ); + //row[4*ix] += (int)(area*cover); + //row[4*ix + 1] += (int)(cover); + //atomicAdd(row + 4*ix, (int)(area*cover)); + //atomicAdd(row + 4*ix + 1, (int)(cover)); + + row += 2; + int iix = (ix & (ix + 1)) - 1; + while(iix >= minx) { + atomicMin(row + 4*iix, ix); + iix = (iix & (iix + 1)) - 1; + } + + p0 = pp1; + } +} + +__global__ void fill( + int width, + int4 *marks, + float4 *image, + float4 color, + int4 bounds ) +{ + int id = blockIdx.x*blockDim.x + threadIdx.x + bounds.y; + if (id >= bounds.w) return; + id *= width; + marks += id; + image += id; + + int4 *mark; + float4 *pixel; + + int4 m; + int icover = 0, c0 = bounds.x, c1 = bounds.x; + while(c1 < bounds.z) { + if (abs(icover) > HALF) + while(c0 < c1) + image[c0++] = color; + + mark = &marks[c1]; + m = *mark; + *mark = make_int4(0, 0, c1 | (c1 + 1), 0); + + float alpha = (float)abs(m.x + icover)*DIV_ONE_F; + float one_alpha = 1.f - alpha; + + pixel = &image[c1]; + float4 p = *pixel; + p.x = p.x*one_alpha + color.x*alpha; + p.y = p.y*one_alpha + color.y*alpha; + p.z = p.z*one_alpha + color.z*alpha; + p.w = p.w*one_alpha + color.w*alpha; + *pixel = p; + + icover += m.y; + c0 = c1 + 1; + c1 = m.z; + } +} + +} \ No newline at end of file diff --git a/c++/contourgl/cudacontext.cpp b/c++/contourgl/cudacontext.cpp index 3901098..d9d978d 100644 --- a/c++/contourgl/cudacontext.cpp +++ b/c++/contourgl/cudacontext.cpp @@ -86,7 +86,7 @@ CudaContext::CudaContext(): err = cuCtxCreate(&context, CU_CTX_SCHED_AUTO, device); assert(!err); - hello(); + //hello(); } CudaContext::~CudaContext() { diff --git a/c++/contourgl/cudarender.cpp b/c++/contourgl/cudarender.cpp new file mode 100644 index 0000000..70eff45 --- /dev/null +++ b/c++/contourgl/cudarender.cpp @@ -0,0 +1,193 @@ +/* + ......... 2018 Ivan Mahonin + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include + +#include "cudarender.h" +#include "measure.h" + + +using namespace std; + + +CudaRender::CudaRender(CudaContext &cu): + cu(cu), + contour_module(), + contour_clear_kernel(), + contour_path_kernel(), + contour_fill_kernel(), + surface(), + points_buffer(), + mark_buffer(), + surface_image() +{ + cu.err = cuModuleLoad(&contour_module, "cuda/contour.ptx"); + assert(!cu.err); + + cu.err = cuModuleGetFunction(&contour_clear_kernel, contour_module, "clear"); + assert(!cu.err); + + cu.err = cuModuleGetFunction(&contour_path_kernel, contour_module, "path"); + assert(!cu.err); + + cu.err = cuModuleGetFunction(&contour_fill_kernel, contour_module, "fill"); + assert(!cu.err); +} + +CudaRender::~CudaRender() { + send_points(NULL, 0); + send_surface(NULL); + + cu.err = cuModuleUnload(contour_module); + assert(!cu.err); +} + +void CudaRender::send_surface(Surface *surface) { + if (this->surface) { + wait(); + + cu.err = cuMemFree(surface_image); + assert(!cu.err); + surface_image = 0; + + cu.err = cuMemFree(mark_buffer); + assert(!cu.err); + mark_buffer = 0; + } + + this->surface = surface; + + if (this->surface) { + int zero_mark[4] = { }; + + cu.err = cuMemAlloc(&surface_image, surface->data_size()); + assert(!cu.err); + + cu.err = cuMemcpyHtoD(surface_image, surface->data, surface->data_size()); + assert(!cu.err); + + cu.err = cuMemAlloc(&mark_buffer, surface->count()*sizeof(zero_mark)); + assert(!cu.err); + + size_t group_size = 32; + size_t count = surface->count(); + cu.err = cuLaunchKernel( + contour_clear_kernel, + (count - 1)/group_size + 1, 1, 1, + group_size, 1, 1, + 0, 0, 0, + CudaParams() + .add(surface->width) + .add(surface->height) + .add(mark_buffer) + .get_extra() ); + assert(!cu.err); + + wait(); + } +} + +Surface* CudaRender::receive_surface() { + if (surface) { + wait(); + cu.err = cuMemcpyDtoH(surface->data, surface_image, surface->data_size()); + assert(!cu.err); + } + return surface; +} + +void CudaRender::send_points(const vec2f *points, int count) { + if (points_buffer) { + wait(); + cu.err = cuMemFree(points_buffer); + assert(!cu.err); + points_buffer = 0; + } + + if (points && count > 0) { + cu.err = cuMemAlloc(&points_buffer, count*sizeof(vec2f)); + assert(!cu.err); + + cu.err = cuMemcpyHtoD(points_buffer, points, count*sizeof(vec2f)); + assert(!cu.err); + } +} + +void CudaRender::draw(const Path &path) { + assert(surface); + assert(points_buffer); + + ContextRect bounds; + bounds.minx = max(1, path.bounds.minx); + bounds.maxx = min(surface->width, path.bounds.maxx); + bounds.miny = max(0, path.bounds.miny); + bounds.maxy = min(surface->height, path.bounds.maxy); + if ( bounds.minx >= bounds.maxx + || bounds.miny >= bounds.maxy + || path.begin >= path.end ) return; + + vec2i boundsx(bounds.minx, bounds.maxx); + + size_t group_size, count; + + count = path.end - path.begin; + group_size = 8; + + count = (count - 1)/group_size + 1; + cu.err = cuLaunchKernel( + contour_path_kernel, + count, 1, 1, + group_size, 1, 1, + 0, 0, 0, + CudaParams() + .add(surface->width) + .add(surface->height) + .add(mark_buffer) + .add(points_buffer) + .add(path.begin) + .add(path.end) + .add(bounds.minx) + .get_extra() ); + assert(!cu.err); + + count = bounds.maxy - bounds.miny; + group_size = 1; + + count = (count - 1)/group_size + 1; + cu.err = cuLaunchKernel( + contour_fill_kernel, + count, 1, 1, + group_size, 1, 1, + 0, 0, 0, + CudaParams() + .add(surface->width) + .add(mark_buffer) + .add(surface_image) + .add(path.color, 16) + .add(bounds, 16) + .get_extra() ); + assert(!cu.err); +} + +void CudaRender::wait() { + cu.err = cuStreamSynchronize(0); + assert(!cu.err); +} + diff --git a/c++/contourgl/cudarender.h b/c++/contourgl/cudarender.h new file mode 100644 index 0000000..7eae406 --- /dev/null +++ b/c++/contourgl/cudarender.h @@ -0,0 +1,66 @@ +/* + ......... 2018 Ivan Mahonin + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _CUDARENDER_H_ +#define _CUDARENDER_H_ + +#include + +#include "cudacontext.h" +#include "geometry.h" +#include "contour.h" +#include "swrender.h" + + +class CudaRender { +public: + struct Path { + ContextRect bounds; + int begin; + int end; + Color color; + bool invert; + bool evenodd; + }; + +private: + CudaContext &cu; + CUmodule contour_module; + CUfunction contour_clear_kernel; + CUfunction contour_path_kernel; + CUfunction contour_fill_kernel; + + Surface *surface; + CUdeviceptr points_buffer; + CUdeviceptr mark_buffer; + CUdeviceptr surface_image; + +public: + CudaRender(CudaContext &cl); + ~CudaRender(); + + void send_surface(Surface *surface); + Surface* receive_surface(); + + void send_points(const vec2f *points, int count); + + void draw(const Path &path); + void wait(); +}; + + +#endif diff --git a/c++/contourgl/test.cpp b/c++/contourgl/test.cpp index fc75a99..e32b29c 100644 --- a/c++/contourgl/test.cpp +++ b/c++/contourgl/test.cpp @@ -26,6 +26,9 @@ #include "utils.h" #include "clrender.h" +#ifdef CUDA +#include "cudarender.h" +#endif using namespace std; @@ -444,3 +447,74 @@ void Test::test_cl3(Environment &e, Data &data, Surface &surface) { } clr.receive_surface(); } + +void Test::test_cu(Environment &e, Data &data, Surface &surface) { +#ifdef CUDA + // prepare data + vector paths; + vector points; + paths.reserve(data.size()); + for(Data::const_iterator i = data.begin(); i != data.end(); ++i) { + if (!i->contour.get_chunks().empty()) { + CudaRender::Path path = {}; + path.color = i->color; + path.invert = i->invert; + path.evenodd = i->evenodd; + + path.bounds.minx = path.bounds.maxx = (int)floor(i->contour.get_chunks().front().p1.x); + path.bounds.miny = path.bounds.maxy = (int)floor(i->contour.get_chunks().front().p1.y); + path.begin = (int)points.size(); + points.reserve(points.size() + i->contour.get_chunks().size() + 1); + for(Contour::ChunkList::const_iterator j = i->contour.get_chunks().begin(); j != i->contour.get_chunks().end(); ++j) { + int x = (int)floor(j->p1.x); + int y = (int)floor(j->p1.y); + if (path.bounds.minx > x) path.bounds.minx = x; + if (path.bounds.maxx < x) path.bounds.maxx = x; + if (path.bounds.miny > y) path.bounds.miny = y; + if (path.bounds.maxy < y) path.bounds.maxy = y; + points.push_back(vec2f(j->p1)); + } + path.end = (int)points.size(); + points.push_back( points[path.begin] ); + ++path.bounds.maxx; + ++path.bounds.maxy; + + paths.push_back(path); + } + } + + // draw + + CudaRender cur(e.cu); + + // warm-up + cur.send_surface(&surface); + cur.send_points(&points.front(), (int)points.size()); + for(int ii = 0; ii < 1000; ++ii) + for(vector::const_iterator i = paths.begin(); i != paths.end(); ++i) + cur.draw(*i); + cur.wait(); + + // measure + { + for(int ii = 0; ii < 1000; ++ii) { + Measure t("render", false, true); + for(vector::const_iterator i = paths.begin(); i != paths.end(); ++i) + cur.draw(*i); + cur.wait(); + } + } + cur.send_points(NULL, 0); + cur.send_surface(NULL); + + // actual task + cur.send_surface(&surface); + cur.send_points(&points.front(), (int)points.size()); + { + for(vector::const_iterator i = paths.begin(); i != paths.end(); ++i) + cur.draw(*i); + cur.wait(); + } + cur.receive_surface(); +#endif +} diff --git a/c++/contourgl/test.h b/c++/contourgl/test.h index 6d3d3dd..a96d7c8 100644 --- a/c++/contourgl/test.h +++ b/c++/contourgl/test.h @@ -56,6 +56,7 @@ public: static void test_cl(Environment &e, Data &data, Surface &surface); static void test_cl2(Environment &e, Data &data, Surface &surface); static void test_cl3(Environment &e, Data &data, Surface &surface); + static void test_cu(Environment &e, Data &data, Surface &surface); }; #endif