From e62ce05db5636f401d19b072bc99861c3781e9df Mon Sep 17 00:00:00 2001
From: Jeremy Cline <jeremy@jcline.org>
Date: Oct 18 2016 14:45:09 +0000
Subject: Add some encoding/decoding utilities for git files


Users have regularly noticed that some files are being returned with an
incorrect character encoding. This is because the files were not pure
ascii, but did not bear any difinitive markers of the UTF family of
encodings. It tends to fall back to ISO-8859-2 (latin-2) or
Windows-1250. This adds some helpful utilities that will use chardet to
guess the encoding, but will heavily skew the results in favor of utf-8
if the encoding is in doubt. It also modifies the raw file view and the
highlighted code view to use the new utilities.

Fixes https://pagure.io/pagure/issue/891

Signed-off-by: Jeremy Cline <jeremy@jcline.org>

---

diff --git a/pagure/lib/encoding_utils.py b/pagure/lib/encoding_utils.py
new file mode 100644
index 0000000..7f6d0a8
--- /dev/null
+++ b/pagure/lib/encoding_utils.py
@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+"""
+(c) 2016 - Copyright Red Hat Inc
+
+Authors:
+    Jeremy Cline <jeremy@jcline.org>
+
+This module contains utilities to deal with character encoding. Git blobs are
+just binary data and do not have a character encoding associated with them, so
+the repetitive task of identifying the character encoding and decoding the
+content to unicode is implemented here.
+"""
+
+from __future__ import unicode_literals, division, absolute_import
+from collections import namedtuple
+import logging
+
+from chardet import universaldetector
+
+
+_log = logging.getLogger(__name__)
+
+Guess = namedtuple('Guess', ['encoding', 'confidence'])
+
+
+def guess_encoding(data):
+    """
+    Attempt to guess the text encoding used for the given data.
+
+    This uses chardet to guess the encoding, but biases the results towards
+    UTF-8. There are cases where chardet cannot know the encoding and
+    therefore is occasionally wrong. In those cases it was decided that it
+    would be better to err on the side of UTF-8 rather than ISO-8859-*.
+    However, it is important to be aware that this also guesses and _will_
+    misclassify ISO-8859-* encoded text as UTF-8 in some cases.
+
+    The discussion that lead to this decision can be found at
+    https://pagure.io/pagure/issue/891.
+
+    :param data: An array of bytes to treat as text data
+    :type  data: bytes
+    """
+    encodings = detect_encodings(data)
+
+    # Boost utf-8 confidence to heavily skew on the side of utf-8. chardet
+    # confidence is between 1.0 and 0 (inclusive), so this boost remains within
+    # the expected range from chardet. This requires chardet to be very
+    # unconfident in utf-8 and very confident in something else for utf-8 to
+    # not be selected.
+    if 'utf-8' in encodings and encodings['utf-8'] > 0.0:
+        encodings['utf-8'] = (encodings['utf-8'] + 2.0) / 3.0
+    encodings = [Guess(encoding, confidence)
+                 for encoding, confidence in encodings.items()]
+    sorted_encodings = sorted(
+        encodings, key=lambda guess: guess.confidence, reverse=True)
+
+    _log.debug('Possible encodings: ' + str(sorted_encodings))
+    return sorted_encodings[0].encoding
+
+
+def detect_encodings(data):
+    """
+    Analyze the provided data for possible character encodings.
+
+    This simply wraps chardet and extracts all the potential encodings it
+    considered before deciding on a particular result.
+
+    :param data: An array of bytes to treat as text data
+    :type  data: bytes
+
+    :return: A dictionary mapping possible encodings to confidence levels
+    :rtype:  dict
+    """
+    # We can't use ``chardet.detect`` because we want to dig in the internals
+    # of the detector to bias the utf-8 result.
+    detector = universaldetector.UniversalDetector()
+    detector.reset()
+    detector.feed(data)
+    result = detector.close()
+    encodings = {result['encoding']: result['confidence']}
+    for prober in detector._mCharSetProbers:
+        if prober:
+            encodings[prober.get_charset_name()] = prober.get_confidence()
+
+    return encodings
+
+
+def decode(data):
+    """
+    Guesses the encoding using ``guess_encoding`` and decodes the data.
+
+    :param data: An array of bytes to treat as text data
+    :type  data: bytes
+
+    :return: A unicode string that has been decoded using the encoding provided
+             by ``guest_encoding``
+    :rtype: unicode str
+    """
+    encoding = guess_encoding(data)
+    return data.decode(encoding)
diff --git a/pagure/ui/issues.py b/pagure/ui/issues.py
index 3ea48a8..39f50d4 100644
--- a/pagure/ui/issues.py
+++ b/pagure/ui/issues.py
@@ -24,7 +24,6 @@ import pygit2
 import werkzeug.datastructures
 from sqlalchemy.exc import SQLAlchemyError
 
-import chardet
 import kitchen.text.converters as ktc
 import mimetypes
 
@@ -1104,7 +1103,7 @@ def view_issue_raw_file(
         headers['Content-Disposition'] = 'attachment'
 
     if mimetype.startswith('text/') and not encoding:
-        encoding = chardet.detect(ktc.to_bytes(data))['encoding']
+        encoding = pagure.lib.guess_encoding(ktc.to_bytes(data))
 
     if encoding:
         mimetype += '; charset={encoding}'.format(encoding=encoding)
diff --git a/pagure/ui/repo.py b/pagure/ui/repo.py
index e718616..2d6c4d1 100644
--- a/pagure/ui/repo.py
+++ b/pagure/ui/repo.py
@@ -37,7 +37,6 @@ from pygments.util import ClassNotFound
 from sqlalchemy.exc import SQLAlchemyError
 
 import mimetypes
-import chardet
 
 from binaryornot.helpers import is_binary_string
 
@@ -50,6 +49,7 @@ import pagure
 import pagure.ui.plugins
 from pagure import (APP, SESSION, LOG, __get_file_in_tree, login_required,
                     admin_session_timedout)
+from pagure.lib import encoding_utils
 
 
 @APP.route('/<repo>.git')
@@ -512,8 +512,7 @@ def view_file(repo, identifier, filename, username=None, namespace=None):
             content, safe = pagure.doc_utils.convert_readme(content.data, ext)
             output_type = 'markup'
         elif not is_binary_string(content.data):
-            encoding = chardet.detect(ktc.to_bytes(content.data))['encoding']
-            file_content = content.data
+            file_content = encoding_utils.decode(ktc.to_bytes(content.data))
             try:
                 lexer = guess_lexer_for_filename(
                     filename,
@@ -648,7 +647,7 @@ def view_raw_file(
         headers['Content-Disposition'] = 'attachment'
 
     if mimetype.startswith('text/') and not encoding:
-        encoding = chardet.detect(ktc.to_bytes(data))['encoding']
+        encoding = encoding_utils.guess_encoding(ktc.to_bytes(data))
 
     if encoding:
         mimetype += '; charset={encoding}'.format(encoding=encoding)
diff --git a/tests/test_pagure_lib_encoding_utils.py b/tests/test_pagure_lib_encoding_utils.py
new file mode 100644
index 0000000..0060595
--- /dev/null
+++ b/tests/test_pagure_lib_encoding_utils.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for :module:`pagure.lib.encoding_utils`.
+"""
+
+import chardet
+import unittest
+
+from pagure.lib import encoding_utils
+
+
+class TestGuessEncoding(unittest.TestCase):
+
+    def test_guess_encoding_ascii(self):
+        """
+        Assert when ascii-only data is provided ascii is the guessed encoding.
+        """
+        data = u'Twas bryllyg, and the slythy toves did gyre and gymble'
+        result = encoding_utils.guess_encoding(data.encode('ascii'))
+        self.assertEqual(result, 'ascii')
+
+    def test_guess_encoding_favor_utf_8(self):
+        """
+        Test that strings that could be UTF-8 or ISO-8859-2 result in UTF-8.
+        """
+        data = u'Šabata'.encode('utf-8')
+        result = encoding_utils.guess_encoding(data)
+        chardet_result = chardet.detect(data)
+        self.assertEqual(result, 'utf-8')
+        self.assertEqual(chardet_result['encoding'], 'ISO-8859-2')
+
+    def test_decode(self):
+        data = u'Šabata'
+        self.assertEqual(data, encoding_utils.decode(data.encode('utf-8')))
+
+
+if __name__ == '__main__':
+    unittest.main()