diff --git a/pagure/lib/encoding_utils.py b/pagure/lib/encoding_utils.py new file mode 100644 index 0000000..7f6d0a8 --- /dev/null +++ b/pagure/lib/encoding_utils.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +""" +(c) 2016 - Copyright Red Hat Inc + +Authors: + Jeremy Cline + +This module contains utilities to deal with character encoding. Git blobs are +just binary data and do not have a character encoding associated with them, so +the repetitive task of identifying the character encoding and decoding the +content to unicode is implemented here. +""" + +from __future__ import unicode_literals, division, absolute_import +from collections import namedtuple +import logging + +from chardet import universaldetector + + +_log = logging.getLogger(__name__) + +Guess = namedtuple('Guess', ['encoding', 'confidence']) + + +def guess_encoding(data): + """ + Attempt to guess the text encoding used for the given data. + + This uses chardet to guess the encoding, but biases the results towards + UTF-8. There are cases where chardet cannot know the encoding and + therefore is occasionally wrong. In those cases it was decided that it + would be better to err on the side of UTF-8 rather than ISO-8859-*. + However, it is important to be aware that this also guesses and _will_ + misclassify ISO-8859-* encoded text as UTF-8 in some cases. + + The discussion that lead to this decision can be found at + https://pagure.io/pagure/issue/891. + + :param data: An array of bytes to treat as text data + :type data: bytes + """ + encodings = detect_encodings(data) + + # Boost utf-8 confidence to heavily skew on the side of utf-8. chardet + # confidence is between 1.0 and 0 (inclusive), so this boost remains within + # the expected range from chardet. This requires chardet to be very + # unconfident in utf-8 and very confident in something else for utf-8 to + # not be selected. + if 'utf-8' in encodings and encodings['utf-8'] > 0.0: + encodings['utf-8'] = (encodings['utf-8'] + 2.0) / 3.0 + encodings = [Guess(encoding, confidence) + for encoding, confidence in encodings.items()] + sorted_encodings = sorted( + encodings, key=lambda guess: guess.confidence, reverse=True) + + _log.debug('Possible encodings: ' + str(sorted_encodings)) + return sorted_encodings[0].encoding + + +def detect_encodings(data): + """ + Analyze the provided data for possible character encodings. + + This simply wraps chardet and extracts all the potential encodings it + considered before deciding on a particular result. + + :param data: An array of bytes to treat as text data + :type data: bytes + + :return: A dictionary mapping possible encodings to confidence levels + :rtype: dict + """ + # We can't use ``chardet.detect`` because we want to dig in the internals + # of the detector to bias the utf-8 result. + detector = universaldetector.UniversalDetector() + detector.reset() + detector.feed(data) + result = detector.close() + encodings = {result['encoding']: result['confidence']} + for prober in detector._mCharSetProbers: + if prober: + encodings[prober.get_charset_name()] = prober.get_confidence() + + return encodings + + +def decode(data): + """ + Guesses the encoding using ``guess_encoding`` and decodes the data. + + :param data: An array of bytes to treat as text data + :type data: bytes + + :return: A unicode string that has been decoded using the encoding provided + by ``guest_encoding`` + :rtype: unicode str + """ + encoding = guess_encoding(data) + return data.decode(encoding) diff --git a/pagure/ui/issues.py b/pagure/ui/issues.py index 3ea48a8..39f50d4 100644 --- a/pagure/ui/issues.py +++ b/pagure/ui/issues.py @@ -24,7 +24,6 @@ import pygit2 import werkzeug.datastructures from sqlalchemy.exc import SQLAlchemyError -import chardet import kitchen.text.converters as ktc import mimetypes @@ -1104,7 +1103,7 @@ def view_issue_raw_file( headers['Content-Disposition'] = 'attachment' if mimetype.startswith('text/') and not encoding: - encoding = chardet.detect(ktc.to_bytes(data))['encoding'] + encoding = pagure.lib.guess_encoding(ktc.to_bytes(data)) if encoding: mimetype += '; charset={encoding}'.format(encoding=encoding) diff --git a/pagure/ui/repo.py b/pagure/ui/repo.py index e718616..2d6c4d1 100644 --- a/pagure/ui/repo.py +++ b/pagure/ui/repo.py @@ -37,7 +37,6 @@ from pygments.util import ClassNotFound from sqlalchemy.exc import SQLAlchemyError import mimetypes -import chardet from binaryornot.helpers import is_binary_string @@ -50,6 +49,7 @@ import pagure import pagure.ui.plugins from pagure import (APP, SESSION, LOG, __get_file_in_tree, login_required, admin_session_timedout) +from pagure.lib import encoding_utils @APP.route('/.git') @@ -512,8 +512,7 @@ def view_file(repo, identifier, filename, username=None, namespace=None): content, safe = pagure.doc_utils.convert_readme(content.data, ext) output_type = 'markup' elif not is_binary_string(content.data): - encoding = chardet.detect(ktc.to_bytes(content.data))['encoding'] - file_content = content.data + file_content = encoding_utils.decode(ktc.to_bytes(content.data)) try: lexer = guess_lexer_for_filename( filename, @@ -648,7 +647,7 @@ def view_raw_file( headers['Content-Disposition'] = 'attachment' if mimetype.startswith('text/') and not encoding: - encoding = chardet.detect(ktc.to_bytes(data))['encoding'] + encoding = encoding_utils.guess_encoding(ktc.to_bytes(data)) if encoding: mimetype += '; charset={encoding}'.format(encoding=encoding) diff --git a/tests/test_pagure_lib_encoding_utils.py b/tests/test_pagure_lib_encoding_utils.py new file mode 100644 index 0000000..0060595 --- /dev/null +++ b/tests/test_pagure_lib_encoding_utils.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +""" +Tests for :module:`pagure.lib.encoding_utils`. +""" + +import chardet +import unittest + +from pagure.lib import encoding_utils + + +class TestGuessEncoding(unittest.TestCase): + + def test_guess_encoding_ascii(self): + """ + Assert when ascii-only data is provided ascii is the guessed encoding. + """ + data = u'Twas bryllyg, and the slythy toves did gyre and gymble' + result = encoding_utils.guess_encoding(data.encode('ascii')) + self.assertEqual(result, 'ascii') + + def test_guess_encoding_favor_utf_8(self): + """ + Test that strings that could be UTF-8 or ISO-8859-2 result in UTF-8. + """ + data = u'Šabata'.encode('utf-8') + result = encoding_utils.guess_encoding(data) + chardet_result = chardet.detect(data) + self.assertEqual(result, 'utf-8') + self.assertEqual(chardet_result['encoding'], 'ISO-8859-2') + + def test_decode(self): + data = u'Šabata' + self.assertEqual(data, encoding_utils.decode(data.encode('utf-8'))) + + +if __name__ == '__main__': + unittest.main()