Blame pagure/lib/encoding_utils.py

Jeremy Cline e62ce0
# -*- coding: utf-8 -*-
Jeremy Cline e62ce0
"""
Jeremy Cline e62ce0
(c) 2016 - Copyright Red Hat Inc
Jeremy Cline e62ce0
Jeremy Cline e62ce0
Authors:
Jeremy Cline e62ce0
    Jeremy Cline <jeremy@jcline.org></jeremy@jcline.org>
Jeremy Cline e62ce0
Jeremy Cline e62ce0
This module contains utilities to deal with character encoding. Git blobs are
Jeremy Cline e62ce0
just binary data and do not have a character encoding associated with them, so
Jeremy Cline e62ce0
the repetitive task of identifying the character encoding and decoding the
Jeremy Cline e62ce0
content to unicode is implemented here.
Jeremy Cline e62ce0
"""
Jeremy Cline e62ce0
Jeremy Cline e62ce0
from __future__ import unicode_literals, division, absolute_import
Jeremy Cline e62ce0
from collections import namedtuple
Jeremy Cline e62ce0
import logging
Jeremy Cline e62ce0
Pierre-Yves Chibon 8ac14c
from chardet import universaldetector, __version__ as ch_version
Jeremy Cline e62ce0
Pierre-Yves Chibon 867097
from pagure.exceptions import PagureEncodingException
Pierre-Yves Chibon 65d40a
Jeremy Cline e62ce0
Jeremy Cline e62ce0
_log = logging.getLogger(__name__)
Jeremy Cline e62ce0
Pierre-Yves Chibon 9c2953
Guess = namedtuple("Guess", ["encoding", "confidence"])
Jeremy Cline e62ce0
Jeremy Cline e62ce0
Pierre-Yves Chibon 29a8fc
def detect_encodings(data):
Pierre-Yves Chibon 29a8fc
    """
Pierre-Yves Chibon 29a8fc
    Analyze the provided data for possible character encodings.
Pierre-Yves Chibon 29a8fc
Pierre-Yves Chibon 29a8fc
    This simply wraps chardet and extracts all the potential encodings it
Pierre-Yves Chibon 29a8fc
    considered before deciding on a particular result.
Pierre-Yves Chibon 29a8fc
Pierre-Yves Chibon 29a8fc
    :param data: An array of bytes to treat as text data
Pierre-Yves Chibon 29a8fc
    :type  data: bytes
Pierre-Yves Chibon 29a8fc
    :return: A dictionary mapping possible encodings to confidence levels
Pierre-Yves Chibon 29a8fc
    :rtype:  dict
Pierre-Yves Chibon f07247
Pierre-Yves Chibon 29a8fc
    """
Pierre-Yves Chibon 29a8fc
    if not data:
Pierre-Yves Chibon 29a8fc
        # It's an empty string so we can safely say it's ascii
Pierre-Yves Chibon 9c2953
        return {"ascii": 1.0}
Pierre-Yves Chibon 29a8fc
Pierre-Yves Chibon 29a8fc
    # We can't use ``chardet.detect`` because we want to dig in the internals
Pierre-Yves Chibon 29a8fc
    # of the detector to bias the utf-8 result.
Pierre-Yves Chibon 29a8fc
    detector = universaldetector.UniversalDetector()
Pierre-Yves Chibon 29a8fc
    detector.reset()
Pierre-Yves Chibon 29a8fc
    detector.feed(data)
Pierre-Yves Chibon 29a8fc
    result = detector.close()
Pierre-Yves Chibon 29a8fc
    if not result:
Pierre-Yves Chibon 9c2953
        return {"utf-8": 1.0}
Pierre-Yves Chibon 9c2953
    encodings = {result["encoding"]: result["confidence"]}
Pierre-Yves Chibon 9c2953
    if ch_version[0] == "3":
Pierre-Yves Chibon 8ac14c
        for prober in detector._charset_probers:
Pierre-Yves Chibon 9c2953
            if hasattr(prober, "probers"):
Pierre-Yves Chibon 8ac14c
                for prober in prober.probers:
Pierre-Yves Chibon 8ac14c
                    encodings[prober.charset_name] = prober.get_confidence()
Pierre-Yves Chibon 8ac14c
            else:
Pierre-Yves Chibon 8ac14c
                encodings[prober.charset_name] = prober.get_confidence()
Pierre-Yves Chibon 8ac14c
    else:
Pierre-Yves Chibon 8ac14c
        for prober in detector._mCharSetProbers:
Pierre-Yves Chibon 8ac14c
            if prober:
Pierre-Yves Chibon 8ac14c
                encodings[prober.get_charset_name()] = prober.get_confidence()
Pierre-Yves Chibon 29a8fc
Pierre-Yves Chibon 29a8fc
    return encodings
Pierre-Yves Chibon 29a8fc
Pierre-Yves Chibon 2eba27
Pierre-Yves Chibon 2eba27
def guess_encodings(data):
Jeremy Cline e62ce0
    """
Pierre-Yves Chibon 2eba27
    List all the possible encoding found for the given data.
Jeremy Cline e62ce0
Jeremy Cline e62ce0
    This uses chardet to guess the encoding, but biases the results towards
Jeremy Cline e62ce0
    UTF-8. There are cases where chardet cannot know the encoding and
Jeremy Cline e62ce0
    therefore is occasionally wrong. In those cases it was decided that it
Jeremy Cline e62ce0
    would be better to err on the side of UTF-8 rather than ISO-8859-*.
Jeremy Cline e62ce0
    However, it is important to be aware that this also guesses and _will_
Jeremy Cline e62ce0
    misclassify ISO-8859-* encoded text as UTF-8 in some cases.
Jeremy Cline e62ce0
Jeremy Cline e62ce0
    The discussion that lead to this decision can be found at
Jeremy Cline e62ce0
    https://pagure.io/pagure/issue/891.
Jeremy Cline e62ce0
Jeremy Cline e62ce0
    :param data: An array of bytes to treat as text data
Jeremy Cline e62ce0
    :type  data: bytes
Pierre-Yves Chibon f07247
    :return: A dictionary mapping possible encodings to confidence levels
Pierre-Yves Chibon f07247
    :rtype:  dict
Pierre-Yves Chibon f07247
Jeremy Cline e62ce0
    """
Jeremy Cline e62ce0
    encodings = detect_encodings(data)
Jeremy Cline e62ce0
Jeremy Cline e62ce0
    # Boost utf-8 confidence to heavily skew on the side of utf-8. chardet
Jeremy Cline e62ce0
    # confidence is between 1.0 and 0 (inclusive), so this boost remains within
Jeremy Cline e62ce0
    # the expected range from chardet. This requires chardet to be very
Jeremy Cline e62ce0
    # unconfident in utf-8 and very confident in something else for utf-8 to
Jeremy Cline e62ce0
    # not be selected.
Pierre-Yves Chibon 9c2953
    if "utf-8" in encodings and encodings["utf-8"] > 0.0:
Pierre-Yves Chibon 9c2953
        encodings["utf-8"] = (encodings["utf-8"] + 2.0) / 3.0
Pierre-Yves Chibon 9c2953
    encodings = [
Pierre-Yves Chibon 9c2953
        Guess(encoding, confidence)
Pierre-Yves Chibon 9c2953
        for encoding, confidence in encodings.items()
Pierre-Yves Chibon 9c2953
    ]
Jeremy Cline e62ce0
    sorted_encodings = sorted(
Pierre-Yves Chibon 9c2953
        encodings, key=lambda guess: guess.confidence, reverse=True
Pierre-Yves Chibon 9c2953
    )
Jeremy Cline e62ce0
Pierre-Yves Chibon 9c2953
    _log.debug("Possible encodings: %s" % sorted_encodings)
Pierre-Yves Chibon 2eba27
    return sorted_encodings
Pierre-Yves Chibon 2eba27
Pierre-Yves Chibon 2eba27
Pierre-Yves Chibon 2eba27
def guess_encoding(data):
Pierre-Yves Chibon 2eba27
    """
Pierre-Yves Chibon 2eba27
    Attempt to guess the text encoding used for the given data.
Pierre-Yves Chibon 2eba27
Pierre-Yves Chibon 2eba27
    This uses chardet to guess the encoding, but biases the results towards
Pierre-Yves Chibon 2eba27
    UTF-8. There are cases where chardet cannot know the encoding and
Pierre-Yves Chibon 2eba27
    therefore is occasionally wrong. In those cases it was decided that it
Pierre-Yves Chibon 2eba27
    would be better to err on the side of UTF-8 rather than ISO-8859-*.
Pierre-Yves Chibon 2eba27
    However, it is important to be aware that this also guesses and _will_
Pierre-Yves Chibon 2eba27
    misclassify ISO-8859-* encoded text as UTF-8 in some cases.
Pierre-Yves Chibon 2eba27
Pierre-Yves Chibon 2eba27
    The discussion that lead to this decision can be found at
Pierre-Yves Chibon 2eba27
    https://pagure.io/pagure/issue/891.
Pierre-Yves Chibon 2eba27
Pierre-Yves Chibon 2eba27
    :param data: An array of bytes to treat as text data
Pierre-Yves Chibon 2eba27
    :type  data: bytes
Pierre-Yves Chibon f07247
    :return: A string of the best encoding found
Pierre-Yves Chibon f07247
    :rtype: str
Pierre-Yves Chibon 2eba27
    :raises PagureException: if no encoding was found that the data could
Pierre-Yves Chibon 2eba27
        be decoded into
Pierre-Yves Chibon f07247
Pierre-Yves Chibon 2eba27
    """
Pierre-Yves Chibon 2eba27
    encodings = guess_encodings(data)
Pierre-Yves Chibon 2eba27
Pierre-Yves Chibon 2eba27
    for encoding in encodings:
Pierre-Yves Chibon 9c2953
        _log.debug("Trying encoding: %s", encoding)
Pierre-Yves Chibon 65d40a
        try:
Pierre-Yves Chibon 65d40a
            data.decode(encoding.encoding)
Pierre-Yves Chibon 65d40a
            return encoding.encoding
Pierre-Yves Chibon f99c87
        except (UnicodeDecodeError, TypeError):
Pierre-Yves Chibon f99c87
            # The first error is thrown when we failed to decode in that
Pierre-Yves Chibon f99c87
            # encoding, the second when encoding.encoding returned None
Pierre-Yves Chibon 65d40a
            pass
Pierre-Yves Chibon 9c2953
    raise PagureEncodingException("No encoding could be guessed for this file")
Jeremy Cline e62ce0
Jeremy Cline e62ce0
Jeremy Cline e62ce0
def decode(data):
Jeremy Cline e62ce0
    """
Jeremy Cline e62ce0
    Guesses the encoding using ``guess_encoding`` and decodes the data.
Jeremy Cline e62ce0
Jeremy Cline e62ce0
    :param data: An array of bytes to treat as text data
Jeremy Cline e62ce0
    :type  data: bytes
Jeremy Cline e62ce0
Jeremy Cline e62ce0
    :return: A unicode string that has been decoded using the encoding provided
Jeremy Cline e62ce0
             by ``guest_encoding``
Jeremy Cline e62ce0
    :rtype: unicode str
Jeremy Cline e62ce0
    """
Jeremy Cline e62ce0
    encoding = guess_encoding(data)
Jeremy Cline e62ce0
    return data.decode(encoding)