# -*- coding: utf-8 -*-
"""
(c) 2016 - Copyright Red Hat Inc
Authors:
Jeremy Cline <jeremy@jcline.org>
This module contains utilities to deal with character encoding. Git blobs are
just binary data and do not have a character encoding associated with them, so
the repetitive task of identifying the character encoding and decoding the
content to unicode is implemented here.
"""
from __future__ import unicode_literals, division, absolute_import
from collections import namedtuple
import logging
from chardet import universaldetector, __version__ as ch_version
from pagure.exceptions import PagureEncodingException
_log = logging.getLogger(__name__)
Guess = namedtuple("Guess", ["encoding", "confidence"])
def detect_encodings(data):
"""
Analyze the provided data for possible character encodings.
This simply wraps chardet and extracts all the potential encodings it
considered before deciding on a particular result.
:param data: An array of bytes to treat as text data
:type data: bytes
:return: A dictionary mapping possible encodings to confidence levels
:rtype: dict
"""
if not data:
# It's an empty string so we can safely say it's ascii
return {"ascii": 1.0}
# We can't use ``chardet.detect`` because we want to dig in the internals
# of the detector to bias the utf-8 result.
detector = universaldetector.UniversalDetector()
detector.reset()
detector.feed(data)
result = detector.close()
if not result:
return {"utf-8": 1.0}
encodings = {result["encoding"]: result["confidence"]}
if ch_version[0] == "3":
for prober in detector._charset_probers:
if hasattr(prober, "probers"):
for prober in prober.probers:
encodings[prober.charset_name] = prober.get_confidence()
else:
encodings[prober.charset_name] = prober.get_confidence()
else:
for prober in detector._mCharSetProbers:
if prober:
encodings[prober.get_charset_name()] = prober.get_confidence()
return encodings
def guess_encodings(data):
"""
List all the possible encoding found for the given data.
This uses chardet to guess the encoding, but biases the results towards
UTF-8. There are cases where chardet cannot know the encoding and
therefore is occasionally wrong. In those cases it was decided that it
would be better to err on the side of UTF-8 rather than ISO-8859-*.
However, it is important to be aware that this also guesses and _will_
misclassify ISO-8859-* encoded text as UTF-8 in some cases.
The discussion that lead to this decision can be found at
https://pagure.io/pagure/issue/891.
:param data: An array of bytes to treat as text data
:type data: bytes
:return: A dictionary mapping possible encodings to confidence levels
:rtype: dict
"""
encodings = detect_encodings(data)
# Boost utf-8 confidence to heavily skew on the side of utf-8. chardet
# confidence is between 1.0 and 0 (inclusive), so this boost remains within
# the expected range from chardet. This requires chardet to be very
# unconfident in utf-8 and very confident in something else for utf-8 to
# not be selected.
if "utf-8" in encodings and encodings["utf-8"] > 0.0:
encodings["utf-8"] = (encodings["utf-8"] + 2.0) / 3.0
encodings = [
Guess(encoding, confidence)
for encoding, confidence in encodings.items()
]
sorted_encodings = sorted(
encodings, key=lambda guess: guess.confidence, reverse=True
)
_log.debug("Possible encodings: %s" % sorted_encodings)
return sorted_encodings
def guess_encoding(data):
"""
Attempt to guess the text encoding used for the given data.
This uses chardet to guess the encoding, but biases the results towards
UTF-8. There are cases where chardet cannot know the encoding and
therefore is occasionally wrong. In those cases it was decided that it
would be better to err on the side of UTF-8 rather than ISO-8859-*.
However, it is important to be aware that this also guesses and _will_
misclassify ISO-8859-* encoded text as UTF-8 in some cases.
The discussion that lead to this decision can be found at
https://pagure.io/pagure/issue/891.
:param data: An array of bytes to treat as text data
:type data: bytes
:return: A string of the best encoding found
:rtype: str
:raises PagureException: if no encoding was found that the data could
be decoded into
"""
encodings = guess_encodings(data)
for encoding in encodings:
_log.debug("Trying encoding: %s", encoding)
try:
data.decode(encoding.encoding)
return encoding.encoding
except (UnicodeDecodeError, TypeError):
# The first error is thrown when we failed to decode in that
# encoding, the second when encoding.encoding returned None
pass
raise PagureEncodingException("No encoding could be guessed for this file")
def decode(data):
"""
Guesses the encoding using ``guess_encoding`` and decodes the data.
:param data: An array of bytes to treat as text data
:type data: bytes
:return: A unicode string that has been decoded using the encoding provided
by ``guest_encoding``
:rtype: unicode str
"""
encoding = guess_encoding(data)
return data.decode(encoding)