|
Jeremy Cline |
e62ce0 |
# -*- coding: utf-8 -*-
|
|
Jeremy Cline |
e62ce0 |
"""
|
|
Jeremy Cline |
e62ce0 |
(c) 2016 - Copyright Red Hat Inc
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
Authors:
|
|
Jeremy Cline |
e62ce0 |
Jeremy Cline <jeremy@jcline.org></jeremy@jcline.org>
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
This module contains utilities to deal with character encoding. Git blobs are
|
|
Jeremy Cline |
e62ce0 |
just binary data and do not have a character encoding associated with them, so
|
|
Jeremy Cline |
e62ce0 |
the repetitive task of identifying the character encoding and decoding the
|
|
Jeremy Cline |
e62ce0 |
content to unicode is implemented here.
|
|
Jeremy Cline |
e62ce0 |
"""
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
from __future__ import unicode_literals, division, absolute_import
|
|
Jeremy Cline |
e62ce0 |
from collections import namedtuple
|
|
Jeremy Cline |
e62ce0 |
import logging
|
|
Jeremy Cline |
e62ce0 |
|
|
Pierre-Yves Chibon |
8ac14c |
from chardet import universaldetector, __version__ as ch_version
|
|
Jeremy Cline |
e62ce0 |
|
|
Pierre-Yves Chibon |
867097 |
from pagure.exceptions import PagureEncodingException
|
|
Pierre-Yves Chibon |
65d40a |
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
_log = logging.getLogger(__name__)
|
|
Jeremy Cline |
e62ce0 |
|
|
Pierre-Yves Chibon |
9c2953 |
Guess = namedtuple("Guess", ["encoding", "confidence"])
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
|
|
Pierre-Yves Chibon |
29a8fc |
def detect_encodings(data):
|
|
Pierre-Yves Chibon |
29a8fc |
"""
|
|
Pierre-Yves Chibon |
29a8fc |
Analyze the provided data for possible character encodings.
|
|
Pierre-Yves Chibon |
29a8fc |
|
|
Pierre-Yves Chibon |
29a8fc |
This simply wraps chardet and extracts all the potential encodings it
|
|
Pierre-Yves Chibon |
29a8fc |
considered before deciding on a particular result.
|
|
Pierre-Yves Chibon |
29a8fc |
|
|
Pierre-Yves Chibon |
29a8fc |
:param data: An array of bytes to treat as text data
|
|
Pierre-Yves Chibon |
29a8fc |
:type data: bytes
|
|
Pierre-Yves Chibon |
29a8fc |
:return: A dictionary mapping possible encodings to confidence levels
|
|
Pierre-Yves Chibon |
29a8fc |
:rtype: dict
|
|
Pierre-Yves Chibon |
f07247 |
|
|
Pierre-Yves Chibon |
29a8fc |
"""
|
|
Pierre-Yves Chibon |
29a8fc |
if not data:
|
|
Pierre-Yves Chibon |
29a8fc |
# It's an empty string so we can safely say it's ascii
|
|
Pierre-Yves Chibon |
9c2953 |
return {"ascii": 1.0}
|
|
Pierre-Yves Chibon |
29a8fc |
|
|
Pierre-Yves Chibon |
29a8fc |
# We can't use ``chardet.detect`` because we want to dig in the internals
|
|
Pierre-Yves Chibon |
29a8fc |
# of the detector to bias the utf-8 result.
|
|
Pierre-Yves Chibon |
29a8fc |
detector = universaldetector.UniversalDetector()
|
|
Pierre-Yves Chibon |
29a8fc |
detector.reset()
|
|
Pierre-Yves Chibon |
29a8fc |
detector.feed(data)
|
|
Pierre-Yves Chibon |
29a8fc |
result = detector.close()
|
|
Pierre-Yves Chibon |
29a8fc |
if not result:
|
|
Pierre-Yves Chibon |
9c2953 |
return {"utf-8": 1.0}
|
|
Pierre-Yves Chibon |
9c2953 |
encodings = {result["encoding"]: result["confidence"]}
|
|
Pierre-Yves Chibon |
9c2953 |
if ch_version[0] == "3":
|
|
Pierre-Yves Chibon |
8ac14c |
for prober in detector._charset_probers:
|
|
Pierre-Yves Chibon |
9c2953 |
if hasattr(prober, "probers"):
|
|
Pierre-Yves Chibon |
8ac14c |
for prober in prober.probers:
|
|
Pierre-Yves Chibon |
8ac14c |
encodings[prober.charset_name] = prober.get_confidence()
|
|
Pierre-Yves Chibon |
8ac14c |
else:
|
|
Pierre-Yves Chibon |
8ac14c |
encodings[prober.charset_name] = prober.get_confidence()
|
|
Pierre-Yves Chibon |
8ac14c |
else:
|
|
Pierre-Yves Chibon |
8ac14c |
for prober in detector._mCharSetProbers:
|
|
Pierre-Yves Chibon |
8ac14c |
if prober:
|
|
Pierre-Yves Chibon |
8ac14c |
encodings[prober.get_charset_name()] = prober.get_confidence()
|
|
Pierre-Yves Chibon |
29a8fc |
|
|
Pierre-Yves Chibon |
29a8fc |
return encodings
|
|
Pierre-Yves Chibon |
29a8fc |
|
|
Pierre-Yves Chibon |
2eba27 |
|
|
Pierre-Yves Chibon |
2eba27 |
def guess_encodings(data):
|
|
Jeremy Cline |
e62ce0 |
"""
|
|
Pierre-Yves Chibon |
2eba27 |
List all the possible encoding found for the given data.
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
This uses chardet to guess the encoding, but biases the results towards
|
|
Jeremy Cline |
e62ce0 |
UTF-8. There are cases where chardet cannot know the encoding and
|
|
Jeremy Cline |
e62ce0 |
therefore is occasionally wrong. In those cases it was decided that it
|
|
Jeremy Cline |
e62ce0 |
would be better to err on the side of UTF-8 rather than ISO-8859-*.
|
|
Jeremy Cline |
e62ce0 |
However, it is important to be aware that this also guesses and _will_
|
|
Jeremy Cline |
e62ce0 |
misclassify ISO-8859-* encoded text as UTF-8 in some cases.
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
The discussion that lead to this decision can be found at
|
|
Jeremy Cline |
e62ce0 |
https://pagure.io/pagure/issue/891.
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
:param data: An array of bytes to treat as text data
|
|
Jeremy Cline |
e62ce0 |
:type data: bytes
|
|
Pierre-Yves Chibon |
f07247 |
:return: A dictionary mapping possible encodings to confidence levels
|
|
Pierre-Yves Chibon |
f07247 |
:rtype: dict
|
|
Pierre-Yves Chibon |
f07247 |
|
|
Jeremy Cline |
e62ce0 |
"""
|
|
Jeremy Cline |
e62ce0 |
encodings = detect_encodings(data)
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
# Boost utf-8 confidence to heavily skew on the side of utf-8. chardet
|
|
Jeremy Cline |
e62ce0 |
# confidence is between 1.0 and 0 (inclusive), so this boost remains within
|
|
Jeremy Cline |
e62ce0 |
# the expected range from chardet. This requires chardet to be very
|
|
Jeremy Cline |
e62ce0 |
# unconfident in utf-8 and very confident in something else for utf-8 to
|
|
Jeremy Cline |
e62ce0 |
# not be selected.
|
|
Pierre-Yves Chibon |
9c2953 |
if "utf-8" in encodings and encodings["utf-8"] > 0.0:
|
|
Pierre-Yves Chibon |
9c2953 |
encodings["utf-8"] = (encodings["utf-8"] + 2.0) / 3.0
|
|
Pierre-Yves Chibon |
9c2953 |
encodings = [
|
|
Pierre-Yves Chibon |
9c2953 |
Guess(encoding, confidence)
|
|
Pierre-Yves Chibon |
9c2953 |
for encoding, confidence in encodings.items()
|
|
Pierre-Yves Chibon |
9c2953 |
]
|
|
Jeremy Cline |
e62ce0 |
sorted_encodings = sorted(
|
|
Pierre-Yves Chibon |
9c2953 |
encodings, key=lambda guess: guess.confidence, reverse=True
|
|
Pierre-Yves Chibon |
9c2953 |
)
|
|
Jeremy Cline |
e62ce0 |
|
|
Pierre-Yves Chibon |
9c2953 |
_log.debug("Possible encodings: %s" % sorted_encodings)
|
|
Pierre-Yves Chibon |
2eba27 |
return sorted_encodings
|
|
Pierre-Yves Chibon |
2eba27 |
|
|
Pierre-Yves Chibon |
2eba27 |
|
|
Pierre-Yves Chibon |
2eba27 |
def guess_encoding(data):
|
|
Pierre-Yves Chibon |
2eba27 |
"""
|
|
Pierre-Yves Chibon |
2eba27 |
Attempt to guess the text encoding used for the given data.
|
|
Pierre-Yves Chibon |
2eba27 |
|
|
Pierre-Yves Chibon |
2eba27 |
This uses chardet to guess the encoding, but biases the results towards
|
|
Pierre-Yves Chibon |
2eba27 |
UTF-8. There are cases where chardet cannot know the encoding and
|
|
Pierre-Yves Chibon |
2eba27 |
therefore is occasionally wrong. In those cases it was decided that it
|
|
Pierre-Yves Chibon |
2eba27 |
would be better to err on the side of UTF-8 rather than ISO-8859-*.
|
|
Pierre-Yves Chibon |
2eba27 |
However, it is important to be aware that this also guesses and _will_
|
|
Pierre-Yves Chibon |
2eba27 |
misclassify ISO-8859-* encoded text as UTF-8 in some cases.
|
|
Pierre-Yves Chibon |
2eba27 |
|
|
Pierre-Yves Chibon |
2eba27 |
The discussion that lead to this decision can be found at
|
|
Pierre-Yves Chibon |
2eba27 |
https://pagure.io/pagure/issue/891.
|
|
Pierre-Yves Chibon |
2eba27 |
|
|
Pierre-Yves Chibon |
2eba27 |
:param data: An array of bytes to treat as text data
|
|
Pierre-Yves Chibon |
2eba27 |
:type data: bytes
|
|
Pierre-Yves Chibon |
f07247 |
:return: A string of the best encoding found
|
|
Pierre-Yves Chibon |
f07247 |
:rtype: str
|
|
Pierre-Yves Chibon |
2eba27 |
:raises PagureException: if no encoding was found that the data could
|
|
Pierre-Yves Chibon |
2eba27 |
be decoded into
|
|
Pierre-Yves Chibon |
f07247 |
|
|
Pierre-Yves Chibon |
2eba27 |
"""
|
|
Pierre-Yves Chibon |
2eba27 |
encodings = guess_encodings(data)
|
|
Pierre-Yves Chibon |
2eba27 |
|
|
Pierre-Yves Chibon |
2eba27 |
for encoding in encodings:
|
|
Pierre-Yves Chibon |
9c2953 |
_log.debug("Trying encoding: %s", encoding)
|
|
Pierre-Yves Chibon |
65d40a |
try:
|
|
Pierre-Yves Chibon |
65d40a |
data.decode(encoding.encoding)
|
|
Pierre-Yves Chibon |
65d40a |
return encoding.encoding
|
|
Pierre-Yves Chibon |
f99c87 |
except (UnicodeDecodeError, TypeError):
|
|
Pierre-Yves Chibon |
f99c87 |
# The first error is thrown when we failed to decode in that
|
|
Pierre-Yves Chibon |
f99c87 |
# encoding, the second when encoding.encoding returned None
|
|
Pierre-Yves Chibon |
65d40a |
pass
|
|
Pierre-Yves Chibon |
9c2953 |
raise PagureEncodingException("No encoding could be guessed for this file")
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
def decode(data):
|
|
Jeremy Cline |
e62ce0 |
"""
|
|
Jeremy Cline |
e62ce0 |
Guesses the encoding using ``guess_encoding`` and decodes the data.
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
:param data: An array of bytes to treat as text data
|
|
Jeremy Cline |
e62ce0 |
:type data: bytes
|
|
Jeremy Cline |
e62ce0 |
|
|
Jeremy Cline |
e62ce0 |
:return: A unicode string that has been decoded using the encoding provided
|
|
Jeremy Cline |
e62ce0 |
by ``guest_encoding``
|
|
Jeremy Cline |
e62ce0 |
:rtype: unicode str
|
|
Jeremy Cline |
e62ce0 |
"""
|
|
Jeremy Cline |
e62ce0 |
encoding = guess_encoding(data)
|
|
Jeremy Cline |
e62ce0 |
return data.decode(encoding)
|