diff --git a/pagure/lib/encoding_utils.py b/pagure/lib/encoding_utils.py index 00c47bb..60c1493 100644 --- a/pagure/lib/encoding_utils.py +++ b/pagure/lib/encoding_utils.py @@ -15,7 +15,7 @@ from __future__ import unicode_literals, division, absolute_import from collections import namedtuple import logging -from chardet import universaldetector +from chardet import universaldetector, __version__ as ch_version from pagure.exceptions import PagureEncodingException @@ -51,9 +51,17 @@ def detect_encodings(data): if not result: return {'utf-8': 1.0} encodings = {result['encoding']: result['confidence']} - for prober in detector._mCharSetProbers: - if prober: - encodings[prober.get_charset_name()] = prober.get_confidence() + if ch_version[0] == '3': + for prober in detector._charset_probers: + if hasattr(prober, 'probers'): + for prober in prober.probers: + encodings[prober.charset_name] = prober.get_confidence() + else: + encodings[prober.charset_name] = prober.get_confidence() + else: + for prober in detector._mCharSetProbers: + if prober: + encodings[prober.get_charset_name()] = prober.get_confidence() return encodings diff --git a/requirements.txt b/requirements.txt index 217112b..551256f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ binaryornot < 0.4.3 bleach blinker celery -chardet < 3.0.0 +chardet docutils enum34 flask diff --git a/tests/test_pagure_lib_encoding_utils.py b/tests/test_pagure_lib_encoding_utils.py index 67fdb68..5c3d07d 100644 --- a/tests/test_pagure_lib_encoding_utils.py +++ b/tests/test_pagure_lib_encoding_utils.py @@ -26,13 +26,19 @@ class TestGuessEncoding(unittest.TestCase): def test_guess_encoding_favor_utf_8(self): """ - Test that strings that could be UTF-8 or ISO-8859-2 result in UTF-8. + Test that strings that could be UTF-8 or ISO-8859-* result in UTF-8. + + python-chardet-3.0.4-2.fc27.noarch detects it as ISO-8859-9 + python-chardet-2.2.1-1.el7_1.noarch detects it as ISO-8859-2 """ data = u'Šabata'.encode('utf-8') result = encoding_utils.guess_encoding(data) chardet_result = chardet.detect(data) self.assertEqual(result, 'utf-8') - self.assertEqual(chardet_result['encoding'], 'ISO-8859-2') + if chardet.__version__[0] == '3': + self.assertEqual(chardet_result['encoding'], 'ISO-8859-9') + else: + self.assertEqual(chardet_result['encoding'], 'ISO-8859-2') def test_guess_encoding_no_data(self): """ Test encoding_utils.guess_encoding() with an empty string """ @@ -47,10 +53,20 @@ class TestGuessEncodings(unittest.TestCase): data = u'Šabata'.encode('utf-8') result = encoding_utils.guess_encodings(data) chardet_result = chardet.detect(data) - self.assertEqual( - [encoding.encoding for encoding in result], - ['utf-8', 'ISO-8859-2', 'windows-1252']) - self.assertEqual(chardet_result['encoding'], 'ISO-8859-2') + if chardet.__version__[0] == '3': + self.assertEqual( + [encoding.encoding for encoding in result], + ['utf-8', 'ISO-8859-9', 'ISO-8859-1', 'MacCyrillic', + 'IBM866', 'TIS-620', 'EUC-JP', 'EUC-KR', 'GB2312', 'KOI8-R', + 'Big5', 'IBM855', 'ISO-8859-7', 'SHIFT_JIS', 'windows-1253', + 'CP949', 'EUC-TW', 'ISO-8859-5', 'windows-1251', + 'windows-1255']) + self.assertEqual(chardet_result['encoding'], 'ISO-8859-9') + else: + self.assertEqual( + [encoding.encoding for encoding in result], + ['utf-8', 'ISO-8859-2', 'windows-1252']) + self.assertEqual(chardet_result['encoding'], 'ISO-8859-2') def test_guess_encodings_no_data(self): """ Test encoding_utils.guess_encodings() with an emtpy string """