From 65d40ace236c71447712cd7baf876ea4e57e32f4 Mon Sep 17 00:00:00 2001 From: Pierre-Yves Chibon Date: Dec 14 2016 09:48:35 +0000 Subject: Let guess_encoding try all the possible encodings before making its choice This way, if by any chance the first one isn't the right one, we can still try the others. If all luck ran out, we can then properly bail and leave it up to the caller to handle the situation. --- diff --git a/pagure/lib/encoding_utils.py b/pagure/lib/encoding_utils.py index 4a9136a..267ef35 100644 --- a/pagure/lib/encoding_utils.py +++ b/pagure/lib/encoding_utils.py @@ -17,6 +17,8 @@ import logging from chardet import universaldetector +from pagure.exceptions import PagureException + _log = logging.getLogger(__name__) @@ -55,7 +57,14 @@ def guess_encoding(data): encodings, key=lambda guess: guess.confidence, reverse=True) _log.debug('Possible encodings: ' + str(sorted_encodings)) - return sorted_encodings[0].encoding + for encoding in sorted_encodings: + _log.debug('Trying encoding: %s', str(encoding)) + try: + data.decode(encoding.encoding) + return encoding.encoding + except UnicodeDecodeError: + pass + raise PagureException('No encoding could be guessed for this file') def detect_encodings(data): diff --git a/pagure/ui/issues.py b/pagure/ui/issues.py index b4a3c7f..43b5014 100644 --- a/pagure/ui/issues.py +++ b/pagure/ui/issues.py @@ -1175,8 +1175,12 @@ def view_issue_raw_file( headers['Content-Disposition'] = 'attachment' if mimetype.startswith('text/') and not encoding: - encoding = pagure.lib.encoding_utils.guess_encoding( - ktc.to_bytes(data)) + try: + encoding = pagure.lib.encoding_utils.guess_encoding( + ktc.to_bytes(data)) + except pagure.exceptions.PagureException: + # We cannot decode the file, so bail but warn the admins + LOG.exception('File could not be decoded') if encoding: mimetype += '; charset={encoding}'.format(encoding=encoding) diff --git a/pagure/ui/repo.py b/pagure/ui/repo.py index 096aa97..e36f5a6 100644 --- a/pagure/ui/repo.py +++ b/pagure/ui/repo.py @@ -533,7 +533,13 @@ def view_file(repo, identifier, filename, username=None, namespace=None): content, safe = pagure.doc_utils.convert_readme(content.data, ext) output_type = 'markup' elif not is_binary_string(content.data): - file_content = encoding_utils.decode(ktc.to_bytes(content.data)) + try: + file_content = encoding_utils.decode(ktc.to_bytes(content.data)) + except pagure.exceptions.PagureException: + # We cannot decode the file, so let's pretend it's a binary + # file and let the user download it instead of displaying + # it. + output_type = 'binary' try: lexer = guess_lexer_for_filename( filename, @@ -670,7 +676,11 @@ def view_raw_file( headers['Content-Disposition'] = 'attachment' if mimetype.startswith('text/') and not encoding: - encoding = encoding_utils.guess_encoding(ktc.to_bytes(data)) + try: + encoding = encoding_utils.guess_encoding(ktc.to_bytes(data)) + except pagure.exceptions.PagureException: + # We cannot decode the file, so bail but warn the admins + LOG.exception('File could not be decoded') if encoding: mimetype += '; charset={encoding}'.format(encoding=encoding) @@ -708,7 +718,13 @@ def view_blame_file(repo, filename, username=None, namespace=None): if is_binary_string(content.data): flask.abort(400, 'Binary files cannot be blamed') - content = encoding_utils.decode(content.data) + try: + content = encoding_utils.decode(content.data) + except pagure.exceptions.PagureException: + # We cannot decode the file, so bail but warn the admins + LOG.exception('File could not be decoded') + flask.abort(400, 'File could not be decoded') + blame = repo_obj.blame(filename) return flask.render_template(