From 5f76d9285c9fef1b068a5132fe9f09cd477d54db Mon Sep 17 00:00:00 2001 From: Philipp Wolfer Date: Tue, 30 Apr 2024 07:38:02 +0200 Subject: [PATCH] Always detect_file_encoding ASCII as UTF-8 This ensures partially analyzed documents that are detected as ASCII but contain UTF-8 encodings later in the file still get loaded. Also it standardizes charset detection between chardet and charset_normalizers, which previously differed on the \00 test case. --- picard/util/__init__.py | 9 +++++++-- test/test_utils.py | 4 +++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/picard/util/__init__.py b/picard/util/__init__.py index 377a54f1c..9a2bfb0a8 100644 --- a/picard/util/__init__.py +++ b/picard/util/__init__.py @@ -1208,6 +1208,11 @@ def detect_file_encoding(path, max_bytes_to_read=1024*256): result = detect(f.read(max_bytes_to_read)) if result['encoding'] is None: log.warning("Couldn't detect encoding for file %r", path) - result['encoding'] = 'UTF-8' + encoding = 'utf-8' + elif result['encoding'].lower() == 'ascii': + # Treat ASCII as UTF-8 (an ASCII document is also valid UTF-8) + encoding = 'utf-8' + else: + encoding = result['encoding'].lower() - return result['encoding'].lower() + return encoding diff --git a/test/test_utils.py b/test/test_utils.py index 019726936..ef3d7e4ac 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -941,6 +941,7 @@ class IgnoreUpdatesContextTest(PicardTestCase): class DetectUnicodeEncodingTest(PicardTestCase): + @unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package") def test_detect_file_encoding_bom(self): boms = { b'\xff\xfe': 'utf-16-le', @@ -950,6 +951,7 @@ class DetectUnicodeEncodingTest(PicardTestCase): b'\xef\xbb\xbf': 'utf-8-sig', b'': 'utf-8', b'\00': 'utf-8', + b'no BOM, only ASCII': 'utf-8', } for bom, expected_encoding in boms.items(): try: @@ -973,7 +975,7 @@ class DetectUnicodeEncodingTest(PicardTestCase): file_path = get_test_data_path('eac-utf32le.log') self.assertEqual(expected_encoding, detect_file_encoding(file_path)) - @unittest.skipUnless(charset_detect, "test requires charset-normalizer or chardet package") + @unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package") def test_detect_file_encoding_eac_windows_1251(self): expected_encoding = 'windows-1251' file_path = get_test_data_path('eac-windows1251.log')