Always detect_file_encoding ASCII as UTF-8

This ensures partially analyzed documents that are detected as ASCII but contain UTF-8 encodings later in the file still get loaded. Also it standardizes charset detection between chardet and charset_normalizers, which previously differed on the \00 test case.
2025-12-23 01:37:47 +00:00 · 2024-04-30 07:38:02 +02:00
parent 43f7603c01
commit 01a3ee39b5
2 changed files with 10 additions and 3 deletions
--- a/picard/util/init.py
+++ b/picard/util/init.py
@@ -1219,6 +1219,11 @@ def detect_file_encoding(path, max_bytes_to_read=1024*256):
        result = detect(f.read(max_bytes_to_read))
        if result['encoding'] is None:
            log.warning("Couldn't detect encoding for file %r", path)
-            result['encoding'] = 'UTF-8'
+            encoding = 'utf-8'
        elif result['encoding'].lower() == 'ascii':
            # Treat ASCII as UTF-8 (an ASCII document is also valid UTF-8)
            encoding = 'utf-8'
        else:
            encoding = result['encoding'].lower()
-        return result['encoding'].lower()
+        return encoding
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -992,6 +992,7 @@ class IgnoreUpdatesContextTest(PicardTestCase):
 class DetectUnicodeEncodingTest(PicardTestCase):
    @unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package")
    def test_detect_file_encoding_bom(self):
        boms = {
            b'\xff\xfe': 'utf-16-le',
@@ -1001,6 +1002,7 @@ class DetectUnicodeEncodingTest(PicardTestCase):
            b'\xef\xbb\xbf': 'utf-8-sig',
            b'': 'utf-8',
            b'\00': 'utf-8',
            b'no BOM, only ASCII': 'utf-8',
        }
        for bom, expected_encoding in boms.items():
            try:
@@ -1024,7 +1026,7 @@ class DetectUnicodeEncodingTest(PicardTestCase):
        file_path = get_test_data_path('eac-utf32le.log')
        self.assertEqual(expected_encoding, detect_file_encoding(file_path))
-    @unittest.skipUnless(charset_detect, "test requires charset-normalizer or chardet package")
+    @unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package")
    def test_detect_file_encoding_eac_windows_1251(self):
        expected_encoding = 'windows-1251'
        file_path = get_test_data_path('eac-windows1251.log')