From 5f76d9285c9fef1b068a5132fe9f09cd477d54db Mon Sep 17 00:00:00 2001
From: Philipp Wolfer <ph.wolfer@gmail.com>
Date: Tue, 30 Apr 2024 07:38:02 +0200
Subject: [PATCH] Always detect_file_encoding ASCII as UTF-8

This ensures partially analyzed documents that are detected as ASCII
but contain UTF-8 encodings later in the file still get loaded. Also
it standardizes charset detection between chardet and charset_normalizers,
which previously differed on the \00 test case.
---
 picard/util/__init__.py | 9 +++++++--
 test/test_utils.py      | 4 +++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/picard/util/__init__.py b/picard/util/__init__.py
index 377a54f1c..9a2bfb0a8 100644
--- a/picard/util/__init__.py
+++ b/picard/util/__init__.py
@@ -1208,6 +1208,11 @@ def detect_file_encoding(path, max_bytes_to_read=1024*256):
         result = detect(f.read(max_bytes_to_read))
         if result['encoding'] is None:
             log.warning("Couldn't detect encoding for file %r", path)
-            result['encoding'] = 'UTF-8'
+            encoding = 'utf-8'
+        elif result['encoding'].lower() == 'ascii':
+            # Treat ASCII as UTF-8 (an ASCII document is also valid UTF-8)
+            encoding = 'utf-8'
+        else:
+            encoding = result['encoding'].lower()
 
-        return result['encoding'].lower()
+        return encoding
diff --git a/test/test_utils.py b/test/test_utils.py
index 019726936..ef3d7e4ac 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -941,6 +941,7 @@ class IgnoreUpdatesContextTest(PicardTestCase):
 
 class DetectUnicodeEncodingTest(PicardTestCase):
 
+    @unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package")
     def test_detect_file_encoding_bom(self):
         boms = {
             b'\xff\xfe': 'utf-16-le',
@@ -950,6 +951,7 @@ class DetectUnicodeEncodingTest(PicardTestCase):
             b'\xef\xbb\xbf': 'utf-8-sig',
             b'': 'utf-8',
             b'\00': 'utf-8',
+            b'no BOM, only ASCII': 'utf-8',
         }
         for bom, expected_encoding in boms.items():
             try:
@@ -973,7 +975,7 @@ class DetectUnicodeEncodingTest(PicardTestCase):
         file_path = get_test_data_path('eac-utf32le.log')
         self.assertEqual(expected_encoding, detect_file_encoding(file_path))
 
-    @unittest.skipUnless(charset_detect, "test requires charset-normalizer or chardet package")
+    @unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package")
     def test_detect_file_encoding_eac_windows_1251(self):
         expected_encoding = 'windows-1251'
         file_path = get_test_data_path('eac-windows1251.log')