diff --git a/picard/util/__init__.py b/picard/util/__init__.py index 377a54f1c..9a2bfb0a8 100644 --- a/picard/util/__init__.py +++ b/picard/util/__init__.py @@ -1208,6 +1208,11 @@ def detect_file_encoding(path, max_bytes_to_read=1024*256): result = detect(f.read(max_bytes_to_read)) if result['encoding'] is None: log.warning("Couldn't detect encoding for file %r", path) - result['encoding'] = 'UTF-8' + encoding = 'utf-8' + elif result['encoding'].lower() == 'ascii': + # Treat ASCII as UTF-8 (an ASCII document is also valid UTF-8) + encoding = 'utf-8' + else: + encoding = result['encoding'].lower() - return result['encoding'].lower() + return encoding diff --git a/test/test_utils.py b/test/test_utils.py index 019726936..ef3d7e4ac 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -941,6 +941,7 @@ class IgnoreUpdatesContextTest(PicardTestCase): class DetectUnicodeEncodingTest(PicardTestCase): + @unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package") def test_detect_file_encoding_bom(self): boms = { b'\xff\xfe': 'utf-16-le', @@ -950,6 +951,7 @@ class DetectUnicodeEncodingTest(PicardTestCase): b'\xef\xbb\xbf': 'utf-8-sig', b'': 'utf-8', b'\00': 'utf-8', + b'no BOM, only ASCII': 'utf-8', } for bom, expected_encoding in boms.items(): try: @@ -973,7 +975,7 @@ class DetectUnicodeEncodingTest(PicardTestCase): file_path = get_test_data_path('eac-utf32le.log') self.assertEqual(expected_encoding, detect_file_encoding(file_path)) - @unittest.skipUnless(charset_detect, "test requires charset-normalizer or chardet package") + @unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package") def test_detect_file_encoding_eac_windows_1251(self): expected_encoding = 'windows-1251' file_path = get_test_data_path('eac-windows1251.log')