Always detect_file_encoding ASCII as UTF-8

This ensures partially analyzed documents that are detected as ASCII
but contain UTF-8 encodings later in the file still get loaded. Also
it standardizes charset detection between chardet and charset_normalizers,
which previously differed on the \00 test case.
This commit is contained in:
Philipp Wolfer
2024-04-30 07:38:02 +02:00
parent 43f7603c01
commit 01a3ee39b5
2 changed files with 10 additions and 3 deletions

View File

@@ -992,6 +992,7 @@ class IgnoreUpdatesContextTest(PicardTestCase):
class DetectUnicodeEncodingTest(PicardTestCase):
@unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package")
def test_detect_file_encoding_bom(self):
boms = {
b'\xff\xfe': 'utf-16-le',
@@ -1001,6 +1002,7 @@ class DetectUnicodeEncodingTest(PicardTestCase):
b'\xef\xbb\xbf': 'utf-8-sig',
b'': 'utf-8',
b'\00': 'utf-8',
b'no BOM, only ASCII': 'utf-8',
}
for bom, expected_encoding in boms.items():
try:
@@ -1024,7 +1026,7 @@ class DetectUnicodeEncodingTest(PicardTestCase):
file_path = get_test_data_path('eac-utf32le.log')
self.assertEqual(expected_encoding, detect_file_encoding(file_path))
@unittest.skipUnless(charset_detect, "test requires charset-normalizer or chardet package")
@unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package")
def test_detect_file_encoding_eac_windows_1251(self):
expected_encoding = 'windows-1251'
file_path = get_test_data_path('eac-windows1251.log')