Fixed utf-32 detection from BOM

- the utf-32-le BOM was in wrong order
- longer BOMs need to be checked first
- added BOM for utf-8-sig
This commit is contained in:
Philipp Wolfer
2024-03-22 07:48:38 +01:00
parent 1ffd583b00
commit 06ae9d7de9
3 changed files with 10 additions and 4 deletions

View File

@@ -949,8 +949,8 @@ class DetectUnicodeEncodingTest(PicardTestCase):
boms = {
b'\xff\xfe': 'utf-16-le',
b'\xfe\xff': 'utf-16-be',
b'\00\00\xff\xfe': 'utf-32-le',
b'\00\00\xfe\xff': 'utf-32-be',
b'\xff\xfe\x00\x00': 'utf-32-le',
b'\x00\x00\xfe\xff': 'utf-32-be',
b'\xef\xbb\xbf': 'utf-8-sig',
b'': 'utf-8',
b'\00': 'utf-8',
@@ -970,6 +970,11 @@ class DetectUnicodeEncodingTest(PicardTestCase):
file_path = get_test_data_path('eac-utf16le.log')
self.assertEqual(expected_encoding, detect_file_encoding(file_path))
def test_detect_file_encoding_eac_utf_32_le(self):
expected_encoding = 'utf-32-le'
file_path = get_test_data_path('eac-utf32le.log')
self.assertEqual(expected_encoding, detect_file_encoding(file_path))
def test_detect_file_encoding_eac_windows_1251(self):
expected_encoding = 'windows-1251'
file_path = get_test_data_path('eac-windows1251.log')