mirror of
https://github.com/fergalmoran/picard.git
synced 2025-12-23 01:37:47 +00:00
Always detect_file_encoding ASCII as UTF-8
This ensures partially analyzed documents that are detected as ASCII but contain UTF-8 encodings later in the file still get loaded. Also it standardizes charset detection between chardet and charset_normalizers, which previously differed on the \00 test case.
This commit is contained in:
@@ -1219,6 +1219,11 @@ def detect_file_encoding(path, max_bytes_to_read=1024*256):
|
|||||||
result = detect(f.read(max_bytes_to_read))
|
result = detect(f.read(max_bytes_to_read))
|
||||||
if result['encoding'] is None:
|
if result['encoding'] is None:
|
||||||
log.warning("Couldn't detect encoding for file %r", path)
|
log.warning("Couldn't detect encoding for file %r", path)
|
||||||
result['encoding'] = 'UTF-8'
|
encoding = 'utf-8'
|
||||||
|
elif result['encoding'].lower() == 'ascii':
|
||||||
|
# Treat ASCII as UTF-8 (an ASCII document is also valid UTF-8)
|
||||||
|
encoding = 'utf-8'
|
||||||
|
else:
|
||||||
|
encoding = result['encoding'].lower()
|
||||||
|
|
||||||
return result['encoding'].lower()
|
return encoding
|
||||||
|
|||||||
@@ -992,6 +992,7 @@ class IgnoreUpdatesContextTest(PicardTestCase):
|
|||||||
|
|
||||||
class DetectUnicodeEncodingTest(PicardTestCase):
|
class DetectUnicodeEncodingTest(PicardTestCase):
|
||||||
|
|
||||||
|
@unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package")
|
||||||
def test_detect_file_encoding_bom(self):
|
def test_detect_file_encoding_bom(self):
|
||||||
boms = {
|
boms = {
|
||||||
b'\xff\xfe': 'utf-16-le',
|
b'\xff\xfe': 'utf-16-le',
|
||||||
@@ -1001,6 +1002,7 @@ class DetectUnicodeEncodingTest(PicardTestCase):
|
|||||||
b'\xef\xbb\xbf': 'utf-8-sig',
|
b'\xef\xbb\xbf': 'utf-8-sig',
|
||||||
b'': 'utf-8',
|
b'': 'utf-8',
|
||||||
b'\00': 'utf-8',
|
b'\00': 'utf-8',
|
||||||
|
b'no BOM, only ASCII': 'utf-8',
|
||||||
}
|
}
|
||||||
for bom, expected_encoding in boms.items():
|
for bom, expected_encoding in boms.items():
|
||||||
try:
|
try:
|
||||||
@@ -1024,7 +1026,7 @@ class DetectUnicodeEncodingTest(PicardTestCase):
|
|||||||
file_path = get_test_data_path('eac-utf32le.log')
|
file_path = get_test_data_path('eac-utf32le.log')
|
||||||
self.assertEqual(expected_encoding, detect_file_encoding(file_path))
|
self.assertEqual(expected_encoding, detect_file_encoding(file_path))
|
||||||
|
|
||||||
@unittest.skipUnless(charset_detect, "test requires charset-normalizer or chardet package")
|
@unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package")
|
||||||
def test_detect_file_encoding_eac_windows_1251(self):
|
def test_detect_file_encoding_eac_windows_1251(self):
|
||||||
expected_encoding = 'windows-1251'
|
expected_encoding = 'windows-1251'
|
||||||
file_path = get_test_data_path('eac-windows1251.log')
|
file_path = get_test_data_path('eac-windows1251.log')
|
||||||
|
|||||||
Reference in New Issue
Block a user