Move detect_unicode_encoding function to picard.util

Extract the function from ripper log file handling and make it available in picard.util.
This commit is contained in:
Philipp Wolfer
2022-09-25 12:14:51 +02:00
parent 593728e616
commit bee8b569e3
4 changed files with 56 additions and 36 deletions

View File

@@ -25,6 +25,7 @@ from picard.disc.utils import (
TocEntry,
calculate_mb_toc_numbers,
)
from picard.util import detect_unicode_encoding
RE_TOC_ENTRY = re.compile(
@@ -46,25 +47,8 @@ def filter_toc_entries(lines):
yield TocEntry(track_num, int(m['start_sector']), int(m['end_sector'])-1)
ENCODING_BOMS = {
b'\xff\xfe': 'utf-16-le',
b'\xfe\xff': 'utf-16-be',
b'\00\00\xff\xfe': 'utf-32-le',
b'\00\00\xfe\xff': 'utf-32-be',
}
def _detect_encoding(path):
with open(path, 'rb') as f:
first_bytes = f.read(4)
for bom, encoding in ENCODING_BOMS.items():
if first_bytes.startswith(bom):
return encoding
return 'utf-8'
def toc_from_file(path):
"""Reads dBpoweramp log files, generates MusicBrainz disc TOC listing for use as discid."""
encoding = _detect_encoding(path)
encoding = detect_unicode_encoding(path)
with open(path, 'r', encoding=encoding) as f:
return calculate_mb_toc_numbers(filter_toc_entries(f))

View File

@@ -29,6 +29,7 @@ from picard.disc.utils import (
TocEntry,
calculate_mb_toc_numbers,
)
from picard.util import detect_unicode_encoding
RE_TOC_TABLE_HEADER = re.compile(r""" \s*
@@ -74,28 +75,11 @@ def filter_toc_entries(lines):
yield TocEntry(int(m['num']), int(m['start_sector']), int(m['end_sector']))
ENCODING_BOMS = {
b'\xff\xfe': 'utf-16-le',
b'\xfe\xff': 'utf-16-be',
b'\00\00\xff\xfe': 'utf-32-le',
b'\00\00\xfe\xff': 'utf-32-be',
}
def _detect_encoding(path):
with open(path, 'rb') as f:
first_bytes = f.read(4)
for bom, encoding in ENCODING_BOMS.items():
if first_bytes.startswith(bom):
return encoding
return 'utf-8'
def toc_from_file(path):
"""Reads EAC / XLD log files, generates MusicBrainz disc TOC listing for use as discid.
Warning: may work wrong for discs having data tracks. May generate wrong
results on other non-standard cases."""
encoding = _detect_encoding(path)
encoding = detect_unicode_encoding(path)
with open(path, 'r', encoding=encoding) as f:
return calculate_mb_toc_numbers(filter_toc_entries(f))

View File

@@ -1100,3 +1100,29 @@ def strxfrm(string):
except (OSError, ValueError) as err:
log.warning('strxfrm(%r) failed: %r', string, err)
return string.lower()
ENCODING_BOMS = {
b'\xff\xfe': 'utf-16-le',
b'\xfe\xff': 'utf-16-be',
b'\00\00\xff\xfe': 'utf-32-le',
b'\00\00\xfe\xff': 'utf-32-be',
}
def detect_unicode_encoding(path):
"""Attempts to guess the unicode encoding of a file based on the BOM.
Assumes UTF-8 by default if there is no BOM.
Args:
path: The path to the file
Returns: The encoding as a string, e.g. "utf-16-le" or "utf-8"
"""
with open(path, 'rb') as f:
first_bytes = f.read(4)
for bom, encoding in ENCODING_BOMS.items():
if first_bytes.startswith(bom):
return encoding
return 'utf-8'

View File

@@ -34,6 +34,7 @@ import builtins
from collections import namedtuple
from collections.abc import Iterator
from locale import strxfrm as system_strxfrm
import os
import re
import subprocess # nosec: B404
from tempfile import NamedTemporaryFile
@@ -56,6 +57,7 @@ from picard.util import (
album_artist_from_path,
any_exception_isinstance,
build_qurl,
detect_unicode_encoding,
extract_year_from_date,
find_best_match,
is_absolute_path,
@@ -878,3 +880,27 @@ class IgnoreUpdatesContextTest(PicardTestCase):
self.assertTrue(context)
self.assertTrue(context)
self.assertFalse(context)
class DetectUnicodeEncodingTest(PicardTestCase):
def test_detect_encoding(self):
boms = {
b'\xff\xfe': 'utf-16-le',
b'\xfe\xff': 'utf-16-be',
b'\00\00\xff\xfe': 'utf-32-le',
b'\00\00\xfe\xff': 'utf-32-be',
b'\00\00\xfe\xff': 'utf-32-be',
b'\xef\xbb\xbf': 'utf-8',
b'': 'utf-8',
b'\00': 'utf-8',
}
for bom, expected_encoding in boms.items():
try:
f = NamedTemporaryFile(delete=False)
f.write(bom)
f.close()
self.assertEqual(expected_encoding, detect_unicode_encoding(f.name))
finally:
f.close()
os.remove(f.name)