diff --git a/picard/disc/dbpoweramplog.py b/picard/disc/dbpoweramplog.py index 81588e1cd..08f97560e 100644 --- a/picard/disc/dbpoweramplog.py +++ b/picard/disc/dbpoweramplog.py @@ -25,6 +25,7 @@ from picard.disc.utils import ( TocEntry, calculate_mb_toc_numbers, ) +from picard.util import detect_unicode_encoding RE_TOC_ENTRY = re.compile( @@ -46,25 +47,8 @@ def filter_toc_entries(lines): yield TocEntry(track_num, int(m['start_sector']), int(m['end_sector'])-1) -ENCODING_BOMS = { - b'\xff\xfe': 'utf-16-le', - b'\xfe\xff': 'utf-16-be', - b'\00\00\xff\xfe': 'utf-32-le', - b'\00\00\xfe\xff': 'utf-32-be', -} - - -def _detect_encoding(path): - with open(path, 'rb') as f: - first_bytes = f.read(4) - for bom, encoding in ENCODING_BOMS.items(): - if first_bytes.startswith(bom): - return encoding - return 'utf-8' - - def toc_from_file(path): """Reads dBpoweramp log files, generates MusicBrainz disc TOC listing for use as discid.""" - encoding = _detect_encoding(path) + encoding = detect_unicode_encoding(path) with open(path, 'r', encoding=encoding) as f: return calculate_mb_toc_numbers(filter_toc_entries(f)) diff --git a/picard/disc/eaclog.py b/picard/disc/eaclog.py index 7facaf785..4433fd4c7 100644 --- a/picard/disc/eaclog.py +++ b/picard/disc/eaclog.py @@ -29,6 +29,7 @@ from picard.disc.utils import ( TocEntry, calculate_mb_toc_numbers, ) +from picard.util import detect_unicode_encoding RE_TOC_TABLE_HEADER = re.compile(r""" \s* @@ -74,28 +75,11 @@ def filter_toc_entries(lines): yield TocEntry(int(m['num']), int(m['start_sector']), int(m['end_sector'])) -ENCODING_BOMS = { - b'\xff\xfe': 'utf-16-le', - b'\xfe\xff': 'utf-16-be', - b'\00\00\xff\xfe': 'utf-32-le', - b'\00\00\xfe\xff': 'utf-32-be', -} - - -def _detect_encoding(path): - with open(path, 'rb') as f: - first_bytes = f.read(4) - for bom, encoding in ENCODING_BOMS.items(): - if first_bytes.startswith(bom): - return encoding - return 'utf-8' - - def toc_from_file(path): """Reads EAC / XLD log files, generates MusicBrainz disc TOC listing for use as discid. Warning: may work wrong for discs having data tracks. May generate wrong results on other non-standard cases.""" - encoding = _detect_encoding(path) + encoding = detect_unicode_encoding(path) with open(path, 'r', encoding=encoding) as f: return calculate_mb_toc_numbers(filter_toc_entries(f)) diff --git a/picard/util/__init__.py b/picard/util/__init__.py index 225ab0a5b..c5ba1e8f5 100644 --- a/picard/util/__init__.py +++ b/picard/util/__init__.py @@ -1100,3 +1100,29 @@ def strxfrm(string): except (OSError, ValueError) as err: log.warning('strxfrm(%r) failed: %r', string, err) return string.lower() + + +ENCODING_BOMS = { + b'\xff\xfe': 'utf-16-le', + b'\xfe\xff': 'utf-16-be', + b'\00\00\xff\xfe': 'utf-32-le', + b'\00\00\xfe\xff': 'utf-32-be', +} + + +def detect_unicode_encoding(path): + """Attempts to guess the unicode encoding of a file based on the BOM. + + Assumes UTF-8 by default if there is no BOM. + + Args: + path: The path to the file + + Returns: The encoding as a string, e.g. "utf-16-le" or "utf-8" + """ + with open(path, 'rb') as f: + first_bytes = f.read(4) + for bom, encoding in ENCODING_BOMS.items(): + if first_bytes.startswith(bom): + return encoding + return 'utf-8' diff --git a/test/test_utils.py b/test/test_utils.py index 4347aa39c..b0123c072 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -34,6 +34,7 @@ import builtins from collections import namedtuple from collections.abc import Iterator from locale import strxfrm as system_strxfrm +import os import re import subprocess # nosec: B404 from tempfile import NamedTemporaryFile @@ -56,6 +57,7 @@ from picard.util import ( album_artist_from_path, any_exception_isinstance, build_qurl, + detect_unicode_encoding, extract_year_from_date, find_best_match, is_absolute_path, @@ -878,3 +880,27 @@ class IgnoreUpdatesContextTest(PicardTestCase): self.assertTrue(context) self.assertTrue(context) self.assertFalse(context) + + +class DetectUnicodeEncodingTest(PicardTestCase): + + def test_detect_encoding(self): + boms = { + b'\xff\xfe': 'utf-16-le', + b'\xfe\xff': 'utf-16-be', + b'\00\00\xff\xfe': 'utf-32-le', + b'\00\00\xfe\xff': 'utf-32-be', + b'\00\00\xfe\xff': 'utf-32-be', + b'\xef\xbb\xbf': 'utf-8', + b'': 'utf-8', + b'\00': 'utf-8', + } + for bom, expected_encoding in boms.items(): + try: + f = NamedTemporaryFile(delete=False) + f.write(bom) + f.close() + self.assertEqual(expected_encoding, detect_unicode_encoding(f.name)) + finally: + f.close() + os.remove(f.name)