Move detect_unicode_encoding function to picard.util

Extract the function from ripper log file handling and make it available in picard.util.
2026-01-06 08:34:01 +00:00 · 2022-09-25 12:14:51 +02:00
parent 593728e616
commit bee8b569e3
4 changed files with 56 additions and 36 deletions
--- a/picard/disc/dbpoweramplog.py
+++ b/picard/disc/dbpoweramplog.py
@@ -25,6 +25,7 @@ from picard.disc.utils import (
    TocEntry,
    calculate_mb_toc_numbers,
 )
+from picard.util import detect_unicode_encoding


 RE_TOC_ENTRY = re.compile(
@@ -46,25 +47,8 @@ def filter_toc_entries(lines):
            yield TocEntry(track_num, int(m['start_sector']), int(m['end_sector'])-1)


-ENCODING_BOMS = {
-    b'\xff\xfe': 'utf-16-le',
-    b'\xfe\xff': 'utf-16-be',
-    b'\00\00\xff\xfe': 'utf-32-le',
-    b'\00\00\xfe\xff': 'utf-32-be',
-}
-
-
-def _detect_encoding(path):
-    with open(path, 'rb') as f:
-        first_bytes = f.read(4)
-        for bom, encoding in ENCODING_BOMS.items():
-            if first_bytes.startswith(bom):
-                return encoding
-        return 'utf-8'
-
-
 def toc_from_file(path):
    """Reads dBpoweramp log files, generates MusicBrainz disc TOC listing for use as discid."""
-    encoding = _detect_encoding(path)
+    encoding = detect_unicode_encoding(path)
    with open(path, 'r', encoding=encoding) as f:
        return calculate_mb_toc_numbers(filter_toc_entries(f))
--- a/picard/disc/eaclog.py
+++ b/picard/disc/eaclog.py
@@ -29,6 +29,7 @@ from picard.disc.utils import (
    TocEntry,
    calculate_mb_toc_numbers,
 )
+from picard.util import detect_unicode_encoding


 RE_TOC_TABLE_HEADER = re.compile(r""" \s*
@@ -74,28 +75,11 @@ def filter_toc_entries(lines):
        yield TocEntry(int(m['num']), int(m['start_sector']), int(m['end_sector']))


-ENCODING_BOMS = {
-    b'\xff\xfe': 'utf-16-le',
-    b'\xfe\xff': 'utf-16-be',
-    b'\00\00\xff\xfe': 'utf-32-le',
-    b'\00\00\xfe\xff': 'utf-32-be',
-}
-
-
-def _detect_encoding(path):
-    with open(path, 'rb') as f:
-        first_bytes = f.read(4)
-        for bom, encoding in ENCODING_BOMS.items():
-            if first_bytes.startswith(bom):
-                return encoding
-        return 'utf-8'
-
-
 def toc_from_file(path):
    """Reads EAC / XLD log files, generates MusicBrainz disc TOC listing for use as discid.

    Warning: may work wrong for discs having data tracks. May generate wrong
    results on other non-standard cases."""
-    encoding = _detect_encoding(path)
+    encoding = detect_unicode_encoding(path)
    with open(path, 'r', encoding=encoding) as f:
        return calculate_mb_toc_numbers(filter_toc_entries(f))
--- a/picard/util/init.py
+++ b/picard/util/init.py
@@ -1100,3 +1100,29 @@ def strxfrm(string):
    except (OSError, ValueError) as err:
        log.warning('strxfrm(%r) failed: %r', string, err)
        return string.lower()
+
+
+ENCODING_BOMS = {
+    b'\xff\xfe': 'utf-16-le',
+    b'\xfe\xff': 'utf-16-be',
+    b'\00\00\xff\xfe': 'utf-32-le',
+    b'\00\00\xfe\xff': 'utf-32-be',
+}
+
+
+def detect_unicode_encoding(path):
+    """Attempts to guess the unicode encoding of a file based on the BOM.
+
+    Assumes UTF-8 by default if there is no BOM.
+
+    Args:
+        path: The path to the file
+
+    Returns: The encoding as a string, e.g. "utf-16-le" or "utf-8"
+    """
+    with open(path, 'rb') as f:
+        first_bytes = f.read(4)
+        for bom, encoding in ENCODING_BOMS.items():
+            if first_bytes.startswith(bom):
+                return encoding
+        return 'utf-8'
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -34,6 +34,7 @@ import builtins
 from collections import namedtuple
 from collections.abc import Iterator
 from locale import strxfrm as system_strxfrm
+import os
 import re
 import subprocess  # nosec: B404
 from tempfile import NamedTemporaryFile
@@ -56,6 +57,7 @@ from picard.util import (
    album_artist_from_path,
    any_exception_isinstance,
    build_qurl,
+    detect_unicode_encoding,
    extract_year_from_date,
    find_best_match,
    is_absolute_path,
@@ -878,3 +880,27 @@ class IgnoreUpdatesContextTest(PicardTestCase):
                self.assertTrue(context)
            self.assertTrue(context)
        self.assertFalse(context)
+
+
+class DetectUnicodeEncodingTest(PicardTestCase):
+
+    def test_detect_encoding(self):
+        boms = {
+            b'\xff\xfe': 'utf-16-le',
+            b'\xfe\xff': 'utf-16-be',
+            b'\00\00\xff\xfe': 'utf-32-le',
+            b'\00\00\xfe\xff': 'utf-32-be',
+            b'\00\00\xfe\xff': 'utf-32-be',
+            b'\xef\xbb\xbf': 'utf-8',
+            b'': 'utf-8',
+            b'\00': 'utf-8',
+        }
+        for bom, expected_encoding in boms.items():
+            try:
+                f = NamedTemporaryFile(delete=False)
+                f.write(bom)
+                f.close()
+                self.assertEqual(expected_encoding, detect_unicode_encoding(f.name))
+            finally:
+                f.close()
+                os.remove(f.name)