mirror of
https://github.com/fergalmoran/picard.git
synced 2026-01-06 08:34:01 +00:00
Move detect_unicode_encoding function to picard.util
Extract the function from ripper log file handling and make it available in picard.util.
This commit is contained in:
@@ -25,6 +25,7 @@ from picard.disc.utils import (
|
||||
TocEntry,
|
||||
calculate_mb_toc_numbers,
|
||||
)
|
||||
from picard.util import detect_unicode_encoding
|
||||
|
||||
|
||||
RE_TOC_ENTRY = re.compile(
|
||||
@@ -46,25 +47,8 @@ def filter_toc_entries(lines):
|
||||
yield TocEntry(track_num, int(m['start_sector']), int(m['end_sector'])-1)
|
||||
|
||||
|
||||
ENCODING_BOMS = {
|
||||
b'\xff\xfe': 'utf-16-le',
|
||||
b'\xfe\xff': 'utf-16-be',
|
||||
b'\00\00\xff\xfe': 'utf-32-le',
|
||||
b'\00\00\xfe\xff': 'utf-32-be',
|
||||
}
|
||||
|
||||
|
||||
def _detect_encoding(path):
|
||||
with open(path, 'rb') as f:
|
||||
first_bytes = f.read(4)
|
||||
for bom, encoding in ENCODING_BOMS.items():
|
||||
if first_bytes.startswith(bom):
|
||||
return encoding
|
||||
return 'utf-8'
|
||||
|
||||
|
||||
def toc_from_file(path):
|
||||
"""Reads dBpoweramp log files, generates MusicBrainz disc TOC listing for use as discid."""
|
||||
encoding = _detect_encoding(path)
|
||||
encoding = detect_unicode_encoding(path)
|
||||
with open(path, 'r', encoding=encoding) as f:
|
||||
return calculate_mb_toc_numbers(filter_toc_entries(f))
|
||||
|
||||
@@ -29,6 +29,7 @@ from picard.disc.utils import (
|
||||
TocEntry,
|
||||
calculate_mb_toc_numbers,
|
||||
)
|
||||
from picard.util import detect_unicode_encoding
|
||||
|
||||
|
||||
RE_TOC_TABLE_HEADER = re.compile(r""" \s*
|
||||
@@ -74,28 +75,11 @@ def filter_toc_entries(lines):
|
||||
yield TocEntry(int(m['num']), int(m['start_sector']), int(m['end_sector']))
|
||||
|
||||
|
||||
ENCODING_BOMS = {
|
||||
b'\xff\xfe': 'utf-16-le',
|
||||
b'\xfe\xff': 'utf-16-be',
|
||||
b'\00\00\xff\xfe': 'utf-32-le',
|
||||
b'\00\00\xfe\xff': 'utf-32-be',
|
||||
}
|
||||
|
||||
|
||||
def _detect_encoding(path):
|
||||
with open(path, 'rb') as f:
|
||||
first_bytes = f.read(4)
|
||||
for bom, encoding in ENCODING_BOMS.items():
|
||||
if first_bytes.startswith(bom):
|
||||
return encoding
|
||||
return 'utf-8'
|
||||
|
||||
|
||||
def toc_from_file(path):
|
||||
"""Reads EAC / XLD log files, generates MusicBrainz disc TOC listing for use as discid.
|
||||
|
||||
Warning: may work wrong for discs having data tracks. May generate wrong
|
||||
results on other non-standard cases."""
|
||||
encoding = _detect_encoding(path)
|
||||
encoding = detect_unicode_encoding(path)
|
||||
with open(path, 'r', encoding=encoding) as f:
|
||||
return calculate_mb_toc_numbers(filter_toc_entries(f))
|
||||
|
||||
@@ -1100,3 +1100,29 @@ def strxfrm(string):
|
||||
except (OSError, ValueError) as err:
|
||||
log.warning('strxfrm(%r) failed: %r', string, err)
|
||||
return string.lower()
|
||||
|
||||
|
||||
ENCODING_BOMS = {
|
||||
b'\xff\xfe': 'utf-16-le',
|
||||
b'\xfe\xff': 'utf-16-be',
|
||||
b'\00\00\xff\xfe': 'utf-32-le',
|
||||
b'\00\00\xfe\xff': 'utf-32-be',
|
||||
}
|
||||
|
||||
|
||||
def detect_unicode_encoding(path):
|
||||
"""Attempts to guess the unicode encoding of a file based on the BOM.
|
||||
|
||||
Assumes UTF-8 by default if there is no BOM.
|
||||
|
||||
Args:
|
||||
path: The path to the file
|
||||
|
||||
Returns: The encoding as a string, e.g. "utf-16-le" or "utf-8"
|
||||
"""
|
||||
with open(path, 'rb') as f:
|
||||
first_bytes = f.read(4)
|
||||
for bom, encoding in ENCODING_BOMS.items():
|
||||
if first_bytes.startswith(bom):
|
||||
return encoding
|
||||
return 'utf-8'
|
||||
|
||||
@@ -34,6 +34,7 @@ import builtins
|
||||
from collections import namedtuple
|
||||
from collections.abc import Iterator
|
||||
from locale import strxfrm as system_strxfrm
|
||||
import os
|
||||
import re
|
||||
import subprocess # nosec: B404
|
||||
from tempfile import NamedTemporaryFile
|
||||
@@ -56,6 +57,7 @@ from picard.util import (
|
||||
album_artist_from_path,
|
||||
any_exception_isinstance,
|
||||
build_qurl,
|
||||
detect_unicode_encoding,
|
||||
extract_year_from_date,
|
||||
find_best_match,
|
||||
is_absolute_path,
|
||||
@@ -878,3 +880,27 @@ class IgnoreUpdatesContextTest(PicardTestCase):
|
||||
self.assertTrue(context)
|
||||
self.assertTrue(context)
|
||||
self.assertFalse(context)
|
||||
|
||||
|
||||
class DetectUnicodeEncodingTest(PicardTestCase):
|
||||
|
||||
def test_detect_encoding(self):
|
||||
boms = {
|
||||
b'\xff\xfe': 'utf-16-le',
|
||||
b'\xfe\xff': 'utf-16-be',
|
||||
b'\00\00\xff\xfe': 'utf-32-le',
|
||||
b'\00\00\xfe\xff': 'utf-32-be',
|
||||
b'\00\00\xfe\xff': 'utf-32-be',
|
||||
b'\xef\xbb\xbf': 'utf-8',
|
||||
b'': 'utf-8',
|
||||
b'\00': 'utf-8',
|
||||
}
|
||||
for bom, expected_encoding in boms.items():
|
||||
try:
|
||||
f = NamedTemporaryFile(delete=False)
|
||||
f.write(bom)
|
||||
f.close()
|
||||
self.assertEqual(expected_encoding, detect_unicode_encoding(f.name))
|
||||
finally:
|
||||
f.close()
|
||||
os.remove(f.name)
|
||||
|
||||
Reference in New Issue
Block a user