diff --git a/picard/disc/dbpoweramplog.py b/picard/disc/dbpoweramplog.py index b243df57c..42726ffc0 100644 --- a/picard/disc/dbpoweramplog.py +++ b/picard/disc/dbpoweramplog.py @@ -26,7 +26,7 @@ from picard.disc.utils import ( TocEntry, calculate_mb_toc_numbers, ) -from picard.util import detect_unicode_encoding +from picard.util import detect_file_encoding RE_TOC_ENTRY = re.compile( @@ -50,6 +50,6 @@ def filter_toc_entries(lines): def toc_from_file(path): """Reads dBpoweramp log files, generates MusicBrainz disc TOC listing for use as discid.""" - encoding = detect_unicode_encoding(path) + encoding = detect_file_encoding(path) with open(path, 'r', encoding=encoding) as f: return calculate_mb_toc_numbers(filter_toc_entries(f)) diff --git a/picard/disc/eaclog.py b/picard/disc/eaclog.py index 5d0256719..dbbd6627a 100644 --- a/picard/disc/eaclog.py +++ b/picard/disc/eaclog.py @@ -31,7 +31,7 @@ from picard.disc.utils import ( TocEntry, calculate_mb_toc_numbers, ) -from picard.util import detect_unicode_encoding +from picard.util import detect_file_encoding RE_TOC_TABLE_HEADER = re.compile(r""" \s* @@ -82,6 +82,6 @@ def toc_from_file(path): Warning: may work wrong for discs having data tracks. May generate wrong results on other non-standard cases.""" - encoding = detect_unicode_encoding(path) + encoding = detect_file_encoding(path) with open(path, 'r', encoding=encoding) as f: return calculate_mb_toc_numbers(filter_toc_entries(f)) diff --git a/picard/util/__init__.py b/picard/util/__init__.py index 275e0f261..3090fbac3 100644 --- a/picard/util/__init__.py +++ b/picard/util/__init__.py @@ -40,7 +40,13 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - +try: + from charset_normalizer import detect +except ImportError: + try: + from chardet import detect + except ImportError: + detect = None from collections import ( defaultdict, namedtuple, @@ -1186,13 +1192,16 @@ ENCODING_BOMS = { } -def detect_unicode_encoding(path): - """Attempts to guess the unicode encoding of a file based on the BOM. +def detect_file_encoding(path, max_bytes_to_read=1024*256): + """Attempts to guess the unicode encoding of a file based on the BOM, and + depending on avalibility, using a charset detection method. - Assumes UTF-8 by default if there is no BOM. + Assumes UTF-8 by default if no other encoding is detected. Args: path: The path to the file + max_bytes_to_read: Maximum bytes to read from the file during encoding + detection. Returns: The encoding as a string, e.g. "utf-16-le" or "utf-8" """ @@ -1201,4 +1210,14 @@ def detect_unicode_encoding(path): for bom, encoding in ENCODING_BOMS.items(): if first_bytes.startswith(bom): return encoding - return 'utf-8' + + if detect is None: + return 'utf-8' + + f.seek(0) + result = detect(f.read(max_bytes_to_read)) + if result['encoding'] is None: + log.warning("Couldn't detect encoding for file %r", path) + result['encoding'] = 'UTF-8' + + return result['encoding'].lower() diff --git a/requirements-macos-10.12.txt b/requirements-macos-10.12.txt index e22a63eda..8fb0d54c1 100644 --- a/requirements-macos-10.12.txt +++ b/requirements-macos-10.12.txt @@ -8,3 +8,4 @@ pyobjc-framework-Cocoa==9.0.1 PyQt5==5.13.1 python-dateutil==2.8.2 PyYAML==6.0.1 +charset-normalizer==3.3.2 diff --git a/requirements-macos-10.14.txt b/requirements-macos-10.14.txt index f44578fc3..5c2593f8b 100644 --- a/requirements-macos-10.14.txt +++ b/requirements-macos-10.14.txt @@ -8,3 +8,4 @@ pyobjc-framework-Cocoa==9.1.1 PyQt5==5.15.10 python-dateutil==2.8.2 PyYAML==6.0.1 +charset-normalizer==3.3.2 diff --git a/requirements-win.txt b/requirements-win.txt index f0b3ed5e5..5f5a75fe9 100644 --- a/requirements-win.txt +++ b/requirements-win.txt @@ -7,3 +7,4 @@ PyQt5==5.15.10 python-dateutil==2.8.2 pywin32==306 PyYAML==6.0.1 +charset-normalizer==3.3.2 diff --git a/requirements.txt b/requirements.txt index 0a58b0818..09fd93e35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ PyQt5~=5.11 python-dateutil~=2.7 pywin32; sys_platform == 'win32' PyYAML>=5.1, <7 +charset-normalizer~=3.3.2 diff --git a/test/data/eac-windows1251.log b/test/data/eac-windows1251.log new file mode 100644 index 000000000..f69d999ad --- /dev/null +++ b/test/data/eac-windows1251.log @@ -0,0 +1,143 @@ +Åüàöò Àóäèî Öîïé Â1.6 ôðîì 23. Îöòîáåð 2020 + +ÅÀÖ åüòðàöòèîí ëîãôèëå ôðîì 24. Æàíóàðé 2022, 21:33 + +ïîðíîïõîíèqóå / Áðàâå Íåú Úîðëä + +Óñåä äðèâå : ÒÑÑÒöîðïÖÄÄÂÄÚ ÑÅ-218ÁÁ Àäàïòåð: 1 ÈÄ: 1 + +Ðåàä ìîäå : Áóðñò + +Ðåàä îôôñåò öîððåöòèîí : 6 +Îâåððåàä èíòî Ëåàä-Èí àíä Ëåàä-Îóò : Íî +Ôèëë óï ìèññèíã îôôñåò ñàìïëåñ úèòõ ñèëåíöå : Éåñ +Äåëåòå ëåàäèíã àíä òðàèëèíã ñèëåíò áëîöêñ : Íî +Íóëë ñàìïëåñ óñåä èí ÖÐÖ öàëöóëàòèîíñ : Éåñ +Óñåä èíòåðôàöå : Íàòèâå Úèí32 èíòåðôàöå ôîð Úèí ÍÒ & 2000 +Ãàï õàíäëèíã : Íîò äåòåöòåä, òõóñ àïïåíäåä òî ïðåâèîóñ òðàöê + +Óñåä îóòïóò ôîðìàò : Óñåð Äåôèíåä Åíöîäåð +Ñåëåöòåä áèòðàòå : 768 êÁèò/ñ +Qóàëèòé : Õèãõ +Àää ÈÄ3 òàã : Íî +Öîììàíä ëèíå öîìïðåññîð : Ö:\Ïðîãðàì Ôèëåñ (ü86)\Åüàöò Àóäèî Öîïé\ÔËÀÖ\ÔËÀÖ.ÅÜÅ +Àääèòèîíàë öîììàíä ëèíå îïòèîíñ : -6 - -Ò "ÀÐÒÈÑÒ=#1072;ðòèñò%" -Ò "ÒÈÒËÅ=#1090;èòëå%" -Ò "ÀËÁÓÌ=#1072;ëáóìòèòëå%" -Ò "ÄÀÒÅ=#1081;åàð%" -Ò "ÒÐÀÖÊÍÓÌÁÅÐ=#1090;ðàöêíð%" -Ò "ÃÅÍÐÅ=#1075;åíðå%" -Ò "ÖÎÌÌÅÍÒ=#1094;îììåíò%" -Ò "ÁÀÍÄ=#1072;ëáóìèíòåðïðåò%" -Ò "ÀËÁÓÌÀÐÒÈÑÒ=#1072;ëáóìèíòåðïðåò%" -Ò "ÖÎÌÏÎÑÅÐ=#1094;îìïîñåð%" %õàñëéðèöñ%--òàã-ôðîì-ôèëå=1051;ÉÐÈÖÑ=ëéðèöñôèëå%"%õàñëéðèöñ% -Ò "ÄÈÑÖÍÓÌÁÅÐ=#1094;äíóìáåð%" -Ò "ÒÎÒÀËÄÈÑÖÑ=#1090;îòàëöäñ%" -Ò "ÒÎÒÀËÒÐÀÖÊÑ=#1085;óìòðàöêñ%" %õàñöîâåð%--ïèöòóðå=öîâåðôèëå%"%õàñöîâåð% %ñîóðöå% -î %äåñò% + + +ÒÎÖ îô òõå åüòðàöòåä ÖÄ + + Òðàöê | Ñòàðò | Ëåíãòõ | Ñòàðò ñåöòîð | Åíä ñåöòîð + --------------------------------------------------------- + 1 | 0:00.00 | 5:32.14 | 0 | 24913 + 2 | 5:32.14 | 4:07.22 | 24914 | 43460 + 3 | 9:39.36 | 3:50.29 | 43461 | 60739 + 4 | 13:29.65 | 4:56.00 | 60740 | 82939 + 5 | 18:25.65 | 3:45.35 | 82940 | 99849 + 6 | 22:11.25 | 3:20.57 | 99850 | 114906 + 7 | 25:32.07 | 4:33.26 | 114907 | 135407 + 8 | 30:05.33 | 3:03.40 | 135408 | 149172 + + +Òðàöê 1 + + Ôèëåíàìå Ö:\Óñåðñ\Äåâåëîïåð\Ìóñèö\ÅÀÖ\Áðàâå Íåú Úîðëä\01 Öîìèíã Õîìå.úàâ + + Ïåàê ëåâåë 94.8 % + Åüòðàöòèîí ñïýä 9.0 Ü + Öîïé ÖÐÖ Å924ÔÁ20 + Àööóðàòåëé ðèïïåä (öîíôèäåíöå 2) [3182EB6A] (ÀÐ â2) + Öîïé ÎÊ + +Òðàöê 2 + + Ôèëåíàìå Ö:\Óñåðñ\Äåâåëîïåð\Ìóñèö\ÅÀÖ\Áðàâå Íåú Úîðëä\02 Ñàâå Ãàìå.úàâ + + Ïåàê ëåâåë 98.1 % + Åüòðàöòèîí ñïýä 12.0 Ü + Öîïé ÖÐÖ Ô19À6ÀÔÖ + Àööóðàòåëé ðèïïåä (öîíôèäåíöå 2) [79A374E7] (ÀÐ â2) + Öîïé ÎÊ + +Òðàöê 3 + + Ôèëåíàìå Ö:\Óñåðñ\Äåâåëîïåð\Ìóñèö\ÅÀÖ\Áðàâå Íåú Úîðëä\03 Âîèöåñ èí Ìé Õåàä.úàâ + + Ïåàê ëåâåë 97.2 % + Åüòðàöòèîí ñïýä 13.0 Ü + Öîïé ÖÐÖ 300ÄÔ15Ö + Àööóðàòåëé ðèïïåä (öîíôèäåíöå 2) [77DE3AFE] (ÀÐ â2) + Öîïé ÎÊ + +Òðàöê 4 + + Ôèëåíàìå Ö:\Óñåðñ\Äåâåëîïåð\Ìóñèö\ÅÀÖ\Áðàâå Íåú Úîðëä\04 Òõå Ñîíãñ Úå Ñàíã Òîãåòõåð.úàâ + + Ïåàê ëåâåë 98.8 % + Åüòðàöòèîí ñïýä 14.0 Ü + Öîïé ÖÐÖ 1657126Ô + Àööóðàòåëé ðèïïåä (öîíôèäåíöå 2) [4D824E2F] (ÀÐ â2) + Öîïé ÎÊ + +Òðàöê 5 + + Ôèëåíàìå Ö:\Óñåðñ\Äåâåëîïåð\Ìóñèö\ÅÀÖ\Áðàâå Íåú Úîðëä\05 Íèãõò Úèëë Ôàëë.úàâ + + Ïåàê ëåâåë 92.8 % + Åüòðàöòèîí ñïýä 14.9 Ü + Öîïé ÖÐÖ 66ÅÔÅÀ59 + Àööóðàòåëé ðèïïåä (öîíôèäåíöå 2) [D0D39055] (ÀÐ â2) + Öîïé ÎÊ + +Òðàöê 6 + + Ôèëåíàìå Ö:\Óñåðñ\Äåâåëîïåð\Ìóñèö\ÅÀÖ\Áðàâå Íåú Úîðëä\06 Áðàâå Íåú Úîðëä.úàâ + + Ïåàê ëåâåë 96.5 % + Åüòðàöòèîí ñïýä 15.6 Ü + Öîïé ÖÐÖ 5Ä7Á712Á + Àööóðàòåëé ðèïïåä (öîíôèäåíöå 2) [CD69B7FC] (ÀÐ â2) + Öîïé ÎÊ + +Òðàöê 7 + + Ôèëåíàìå Ö:\Óñåðñ\Äåâåëîïåð\Ìóñèö\ÅÀÖ\Áðàâå Íåú Úîðëä\07 Úàâå Àôòåð Úàâå.úàâ + + Ïåàê ëåâåë 97.5 % + Åüòðàöòèîí ñïýä 16.4 Ü + Öîïé ÖÐÖ 694ÖÄÄ79 + Àööóðàòåëé ðèïïåä (öîíôèäåíöå 2) [FE6DBCBF] (ÀÐ â2) + Öîïé ÎÊ + +Òðàöê 8 + + Ôèëåíàìå Ö:\Óñåðñ\Äåâåëîïåð\Ìóñèö\ÅÀÖ\Áðàâå Íåú Úîðëä\08 Àúàêåíèíã.úàâ + + Ïåàê ëåâåë 98.4 % + Åüòðàöòèîí ñïýä 17.0 Ü + Öîïé ÖÐÖ 629ÖÖ273 + Àööóðàòåëé ðèïïåä (öîíôèäåíöå 2) [7EB7F715] (ÀÐ â2) + Öîïé ÎÊ + + +Àëë òðàöêñ àööóðàòåëé ðèïïåä + +Íî åððîðñ îööóððåä + +Åíä îô ñòàòóñ ðåïîðò + +---- ÖÓÅÒîîëñ ÄÁ Ïëóãèí Â2.1.6 + +[CTDB TOCID: nj4ubZdFWgR4oA9uNKS.7OHP30U-] ôîóíä +Ñóáìèò ðåñóëò: àëðåàäé ñóáìèòòåä +Òðàöê | ÖÒÄÁ Ñòàòóñ + 1 | (8/8) Àööóðàòåëé ðèïïåä + 2 | (8/8) Àööóðàòåëé ðèïïåä + 3 | (8/8) Àööóðàòåëé ðèïïåä + 4 | (8/8) Àööóðàòåëé ðèïïåä + 5 | (8/8) Àööóðàòåëé ðèïïåä + 6 | (8/8) Àööóðàòåëé ðèïïåä + 7 | (8/8) Àööóðàòåëé ðèïïåä + 8 | (8/8) Àööóðàòåëé ðèïïåä + + +==#1051;îã ÷åöêñóì Ô97Ý7Ö5Å011ÔÔ7520Á7ÀÖ437Ä42Á9ÁÖÖÄ5Ä80929Ô8213ÅÄÔ90656Ý39Á95Ô8À == \ No newline at end of file diff --git a/test/test_disc_eaclog.py b/test/test_disc_eaclog.py index 886f7334c..f86de1b78 100644 --- a/test/test_disc_eaclog.py +++ b/test/test_disc_eaclog.py @@ -72,6 +72,9 @@ class TestTocFromFile(PicardTestCase): toc = toc_from_file(test_log) self.assertEqual((1, 8, 149323, 150, 25064, 43611, 60890, 83090, 100000, 115057, 135558), toc) + def test_toc_from_file_eac_windows1251(self): + self._test_toc_from_file('eac-windows1251.log') + def test_toc_from_file_eac_utf8(self): self._test_toc_from_file('eac-utf8.log') diff --git a/test/test_utils.py b/test/test_utils.py index f005150fd..c6cda31ae 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -44,7 +44,10 @@ from unittest.mock import ( patch, ) -from test.picardtestcase import PicardTestCase +from test.picardtestcase import ( + PicardTestCase, + get_test_data_path, +) from picard import util from picard.const import MUSICBRAINZ_SERVERS @@ -57,7 +60,7 @@ from picard.util import ( album_artist_from_path, any_exception_isinstance, build_qurl, - detect_unicode_encoding, + detect_file_encoding, encoded_queryargs, extract_year_from_date, find_best_match, @@ -988,13 +991,13 @@ class IgnoreUpdatesContextTest(PicardTestCase): class DetectUnicodeEncodingTest(PicardTestCase): - def test_detect_encoding(self): + def test_detect_file_encoding_bom(self): boms = { b'\xff\xfe': 'utf-16-le', b'\xfe\xff': 'utf-16-be', b'\00\00\xff\xfe': 'utf-32-le', b'\00\00\xfe\xff': 'utf-32-be', - b'\xef\xbb\xbf': 'utf-8', + b'\xef\xbb\xbf': 'utf-8-sig', b'': 'utf-8', b'\00': 'utf-8', } @@ -1003,7 +1006,17 @@ class DetectUnicodeEncodingTest(PicardTestCase): f = NamedTemporaryFile(delete=False) f.write(bom) f.close() - self.assertEqual(expected_encoding, detect_unicode_encoding(f.name)) + self.assertEqual(expected_encoding, detect_file_encoding(f.name)) finally: f.close() os.remove(f.name) + + def test_detect_file_encoding_eac_utf_16_le(self): + expected_encoding = 'utf-16-le' + file_path = get_test_data_path('eac-utf16le.log') + self.assertEqual(expected_encoding, detect_file_encoding(file_path)) + + def test_detect_file_encoding_eac_windows_1251(self): + expected_encoding = 'windows-1251' + file_path = get_test_data_path('eac-windows1251.log') + self.assertEqual(expected_encoding, detect_file_encoding(file_path))