From cd559a101181ad8650f3e94c3f133bf048683255 Mon Sep 17 00:00:00 2001 From: Philipp Wolfer Date: Thu, 14 Oct 2021 22:53:22 +0200 Subject: [PATCH] Extend wildcard to regex syntax with [...] set matching --- picard/util/__init__.py | 36 +++++++++++++++++++++++++++++++++--- test/test_utils.py | 24 +++++++++++++++++++++--- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/picard/util/__init__.py b/picard/util/__init__.py index 5210dfa58..5af57d9d1 100644 --- a/picard/util/__init__.py +++ b/picard/util/__init__.py @@ -791,7 +791,9 @@ def pattern_as_regex(pattern, allow_wildcards=False, flags=0): Wildcard matching currently supports these characters: - `*`: Matches an arbitrary number of characters or none, e.g. `fo*` matches "foo" or "foot". - `?`: Matches exactly one character, e.g. `fo?` matches "foo" or "for". - - `?`, `*` and `\\` can be escaped with a backslash \\ to match the literal character, e.g. `fo\\?` matches "fo?". + - `[...]`: Matches any character in the set, e.g. `[fo?]` matches all of "f", "o" and "?". + - `?`, `*`, `[`, `]` and `\\` can be escaped with a backslash \\ to match the literal + character, e.g. `fo\\?` matches "fo?". Args: pattern: The pattern as a string @@ -823,6 +825,7 @@ def wildcards_to_regex_pattern(pattern): The following syntax is supported: - `*`: Matches an arbitrary number of characters or none, e.g. `fo*` matches "foo" or "foot". - `?`: Matches exactly one character, e.g. `fo?` matches "foo" or "for". + - `[...]` - `?`, `*` and `\\` can be escaped with a backslash \\ to match the literal character, e.g. `fo\\?` matches "fo?". Args: @@ -831,10 +834,28 @@ def wildcards_to_regex_pattern(pattern): Returns: A string with a valid regular expression. """ regex = [] + group = None escape = False for c in pattern: - if escape: - if c in ('*', '?', '\\'): + if group is not None: + if escape: + if c in ('\\', '[', ']'): + c = '\\' + c + else: + group.append('\\\\') + escape = False + if c == ']': + group.append(c) + part = ''.join(group) + group = None + elif c == '\\': + escape = True + continue + else: + group.append(c) + continue + elif escape: + if c in ('*', '?', '\\', '[', ']'): part = '\\' + c else: part = re.escape('\\' + c) @@ -842,6 +863,9 @@ def wildcards_to_regex_pattern(pattern): elif c == '\\': escape = True continue + elif c == '[': + group = ['['] + continue elif c == '*': part = '.*' elif c == '?': @@ -849,4 +873,10 @@ def wildcards_to_regex_pattern(pattern): else: part = re.escape(c) regex.append(part) + + # There might be an unclosed character group. Interpret the starting + # bracket of the group as a literal bracket and re-evaluate the rest. + if group is not None: + regex.append('\\[') + regex.append(wildcards_to_regex_pattern(''.join(group[1:]))) return ''.join(regex) diff --git a/test/test_utils.py b/test/test_utils.py index 0ee6eb4c2..4f3b148ff 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -602,13 +602,31 @@ class WildcardsToRegexPatternTest(PicardTestCase): re.compile(regex) def test_escape(self): - pattern = 'f\\?o\\*o?o*' + pattern = 'f\\?o\\*o?o*\\[o' regex = wildcards_to_regex_pattern(pattern) - self.assertEqual('f\\?o\\*o.o.*', regex) + self.assertEqual('f\\?o\\*o.o.*\\[o', regex) + re.compile(regex) + + def test_character_group(self): + pattern = '[abc*?xyz]]' + regex = wildcards_to_regex_pattern(pattern) + self.assertEqual('[abc*?xyz]\\]', regex) + re.compile(regex) + + def test_character_group_escape_square_brackets(self): + pattern = '[a[b\\]c]' + regex = wildcards_to_regex_pattern(pattern) + self.assertEqual('[a[b\\]c]', regex) + re.compile(regex) + + def test_open_character_group(self): + pattern = '[abc*?xyz[' + regex = wildcards_to_regex_pattern(pattern) + self.assertEqual('\\[abc.*.xyz\\[', regex) re.compile(regex) def test_special_chars(self): - pattern = '[]()\\^$|' + pattern = ']()\\^$|' regex = wildcards_to_regex_pattern(pattern) self.assertEqual(re.escape(pattern), regex) re.compile(regex)