LibRegex+Everywhere: Make LibRegex more unicode-aware

This commit makes LibRegex (mostly) capable of operating on any of the three main string views: - StringView for raw strings - Utf8View for utf-8 encoded strings - Utf32View for raw unicode strings As a result, regexps with unicode strings should be able to properly handle utf-8 and not stop in the middle of a code point. A future commit will update LibJS to use the correct type of string depending on the flags.
2026-01-06 16:45:03 +00:00 · 2021-07-18 05:07:01 +04:30
parent e5af15a6e9
commit f364fcec5d
8 changed files with 310 additions and 207 deletions
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@@ -249,7 +249,7 @@ TEST_CASE(char_utf8)
    Regex<PosixExtended> re("😀");
    RegexResult result;

-    EXPECT_EQ((result = match("Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界", re, PosixFlags::Global)).success, true);
+    EXPECT_EQ((result = match(Utf8View { "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界" }, re, PosixFlags::Global)).success, true);
    EXPECT_EQ(result.count, 2u);
 }

@@ -312,7 +312,6 @@ TEST_CASE(match_all_character_class)
    EXPECT_EQ(result.matches.at(0).view, "W");
    EXPECT_EQ(result.matches.at(1).view, "i");
    EXPECT_EQ(result.matches.at(2).view, "n");
-    EXPECT(&result.matches.at(0).view.characters_without_null_termination()[0] != &str.view().characters_without_null_termination()[1]);
 }

 TEST_CASE(match_character_class_with_assertion)