mirror of
https://github.com/fergalmoran/ladybird.git
synced 2025-12-22 09:19:03 +00:00
LibRegex: Don't repeat the same fork again
If some state has already been tried, skip over it as it would never
lead to a match regardless.
This fixes performance/memory issues in cases like
/(a+)+b/.exec("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
or
/(a|a?)+b/...
Fixes #2622.
This commit is contained in:
committed by
Andreas Kling
parent
7ceeb85ba7
commit
cce000d57c
@@ -561,6 +561,33 @@ struct MatchState {
|
|||||||
COWVector<Vector<Match>> capture_group_matches;
|
COWVector<Vector<Match>> capture_group_matches;
|
||||||
COWVector<u64> repetition_marks;
|
COWVector<u64> repetition_marks;
|
||||||
Vector<u64, 64> checkpoints;
|
Vector<u64, 64> checkpoints;
|
||||||
|
|
||||||
|
// For size_t in {0..100}, ips in {0..500} and repetitions in {0..30}, there are zero collisions.
|
||||||
|
// For the full range, zero collisions were found in 8 million random samples.
|
||||||
|
u64 u64_hash() const
|
||||||
|
{
|
||||||
|
u64 hash = 0xcbf29ce484222325;
|
||||||
|
auto combine = [&hash](auto value) {
|
||||||
|
hash ^= value + 0x9e3779b97f4a7c15 + (hash << 6) + (hash >> 2);
|
||||||
|
};
|
||||||
|
auto combine_vector = [&hash](auto const& vector) {
|
||||||
|
for (auto& value : vector) {
|
||||||
|
hash ^= value;
|
||||||
|
hash *= 0x100000001b3;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
combine(string_position_before_match);
|
||||||
|
combine(string_position);
|
||||||
|
combine(string_position_in_code_units);
|
||||||
|
combine(instruction_position);
|
||||||
|
combine(fork_at_position);
|
||||||
|
combine(initiating_fork.value_or(0) + initiating_fork.has_value());
|
||||||
|
combine_vector(repetition_marks);
|
||||||
|
combine_vector(checkpoints);
|
||||||
|
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -468,6 +468,7 @@ bool Matcher<Parser>::execute(MatchInput const& input, MatchState& state, size_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
BumpAllocatedLinkedList<MatchState> states_to_try_next;
|
BumpAllocatedLinkedList<MatchState> states_to_try_next;
|
||||||
|
HashTable<u64> seen_state_hashes;
|
||||||
#if REGEX_DEBUG
|
#if REGEX_DEBUG
|
||||||
size_t recursion_level = 0;
|
size_t recursion_level = 0;
|
||||||
#endif
|
#endif
|
||||||
@@ -545,17 +546,34 @@ bool Matcher<Parser>::execute(MatchInput const& input, MatchState& state, size_t
|
|||||||
continue;
|
continue;
|
||||||
case ExecutionResult::Succeeded:
|
case ExecutionResult::Succeeded:
|
||||||
return true;
|
return true;
|
||||||
case ExecutionResult::Failed:
|
case ExecutionResult::Failed: {
|
||||||
if (!states_to_try_next.is_empty()) {
|
bool found = false;
|
||||||
|
while (!states_to_try_next.is_empty()) {
|
||||||
state = states_to_try_next.take_last();
|
state = states_to_try_next.take_last();
|
||||||
|
if (auto hash = state.u64_hash(); seen_state_hashes.set(hash) != HashSetResult::InsertedNewEntry) {
|
||||||
|
dbgln_if(REGEX_DEBUG, "Already seen state, skipping: {}", hash);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (found)
|
||||||
continue;
|
continue;
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
case ExecutionResult::Failed_ExecuteLowPrioForks: {
|
case ExecutionResult::Failed_ExecuteLowPrioForks: {
|
||||||
if (states_to_try_next.is_empty()) {
|
bool found = false;
|
||||||
return false;
|
while (!states_to_try_next.is_empty()) {
|
||||||
|
state = states_to_try_next.take_last();
|
||||||
|
if (auto hash = state.u64_hash(); seen_state_hashes.set(hash) != HashSetResult::InsertedNewEntry) {
|
||||||
|
dbgln_if(REGEX_DEBUG, "Already seen state, skipping: {}", hash);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
state = states_to_try_next.take_last();
|
if (!found)
|
||||||
|
return false;
|
||||||
#if REGEX_DEBUG
|
#if REGEX_DEBUG
|
||||||
++recursion_level;
|
++recursion_level;
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -1004,9 +1004,21 @@ static auto g_lots_of_a_s = ByteString::repeated('a', 10'000'000);
|
|||||||
|
|
||||||
BENCHMARK_CASE(fork_performance)
|
BENCHMARK_CASE(fork_performance)
|
||||||
{
|
{
|
||||||
Regex<ECMA262> re("(?:aa)*");
|
{
|
||||||
auto result = re.match(g_lots_of_a_s);
|
Regex<ECMA262> re("(?:aa)*");
|
||||||
EXPECT_EQ(result.success, true);
|
auto result = re.match(g_lots_of_a_s);
|
||||||
|
EXPECT_EQ(result.success, true);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
Regex<ECMA262> re("(a+)+b");
|
||||||
|
auto result = re.match(g_lots_of_a_s.substring_view(0, 100));
|
||||||
|
EXPECT_EQ(result.success, false);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
Regex<ECMA262> re("^(a|a?)+$");
|
||||||
|
auto result = re.match(ByteString::formatted("{}b", g_lots_of_a_s.substring_view(0, 100)));
|
||||||
|
EXPECT_EQ(result.success, false);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
BENCHMARK_CASE(anchor_performance)
|
BENCHMARK_CASE(anchor_performance)
|
||||||
|
|||||||
Reference in New Issue
Block a user