diff --git a/AK/Utf8View.cpp b/AK/Utf8View.cpp index 14eec8294d..05d67cfa20 100644 --- a/AK/Utf8View.cpp +++ b/AK/Utf8View.cpp @@ -76,6 +76,10 @@ Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_ size_t Utf8View::calculate_length() const { + // FIXME: The CPU-specific implementations behave differently on null inputs. We treat null views as an empty string. + if (is_empty()) + return 0; + // FIXME: simdutf's code point length method assumes valid UTF-8, whereas Utf8View uses U+FFFD as a replacement // for invalid code points. If we change Utf8View to only accept valid encodings as an invariant, we can // remove this branch. @@ -155,6 +159,12 @@ Utf8View Utf8View::trim(Utf8View const& characters, TrimMode mode) const bool Utf8View::validate(size_t& valid_bytes, AllowSurrogates allow_surrogates) const { + // FIXME: The CPU-specific implementations behave differently on null inputs. We treat null views as an empty string. + if (is_empty()) { + valid_bytes = 0; + return true; + } + auto result = simdutf::validate_utf8_with_errors(m_string.characters_without_null_termination(), m_string.length()); valid_bytes = result.count; diff --git a/Tests/AK/TestUtf8.cpp b/Tests/AK/TestUtf8.cpp index 726d79dca0..988707c41d 100644 --- a/Tests/AK/TestUtf8.cpp +++ b/Tests/AK/TestUtf8.cpp @@ -48,6 +48,18 @@ TEST_CASE(decode_utf8) EXPECT_EQ(i, expected_size); } +TEST_CASE(null_view) +{ + Utf8View view; + EXPECT(view.validate(Utf8View::AllowSurrogates::No)); + EXPECT(view.validate(Utf8View::AllowSurrogates::Yes)); + EXPECT_EQ(view.byte_length(), 0zu); + EXPECT_EQ(view.length(), 0zu); + + for ([[maybe_unused]] auto it : view) + FAIL("Iterating a null UTF-8 string should not produce any values"); +} + TEST_CASE(validate_invalid_ut8) { size_t valid_bytes;