mirror of
https://github.com/fergalmoran/ladybird.git
synced 2026-01-06 16:45:03 +00:00
AK: Use simdutf when appending UTF-16 to StringBuilder
Adds a fast path for valid UTF-16 using `simdutf`, and fall back to the slow path for unmatched surrogates.
This commit is contained in:
committed by
Andreas Kling
parent
ff6020c207
commit
04920d06f0
@@ -78,4 +78,50 @@ template<FallibleFunction<char> Callback>
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates.
|
||||
* This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF.
|
||||
*/
|
||||
[[nodiscard]] static inline size_t maximum_utf8_length_from_utf16(ReadonlySpan<u16> code_units)
|
||||
{
|
||||
// # UTF-8 code point -> no. UTF-8 bytes needed
|
||||
// U+0000 - U+007F => 1 UTF-8 bytes
|
||||
// U+0080 - U+07FF => 2 UTF-8 bytes
|
||||
// U+0800 - U+FFFF => 3 UTF-8 bytes
|
||||
// U+010000 - U+10FFFF => 4 UTF-8 bytes
|
||||
|
||||
// # UTF-16 code unit -> no. UTF-8 bytes needed
|
||||
// 0x0000 - 0x007f [U+000000 - U+00007F] = 1 UTF-8 bytes
|
||||
// 0x0080 - 0x07ff [U+000080 - U+0007FF] = 2 UTF-8 bytes
|
||||
// 0x0800 - 0xd7ff [U+000800 - U+00FFFF] = 3 UTF-8 bytes
|
||||
// 0xd800 - 0xdbff [U+010000 - U+10FFFF] = 4 UTF-8 bytes to encode valid UTF-16 code units,
|
||||
// or 3 UTF-8 bytes to encode the unmatched surrogate code unit.
|
||||
// 0xdc00 - 0xdfff [U+010000 - U+10FFFF] = 0 UTF-8 bytes to encode valid UTF-16 code units (because it is already accounted for in 0xdc00 - 0xdfff),
|
||||
// or 3 UTF-8 bytes to encode the unmatched surrogate code unit.
|
||||
// 0xe000 - 0xffff [U+00E000 - U+00FFFF] = 3 UTF-8 bytes
|
||||
|
||||
// # UTF-16 code unit -> actual length added.
|
||||
// 0x0000 - 0x007f = 1
|
||||
// 0x0080 - 0x07ff = 2
|
||||
// 0x0800 - 0xd7ff = 3
|
||||
// 0xd800 - 0xdbff = 3
|
||||
// ^ If the next code unit is 0xdc00 - 0xdfff, they will combined sum to 6, which is greater than the 4 required.
|
||||
// Otherwise, 3 bytes are needed to encode U+D800 - U+DBFF.
|
||||
// 0xdc00 - 0xdfff = 3
|
||||
// ^ If the previous code unit was, 0xd800 - 0xdbff, this will ensure that the combined sum is greater than 4.
|
||||
// Otherwise, 3 bytes are needed to encode U+DC00 - U+DFFF.
|
||||
// 0xe000 - 0xffff = 3
|
||||
|
||||
size_t maximum_utf8_length = 0;
|
||||
|
||||
// NOTE: This loop is designed to be easy to vectorize.
|
||||
for (auto code_unit : code_units) {
|
||||
maximum_utf8_length += 1;
|
||||
maximum_utf8_length += code_unit > 0x007f;
|
||||
maximum_utf8_length += code_unit > 0x07ff;
|
||||
}
|
||||
|
||||
return maximum_utf8_length;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user