Files
ladybird/Meta/Lagom/Tools/CodeGenerators/LibLocale/GenerateNumberFormatData.cpp
Timothy Flynn 67f3de2320 LibJS+LibLocale: Begin replacing number formatting with ICU
This uses ICU for the Intl.NumberFormat `format` and `formatToParts`
prototypes. It does not yet port the range formatter prototypes.

Most of the new code in LibLocale/NumberFormat is simply mapping from
ECMA-402 types to ICU types. Beyond that, the only algorithmic change is
that we have to mutate the output from ICU for `formatToParts` to match
what is expected by ECMA-402. This is explained in NumberFormat.cpp in
`flatten_partitions`.

This lets us remove most data from our number format generator. All that
remains are numbering system digits and symbols, which are relied upon
still for other interfaces (e.g. Intl.DateTimeFormat). So they will be
removed in a future patch.

Note: All of the changes to the test files in this patch are now aligned
with both Chrome and Safari.
2024-06-10 13:51:51 +02:00

437 lines
16 KiB
C++

/*
* Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "../LibUnicode/GeneratorUtil.h" // FIXME: Move this somewhere common.
#include <AK/Array.h>
#include <AK/ByteString.h>
#include <AK/Format.h>
#include <AK/HashFunctions.h>
#include <AK/HashMap.h>
#include <AK/JsonObject.h>
#include <AK/JsonParser.h>
#include <AK/JsonValue.h>
#include <AK/LexicalPath.h>
#include <AK/QuickSort.h>
#include <AK/SourceGenerator.h>
#include <AK/StringBuilder.h>
#include <AK/Traits.h>
#include <AK/Utf8View.h>
#include <LibCore/ArgsParser.h>
#include <LibCore/Directory.h>
#include <LibFileSystem/FileSystem.h>
#include <LibLocale/NumberFormat.h>
#include <math.h>
using NumericSymbolList = Vector<size_t>;
struct NumberSystem {
unsigned hash() const { return int_hash(symbols); }
bool operator==(NumberSystem const& other) const { return (symbols == other.symbols); }
size_t symbols { 0 };
};
template<>
struct AK::Formatter<NumberSystem> : Formatter<FormatString> {
ErrorOr<void> format(FormatBuilder& builder, NumberSystem const& system)
{
return Formatter<FormatString>::format(builder, "{{ {} }}"sv, system.symbols);
}
};
template<>
struct AK::Traits<NumberSystem> : public DefaultTraits<NumberSystem> {
static unsigned hash(NumberSystem const& s) { return s.hash(); }
};
struct LocaleData {
Vector<size_t> number_systems;
};
struct CLDR {
UniqueStringStorage unique_strings;
UniqueStorage<NumericSymbolList> unique_symbols;
UniqueStorage<NumberSystem> unique_systems;
HashMap<ByteString, Array<u32, 10>> number_system_digits;
Vector<ByteString> number_systems;
HashMap<ByteString, LocaleData> locales;
};
static ErrorOr<void> parse_number_system_digits(ByteString core_supplemental_path, CLDR& cldr)
{
LexicalPath number_systems_path(move(core_supplemental_path));
number_systems_path = number_systems_path.append("numberingSystems.json"sv);
auto number_systems = TRY(read_json_file(number_systems_path.string()));
auto const& supplemental_object = number_systems.as_object().get_object("supplemental"sv).value();
auto const& number_systems_object = supplemental_object.get_object("numberingSystems"sv).value();
number_systems_object.for_each_member([&](auto const& number_system, auto const& digits_object) {
auto type = digits_object.as_object().get_byte_string("_type"sv).value();
if (type != "numeric"sv)
return;
auto digits = digits_object.as_object().get_byte_string("_digits"sv).value();
Utf8View utf8_digits { digits };
VERIFY(utf8_digits.length() == 10);
auto& number_system_digits = cldr.number_system_digits.ensure(number_system);
size_t index = 0;
for (u32 digit : utf8_digits)
number_system_digits[index++] = digit;
if (!cldr.number_systems.contains_slow(number_system))
cldr.number_systems.append(number_system);
});
return {};
}
static ErrorOr<void> parse_number_systems(ByteString locale_numbers_path, CLDR& cldr, LocaleData& locale)
{
LexicalPath numbers_path(move(locale_numbers_path));
numbers_path = numbers_path.append("numbers.json"sv);
auto numbers = TRY(read_json_file(numbers_path.string()));
auto const& main_object = numbers.as_object().get_object("main"sv).value();
auto const& locale_object = main_object.get_object(numbers_path.parent().basename()).value();
auto const& locale_numbers_object = locale_object.get_object("numbers"sv).value();
auto const& minimum_grouping_digits = locale_numbers_object.get_byte_string("minimumGroupingDigits"sv).value();
Vector<Optional<NumberSystem>> number_systems;
number_systems.resize(cldr.number_systems.size());
auto ensure_number_system = [&](auto const& system) -> NumberSystem& {
auto system_index = cldr.number_systems.find_first_index(system).value();
VERIFY(system_index < number_systems.size());
auto& number_system = number_systems.at(system_index);
if (!number_system.has_value())
number_system = NumberSystem {};
return number_system.value();
};
auto numeric_symbol_from_string = [&](StringView numeric_symbol) -> Optional<Locale::NumericSymbol> {
if (numeric_symbol == "approximatelySign"sv)
return Locale::NumericSymbol::ApproximatelySign;
if (numeric_symbol == "decimal"sv)
return Locale::NumericSymbol::Decimal;
if (numeric_symbol == "exponential"sv)
return Locale::NumericSymbol::Exponential;
if (numeric_symbol == "group"sv)
return Locale::NumericSymbol::Group;
if (numeric_symbol == "infinity"sv)
return Locale::NumericSymbol::Infinity;
if (numeric_symbol == "minusSign"sv)
return Locale::NumericSymbol::MinusSign;
if (numeric_symbol == "nan"sv)
return Locale::NumericSymbol::NaN;
if (numeric_symbol == "percentSign"sv)
return Locale::NumericSymbol::PercentSign;
if (numeric_symbol == "plusSign"sv)
return Locale::NumericSymbol::PlusSign;
if (numeric_symbol == "timeSeparator"sv)
return Locale::NumericSymbol::TimeSeparator;
return {};
};
locale_numbers_object.for_each_member([&](auto const& key, JsonValue const& value) {
constexpr auto symbols_prefix = "symbols-numberSystem-"sv;
constexpr auto misc_patterns_prefix = "miscPatterns-numberSystem-"sv;
if (key.starts_with(symbols_prefix)) {
auto system = key.substring(symbols_prefix.length());
auto& number_system = ensure_number_system(system);
NumericSymbolList symbols;
value.as_object().for_each_member([&](auto const& symbol, JsonValue const& localization) {
auto numeric_symbol = numeric_symbol_from_string(symbol);
if (!numeric_symbol.has_value())
return;
if (to_underlying(*numeric_symbol) >= symbols.size())
symbols.resize(to_underlying(*numeric_symbol) + 1);
auto symbol_index = cldr.unique_strings.ensure(localization.as_string());
symbols[to_underlying(*numeric_symbol)] = symbol_index;
});
// The range separator does not appear in the symbols list, we have to extract it from
// the range pattern.
auto misc_patterns_key = ByteString::formatted("{}{}", misc_patterns_prefix, system);
auto misc_patterns = locale_numbers_object.get_object(misc_patterns_key).value();
auto range_separator = misc_patterns.get_byte_string("range"sv).value();
auto begin_index = range_separator.find("{0}"sv).value() + "{0}"sv.length();
auto end_index = range_separator.find("{1}"sv).value();
range_separator = range_separator.substring(begin_index, end_index - begin_index);
if (to_underlying(Locale::NumericSymbol::RangeSeparator) >= symbols.size())
symbols.resize(to_underlying(Locale::NumericSymbol::RangeSeparator) + 1);
auto symbol_index = cldr.unique_strings.ensure(move(range_separator));
symbols[to_underlying(Locale::NumericSymbol::RangeSeparator)] = symbol_index;
number_system.symbols = cldr.unique_symbols.ensure(move(symbols));
}
});
locale.number_systems.ensure_capacity(number_systems.size());
for (auto& number_system : number_systems) {
size_t system_index = 0;
if (number_system.has_value())
system_index = cldr.unique_systems.ensure(number_system.release_value());
locale.number_systems.append(system_index);
}
// locale.minimum_grouping_digits = minimum_grouping_digits.template to_number<u8>().value();
return {};
}
static ErrorOr<void> parse_all_locales(ByteString core_path, ByteString numbers_path, CLDR& cldr)
{
LexicalPath core_supplemental_path(move(core_path));
core_supplemental_path = core_supplemental_path.append("supplemental"sv);
VERIFY(FileSystem::is_directory(core_supplemental_path.string()));
TRY(parse_number_system_digits(core_supplemental_path.string(), cldr));
auto remove_variants_from_path = [&](ByteString path) -> ErrorOr<ByteString> {
auto parsed_locale = TRY(CanonicalLanguageID::parse(cldr.unique_strings, LexicalPath::basename(path)));
StringBuilder builder;
builder.append(cldr.unique_strings.get(parsed_locale.language));
if (auto script = cldr.unique_strings.get(parsed_locale.script); !script.is_empty())
builder.appendff("-{}", script);
if (auto region = cldr.unique_strings.get(parsed_locale.region); !region.is_empty())
builder.appendff("-{}", region);
return builder.to_byte_string();
};
TRY(Core::Directory::for_each_entry(TRY(String::formatted("{}/main", numbers_path)), Core::DirIterator::SkipParentAndBaseDir, [&](auto& entry, auto& directory) -> ErrorOr<IterationDecision> {
auto numbers_path = LexicalPath::join(directory.path().string(), entry.name).string();
auto language = TRY(remove_variants_from_path(numbers_path));
auto& locale = cldr.locales.ensure(language);
TRY(parse_number_systems(numbers_path, cldr, locale));
return IterationDecision::Continue;
}));
return {};
}
static ByteString format_identifier(StringView, ByteString const& identifier)
{
return identifier.to_titlecase();
}
static ErrorOr<void> generate_unicode_locale_header(Core::InputBufferedFile& file, CLDR& cldr)
{
StringBuilder builder;
SourceGenerator generator { builder };
generator.append(R"~~~(
#pragma once
#include <AK/Types.h>
namespace Locale {
)~~~");
generate_enum(generator, format_identifier, "NumberSystem"sv, {}, cldr.number_systems);
generator.append(R"~~~(
}
)~~~");
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
return {};
}
static ErrorOr<void> generate_unicode_locale_implementation(Core::InputBufferedFile& file, CLDR& cldr)
{
StringBuilder builder;
SourceGenerator generator { builder };
generator.set("string_index_type"sv, cldr.unique_strings.type_that_fits());
generator.set("numeric_symbol_list_index_type"sv, cldr.unique_symbols.type_that_fits());
generator.append(R"~~~(
#include <AK/Array.h>
#include <AK/Optional.h>
#include <AK/Span.h>
#include <AK/StringView.h>
#include <LibLocale/Locale.h>
#include <LibLocale/LocaleData.h>
#include <LibLocale/NumberFormat.h>
#include <LibLocale/NumberFormatData.h>
namespace Locale {
)~~~");
cldr.unique_strings.generate(generator);
generator.append(R"~~~(
struct NumberSystemData {
@numeric_symbol_list_index_type@ symbols { 0 };
};
)~~~");
cldr.unique_symbols.generate(generator, cldr.unique_strings.type_that_fits(), "s_numeric_symbol_lists"sv);
cldr.unique_systems.generate(generator, "NumberSystemData"sv, "s_number_systems"sv, 10);
auto locales = cldr.locales.keys();
quick_sort(locales);
auto append_map = [&](ByteString name, auto type, auto const& map) {
generator.set("name", move(name));
generator.set("type", type);
generator.set("size", ByteString::number(map.size()));
generator.append(R"~~~(
static constexpr Array<@type@, @size@> @name@ { {)~~~");
bool first = true;
for (auto const& item : map) {
generator.append(first ? " "sv : ", "sv);
if constexpr (requires { item.value; })
generator.append(ByteString::number(item.value));
else
generator.append(ByteString::number(item));
first = false;
}
generator.append(" } };");
};
generate_mapping(generator, cldr.number_system_digits, "u32"sv, "s_number_systems_digits"sv, "s_number_systems_digits_{}"sv, nullptr, [&](auto const& name, auto const& value) { append_map(name, "u32"sv, value); });
generate_mapping(generator, cldr.locales, cldr.unique_systems.type_that_fits(), "s_locale_number_systems"sv, "s_number_systems_{}"sv, nullptr, [&](auto const& name, auto const& value) { append_map(name, cldr.unique_systems.type_that_fits(), value.number_systems); });
generator.append(R"~~~(
static Optional<NumberSystem> keyword_to_number_system(KeywordNumbers keyword)
{
switch (keyword) {)~~~");
for (auto const& number_system : cldr.number_systems) {
generator.set("name"sv, format_identifier({}, number_system));
generator.append(R"~~~(
case KeywordNumbers::@name@:
return NumberSystem::@name@;)~~~");
}
generator.append(R"~~~(
default:
return {};
}
}
Optional<ReadonlySpan<u32>> get_digits_for_number_system(StringView system)
{
auto number_system_keyword = keyword_nu_from_string(system);
if (!number_system_keyword.has_value())
return {};
auto number_system_value = keyword_to_number_system(*number_system_keyword);
if (!number_system_value.has_value())
return {};
auto number_system_index = to_underlying(*number_system_value);
return s_number_systems_digits[number_system_index];
}
static NumberSystemData const* find_number_system(StringView locale, StringView system)
{
auto locale_value = locale_from_string(locale);
if (!locale_value.has_value())
return nullptr;
auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
auto const& number_systems = s_locale_number_systems.at(locale_index);
auto lookup_number_system = [&](auto number_system) -> NumberSystemData const* {
auto number_system_keyword = keyword_nu_from_string(number_system);
if (!number_system_keyword.has_value())
return nullptr;
auto number_system_value = keyword_to_number_system(*number_system_keyword);
if (!number_system_value.has_value())
return nullptr;
auto number_system_index = to_underlying(*number_system_value);
number_system_index = number_systems.at(number_system_index);
if (number_system_index == 0)
return nullptr;
return &s_number_systems.at(number_system_index);
};
if (auto const* number_system = lookup_number_system(system))
return number_system;
auto default_number_system = get_preferred_keyword_value_for_locale(locale, "nu"sv);
if (!default_number_system.has_value())
return nullptr;
return lookup_number_system(*default_number_system);
}
Optional<StringView> get_number_system_symbol(StringView locale, StringView system, NumericSymbol symbol)
{
if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
auto symbols = s_numeric_symbol_lists.at(number_system->symbols);
auto symbol_index = to_underlying(symbol);
if (symbol_index >= symbols.size())
return {};
return decode_string(symbols[symbol_index]);
}
return {};
}
}
)~~~");
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
return {};
}
ErrorOr<int> serenity_main(Main::Arguments arguments)
{
StringView generated_header_path;
StringView generated_implementation_path;
StringView core_path;
StringView numbers_path;
Core::ArgsParser args_parser;
args_parser.add_option(generated_header_path, "Path to the Unicode locale header file to generate", "generated-header-path", 'h', "generated-header-path");
args_parser.add_option(generated_implementation_path, "Path to the Unicode locale implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
args_parser.add_option(core_path, "Path to cldr-core directory", "core-path", 'r', "core-path");
args_parser.add_option(numbers_path, "Path to cldr-numbers directory", "numbers-path", 'n', "numbers-path");
args_parser.parse(arguments);
auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write));
CLDR cldr;
TRY(parse_all_locales(core_path, numbers_path, cldr));
TRY(generate_unicode_locale_header(*generated_header_file, cldr));
TRY(generate_unicode_locale_implementation(*generated_implementation_file, cldr));
return 0;
}