BCP47: add function for finding the first variant not matching its prefixes

Part of the implementation of #3307.
This commit is contained in:
Moritz Bunkus 2022-03-29 13:01:53 +02:00
parent e6bcf00a6d
commit 3048c61d92
No known key found for this signature in database
GPG Key ID: 74AF00ADF2E32C85
3 changed files with 117 additions and 11 deletions

View File

@ -379,16 +379,9 @@ language_c::matches_prefix(language_c const &prefix,
}
bool
language_c::validate_extlang() {
if (m_extended_language_subtag.empty())
return true;
auto extlang = mtx::iana::language_subtag_registry::look_up_extlang(m_extended_language_subtag);
if (!extlang) // Should not happen as the parsing checks this already.
return false;
if (extlang->prefixes.empty())
language_c::validate_prefixes(std::vector<std::string> const &prefixes)
const noexcept {
if (prefixes.empty())
return true;
prefix_restrictions_t restrictions;
@ -399,7 +392,7 @@ language_c::validate_extlang() {
value = true;
};
for (auto const &prefix : extlang->prefixes) {
for (auto const &prefix : prefixes) {
parsed_prefixes.emplace_back(parse(prefix));
auto const &tag = parsed_prefixes.back();
@ -414,6 +407,44 @@ language_c::validate_extlang() {
if (matches_prefix(parsed_prefix, restrictions))
return true;
return false;
}
std::string
language_c::get_first_variant_not_matching_prefixes()
const noexcept {
if (m_variants.empty())
return {};
for (auto const &variant_str : m_variants) {
auto variant = mtx::iana::language_subtag_registry::look_up_variant(variant_str);
if (!variant) // Should not happen as the parsing checks this already.
continue;
if (variant->prefixes.empty())
continue;
if (!validate_prefixes(variant->prefixes))
return variant_str;
}
return {};
}
bool
language_c::validate_extlang() {
if (m_extended_language_subtag.empty())
return true;
auto extlang = mtx::iana::language_subtag_registry::look_up_extlang(m_extended_language_subtag);
if (!extlang) // Should not happen as the parsing checks this already.
return false;
if (validate_prefixes(extlang->prefixes))
return true;
auto message = Y("The extended language subtag '{}' must only be used with one of the following prefixes: {}.");
m_parser_error = fmt::format(message, m_extended_language_subtag, fmt::join(extlang->prefixes, ", "));

View File

@ -111,6 +111,8 @@ public:
std::vector<std::string> const &get_private_use() const noexcept;
std::string const &get_grandfathered() const noexcept;
std::string get_first_variant_not_matching_prefixes() const noexcept;
protected:
std::string format_internal(bool force) const noexcept;
@ -124,6 +126,7 @@ protected:
bool validate_extensions();
bool validate_extlang();
bool validate_variants();
bool validate_prefixes(std::vector<std::string> const &prefixes) const noexcept;
bool matches_prefix(language_c const &prefix, prefix_restrictions_t const &restrictions) const noexcept;
language_c &canonicalize_preferred_values();

View File

@ -567,4 +567,76 @@ TEST(BCP47LanguageTags, NormalizationForDCNCTags) {
EXPECT_EQ("pt-BR"s, language_c::parse("QBP"s).format());
}
TEST(BCP47LanguageTags, VariantPrefixValidation) {
language_c::set_normalization_mode(norm_e::none);
auto l = language_c::parse("pt-BR"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // no variant
l = language_c::parse("da-DK-fonipa"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // variant without prefixes
l = language_c::parse("de-1996"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid
l = language_c::parse("de-DE-1996"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid
l = language_c::parse("fr-1996"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ("1996"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: language code doesn't match
l = language_c::parse("pt-BR-abl1943"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid
l = language_c::parse("pt-abl1943"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ("abl1943"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: pt-BR is valid, pt isn't
l = language_c::parse("zh-cmn-Latn-tongyong"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid
l = language_c::parse("yue-jyutping"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid
l = language_c::parse("zh-yue-jyutping"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ("jyutping"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: yue would be a valid prefix but zh-yue isn't
l.to_canonical_form();
EXPECT_TRUE(l.is_valid());
EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid: zh-yue-jyutping canonicals to yue-jyutping & yue is a valid prefix
l = language_c::parse("cmn-pinyin"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ("pinyin"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: missing Latn
l = language_c::parse("zh-cmn-pinyin"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ("pinyin"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: missing Latn
l = language_c::parse("cmn-Latn-pinyin"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ("pinyin"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: only valid for zh-Latn, not cmn-Latn
l.to_extlang_form();
EXPECT_TRUE(l.is_valid());
EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid now as extlang form is zh-cmn-Latn-pinyin
l = language_c::parse("zh-cmn-Latn-pinyin"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid (directly)
l = language_c::parse("zh-cmn-Hans-pinyin"s);
EXPECT_TRUE(l.is_valid());
EXPECT_EQ("pinyin"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: script not Latn
}
}