From 3048c61d920fb9ac19539a4c8197c976c125fe97 Mon Sep 17 00:00:00 2001 From: Moritz Bunkus Date: Tue, 29 Mar 2022 13:01:53 +0200 Subject: [PATCH] BCP47: add function for finding the first variant not matching its prefixes Part of the implementation of #3307. --- src/common/bcp47.cpp | 53 +++++++++++++++++++++------ src/common/bcp47.h | 3 ++ tests/unit/common/bcp47.cpp | 72 +++++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 11 deletions(-) diff --git a/src/common/bcp47.cpp b/src/common/bcp47.cpp index 85282d963..06e86868d 100644 --- a/src/common/bcp47.cpp +++ b/src/common/bcp47.cpp @@ -379,16 +379,9 @@ language_c::matches_prefix(language_c const &prefix, } bool -language_c::validate_extlang() { - if (m_extended_language_subtag.empty()) - return true; - - auto extlang = mtx::iana::language_subtag_registry::look_up_extlang(m_extended_language_subtag); - - if (!extlang) // Should not happen as the parsing checks this already. - return false; - - if (extlang->prefixes.empty()) +language_c::validate_prefixes(std::vector const &prefixes) + const noexcept { + if (prefixes.empty()) return true; prefix_restrictions_t restrictions; @@ -399,7 +392,7 @@ language_c::validate_extlang() { value = true; }; - for (auto const &prefix : extlang->prefixes) { + for (auto const &prefix : prefixes) { parsed_prefixes.emplace_back(parse(prefix)); auto const &tag = parsed_prefixes.back(); @@ -414,6 +407,44 @@ language_c::validate_extlang() { if (matches_prefix(parsed_prefix, restrictions)) return true; + return false; +} + +std::string +language_c::get_first_variant_not_matching_prefixes() + const noexcept { + if (m_variants.empty()) + return {}; + + for (auto const &variant_str : m_variants) { + auto variant = mtx::iana::language_subtag_registry::look_up_variant(variant_str); + + if (!variant) // Should not happen as the parsing checks this already. + continue; + + if (variant->prefixes.empty()) + continue; + + if (!validate_prefixes(variant->prefixes)) + return variant_str; + } + + return {}; +} + +bool +language_c::validate_extlang() { + if (m_extended_language_subtag.empty()) + return true; + + auto extlang = mtx::iana::language_subtag_registry::look_up_extlang(m_extended_language_subtag); + + if (!extlang) // Should not happen as the parsing checks this already. + return false; + + if (validate_prefixes(extlang->prefixes)) + return true; + auto message = Y("The extended language subtag '{}' must only be used with one of the following prefixes: {}."); m_parser_error = fmt::format(message, m_extended_language_subtag, fmt::join(extlang->prefixes, ", ")); diff --git a/src/common/bcp47.h b/src/common/bcp47.h index 735c652e7..c8f8488a5 100644 --- a/src/common/bcp47.h +++ b/src/common/bcp47.h @@ -111,6 +111,8 @@ public: std::vector const &get_private_use() const noexcept; std::string const &get_grandfathered() const noexcept; + std::string get_first_variant_not_matching_prefixes() const noexcept; + protected: std::string format_internal(bool force) const noexcept; @@ -124,6 +126,7 @@ protected: bool validate_extensions(); bool validate_extlang(); bool validate_variants(); + bool validate_prefixes(std::vector const &prefixes) const noexcept; bool matches_prefix(language_c const &prefix, prefix_restrictions_t const &restrictions) const noexcept; language_c &canonicalize_preferred_values(); diff --git a/tests/unit/common/bcp47.cpp b/tests/unit/common/bcp47.cpp index 41816020c..9775563c8 100644 --- a/tests/unit/common/bcp47.cpp +++ b/tests/unit/common/bcp47.cpp @@ -567,4 +567,76 @@ TEST(BCP47LanguageTags, NormalizationForDCNCTags) { EXPECT_EQ("pt-BR"s, language_c::parse("QBP"s).format()); } +TEST(BCP47LanguageTags, VariantPrefixValidation) { + language_c::set_normalization_mode(norm_e::none); + + auto l = language_c::parse("pt-BR"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // no variant + + l = language_c::parse("da-DK-fonipa"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // variant without prefixes + + l = language_c::parse("de-1996"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid + + l = language_c::parse("de-DE-1996"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid + + l = language_c::parse("fr-1996"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ("1996"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: language code doesn't match + + l = language_c::parse("pt-BR-abl1943"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid + + l = language_c::parse("pt-abl1943"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ("abl1943"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: pt-BR is valid, pt isn't + + l = language_c::parse("zh-cmn-Latn-tongyong"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid + + l = language_c::parse("yue-jyutping"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid + + l = language_c::parse("zh-yue-jyutping"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ("jyutping"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: yue would be a valid prefix but zh-yue isn't + + l.to_canonical_form(); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid: zh-yue-jyutping canonicals to yue-jyutping & yue is a valid prefix + + l = language_c::parse("cmn-pinyin"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ("pinyin"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: missing Latn + + l = language_c::parse("zh-cmn-pinyin"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ("pinyin"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: missing Latn + + l = language_c::parse("cmn-Latn-pinyin"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ("pinyin"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: only valid for zh-Latn, not cmn-Latn + + l.to_extlang_form(); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid now as extlang form is zh-cmn-Latn-pinyin + + l = language_c::parse("zh-cmn-Latn-pinyin"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ(""s, l.get_first_variant_not_matching_prefixes()); // prefixes valid (directly) + + l = language_c::parse("zh-cmn-Hans-pinyin"s); + EXPECT_TRUE(l.is_valid()); + EXPECT_EQ("pinyin"s, l.get_first_variant_not_matching_prefixes()); // prefixes invalid: script not Latn +} + }