diff --git a/src/common/bcp47.cpp b/src/common/bcp47.cpp index 27040feaa..4eb330173 100644 --- a/src/common/bcp47.cpp +++ b/src/common/bcp47.cpp @@ -783,6 +783,79 @@ language_c::find_best_match(std::vector const &potential_matches) return best_match; } +language_c & +language_c::canonicalize_preferred_values() { + auto &preferred_values = mtx::iana::language_subtag_registry::g_preferred_values; + auto idx = std::find_if(preferred_values.begin(), preferred_values.end(), + [this](auto const &pair) { + return matches(pair.first); + }); + + if (idx == preferred_values.end()) + return *this; + + // mxdebug(fmt::format("found one! I am {0} first is {1} second {2}\n", dump(), idx->first.dump(), idx->second.dump())); + + auto const &[match, preferred] = *idx; + + if (!preferred.m_language.empty()) { + if (!match.m_language.empty()) + m_language.clear(); + + if (!match.m_extended_language_subtags.empty()) + m_extended_language_subtags.clear(); + + if (!match.m_script.empty()) + m_script.clear(); + + if (!match.m_region.empty()) + m_region.clear(); + + if (!match.m_variants.empty()) + m_variants.clear(); + + if (!match.m_extensions.empty()) + m_extensions.clear(); + + if (!match.m_private_use.empty()) + m_private_use.clear(); + + if (!match.m_grandfathered.empty()) + m_grandfathered.clear(); + } + + if (!preferred.m_language.empty()) + m_language = preferred.m_language; + + if (!preferred.m_extended_language_subtags.empty()) + m_extended_language_subtags = preferred.m_extended_language_subtags; + + if (!preferred.m_script.empty()) + m_script = preferred.m_script; + + if (!preferred.m_region.empty()) + m_region = preferred.m_region; + + if (!preferred.m_variants.empty()) + m_variants = preferred.m_variants; + + if (!preferred.m_extensions.empty()) + m_extensions = preferred.m_extensions; + + if (!preferred.m_private_use.empty()) + m_private_use = preferred.m_private_use; + + if (!preferred.m_grandfathered.empty()) + m_grandfathered = preferred.m_grandfathered; + + return *this; +} + +language_c & +language_c::to_canonical_form() { + return canonicalize_preferred_values(); +} + void language_c::disable() { ms_disabled = true; diff --git a/src/common/bcp47.h b/src/common/bcp47.h index fe01f814b..af4f20359 100644 --- a/src/common/bcp47.h +++ b/src/common/bcp47.h @@ -77,6 +77,8 @@ public: bool matches(language_c const &match) const noexcept; language_c find_best_match(std::vector const &potential_matches) const noexcept; + language_c &to_canonical_form(); + language_c &set_valid(bool valid); language_c &set_language(std::string const &language); language_c &set_extended_language_subtags(std::vector const &extended_language_subtags); @@ -113,6 +115,8 @@ protected: bool validate_one_extlang(std::size_t extlang_index); bool matches_prefix(language_c const &prefix, std::size_t extlang_index, prefix_restrictions_t const &restrictions) const noexcept; + language_c &canonicalize_preferred_values(); + public: static language_c parse(std::string const &language); diff --git a/tests/unit/common/bcp47.cpp b/tests/unit/common/bcp47.cpp index 4edd439d9..9e9032166 100644 --- a/tests/unit/common/bcp47.cpp +++ b/tests/unit/common/bcp47.cpp @@ -423,4 +423,26 @@ TEST(BCP47LanguageTags, Grandfathered) { EXPECT_EQ("i-KLINGON"s, l.get_grandfathered()); } +TEST(BCP47LanguageTags, ToCanonicalForm) { + // No changes as they're already normalized. + EXPECT_EQ("sgn"s, language_c::parse("sgn"s).to_canonical_form().format()); + EXPECT_EQ("nsi"s, language_c::parse("nsi"s).to_canonical_form().format()); + + // No changes as even though they're listed as redundant, they don't have preferred values. + EXPECT_EQ("az-Arab"s, language_c::parse("az-Arab"s).to_canonical_form().format()); + + // For the following there are changes. + EXPECT_EQ("nsi"s, language_c::parse("sgn-nsi"s).to_canonical_form().format()); + EXPECT_EQ("ja-Latn-alalc97"s, language_c::parse("ja-Latn-hepburn-heploc"s).to_canonical_form().format()); + EXPECT_EQ("jbo"s, language_c::parse("art-lojban"s).to_canonical_form().format()); + EXPECT_EQ("jsl"s, language_c::parse("sgn-JP"s).to_canonical_form().format()); + EXPECT_EQ("cmn"s, language_c::parse("zh-cmn"s).to_canonical_form().format()); + EXPECT_EQ("cmn-CN"s, language_c::parse("zh-cmn-CN"s).to_canonical_form().format()); + EXPECT_EQ("cmn-Hans"s, language_c::parse("zh-cmn-Hans"s).to_canonical_form().format()); + EXPECT_EQ("cmn"s, language_c::parse("zh-guoyu"s).to_canonical_form().format()); + EXPECT_EQ("hak"s, language_c::parse("zh-hakka"s).to_canonical_form().format()); + EXPECT_EQ("hak"s, language_c::parse("i-hak"s).to_canonical_form().format()); + EXPECT_EQ("yue-jyutping"s, language_c::parse("zh-yue-jyutping"s).to_canonical_form().format()); +} + }