BCP47: add function to convert to canonical form

Part of the implementation of #3307.
This commit is contained in:
Moritz Bunkus 2022-03-26 00:22:09 +01:00
parent 37d48d5d2f
commit 5b6aab5f32
No known key found for this signature in database
GPG Key ID: 74AF00ADF2E32C85
3 changed files with 99 additions and 0 deletions

View File

@ -783,6 +783,79 @@ language_c::find_best_match(std::vector<language_c> const &potential_matches)
return best_match;
}
language_c &
language_c::canonicalize_preferred_values() {
auto &preferred_values = mtx::iana::language_subtag_registry::g_preferred_values;
auto idx = std::find_if(preferred_values.begin(), preferred_values.end(),
[this](auto const &pair) {
return matches(pair.first);
});
if (idx == preferred_values.end())
return *this;
// mxdebug(fmt::format("found one! I am {0} first is {1} second {2}\n", dump(), idx->first.dump(), idx->second.dump()));
auto const &[match, preferred] = *idx;
if (!preferred.m_language.empty()) {
if (!match.m_language.empty())
m_language.clear();
if (!match.m_extended_language_subtags.empty())
m_extended_language_subtags.clear();
if (!match.m_script.empty())
m_script.clear();
if (!match.m_region.empty())
m_region.clear();
if (!match.m_variants.empty())
m_variants.clear();
if (!match.m_extensions.empty())
m_extensions.clear();
if (!match.m_private_use.empty())
m_private_use.clear();
if (!match.m_grandfathered.empty())
m_grandfathered.clear();
}
if (!preferred.m_language.empty())
m_language = preferred.m_language;
if (!preferred.m_extended_language_subtags.empty())
m_extended_language_subtags = preferred.m_extended_language_subtags;
if (!preferred.m_script.empty())
m_script = preferred.m_script;
if (!preferred.m_region.empty())
m_region = preferred.m_region;
if (!preferred.m_variants.empty())
m_variants = preferred.m_variants;
if (!preferred.m_extensions.empty())
m_extensions = preferred.m_extensions;
if (!preferred.m_private_use.empty())
m_private_use = preferred.m_private_use;
if (!preferred.m_grandfathered.empty())
m_grandfathered = preferred.m_grandfathered;
return *this;
}
language_c &
language_c::to_canonical_form() {
return canonicalize_preferred_values();
}
void
language_c::disable() {
ms_disabled = true;

View File

@ -77,6 +77,8 @@ public:
bool matches(language_c const &match) const noexcept;
language_c find_best_match(std::vector<language_c> const &potential_matches) const noexcept;
language_c &to_canonical_form();
language_c &set_valid(bool valid);
language_c &set_language(std::string const &language);
language_c &set_extended_language_subtags(std::vector<std::string> const &extended_language_subtags);
@ -113,6 +115,8 @@ protected:
bool validate_one_extlang(std::size_t extlang_index);
bool matches_prefix(language_c const &prefix, std::size_t extlang_index, prefix_restrictions_t const &restrictions) const noexcept;
language_c &canonicalize_preferred_values();
public:
static language_c parse(std::string const &language);

View File

@ -423,4 +423,26 @@ TEST(BCP47LanguageTags, Grandfathered) {
EXPECT_EQ("i-KLINGON"s, l.get_grandfathered());
}
TEST(BCP47LanguageTags, ToCanonicalForm) {
// No changes as they're already normalized.
EXPECT_EQ("sgn"s, language_c::parse("sgn"s).to_canonical_form().format());
EXPECT_EQ("nsi"s, language_c::parse("nsi"s).to_canonical_form().format());
// No changes as even though they're listed as redundant, they don't have preferred values.
EXPECT_EQ("az-Arab"s, language_c::parse("az-Arab"s).to_canonical_form().format());
// For the following there are changes.
EXPECT_EQ("nsi"s, language_c::parse("sgn-nsi"s).to_canonical_form().format());
EXPECT_EQ("ja-Latn-alalc97"s, language_c::parse("ja-Latn-hepburn-heploc"s).to_canonical_form().format());
EXPECT_EQ("jbo"s, language_c::parse("art-lojban"s).to_canonical_form().format());
EXPECT_EQ("jsl"s, language_c::parse("sgn-JP"s).to_canonical_form().format());
EXPECT_EQ("cmn"s, language_c::parse("zh-cmn"s).to_canonical_form().format());
EXPECT_EQ("cmn-CN"s, language_c::parse("zh-cmn-CN"s).to_canonical_form().format());
EXPECT_EQ("cmn-Hans"s, language_c::parse("zh-cmn-Hans"s).to_canonical_form().format());
EXPECT_EQ("cmn"s, language_c::parse("zh-guoyu"s).to_canonical_form().format());
EXPECT_EQ("hak"s, language_c::parse("zh-hakka"s).to_canonical_form().format());
EXPECT_EQ("hak"s, language_c::parse("i-hak"s).to_canonical_form().format());
EXPECT_EQ("yue-jyutping"s, language_c::parse("zh-yue-jyutping"s).to_canonical_form().format());
}
}