BCP 47: add function for getting the closest ISO 639-2 code for a tag

There are several languages that aren't part of ISO 639-2 but are part
of the 639-3 or 639-5. For those languages the legacy Matroska language
elements cannot be set to the ISO 639 alpha 3 code of the BCP 47
language tag.

However, there are a lot of such languages whose ISO 639 alpha 3 code
is a valid extlang subtag of a BCP 47 tag. For example: the language
"Yue Chinese" has an ISO 639 alpha 3 code of `yue` but isn't part of
ISO 639-2. However, `yue` is also a valid extlang.

As each extlang must have a prefix for which it is valid (in the case
of `yue` it's `zh`) and as that prefix must in turn be an ISO 639 code
itself, that prefix language's ISO 639-2 code is the closest
representation.

Part of the implementation of #3307.
This commit is contained in:
Moritz Bunkus 2022-03-23 22:15:05 +01:00
parent bc357697bc
commit 1015808193
No known key found for this signature in database
GPG Key ID: 74AF00ADF2E32C85
3 changed files with 54 additions and 0 deletions

View File

@ -532,6 +532,31 @@ language_c::get_iso639_2_alpha_3_code_or(std::string const &value_if_invalid)
return value_if_invalid;
}
std::string
language_c::get_closest_iso639_2_alpha_3_code()
const noexcept {
if (!m_valid || m_language.empty())
return "und"s;
auto language = mtx::iso639::look_up(m_language);
if (!language)
return "und"s;
if (language->is_part_of_iso639_2)
return language->alpha_3_code;
auto extlang = mtx::iana::language_subtag_registry::look_up_extlang(language->alpha_3_code);
if (!extlang || extlang->prefixes.empty())
return "und"s;
auto prefix_language = mtx::iso639::look_up(extlang->prefixes.front());
if (prefix_language && prefix_language->is_part_of_iso639_2)
return prefix_language->alpha_3_code;
return "und"s;
}
language_c &
language_c::set_valid(bool valid) {
m_valid = valid;

View File

@ -59,6 +59,7 @@ public:
bool has_valid_iso639_2_code() const noexcept;
std::string get_iso639_alpha_3_code() const noexcept;
std::string get_iso639_2_alpha_3_code_or(std::string const &value_if_invalid) const noexcept;
std::string get_closest_iso639_2_alpha_3_code() const noexcept;
bool has_valid_iso3166_1_alpha_2_or_top_level_domain_country_code() const noexcept;
std::string get_iso3166_1_alpha_2_code() const noexcept;

View File

@ -394,4 +394,32 @@ TEST(BCP47LanguageTags, ISO3166_1_Alpha2Codes) {
EXPECT_EQ("uk"s, language_c::parse("en-GB").get_top_level_domain_country_code());
}
TEST(BCP47LanguageTags, ClosestISO639_2_Alpha3Code) {
// default value returned in different cases
EXPECT_EQ("und"s, language_c{}.get_closest_iso639_2_alpha_3_code()); // empty entry
EXPECT_EQ("und"s, language_c::parse("moocow").get_closest_iso639_2_alpha_3_code()); // invalid entry
EXPECT_EQ("und"s, language_c::parse("x-muh-to-the-kuh").get_closest_iso639_2_alpha_3_code()); // valid but no ISO 639 code
EXPECT_EQ("und"s, language_c::parse("aiw").get_closest_iso639_2_alpha_3_code()); // valid but no 639-2 code & not an extlang
// "Valid, is extlang, prefix is ISO 639 code but not ISO 639-2"
// would be another case when "und" should be returned, but as of
// 2022-03-23 there's no such entry. All current prefixes for
// extlangs do have ISO 639-2 codes.
// Now some valid cases.
EXPECT_EQ("fre"s, language_c::parse("fr-FR").get_closest_iso639_2_alpha_3_code());
EXPECT_EQ("ger"s, language_c::parse("de").get_closest_iso639_2_alpha_3_code());
EXPECT_EQ("ger"s, language_c::parse("deu").get_closest_iso639_2_alpha_3_code());
EXPECT_EQ("ger"s, language_c::parse("ger").get_closest_iso639_2_alpha_3_code());
// Last the interesting cases: `yue` = "Yue Chinese" doesn't have an
// ISO 639-2 code, but it is an extlang with a prefix of `zh` =
// "Chinese" for which there is an ISO 639-2 code: `chi`. Similarly
// for the other two examples (`bsi` = "British Sign Language" → `sgn` = "Sign
// Languages"; `zsm` = "Standard Malay" → `may` = "Malay (macrolanguage)")
EXPECT_EQ("chi"s, language_c::parse("yue").get_closest_iso639_2_alpha_3_code());
EXPECT_EQ("sgn"s, language_c::parse("bfi").get_closest_iso639_2_alpha_3_code());
EXPECT_EQ("may"s, language_c::parse("zsm").get_closest_iso639_2_alpha_3_code());
}
}