mirror of
https://gitlab.com/mbunkus/mkvtoolnix.git
synced 2024-12-24 11:54:01 +00:00
BCP 47: add function for getting the closest ISO 639-2 code for a tag
There are several languages that aren't part of ISO 639-2 but are part of the 639-3 or 639-5. For those languages the legacy Matroska language elements cannot be set to the ISO 639 alpha 3 code of the BCP 47 language tag. However, there are a lot of such languages whose ISO 639 alpha 3 code is a valid extlang subtag of a BCP 47 tag. For example: the language "Yue Chinese" has an ISO 639 alpha 3 code of `yue` but isn't part of ISO 639-2. However, `yue` is also a valid extlang. As each extlang must have a prefix for which it is valid (in the case of `yue` it's `zh`) and as that prefix must in turn be an ISO 639 code itself, that prefix language's ISO 639-2 code is the closest representation. Part of the implementation of #3307.
This commit is contained in:
parent
bc357697bc
commit
1015808193
@ -532,6 +532,31 @@ language_c::get_iso639_2_alpha_3_code_or(std::string const &value_if_invalid)
|
||||
return value_if_invalid;
|
||||
}
|
||||
|
||||
std::string
|
||||
language_c::get_closest_iso639_2_alpha_3_code()
|
||||
const noexcept {
|
||||
if (!m_valid || m_language.empty())
|
||||
return "und"s;
|
||||
|
||||
auto language = mtx::iso639::look_up(m_language);
|
||||
if (!language)
|
||||
return "und"s;
|
||||
|
||||
if (language->is_part_of_iso639_2)
|
||||
return language->alpha_3_code;
|
||||
|
||||
auto extlang = mtx::iana::language_subtag_registry::look_up_extlang(language->alpha_3_code);
|
||||
if (!extlang || extlang->prefixes.empty())
|
||||
return "und"s;
|
||||
|
||||
auto prefix_language = mtx::iso639::look_up(extlang->prefixes.front());
|
||||
|
||||
if (prefix_language && prefix_language->is_part_of_iso639_2)
|
||||
return prefix_language->alpha_3_code;
|
||||
|
||||
return "und"s;
|
||||
}
|
||||
|
||||
language_c &
|
||||
language_c::set_valid(bool valid) {
|
||||
m_valid = valid;
|
||||
|
@ -59,6 +59,7 @@ public:
|
||||
bool has_valid_iso639_2_code() const noexcept;
|
||||
std::string get_iso639_alpha_3_code() const noexcept;
|
||||
std::string get_iso639_2_alpha_3_code_or(std::string const &value_if_invalid) const noexcept;
|
||||
std::string get_closest_iso639_2_alpha_3_code() const noexcept;
|
||||
|
||||
bool has_valid_iso3166_1_alpha_2_or_top_level_domain_country_code() const noexcept;
|
||||
std::string get_iso3166_1_alpha_2_code() const noexcept;
|
||||
|
@ -394,4 +394,32 @@ TEST(BCP47LanguageTags, ISO3166_1_Alpha2Codes) {
|
||||
EXPECT_EQ("uk"s, language_c::parse("en-GB").get_top_level_domain_country_code());
|
||||
}
|
||||
|
||||
TEST(BCP47LanguageTags, ClosestISO639_2_Alpha3Code) {
|
||||
// default value returned in different cases
|
||||
EXPECT_EQ("und"s, language_c{}.get_closest_iso639_2_alpha_3_code()); // empty entry
|
||||
EXPECT_EQ("und"s, language_c::parse("moocow").get_closest_iso639_2_alpha_3_code()); // invalid entry
|
||||
EXPECT_EQ("und"s, language_c::parse("x-muh-to-the-kuh").get_closest_iso639_2_alpha_3_code()); // valid but no ISO 639 code
|
||||
EXPECT_EQ("und"s, language_c::parse("aiw").get_closest_iso639_2_alpha_3_code()); // valid but no 639-2 code & not an extlang
|
||||
|
||||
// "Valid, is extlang, prefix is ISO 639 code but not ISO 639-2"
|
||||
// would be another case when "und" should be returned, but as of
|
||||
// 2022-03-23 there's no such entry. All current prefixes for
|
||||
// extlangs do have ISO 639-2 codes.
|
||||
|
||||
// Now some valid cases.
|
||||
EXPECT_EQ("fre"s, language_c::parse("fr-FR").get_closest_iso639_2_alpha_3_code());
|
||||
EXPECT_EQ("ger"s, language_c::parse("de").get_closest_iso639_2_alpha_3_code());
|
||||
EXPECT_EQ("ger"s, language_c::parse("deu").get_closest_iso639_2_alpha_3_code());
|
||||
EXPECT_EQ("ger"s, language_c::parse("ger").get_closest_iso639_2_alpha_3_code());
|
||||
|
||||
// Last the interesting cases: `yue` = "Yue Chinese" doesn't have an
|
||||
// ISO 639-2 code, but it is an extlang with a prefix of `zh` =
|
||||
// "Chinese" for which there is an ISO 639-2 code: `chi`. Similarly
|
||||
// for the other two examples (`bsi` = "British Sign Language" → `sgn` = "Sign
|
||||
// Languages"; `zsm` = "Standard Malay" → `may` = "Malay (macrolanguage)")
|
||||
EXPECT_EQ("chi"s, language_c::parse("yue").get_closest_iso639_2_alpha_3_code());
|
||||
EXPECT_EQ("sgn"s, language_c::parse("bfi").get_closest_iso639_2_alpha_3_code());
|
||||
EXPECT_EQ("may"s, language_c::parse("zsm").get_closest_iso639_2_alpha_3_code());
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user