From 101580819376ec70a86b00ad3361794a9569b87e Mon Sep 17 00:00:00 2001 From: Moritz Bunkus Date: Wed, 23 Mar 2022 22:15:05 +0100 Subject: [PATCH] BCP 47: add function for getting the closest ISO 639-2 code for a tag There are several languages that aren't part of ISO 639-2 but are part of the 639-3 or 639-5. For those languages the legacy Matroska language elements cannot be set to the ISO 639 alpha 3 code of the BCP 47 language tag. However, there are a lot of such languages whose ISO 639 alpha 3 code is a valid extlang subtag of a BCP 47 tag. For example: the language "Yue Chinese" has an ISO 639 alpha 3 code of `yue` but isn't part of ISO 639-2. However, `yue` is also a valid extlang. As each extlang must have a prefix for which it is valid (in the case of `yue` it's `zh`) and as that prefix must in turn be an ISO 639 code itself, that prefix language's ISO 639-2 code is the closest representation. Part of the implementation of #3307. --- src/common/bcp47.cpp | 25 +++++++++++++++++++++++++ src/common/bcp47.h | 1 + tests/unit/common/bcp47.cpp | 28 ++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/src/common/bcp47.cpp b/src/common/bcp47.cpp index e9c35476e..5fd440b60 100644 --- a/src/common/bcp47.cpp +++ b/src/common/bcp47.cpp @@ -532,6 +532,31 @@ language_c::get_iso639_2_alpha_3_code_or(std::string const &value_if_invalid) return value_if_invalid; } +std::string +language_c::get_closest_iso639_2_alpha_3_code() + const noexcept { + if (!m_valid || m_language.empty()) + return "und"s; + + auto language = mtx::iso639::look_up(m_language); + if (!language) + return "und"s; + + if (language->is_part_of_iso639_2) + return language->alpha_3_code; + + auto extlang = mtx::iana::language_subtag_registry::look_up_extlang(language->alpha_3_code); + if (!extlang || extlang->prefixes.empty()) + return "und"s; + + auto prefix_language = mtx::iso639::look_up(extlang->prefixes.front()); + + if (prefix_language && prefix_language->is_part_of_iso639_2) + return prefix_language->alpha_3_code; + + return "und"s; +} + language_c & language_c::set_valid(bool valid) { m_valid = valid; diff --git a/src/common/bcp47.h b/src/common/bcp47.h index e36a2d374..2f6c4fc71 100644 --- a/src/common/bcp47.h +++ b/src/common/bcp47.h @@ -59,6 +59,7 @@ public: bool has_valid_iso639_2_code() const noexcept; std::string get_iso639_alpha_3_code() const noexcept; std::string get_iso639_2_alpha_3_code_or(std::string const &value_if_invalid) const noexcept; + std::string get_closest_iso639_2_alpha_3_code() const noexcept; bool has_valid_iso3166_1_alpha_2_or_top_level_domain_country_code() const noexcept; std::string get_iso3166_1_alpha_2_code() const noexcept; diff --git a/tests/unit/common/bcp47.cpp b/tests/unit/common/bcp47.cpp index c6e4e5237..b709423a3 100644 --- a/tests/unit/common/bcp47.cpp +++ b/tests/unit/common/bcp47.cpp @@ -394,4 +394,32 @@ TEST(BCP47LanguageTags, ISO3166_1_Alpha2Codes) { EXPECT_EQ("uk"s, language_c::parse("en-GB").get_top_level_domain_country_code()); } +TEST(BCP47LanguageTags, ClosestISO639_2_Alpha3Code) { + // default value returned in different cases + EXPECT_EQ("und"s, language_c{}.get_closest_iso639_2_alpha_3_code()); // empty entry + EXPECT_EQ("und"s, language_c::parse("moocow").get_closest_iso639_2_alpha_3_code()); // invalid entry + EXPECT_EQ("und"s, language_c::parse("x-muh-to-the-kuh").get_closest_iso639_2_alpha_3_code()); // valid but no ISO 639 code + EXPECT_EQ("und"s, language_c::parse("aiw").get_closest_iso639_2_alpha_3_code()); // valid but no 639-2 code & not an extlang + + // "Valid, is extlang, prefix is ISO 639 code but not ISO 639-2" + // would be another case when "und" should be returned, but as of + // 2022-03-23 there's no such entry. All current prefixes for + // extlangs do have ISO 639-2 codes. + + // Now some valid cases. + EXPECT_EQ("fre"s, language_c::parse("fr-FR").get_closest_iso639_2_alpha_3_code()); + EXPECT_EQ("ger"s, language_c::parse("de").get_closest_iso639_2_alpha_3_code()); + EXPECT_EQ("ger"s, language_c::parse("deu").get_closest_iso639_2_alpha_3_code()); + EXPECT_EQ("ger"s, language_c::parse("ger").get_closest_iso639_2_alpha_3_code()); + + // Last the interesting cases: `yue` = "Yue Chinese" doesn't have an + // ISO 639-2 code, but it is an extlang with a prefix of `zh` = + // "Chinese" for which there is an ISO 639-2 code: `chi`. Similarly + // for the other two examples (`bsi` = "British Sign Language" → `sgn` = "Sign + // Languages"; `zsm` = "Standard Malay" → `may` = "Malay (macrolanguage)") + EXPECT_EQ("chi"s, language_c::parse("yue").get_closest_iso639_2_alpha_3_code()); + EXPECT_EQ("sgn"s, language_c::parse("bfi").get_closest_iso639_2_alpha_3_code()); + EXPECT_EQ("may"s, language_c::parse("zsm").get_closest_iso639_2_alpha_3_code()); +} + }