diff --git a/NEWS.md b/NEWS.md index a2d80e404..471ba9883 100644 --- a/NEWS.md +++ b/NEWS.md @@ -66,6 +66,9 @@ variant's list of suitable prefixes. It'll also say if the corresponding canonical/extlang forms would have a suitable prefix. Part of the implementation of #3307. +* MKVToolNix GUI: IETF BCP 47/RFC 5646 language tags: the language dialog now + shows a warning if a script is used with a language for which it should be + suppressed. Part of the implementation of #3307. ## Bug fixes diff --git a/rake.d/iana_language_subtag_registry.rb b/rake.d/iana_language_subtag_registry.rb index b38698ca7..8facc2dc2 100644 --- a/rake.d/iana_language_subtag_registry.rb +++ b/rake.d/iana_language_subtag_registry.rb @@ -26,6 +26,7 @@ namespace mtx::iana::language_subtag_registry { std::vector g_extlangs, g_variants, g_grandfathered; std::vector> g_preferred_values; +std::unordered_map g_suppress_scripts; using VS = std::vector; @@ -207,6 +208,19 @@ EOERB format_table(rows, :column_suffix => ',', :row_prefix => " g_preferred_values.emplace_back(", :row_suffix => ");").join("\n") end + def self.format_suppress_scripts entries + name = "g_suppress_scripts" + rows = (entries["language"] + entries["extlang"]). + select { |e| !e[:suppress_script].blank? }. + map { |e| [ e[:tag] || e[:subtag], e[:suppress_script] ] }. + sort. + uniq. + map { |p| p.map(&:to_cpp_string) } + + " #{name}.reserve(#{rows.size});\n\n" + + format_table(rows, :column_suffix => ",", :row_prefix => " #{name}.insert_or_assign(", :row_suffix => ");").join("\n") + end + def self.do_create_cpp entries, isdcf_entries cpp_file_name = "src/common/iana_language_subtag_registry_list.cpp" @@ -214,6 +228,7 @@ EOERB content_of[:init] = [ self.format_extlangs_variants(entries, "extlang", "extlangs"), "", self.format_extlangs_variants(entries, "variant", "variants"), "", + self.format_suppress_scripts(entries), "", self.format_grandfathered(entries), ].join("\n") content_of[:init_preferred_values] = self.format_preferred_values(entries, isdcf_entries) diff --git a/src/common/bcp47.cpp b/src/common/bcp47.cpp index 06e86868d..2cbe5f6a1 100644 --- a/src/common/bcp47.cpp +++ b/src/common/bcp47.cpp @@ -918,6 +918,32 @@ language_c::to_extlang_form() { return *this; } +bool +language_c::should_script_be_suppressed() + const noexcept { + if (m_script.empty()) + return false; + + auto check = [this](std::string const &code) -> bool { + if (code.empty()) + return false; + + auto language = mtx::iso639::look_up(code); + if (!language) + return false; + + auto const &suppressions = mtx::iana::language_subtag_registry::g_suppress_scripts; + auto itr = suppressions.find(language->alpha_3_code); + + if ((itr == suppressions.end()) && !language->alpha_2_code.empty()) + itr = suppressions.find(language->alpha_2_code); + + return (itr != suppressions.end()) && (mtx::string::to_lower_ascii(itr->second) == mtx::string::to_lower_ascii(m_script)); + }; + + return check(m_language) || check(m_extended_language_subtag); +} + void language_c::disable() { ms_disabled = true; diff --git a/src/common/bcp47.h b/src/common/bcp47.h index c8f8488a5..ae9668d39 100644 --- a/src/common/bcp47.h +++ b/src/common/bcp47.h @@ -112,6 +112,7 @@ public: std::string const &get_grandfathered() const noexcept; std::string get_first_variant_not_matching_prefixes() const noexcept; + bool should_script_be_suppressed() const noexcept; protected: std::string format_internal(bool force) const noexcept; diff --git a/src/common/iana_language_subtag_registry.h b/src/common/iana_language_subtag_registry.h index 460b43c0d..cb63c586b 100644 --- a/src/common/iana_language_subtag_registry.h +++ b/src/common/iana_language_subtag_registry.h @@ -37,6 +37,7 @@ struct entry_t { extern std::vector g_extlangs, g_variants, g_grandfathered; extern std::vector< std::pair > g_preferred_values; +extern std::unordered_map g_suppress_scripts; void init(); void init_preferred_values(); diff --git a/src/common/iana_language_subtag_registry_list.cpp b/src/common/iana_language_subtag_registry_list.cpp index 807f4b28a..6c2e741f5 100644 --- a/src/common/iana_language_subtag_registry_list.cpp +++ b/src/common/iana_language_subtag_registry_list.cpp @@ -24,6 +24,7 @@ namespace mtx::iana::language_subtag_registry { std::vector g_extlangs, g_variants, g_grandfathered; std::vector> g_preferred_values; +std::unordered_map g_suppress_scripts; using VS = std::vector; @@ -395,6 +396,143 @@ init() { g_variants.emplace_back("wadegile"s, u8"Wade-Giles romanization"s, VS{ "zh-Latn"s }, false); g_variants.emplace_back("xsistemo"s, u8"Standard X-system orthographic fallback for spelling Esperanto"s, VS{ "eo"s }, false); + g_suppress_scripts.reserve(134); + + g_suppress_scripts.insert_or_assign("ab"s, "Cyrl"s); + g_suppress_scripts.insert_or_assign("af"s, "Latn"s); + g_suppress_scripts.insert_or_assign("am"s, "Ethi"s); + g_suppress_scripts.insert_or_assign("ar"s, "Arab"s); + g_suppress_scripts.insert_or_assign("as"s, "Beng"s); + g_suppress_scripts.insert_or_assign("ay"s, "Latn"s); + g_suppress_scripts.insert_or_assign("be"s, "Cyrl"s); + g_suppress_scripts.insert_or_assign("bg"s, "Cyrl"s); + g_suppress_scripts.insert_or_assign("bn"s, "Beng"s); + g_suppress_scripts.insert_or_assign("bs"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ca"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ch"s, "Latn"s); + g_suppress_scripts.insert_or_assign("cs"s, "Latn"s); + g_suppress_scripts.insert_or_assign("cy"s, "Latn"s); + g_suppress_scripts.insert_or_assign("da"s, "Latn"s); + g_suppress_scripts.insert_or_assign("de"s, "Latn"s); + g_suppress_scripts.insert_or_assign("dsb"s, "Latn"s); + g_suppress_scripts.insert_or_assign("dv"s, "Thaa"s); + g_suppress_scripts.insert_or_assign("dz"s, "Tibt"s); + g_suppress_scripts.insert_or_assign("el"s, "Grek"s); + g_suppress_scripts.insert_or_assign("en"s, "Latn"s); + g_suppress_scripts.insert_or_assign("eo"s, "Latn"s); + g_suppress_scripts.insert_or_assign("es"s, "Latn"s); + g_suppress_scripts.insert_or_assign("et"s, "Latn"s); + g_suppress_scripts.insert_or_assign("eu"s, "Latn"s); + g_suppress_scripts.insert_or_assign("fa"s, "Arab"s); + g_suppress_scripts.insert_or_assign("fi"s, "Latn"s); + g_suppress_scripts.insert_or_assign("fj"s, "Latn"s); + g_suppress_scripts.insert_or_assign("fo"s, "Latn"s); + g_suppress_scripts.insert_or_assign("fr"s, "Latn"s); + g_suppress_scripts.insert_or_assign("frr"s, "Latn"s); + g_suppress_scripts.insert_or_assign("frs"s, "Latn"s); + g_suppress_scripts.insert_or_assign("fy"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ga"s, "Latn"s); + g_suppress_scripts.insert_or_assign("gl"s, "Latn"s); + g_suppress_scripts.insert_or_assign("gn"s, "Latn"s); + g_suppress_scripts.insert_or_assign("gsw"s, "Latn"s); + g_suppress_scripts.insert_or_assign("gu"s, "Gujr"s); + g_suppress_scripts.insert_or_assign("gv"s, "Latn"s); + g_suppress_scripts.insert_or_assign("he"s, "Hebr"s); + g_suppress_scripts.insert_or_assign("hi"s, "Deva"s); + g_suppress_scripts.insert_or_assign("hr"s, "Latn"s); + g_suppress_scripts.insert_or_assign("hsb"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ht"s, "Latn"s); + g_suppress_scripts.insert_or_assign("hu"s, "Latn"s); + g_suppress_scripts.insert_or_assign("hy"s, "Armn"s); + g_suppress_scripts.insert_or_assign("id"s, "Latn"s); + g_suppress_scripts.insert_or_assign("in"s, "Latn"s); + g_suppress_scripts.insert_or_assign("is"s, "Latn"s); + g_suppress_scripts.insert_or_assign("it"s, "Latn"s); + g_suppress_scripts.insert_or_assign("iw"s, "Hebr"s); + g_suppress_scripts.insert_or_assign("ja"s, "Jpan"s); + g_suppress_scripts.insert_or_assign("ka"s, "Geor"s); + g_suppress_scripts.insert_or_assign("kk"s, "Cyrl"s); + g_suppress_scripts.insert_or_assign("kl"s, "Latn"s); + g_suppress_scripts.insert_or_assign("km"s, "Khmr"s); + g_suppress_scripts.insert_or_assign("kn"s, "Knda"s); + g_suppress_scripts.insert_or_assign("ko"s, "Kore"s); + g_suppress_scripts.insert_or_assign("kok"s, "Deva"s); + g_suppress_scripts.insert_or_assign("la"s, "Latn"s); + g_suppress_scripts.insert_or_assign("lb"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ln"s, "Latn"s); + g_suppress_scripts.insert_or_assign("lo"s, "Laoo"s); + g_suppress_scripts.insert_or_assign("lt"s, "Latn"s); + g_suppress_scripts.insert_or_assign("lv"s, "Latn"s); + g_suppress_scripts.insert_or_assign("mai"s, "Deva"s); + g_suppress_scripts.insert_or_assign("men"s, "Latn"s); + g_suppress_scripts.insert_or_assign("mg"s, "Latn"s); + g_suppress_scripts.insert_or_assign("mh"s, "Latn"s); + g_suppress_scripts.insert_or_assign("mk"s, "Cyrl"s); + g_suppress_scripts.insert_or_assign("ml"s, "Mlym"s); + g_suppress_scripts.insert_or_assign("mo"s, "Latn"s); + g_suppress_scripts.insert_or_assign("mr"s, "Deva"s); + g_suppress_scripts.insert_or_assign("ms"s, "Latn"s); + g_suppress_scripts.insert_or_assign("mt"s, "Latn"s); + g_suppress_scripts.insert_or_assign("my"s, "Mymr"s); + g_suppress_scripts.insert_or_assign("na"s, "Latn"s); + g_suppress_scripts.insert_or_assign("nb"s, "Latn"s); + g_suppress_scripts.insert_or_assign("nd"s, "Latn"s); + g_suppress_scripts.insert_or_assign("nds"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ne"s, "Deva"s); + g_suppress_scripts.insert_or_assign("niu"s, "Latn"s); + g_suppress_scripts.insert_or_assign("nl"s, "Latn"s); + g_suppress_scripts.insert_or_assign("nn"s, "Latn"s); + g_suppress_scripts.insert_or_assign("no"s, "Latn"s); + g_suppress_scripts.insert_or_assign("nqo"s, "Nkoo"s); + g_suppress_scripts.insert_or_assign("nr"s, "Latn"s); + g_suppress_scripts.insert_or_assign("nso"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ny"s, "Latn"s); + g_suppress_scripts.insert_or_assign("om"s, "Latn"s); + g_suppress_scripts.insert_or_assign("or"s, "Orya"s); + g_suppress_scripts.insert_or_assign("pa"s, "Guru"s); + g_suppress_scripts.insert_or_assign("pl"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ps"s, "Arab"s); + g_suppress_scripts.insert_or_assign("pt"s, "Latn"s); + g_suppress_scripts.insert_or_assign("qu"s, "Latn"s); + g_suppress_scripts.insert_or_assign("rm"s, "Latn"s); + g_suppress_scripts.insert_or_assign("rn"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ro"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ru"s, "Cyrl"s); + g_suppress_scripts.insert_or_assign("rw"s, "Latn"s); + g_suppress_scripts.insert_or_assign("sg"s, "Latn"s); + g_suppress_scripts.insert_or_assign("si"s, "Sinh"s); + g_suppress_scripts.insert_or_assign("sk"s, "Latn"s); + g_suppress_scripts.insert_or_assign("sl"s, "Latn"s); + g_suppress_scripts.insert_or_assign("sm"s, "Latn"s); + g_suppress_scripts.insert_or_assign("so"s, "Latn"s); + g_suppress_scripts.insert_or_assign("sq"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ss"s, "Latn"s); + g_suppress_scripts.insert_or_assign("st"s, "Latn"s); + g_suppress_scripts.insert_or_assign("sv"s, "Latn"s); + g_suppress_scripts.insert_or_assign("sw"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ta"s, "Taml"s); + g_suppress_scripts.insert_or_assign("te"s, "Telu"s); + g_suppress_scripts.insert_or_assign("tem"s, "Latn"s); + g_suppress_scripts.insert_or_assign("th"s, "Thai"s); + g_suppress_scripts.insert_or_assign("ti"s, "Ethi"s); + g_suppress_scripts.insert_or_assign("tkl"s, "Latn"s); + g_suppress_scripts.insert_or_assign("tl"s, "Latn"s); + g_suppress_scripts.insert_or_assign("tmh"s, "Latn"s); + g_suppress_scripts.insert_or_assign("tn"s, "Latn"s); + g_suppress_scripts.insert_or_assign("to"s, "Latn"s); + g_suppress_scripts.insert_or_assign("tpi"s, "Latn"s); + g_suppress_scripts.insert_or_assign("tr"s, "Latn"s); + g_suppress_scripts.insert_or_assign("ts"s, "Latn"s); + g_suppress_scripts.insert_or_assign("tvl"s, "Latn"s); + g_suppress_scripts.insert_or_assign("uk"s, "Cyrl"s); + g_suppress_scripts.insert_or_assign("ur"s, "Arab"s); + g_suppress_scripts.insert_or_assign("ve"s, "Latn"s); + g_suppress_scripts.insert_or_assign("vi"s, "Latn"s); + g_suppress_scripts.insert_or_assign("xh"s, "Latn"s); + g_suppress_scripts.insert_or_assign("yi"s, "Hebr"s); + g_suppress_scripts.insert_or_assign("zbl"s, "Blis"s); + g_suppress_scripts.insert_or_assign("zu"s, "Latn"s); + g_grandfathered.reserve(26); g_grandfathered.emplace_back("art-lojban"s, u8"Lojban"s, VS{}, true); diff --git a/src/mkvtoolnix-gui/util/language_dialog.cpp b/src/mkvtoolnix-gui/util/language_dialog.cpp index 91293ae44..2f8eea10b 100644 --- a/src/mkvtoolnix-gui/util/language_dialog.cpp +++ b/src/mkvtoolnix-gui/util/language_dialog.cpp @@ -456,6 +456,10 @@ LanguageDialog::determineInfoAndWarningsFor(mtx::bcp47::language_c const &tag) { lists.second << QY("The script '%1' is deprecated.").arg(Q(tag.get_script())); } + if (tag.should_script_be_suppressed()) + lists.second << QY("The script '%1' should not be used for the language '%2' as it is the script the overwhelming majority of documents for this language is written in.") + .arg(Q(tag.get_script())).arg(Q(tag.get_language().empty() ? tag.get_extended_language_subtag() : tag.get_language())); + if (!tag.get_region().empty()) { auto region = mtx::iso3166::look_up(tag.get_region()); if (region && region->is_deprecated) diff --git a/tests/unit/common/bcp47.cpp b/tests/unit/common/bcp47.cpp index 6682d6fee..6c1cb060e 100644 --- a/tests/unit/common/bcp47.cpp +++ b/tests/unit/common/bcp47.cpp @@ -659,4 +659,14 @@ TEST(BCP47LanguageTags, VariantPrefixValidation) { EXPECT_EQ("ijekavsk"s, l.get_first_variant_not_matching_prefixes()); } +TEST(BCP47LanguageTags, ShouldScriptBeSuppressed) { + EXPECT_FALSE(language_c::parse("de").should_script_be_suppressed()); // no script to suppress + EXPECT_FALSE(language_c::parse("de-CH").should_script_be_suppressed()); // no script to suppress + EXPECT_TRUE(language_c::parse("de-Latn").should_script_be_suppressed()); + EXPECT_TRUE(language_c::parse("de-Latn-CH").should_script_be_suppressed()); + EXPECT_TRUE(language_c::parse("de-lATN-CH").should_script_be_suppressed()); + EXPECT_TRUE(language_c::parse("deu-LAtN-Ch").should_script_be_suppressed()); + EXPECT_TRUE(language_c::parse("ger-latn-ch").should_script_be_suppressed()); +} + }