GUI: BCP47: show warning if script should be suppressed

Part of the implementation of #3307.
This commit is contained in:
Moritz Bunkus 2022-03-29 21:15:28 +02:00
parent 3c52a6d26d
commit c50e582fa4
No known key found for this signature in database
GPG Key ID: 74AF00ADF2E32C85
8 changed files with 198 additions and 0 deletions

View File

@ -66,6 +66,9 @@
variant's list of suitable prefixes. It'll also say if the corresponding
canonical/extlang forms would have a suitable prefix. Part of the
implementation of #3307.
* MKVToolNix GUI: IETF BCP 47/RFC 5646 language tags: the language dialog now
shows a warning if a script is used with a language for which it should be
suppressed. Part of the implementation of #3307.
## Bug fixes

View File

@ -26,6 +26,7 @@ namespace mtx::iana::language_subtag_registry {
std::vector<entry_t> g_extlangs, g_variants, g_grandfathered;
std::vector<std::pair<mtx::bcp47::language_c, mtx::bcp47::language_c>> g_preferred_values;
std::unordered_map<std::string, std::string> g_suppress_scripts;
using VS = std::vector<std::string>;
@ -207,6 +208,19 @@ EOERB
format_table(rows, :column_suffix => ',', :row_prefix => " g_preferred_values.emplace_back(", :row_suffix => ");").join("\n")
end
def self.format_suppress_scripts entries
name = "g_suppress_scripts"
rows = (entries["language"] + entries["extlang"]).
select { |e| !e[:suppress_script].blank? }.
map { |e| [ e[:tag] || e[:subtag], e[:suppress_script] ] }.
sort.
uniq.
map { |p| p.map(&:to_cpp_string) }
" #{name}.reserve(#{rows.size});\n\n" +
format_table(rows, :column_suffix => ",", :row_prefix => " #{name}.insert_or_assign(", :row_suffix => ");").join("\n")
end
def self.do_create_cpp entries, isdcf_entries
cpp_file_name = "src/common/iana_language_subtag_registry_list.cpp"
@ -214,6 +228,7 @@ EOERB
content_of[:init] = [
self.format_extlangs_variants(entries, "extlang", "extlangs"), "",
self.format_extlangs_variants(entries, "variant", "variants"), "",
self.format_suppress_scripts(entries), "",
self.format_grandfathered(entries),
].join("\n")
content_of[:init_preferred_values] = self.format_preferred_values(entries, isdcf_entries)

View File

@ -918,6 +918,32 @@ language_c::to_extlang_form() {
return *this;
}
bool
language_c::should_script_be_suppressed()
const noexcept {
if (m_script.empty())
return false;
auto check = [this](std::string const &code) -> bool {
if (code.empty())
return false;
auto language = mtx::iso639::look_up(code);
if (!language)
return false;
auto const &suppressions = mtx::iana::language_subtag_registry::g_suppress_scripts;
auto itr = suppressions.find(language->alpha_3_code);
if ((itr == suppressions.end()) && !language->alpha_2_code.empty())
itr = suppressions.find(language->alpha_2_code);
return (itr != suppressions.end()) && (mtx::string::to_lower_ascii(itr->second) == mtx::string::to_lower_ascii(m_script));
};
return check(m_language) || check(m_extended_language_subtag);
}
void
language_c::disable() {
ms_disabled = true;

View File

@ -112,6 +112,7 @@ public:
std::string const &get_grandfathered() const noexcept;
std::string get_first_variant_not_matching_prefixes() const noexcept;
bool should_script_be_suppressed() const noexcept;
protected:
std::string format_internal(bool force) const noexcept;

View File

@ -37,6 +37,7 @@ struct entry_t {
extern std::vector<entry_t> g_extlangs, g_variants, g_grandfathered;
extern std::vector< std::pair<mtx::bcp47::language_c, mtx::bcp47::language_c> > g_preferred_values;
extern std::unordered_map<std::string, std::string> g_suppress_scripts;
void init();
void init_preferred_values();

View File

@ -24,6 +24,7 @@ namespace mtx::iana::language_subtag_registry {
std::vector<entry_t> g_extlangs, g_variants, g_grandfathered;
std::vector<std::pair<mtx::bcp47::language_c, mtx::bcp47::language_c>> g_preferred_values;
std::unordered_map<std::string, std::string> g_suppress_scripts;
using VS = std::vector<std::string>;
@ -395,6 +396,143 @@ init() {
g_variants.emplace_back("wadegile"s, u8"Wade-Giles romanization"s, VS{ "zh-Latn"s }, false);
g_variants.emplace_back("xsistemo"s, u8"Standard X-system orthographic fallback for spelling Esperanto"s, VS{ "eo"s }, false);
g_suppress_scripts.reserve(134);
g_suppress_scripts.insert_or_assign("ab"s, "Cyrl"s);
g_suppress_scripts.insert_or_assign("af"s, "Latn"s);
g_suppress_scripts.insert_or_assign("am"s, "Ethi"s);
g_suppress_scripts.insert_or_assign("ar"s, "Arab"s);
g_suppress_scripts.insert_or_assign("as"s, "Beng"s);
g_suppress_scripts.insert_or_assign("ay"s, "Latn"s);
g_suppress_scripts.insert_or_assign("be"s, "Cyrl"s);
g_suppress_scripts.insert_or_assign("bg"s, "Cyrl"s);
g_suppress_scripts.insert_or_assign("bn"s, "Beng"s);
g_suppress_scripts.insert_or_assign("bs"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ca"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ch"s, "Latn"s);
g_suppress_scripts.insert_or_assign("cs"s, "Latn"s);
g_suppress_scripts.insert_or_assign("cy"s, "Latn"s);
g_suppress_scripts.insert_or_assign("da"s, "Latn"s);
g_suppress_scripts.insert_or_assign("de"s, "Latn"s);
g_suppress_scripts.insert_or_assign("dsb"s, "Latn"s);
g_suppress_scripts.insert_or_assign("dv"s, "Thaa"s);
g_suppress_scripts.insert_or_assign("dz"s, "Tibt"s);
g_suppress_scripts.insert_or_assign("el"s, "Grek"s);
g_suppress_scripts.insert_or_assign("en"s, "Latn"s);
g_suppress_scripts.insert_or_assign("eo"s, "Latn"s);
g_suppress_scripts.insert_or_assign("es"s, "Latn"s);
g_suppress_scripts.insert_or_assign("et"s, "Latn"s);
g_suppress_scripts.insert_or_assign("eu"s, "Latn"s);
g_suppress_scripts.insert_or_assign("fa"s, "Arab"s);
g_suppress_scripts.insert_or_assign("fi"s, "Latn"s);
g_suppress_scripts.insert_or_assign("fj"s, "Latn"s);
g_suppress_scripts.insert_or_assign("fo"s, "Latn"s);
g_suppress_scripts.insert_or_assign("fr"s, "Latn"s);
g_suppress_scripts.insert_or_assign("frr"s, "Latn"s);
g_suppress_scripts.insert_or_assign("frs"s, "Latn"s);
g_suppress_scripts.insert_or_assign("fy"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ga"s, "Latn"s);
g_suppress_scripts.insert_or_assign("gl"s, "Latn"s);
g_suppress_scripts.insert_or_assign("gn"s, "Latn"s);
g_suppress_scripts.insert_or_assign("gsw"s, "Latn"s);
g_suppress_scripts.insert_or_assign("gu"s, "Gujr"s);
g_suppress_scripts.insert_or_assign("gv"s, "Latn"s);
g_suppress_scripts.insert_or_assign("he"s, "Hebr"s);
g_suppress_scripts.insert_or_assign("hi"s, "Deva"s);
g_suppress_scripts.insert_or_assign("hr"s, "Latn"s);
g_suppress_scripts.insert_or_assign("hsb"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ht"s, "Latn"s);
g_suppress_scripts.insert_or_assign("hu"s, "Latn"s);
g_suppress_scripts.insert_or_assign("hy"s, "Armn"s);
g_suppress_scripts.insert_or_assign("id"s, "Latn"s);
g_suppress_scripts.insert_or_assign("in"s, "Latn"s);
g_suppress_scripts.insert_or_assign("is"s, "Latn"s);
g_suppress_scripts.insert_or_assign("it"s, "Latn"s);
g_suppress_scripts.insert_or_assign("iw"s, "Hebr"s);
g_suppress_scripts.insert_or_assign("ja"s, "Jpan"s);
g_suppress_scripts.insert_or_assign("ka"s, "Geor"s);
g_suppress_scripts.insert_or_assign("kk"s, "Cyrl"s);
g_suppress_scripts.insert_or_assign("kl"s, "Latn"s);
g_suppress_scripts.insert_or_assign("km"s, "Khmr"s);
g_suppress_scripts.insert_or_assign("kn"s, "Knda"s);
g_suppress_scripts.insert_or_assign("ko"s, "Kore"s);
g_suppress_scripts.insert_or_assign("kok"s, "Deva"s);
g_suppress_scripts.insert_or_assign("la"s, "Latn"s);
g_suppress_scripts.insert_or_assign("lb"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ln"s, "Latn"s);
g_suppress_scripts.insert_or_assign("lo"s, "Laoo"s);
g_suppress_scripts.insert_or_assign("lt"s, "Latn"s);
g_suppress_scripts.insert_or_assign("lv"s, "Latn"s);
g_suppress_scripts.insert_or_assign("mai"s, "Deva"s);
g_suppress_scripts.insert_or_assign("men"s, "Latn"s);
g_suppress_scripts.insert_or_assign("mg"s, "Latn"s);
g_suppress_scripts.insert_or_assign("mh"s, "Latn"s);
g_suppress_scripts.insert_or_assign("mk"s, "Cyrl"s);
g_suppress_scripts.insert_or_assign("ml"s, "Mlym"s);
g_suppress_scripts.insert_or_assign("mo"s, "Latn"s);
g_suppress_scripts.insert_or_assign("mr"s, "Deva"s);
g_suppress_scripts.insert_or_assign("ms"s, "Latn"s);
g_suppress_scripts.insert_or_assign("mt"s, "Latn"s);
g_suppress_scripts.insert_or_assign("my"s, "Mymr"s);
g_suppress_scripts.insert_or_assign("na"s, "Latn"s);
g_suppress_scripts.insert_or_assign("nb"s, "Latn"s);
g_suppress_scripts.insert_or_assign("nd"s, "Latn"s);
g_suppress_scripts.insert_or_assign("nds"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ne"s, "Deva"s);
g_suppress_scripts.insert_or_assign("niu"s, "Latn"s);
g_suppress_scripts.insert_or_assign("nl"s, "Latn"s);
g_suppress_scripts.insert_or_assign("nn"s, "Latn"s);
g_suppress_scripts.insert_or_assign("no"s, "Latn"s);
g_suppress_scripts.insert_or_assign("nqo"s, "Nkoo"s);
g_suppress_scripts.insert_or_assign("nr"s, "Latn"s);
g_suppress_scripts.insert_or_assign("nso"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ny"s, "Latn"s);
g_suppress_scripts.insert_or_assign("om"s, "Latn"s);
g_suppress_scripts.insert_or_assign("or"s, "Orya"s);
g_suppress_scripts.insert_or_assign("pa"s, "Guru"s);
g_suppress_scripts.insert_or_assign("pl"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ps"s, "Arab"s);
g_suppress_scripts.insert_or_assign("pt"s, "Latn"s);
g_suppress_scripts.insert_or_assign("qu"s, "Latn"s);
g_suppress_scripts.insert_or_assign("rm"s, "Latn"s);
g_suppress_scripts.insert_or_assign("rn"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ro"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ru"s, "Cyrl"s);
g_suppress_scripts.insert_or_assign("rw"s, "Latn"s);
g_suppress_scripts.insert_or_assign("sg"s, "Latn"s);
g_suppress_scripts.insert_or_assign("si"s, "Sinh"s);
g_suppress_scripts.insert_or_assign("sk"s, "Latn"s);
g_suppress_scripts.insert_or_assign("sl"s, "Latn"s);
g_suppress_scripts.insert_or_assign("sm"s, "Latn"s);
g_suppress_scripts.insert_or_assign("so"s, "Latn"s);
g_suppress_scripts.insert_or_assign("sq"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ss"s, "Latn"s);
g_suppress_scripts.insert_or_assign("st"s, "Latn"s);
g_suppress_scripts.insert_or_assign("sv"s, "Latn"s);
g_suppress_scripts.insert_or_assign("sw"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ta"s, "Taml"s);
g_suppress_scripts.insert_or_assign("te"s, "Telu"s);
g_suppress_scripts.insert_or_assign("tem"s, "Latn"s);
g_suppress_scripts.insert_or_assign("th"s, "Thai"s);
g_suppress_scripts.insert_or_assign("ti"s, "Ethi"s);
g_suppress_scripts.insert_or_assign("tkl"s, "Latn"s);
g_suppress_scripts.insert_or_assign("tl"s, "Latn"s);
g_suppress_scripts.insert_or_assign("tmh"s, "Latn"s);
g_suppress_scripts.insert_or_assign("tn"s, "Latn"s);
g_suppress_scripts.insert_or_assign("to"s, "Latn"s);
g_suppress_scripts.insert_or_assign("tpi"s, "Latn"s);
g_suppress_scripts.insert_or_assign("tr"s, "Latn"s);
g_suppress_scripts.insert_or_assign("ts"s, "Latn"s);
g_suppress_scripts.insert_or_assign("tvl"s, "Latn"s);
g_suppress_scripts.insert_or_assign("uk"s, "Cyrl"s);
g_suppress_scripts.insert_or_assign("ur"s, "Arab"s);
g_suppress_scripts.insert_or_assign("ve"s, "Latn"s);
g_suppress_scripts.insert_or_assign("vi"s, "Latn"s);
g_suppress_scripts.insert_or_assign("xh"s, "Latn"s);
g_suppress_scripts.insert_or_assign("yi"s, "Hebr"s);
g_suppress_scripts.insert_or_assign("zbl"s, "Blis"s);
g_suppress_scripts.insert_or_assign("zu"s, "Latn"s);
g_grandfathered.reserve(26);
g_grandfathered.emplace_back("art-lojban"s, u8"Lojban"s, VS{}, true);

View File

@ -456,6 +456,10 @@ LanguageDialog::determineInfoAndWarningsFor(mtx::bcp47::language_c const &tag) {
lists.second << QY("The script '%1' is deprecated.").arg(Q(tag.get_script()));
}
if (tag.should_script_be_suppressed())
lists.second << QY("The script '%1' should not be used for the language '%2' as it is the script the overwhelming majority of documents for this language is written in.")
.arg(Q(tag.get_script())).arg(Q(tag.get_language().empty() ? tag.get_extended_language_subtag() : tag.get_language()));
if (!tag.get_region().empty()) {
auto region = mtx::iso3166::look_up(tag.get_region());
if (region && region->is_deprecated)

View File

@ -659,4 +659,14 @@ TEST(BCP47LanguageTags, VariantPrefixValidation) {
EXPECT_EQ("ijekavsk"s, l.get_first_variant_not_matching_prefixes());
}
TEST(BCP47LanguageTags, ShouldScriptBeSuppressed) {
EXPECT_FALSE(language_c::parse("de").should_script_be_suppressed()); // no script to suppress
EXPECT_FALSE(language_c::parse("de-CH").should_script_be_suppressed()); // no script to suppress
EXPECT_TRUE(language_c::parse("de-Latn").should_script_be_suppressed());
EXPECT_TRUE(language_c::parse("de-Latn-CH").should_script_be_suppressed());
EXPECT_TRUE(language_c::parse("de-lATN-CH").should_script_be_suppressed());
EXPECT_TRUE(language_c::parse("deu-LAtN-Ch").should_script_be_suppressed());
EXPECT_TRUE(language_c::parse("ger-latn-ch").should_script_be_suppressed());
}
}