mirror of
https://gitlab.com/mbunkus/mkvtoolnix.git
synced 2024-10-22 07:21:21 +00:00
BCP 47: don't enforce prefixes for variants; enforce uniqueness of variants
BCP 47's verbiage is pretty lax wrt. variants & their prefixes. It states[1]: > Variant subtag records in the Language Subtag Registry MAY include > one or more 'Prefix' (Section 3.1.8) fields. Each 'Prefix' > indicates a suitable sequence of subtags for forming (with other > subtags, as appropriate) a language tag when using the variant. Therefore a hard check whether a variant is used with only the listed prefixes is inappropriate. Furthermore there are other semi-normative sources stating the same. For example, the W3C[2] says: > Check the context and ordering for variant subtags. Most variant > subtag records in the registry have one or more Prefix fields. The > prefixes indicate with which subtags it is usually appropriate to > use this variant. … > If you have a good reason, you could use a variant subtag with > different subtags, eg. cmn-Latn-pinyin would be a perfectly legal > way to say Mandarin Chinese written with pinyin. And `pinyin` lists neither `cmn` nor `cmn-Latn` as a prefix. BCP 47 goes on to state that "Most variants that share a prefix are mutually exclusive", but there's actually no way to identify the variants for which this holds true automatically. Therefore this property isn't enforced either. Lastly BCP 47 does have one hard requirement on variants in [1]: > 5. The same variant subtag MUST NOT be used more than once within a > language tag. This is now enforced. Part of the implementation/fix of #3307. [1] https://www.rfc-editor.org/rfc/rfc5646.html#section-2.2.5 [2] https://www.w3.org/International/questions/qa-choosing-language-tags#variants
This commit is contained in:
parent
f7f71ea288
commit
a73c424e5e
5
NEWS.md
5
NEWS.md
@ -32,6 +32,11 @@
|
||||
enough for files that contain a lot of comments at the start like the
|
||||
included `example-chapters-2.xml`. The detection range was extended to 10
|
||||
KB. Fixes #3302.
|
||||
* all: IETF BCP 47/RFC 5646 language tags: variants aren't validated wrt. to
|
||||
prefixes anymore as BCP 47 doesn't actually pose restrictions on them,
|
||||
saying only that prefixes "are suitable sequences" for use with the
|
||||
variants. What is now verified, though, is that no variant is used multiple
|
||||
times within the same language tag. Part of the implementation/fix of #3307.
|
||||
|
||||
|
||||
# Version 66.0.0 "Josie" 2022-03-13
|
||||
|
@ -316,12 +316,10 @@ language_c::parse_extensions(std::string const &str) {
|
||||
|
||||
bool
|
||||
language_c::matches_prefix(language_c const &prefix,
|
||||
std::size_t extlang_or_variant_index,
|
||||
bool is_extlang,
|
||||
std::size_t extlang_index,
|
||||
prefix_restrictions_t const &restrictions)
|
||||
const noexcept {
|
||||
if ( ( is_extlang && !m_extended_language_subtags.empty() && (extlang_or_variant_index > (prefix.m_extended_language_subtags.size())))
|
||||
|| (!is_extlang && !m_variants .empty() && (extlang_or_variant_index > (prefix.m_variants .size()))))
|
||||
if (!m_extended_language_subtags.empty() && (extlang_index > (prefix.m_extended_language_subtags.size())))
|
||||
return false;
|
||||
|
||||
if ( (restrictions.language && prefix.m_language .empty() && !m_language .empty())
|
||||
@ -360,17 +358,14 @@ language_c::matches_prefix(language_c const &prefix,
|
||||
}
|
||||
|
||||
bool
|
||||
language_c::validate_one_extlang_or_variant(std::size_t extlang_or_variant_index,
|
||||
bool is_extlang) {
|
||||
auto const &extlang_or_variant_code = is_extlang ? m_extended_language_subtags[extlang_or_variant_index]
|
||||
: m_variants[extlang_or_variant_index];
|
||||
auto extlang_or_variant = is_extlang ? mtx::iana::language_subtag_registry::look_up_extlang(extlang_or_variant_code)
|
||||
: mtx::iana::language_subtag_registry::look_up_variant(extlang_or_variant_code);
|
||||
language_c::validate_one_extlang(std::size_t extlang_index) {
|
||||
auto const &extlang_code = m_extended_language_subtags[extlang_index];
|
||||
auto extlang = mtx::iana::language_subtag_registry::look_up_extlang(extlang_code);
|
||||
|
||||
if (!extlang_or_variant) // Should not happen as the parsing checks this already.
|
||||
if (!extlang) // Should not happen as the parsing checks this already.
|
||||
return false;
|
||||
|
||||
if (extlang_or_variant->prefixes.empty())
|
||||
if (extlang->prefixes.empty())
|
||||
return true;
|
||||
|
||||
prefix_restrictions_t restrictions;
|
||||
@ -381,7 +376,7 @@ language_c::validate_one_extlang_or_variant(std::size_t extlang_or_variant_index
|
||||
value = true;
|
||||
};
|
||||
|
||||
for (auto const &prefix : extlang_or_variant->prefixes) {
|
||||
for (auto const &prefix : extlang->prefixes) {
|
||||
parsed_prefixes.emplace_back(parse(prefix));
|
||||
auto const &tag = parsed_prefixes.back();
|
||||
|
||||
@ -393,27 +388,40 @@ language_c::validate_one_extlang_or_variant(std::size_t extlang_or_variant_index
|
||||
}
|
||||
|
||||
for (auto const &parsed_prefix : parsed_prefixes)
|
||||
if (matches_prefix(parsed_prefix, extlang_or_variant_index, is_extlang, restrictions))
|
||||
if (matches_prefix(parsed_prefix, extlang_index, restrictions))
|
||||
return true;
|
||||
|
||||
auto message = is_extlang ? Y("The extended language subtag '{}' must only be used with one of the following prefixes: {}.")
|
||||
: Y("The variant '{}' must only be used with one of the following prefixes: {}.");
|
||||
m_parser_error = fmt::format(message, extlang_or_variant_code, fmt::join(extlang_or_variant->prefixes, ", "));
|
||||
auto message = Y("The extended language subtag '{}' must only be used with one of the following prefixes: {}.");
|
||||
m_parser_error = fmt::format(message, extlang_code, fmt::join(extlang->prefixes, ", "));
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
language_c::validate_extlangs_or_variants(bool is_extlangs) {
|
||||
auto const &extlangs_or_variants = is_extlangs ? m_extended_language_subtags : m_variants;
|
||||
|
||||
for (int idx = 0, num_entries = extlangs_or_variants.size(); idx < num_entries; ++idx)
|
||||
if (!validate_one_extlang_or_variant(idx, is_extlangs))
|
||||
language_c::validate_extlangs() {
|
||||
for (int idx = 0, num_entries = m_extended_language_subtags.size(); idx < num_entries; ++idx)
|
||||
if (!validate_one_extlang(idx))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
language_c::validate_variants() {
|
||||
std::map<std::string, bool> variants_seen;
|
||||
|
||||
for (auto const &variant : m_variants) {
|
||||
if (variants_seen[variant]) {
|
||||
m_parser_error = fmt::format(Y("The variant '{}' occurs more than once."), variant);
|
||||
return false;
|
||||
}
|
||||
|
||||
variants_seen[variant] = true;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
language_c::validate_extensions() {
|
||||
if (m_extensions.empty())
|
||||
@ -497,8 +505,7 @@ language_c::parse(std::string const &language) {
|
||||
if (matches.capturedLength(9))
|
||||
l.m_private_use = mtx::string::split(to_utf8(matches.captured(9)).substr(1), "-");
|
||||
|
||||
if ( !l.validate_extlangs_or_variants(true)
|
||||
|| !l.validate_extlangs_or_variants(false))
|
||||
if (!l.validate_extlangs() || !l.validate_variants())
|
||||
return l;
|
||||
|
||||
l.m_valid = true;
|
||||
|
@ -105,9 +105,10 @@ protected:
|
||||
bool parse_extlangs_or_variants(std::string const &str, bool is_extlangs);
|
||||
|
||||
bool validate_extensions();
|
||||
bool validate_extlangs_or_variants(bool is_extlangs);
|
||||
bool validate_one_extlang_or_variant(std::size_t extlang_or_variant_index, bool is_extlang);
|
||||
bool matches_prefix(language_c const &prefix, std::size_t extlang_or_variant_index, bool is_extlang, prefix_restrictions_t const &restrictions) const noexcept;
|
||||
bool validate_extlangs();
|
||||
bool validate_variants();
|
||||
bool validate_one_extlang(std::size_t extlang_index);
|
||||
bool matches_prefix(language_c const &prefix, std::size_t extlang_index, prefix_restrictions_t const &restrictions) const noexcept;
|
||||
|
||||
public:
|
||||
static language_c parse(std::string const &language);
|
||||
|
@ -35,7 +35,6 @@ TEST(BCP47LanguageTags, ParsingInvalid) {
|
||||
EXPECT_FALSE(language_c::parse("zh-min").is_valid()); // invalid (min not allowed with zh)
|
||||
EXPECT_FALSE(language_c::parse("gonzo").is_valid()); // invalid
|
||||
EXPECT_FALSE(language_c::parse("de-aao-Latn-DZ").is_valid()); // invalid (aoo not valid with de)
|
||||
EXPECT_FALSE(language_c::parse("de-ekavsk").is_valid()); // invalid (ekavsk not valid with de)
|
||||
EXPECT_FALSE(language_c::parse("es-0").is_valid()); // invalid (no such region)
|
||||
}
|
||||
|
||||
@ -184,19 +183,13 @@ TEST(BCP47LanguageTags, PrefixValidation) {
|
||||
EXPECT_TRUE(language_c::parse("en-GB-scotland").is_valid());
|
||||
EXPECT_TRUE(language_c::parse("zh-Latn-CN-pinyin").is_valid());
|
||||
|
||||
EXPECT_FALSE(language_c::parse("sr-biske").is_valid());
|
||||
EXPECT_FALSE(language_c::parse("tr-rozaj").is_valid());
|
||||
|
||||
EXPECT_TRUE(language_c::parse("sl-rozaj").is_valid());
|
||||
EXPECT_TRUE(language_c::parse("sl-rozaj-biske").is_valid());
|
||||
EXPECT_TRUE(language_c::parse("sl-rozaj-1994").is_valid());
|
||||
EXPECT_TRUE(language_c::parse("sl-rozaj-biske-1994").is_valid());
|
||||
EXPECT_FALSE(language_c::parse("sl-1994").is_valid());
|
||||
EXPECT_FALSE(language_c::parse("sl-biske-rozaj").is_valid());
|
||||
|
||||
EXPECT_TRUE(language_c::parse("de-1901").is_valid());
|
||||
EXPECT_TRUE(language_c::parse("de-1996").is_valid());
|
||||
EXPECT_FALSE(language_c::parse("de-1901-1996").is_valid());
|
||||
|
||||
EXPECT_TRUE(language_c::parse("zh-cmn").is_valid());
|
||||
EXPECT_TRUE(language_c::parse("zh-yue").is_valid());
|
||||
@ -204,12 +197,14 @@ TEST(BCP47LanguageTags, PrefixValidation) {
|
||||
|
||||
EXPECT_TRUE(language_c::parse("hy-arevela").is_valid());
|
||||
EXPECT_TRUE(language_c::parse("hy-arevmda").is_valid());
|
||||
EXPECT_FALSE(language_c::parse("hy-arevela-arevmda").is_valid());
|
||||
|
||||
EXPECT_TRUE(language_c::parse("ja-Latn-hepburn").is_valid());
|
||||
EXPECT_TRUE(language_c::parse("ja-Latn-hepburn-heploc").is_valid());
|
||||
EXPECT_FALSE(language_c::parse("ja-Latn-heploc").is_valid());
|
||||
|
||||
EXPECT_FALSE(language_c::parse("de-1996-1996").is_valid());
|
||||
EXPECT_TRUE(language_c::parse("cmn-Latn-pinyin").is_valid());
|
||||
EXPECT_TRUE(language_c::parse("zh-cmn-Latn-tongyong").is_valid());
|
||||
EXPECT_TRUE(language_c::parse("zh-yue-jyutping").is_valid());
|
||||
}
|
||||
|
||||
TEST(BCP47LanguageTags, RFC4646AssortedValid) {
|
||||
@ -218,10 +213,6 @@ TEST(BCP47LanguageTags, RFC4646AssortedValid) {
|
||||
EXPECT_TRUE(language_c::parse("de-CH-1996").is_valid()); // section 3.1
|
||||
}
|
||||
|
||||
TEST(BCP47LanguageTags, RFC4646AssortedInvalid) {
|
||||
EXPECT_FALSE(language_c::parse("fr-1996").is_valid()); // section 3.1
|
||||
}
|
||||
|
||||
TEST(BCP47LanguageTags, RFC4646AppendixBValid) {
|
||||
// Simple language subtag:
|
||||
EXPECT_TRUE(language_c::parse("de").is_valid()); // (German)
|
||||
@ -293,8 +284,6 @@ TEST(BCP47LanguageTags, OnlyCertainScriptsAllowedOrNoScriptAtAll) {
|
||||
EXPECT_TRUE(language_c::parse("sr-Latn-ekavsk").is_valid());
|
||||
|
||||
EXPECT_TRUE(language_c::parse("sr-Latn-RS-ekavsk").is_valid());
|
||||
|
||||
EXPECT_FALSE(language_c::parse("sr-Bali-ekavsk").is_valid());
|
||||
}
|
||||
|
||||
TEST(BCP47LanguageTags, ExtensionsBasics) {
|
||||
|
Loading…
Reference in New Issue
Block a user