BCP 47: don't enforce prefixes for variants; enforce uniqueness of variants

BCP 47's verbiage is pretty lax wrt. variants & their prefixes. It
states[1]:

> Variant subtag records in the Language Subtag Registry MAY include
> one or more 'Prefix' (Section 3.1.8) fields.  Each 'Prefix'
> indicates a suitable sequence of subtags for forming (with other
> subtags, as appropriate) a language tag when using the variant.

Therefore a hard check whether a variant is used with only the listed
prefixes is inappropriate.

Furthermore there are other semi-normative sources stating the
same. For example, the W3C[2] says:

> Check the context and ordering for variant subtags. Most variant
> subtag records in the registry have one or more Prefix fields. The
> prefixes indicate with which subtags it is usually appropriate to
> use this variant.

…

> If you have a good reason, you could use a variant subtag with
> different subtags, eg. cmn-Latn-pinyin would be a perfectly legal
> way to say Mandarin Chinese written with pinyin.

And `pinyin` lists neither `cmn` nor `cmn-Latn` as a prefix.

BCP 47 goes on to state that "Most variants that share a prefix are
mutually exclusive", but there's actually no way to identify the
variants for which this holds true automatically. Therefore this
property isn't enforced either.

Lastly BCP 47 does have one hard requirement on variants in [1]:

>  5. The same variant subtag MUST NOT be used more than once within a
>     language tag.

This is now enforced.

Part of the implementation/fix of #3307.

[1]  https://www.rfc-editor.org/rfc/rfc5646.html#section-2.2.5
[2]  https://www.w3.org/International/questions/qa-choosing-language-tags#variants
This commit is contained in:
Moritz Bunkus 2022-03-24 21:26:24 +01:00
parent f7f71ea288
commit a73c424e5e
No known key found for this signature in database
GPG Key ID: 74AF00ADF2E32C85
4 changed files with 44 additions and 42 deletions

View File

@ -32,6 +32,11 @@
enough for files that contain a lot of comments at the start like the
included `example-chapters-2.xml`. The detection range was extended to 10
KB. Fixes #3302.
* all: IETF BCP 47/RFC 5646 language tags: variants aren't validated wrt. to
prefixes anymore as BCP 47 doesn't actually pose restrictions on them,
saying only that prefixes "are suitable sequences" for use with the
variants. What is now verified, though, is that no variant is used multiple
times within the same language tag. Part of the implementation/fix of #3307.
# Version 66.0.0 "Josie" 2022-03-13

View File

@ -316,12 +316,10 @@ language_c::parse_extensions(std::string const &str) {
bool
language_c::matches_prefix(language_c const &prefix,
std::size_t extlang_or_variant_index,
bool is_extlang,
std::size_t extlang_index,
prefix_restrictions_t const &restrictions)
const noexcept {
if ( ( is_extlang && !m_extended_language_subtags.empty() && (extlang_or_variant_index > (prefix.m_extended_language_subtags.size())))
|| (!is_extlang && !m_variants .empty() && (extlang_or_variant_index > (prefix.m_variants .size()))))
if (!m_extended_language_subtags.empty() && (extlang_index > (prefix.m_extended_language_subtags.size())))
return false;
if ( (restrictions.language && prefix.m_language .empty() && !m_language .empty())
@ -360,17 +358,14 @@ language_c::matches_prefix(language_c const &prefix,
}
bool
language_c::validate_one_extlang_or_variant(std::size_t extlang_or_variant_index,
bool is_extlang) {
auto const &extlang_or_variant_code = is_extlang ? m_extended_language_subtags[extlang_or_variant_index]
: m_variants[extlang_or_variant_index];
auto extlang_or_variant = is_extlang ? mtx::iana::language_subtag_registry::look_up_extlang(extlang_or_variant_code)
: mtx::iana::language_subtag_registry::look_up_variant(extlang_or_variant_code);
language_c::validate_one_extlang(std::size_t extlang_index) {
auto const &extlang_code = m_extended_language_subtags[extlang_index];
auto extlang = mtx::iana::language_subtag_registry::look_up_extlang(extlang_code);
if (!extlang_or_variant) // Should not happen as the parsing checks this already.
if (!extlang) // Should not happen as the parsing checks this already.
return false;
if (extlang_or_variant->prefixes.empty())
if (extlang->prefixes.empty())
return true;
prefix_restrictions_t restrictions;
@ -381,7 +376,7 @@ language_c::validate_one_extlang_or_variant(std::size_t extlang_or_variant_index
value = true;
};
for (auto const &prefix : extlang_or_variant->prefixes) {
for (auto const &prefix : extlang->prefixes) {
parsed_prefixes.emplace_back(parse(prefix));
auto const &tag = parsed_prefixes.back();
@ -393,27 +388,40 @@ language_c::validate_one_extlang_or_variant(std::size_t extlang_or_variant_index
}
for (auto const &parsed_prefix : parsed_prefixes)
if (matches_prefix(parsed_prefix, extlang_or_variant_index, is_extlang, restrictions))
if (matches_prefix(parsed_prefix, extlang_index, restrictions))
return true;
auto message = is_extlang ? Y("The extended language subtag '{}' must only be used with one of the following prefixes: {}.")
: Y("The variant '{}' must only be used with one of the following prefixes: {}.");
m_parser_error = fmt::format(message, extlang_or_variant_code, fmt::join(extlang_or_variant->prefixes, ", "));
auto message = Y("The extended language subtag '{}' must only be used with one of the following prefixes: {}.");
m_parser_error = fmt::format(message, extlang_code, fmt::join(extlang->prefixes, ", "));
return false;
}
bool
language_c::validate_extlangs_or_variants(bool is_extlangs) {
auto const &extlangs_or_variants = is_extlangs ? m_extended_language_subtags : m_variants;
for (int idx = 0, num_entries = extlangs_or_variants.size(); idx < num_entries; ++idx)
if (!validate_one_extlang_or_variant(idx, is_extlangs))
language_c::validate_extlangs() {
for (int idx = 0, num_entries = m_extended_language_subtags.size(); idx < num_entries; ++idx)
if (!validate_one_extlang(idx))
return false;
return true;
}
bool
language_c::validate_variants() {
std::map<std::string, bool> variants_seen;
for (auto const &variant : m_variants) {
if (variants_seen[variant]) {
m_parser_error = fmt::format(Y("The variant '{}' occurs more than once."), variant);
return false;
}
variants_seen[variant] = true;
}
return true;
}
bool
language_c::validate_extensions() {
if (m_extensions.empty())
@ -497,8 +505,7 @@ language_c::parse(std::string const &language) {
if (matches.capturedLength(9))
l.m_private_use = mtx::string::split(to_utf8(matches.captured(9)).substr(1), "-");
if ( !l.validate_extlangs_or_variants(true)
|| !l.validate_extlangs_or_variants(false))
if (!l.validate_extlangs() || !l.validate_variants())
return l;
l.m_valid = true;

View File

@ -105,9 +105,10 @@ protected:
bool parse_extlangs_or_variants(std::string const &str, bool is_extlangs);
bool validate_extensions();
bool validate_extlangs_or_variants(bool is_extlangs);
bool validate_one_extlang_or_variant(std::size_t extlang_or_variant_index, bool is_extlang);
bool matches_prefix(language_c const &prefix, std::size_t extlang_or_variant_index, bool is_extlang, prefix_restrictions_t const &restrictions) const noexcept;
bool validate_extlangs();
bool validate_variants();
bool validate_one_extlang(std::size_t extlang_index);
bool matches_prefix(language_c const &prefix, std::size_t extlang_index, prefix_restrictions_t const &restrictions) const noexcept;
public:
static language_c parse(std::string const &language);

View File

@ -35,7 +35,6 @@ TEST(BCP47LanguageTags, ParsingInvalid) {
EXPECT_FALSE(language_c::parse("zh-min").is_valid()); // invalid (min not allowed with zh)
EXPECT_FALSE(language_c::parse("gonzo").is_valid()); // invalid
EXPECT_FALSE(language_c::parse("de-aao-Latn-DZ").is_valid()); // invalid (aoo not valid with de)
EXPECT_FALSE(language_c::parse("de-ekavsk").is_valid()); // invalid (ekavsk not valid with de)
EXPECT_FALSE(language_c::parse("es-0").is_valid()); // invalid (no such region)
}
@ -184,19 +183,13 @@ TEST(BCP47LanguageTags, PrefixValidation) {
EXPECT_TRUE(language_c::parse("en-GB-scotland").is_valid());
EXPECT_TRUE(language_c::parse("zh-Latn-CN-pinyin").is_valid());
EXPECT_FALSE(language_c::parse("sr-biske").is_valid());
EXPECT_FALSE(language_c::parse("tr-rozaj").is_valid());
EXPECT_TRUE(language_c::parse("sl-rozaj").is_valid());
EXPECT_TRUE(language_c::parse("sl-rozaj-biske").is_valid());
EXPECT_TRUE(language_c::parse("sl-rozaj-1994").is_valid());
EXPECT_TRUE(language_c::parse("sl-rozaj-biske-1994").is_valid());
EXPECT_FALSE(language_c::parse("sl-1994").is_valid());
EXPECT_FALSE(language_c::parse("sl-biske-rozaj").is_valid());
EXPECT_TRUE(language_c::parse("de-1901").is_valid());
EXPECT_TRUE(language_c::parse("de-1996").is_valid());
EXPECT_FALSE(language_c::parse("de-1901-1996").is_valid());
EXPECT_TRUE(language_c::parse("zh-cmn").is_valid());
EXPECT_TRUE(language_c::parse("zh-yue").is_valid());
@ -204,12 +197,14 @@ TEST(BCP47LanguageTags, PrefixValidation) {
EXPECT_TRUE(language_c::parse("hy-arevela").is_valid());
EXPECT_TRUE(language_c::parse("hy-arevmda").is_valid());
EXPECT_FALSE(language_c::parse("hy-arevela-arevmda").is_valid());
EXPECT_TRUE(language_c::parse("ja-Latn-hepburn").is_valid());
EXPECT_TRUE(language_c::parse("ja-Latn-hepburn-heploc").is_valid());
EXPECT_FALSE(language_c::parse("ja-Latn-heploc").is_valid());
EXPECT_FALSE(language_c::parse("de-1996-1996").is_valid());
EXPECT_TRUE(language_c::parse("cmn-Latn-pinyin").is_valid());
EXPECT_TRUE(language_c::parse("zh-cmn-Latn-tongyong").is_valid());
EXPECT_TRUE(language_c::parse("zh-yue-jyutping").is_valid());
}
TEST(BCP47LanguageTags, RFC4646AssortedValid) {
@ -218,10 +213,6 @@ TEST(BCP47LanguageTags, RFC4646AssortedValid) {
EXPECT_TRUE(language_c::parse("de-CH-1996").is_valid()); // section 3.1
}
TEST(BCP47LanguageTags, RFC4646AssortedInvalid) {
EXPECT_FALSE(language_c::parse("fr-1996").is_valid()); // section 3.1
}
TEST(BCP47LanguageTags, RFC4646AppendixBValid) {
// Simple language subtag:
EXPECT_TRUE(language_c::parse("de").is_valid()); // (German)
@ -293,8 +284,6 @@ TEST(BCP47LanguageTags, OnlyCertainScriptsAllowedOrNoScriptAtAll) {
EXPECT_TRUE(language_c::parse("sr-Latn-ekavsk").is_valid());
EXPECT_TRUE(language_c::parse("sr-Latn-RS-ekavsk").is_valid());
EXPECT_FALSE(language_c::parse("sr-Bali-ekavsk").is_valid());
}
TEST(BCP47LanguageTags, ExtensionsBasics) {