mirror of
https://gitlab.com/mbunkus/mkvtoolnix.git
synced 2024-12-24 11:54:01 +00:00
track selection: use language tag matching instead of verbatim equality
When using language tags for selecting which tracks to keep or discard, mkvmerge was so far comparing the given language tag with the ones in the file (after normalizing each). This meant that in order to always keep all Spanish tracks but discard others, `--stracks !es` would not work reliably as a track in the file might be specified as `es-ES` — and verbatim comparison simply didn't treat `es` and `es-ES` as the same. For users this is somewhat counterintuitive. The idea behind allowing languages for track selection has always been to provide an easy to remember, easy to use way to select tracks for human beings without having to look through file identification first. Verbatim comparison worked fine until support for IETF BCP 47 language tags came along as until that point languages in Matroska files only ever contained a language component but not e.g. a region or a variant. This commit changes the selection to use a matching algorithm similar to how IETF BCP 47 describes language tag matching. Basically it takes a track's existing language, normalizes it & splits it into its components. Then the same is done with all the languages mentioned with the track selection option currently evaluated. For each language listed in the track selection all components that are actually set are compared with the track's language's corresponding components. If all of them are equal, the track is considered to be matched. Components set in the track's language but not in the selection's language are simply ignored. This means that specifying `--stracks !es` in the example above will now match all tracks whose language is some kind of Spanish, no matter if the track's language tag contains a region, variants or whatever (e.g. it would drop tracks marked as `es`, `es-MX`, `es-Latn-ES` etc.).
This commit is contained in:
parent
2928636c95
commit
0394a674bd
11
NEWS.md
11
NEWS.md
@ -21,6 +21,17 @@
|
||||
* HEVC dumper development tool: the tool has been renamed to `xvc_dump` and
|
||||
extended to be able to dump AVC/H.264 bitstreams, too. It now also detects
|
||||
the type of bitstream framing (ISO 14496-15 vs. ITU-T H.264/H.265 Annex B).
|
||||
* mkvmerge: track selection: when using language tags for selecting which
|
||||
tracks to keep, mkvmerge will now use component-based language tag matching
|
||||
instead of comparing them verbatim. This means that only those components
|
||||
(language, region etc.) the user specified will be required to exist and be
|
||||
equal. For example, if a file contains three subtitle tracks with languages
|
||||
`es` (generic Spanish), `es-MX` (Spanish as spoken in Mexico) and `es-ES`
|
||||
(Spanish as spoken in Spain), the user can use `--stracks es` to match all
|
||||
three tracks or be more specific with e.g. `--stracks es-MX` which would
|
||||
only match one track. Similarly inverting the selection with e.g. `--stracks
|
||||
!es` would get rid of all three tracks, not just the one for the generic
|
||||
Spanish.
|
||||
|
||||
## Bug fixes
|
||||
|
||||
|
@ -77,17 +77,19 @@ generic_reader_c::demuxing_requested(char type,
|
||||
int64_t id,
|
||||
mtx::bcp47::language_c const &language)
|
||||
const {
|
||||
auto const *tracks = 'v' == type ? &m_ti.m_vtracks
|
||||
: 'a' == type ? &m_ti.m_atracks
|
||||
: 's' == type ? &m_ti.m_stracks
|
||||
: 'b' == type ? &m_ti.m_btracks
|
||||
: 'T' == type ? &m_ti.m_track_tags
|
||||
: nullptr;
|
||||
static debugging_option_c s_debug{"demuxing_requested"};
|
||||
|
||||
if (!tracks)
|
||||
mxerror(fmt::format("generic_reader_c::demuxing_requested: {1}", fmt::format(Y("Invalid track type {0}."), type)));
|
||||
auto const &tracks = 'v' == type ? m_ti.m_vtracks
|
||||
: 'a' == type ? m_ti.m_atracks
|
||||
: 's' == type ? m_ti.m_stracks
|
||||
: 'b' == type ? m_ti.m_btracks
|
||||
: m_ti.m_track_tags;
|
||||
|
||||
return tracks->selected(id, language);
|
||||
auto result = tracks.selected(id, language);
|
||||
|
||||
mxdebug_if(s_debug, fmt::format("demuxing_requested? {4} type {0} id {1} language {2} item_selector {3}\n", type, id, language, tracks, result ? "yes" : "no"));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
attach_mode_e
|
||||
|
@ -32,6 +32,17 @@ public:
|
||||
{
|
||||
}
|
||||
|
||||
mtx::bcp47::language_c
|
||||
best_language_match(mtx::bcp47::language_c const &language)
|
||||
const {
|
||||
std::vector<mtx::bcp47::language_c> potential_matches;
|
||||
|
||||
for (auto const &pair : m_language_items)
|
||||
potential_matches.emplace_back(pair.first);
|
||||
|
||||
return language.find_best_match(potential_matches);
|
||||
}
|
||||
|
||||
bool
|
||||
selected(int64_t item,
|
||||
mtx::bcp47::language_c const &language_item = {})
|
||||
@ -42,8 +53,9 @@ public:
|
||||
if (m_items.empty() && m_language_items.empty())
|
||||
return !m_reversed;
|
||||
|
||||
auto included = ( !m_items.empty() && mtx::includes(m_items, item))
|
||||
|| (language_item.is_valid() && !m_language_items.empty() && mtx::includes(m_language_items, language_item));
|
||||
auto matched_language = best_language_match(language_item);
|
||||
auto included = ( !m_items.empty() && mtx::includes(m_items, item))
|
||||
|| (language_item.is_valid() && matched_language.is_valid() && mtx::includes(m_language_items, matched_language));
|
||||
return m_reversed ? !included : included;
|
||||
}
|
||||
|
||||
@ -54,8 +66,13 @@ public:
|
||||
if (!selected(item, language_item))
|
||||
return m_default_value;
|
||||
|
||||
if (!m_language_items.empty())
|
||||
return language_item.is_valid() && mtx::includes(m_language_items, language_item) ? m_language_items.at(language_item) : m_default_value;
|
||||
if (!m_language_items.empty()) {
|
||||
auto matched_language = best_language_match(language_item);
|
||||
if (matched_language.is_valid() && mtx::includes(m_language_items, matched_language))
|
||||
return m_language_items.at(matched_language);
|
||||
|
||||
return m_default_value;
|
||||
}
|
||||
|
||||
return mtx::includes(m_items, item) ? m_items.at(item) : m_default_value;
|
||||
}
|
||||
@ -89,3 +106,36 @@ public:
|
||||
return m_items.empty() && m_language_items.empty();
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
std::ostream &
|
||||
operator <<(std::ostream &out,
|
||||
item_selector_c<T> const &selector) {
|
||||
out << "<def:" << selector.m_default_value << " none:" << selector.m_none << " reversed:" << selector.m_reversed << " items:[";
|
||||
|
||||
auto first = true;
|
||||
|
||||
for (auto const &item : selector.m_items) {
|
||||
if (!first)
|
||||
out << " ";
|
||||
first = false;
|
||||
|
||||
out << item.first << ":" << item.second;
|
||||
}
|
||||
|
||||
out << "] lang_items:[";
|
||||
|
||||
first = true;
|
||||
|
||||
for (auto const &item : selector.m_language_items) {
|
||||
if (!first)
|
||||
out << " ";
|
||||
first = false;
|
||||
|
||||
out << item.first << ":" << item.second;
|
||||
}
|
||||
|
||||
out << "]>";
|
||||
|
||||
return out;
|
||||
}
|
||||
|
@ -547,7 +547,7 @@ T_0699default_track_forced_off_vs_default_by_type:b4b2e1178e105c09bb2b6be7863110
|
||||
T_0700X_usf:1dac49aebe86754fa619c001725eb418-46001697d31d3a435ac65008cd37ea5a:passed:20200712-180338:0.092373405
|
||||
T_0701subtitles_duration_rounding:d727b6c1522504da6e3187e027e69145:passed:20200801-150217:0.047938163
|
||||
T_0702block_addition_mapping:21a1140e1363f557f332f123966c0293-165a4660aab1881872f2be9db37ac868:passed:20200801-175839:1.382622914
|
||||
T_0703bcp47_mkvmerge_tracks:649c3b23ddd919260aacfc3843d2dee0-ok-6cea988df7bc3f16fc29a9eedbcb24dd-ok-2efc761eef0ab7b1611eab03ccfa1e73-ok-4142b618710c418f5f6b990debfb81dd-ok-1cddfc894acaaf967841284adc68d95d-ok-8a924fac03de11cd7e9c3148fa8d0de4-ok-6803f90aae01a5188a4b423f5663019b-ok-26253739e9b8bf8388d196406d26a637-ok-080774d207e6b0145490ef9462ee91d0-ok-0e8434cc7cc1ba21b365fef2fd1d17fa-ok-ok-b856b48371435e5db128cb76df8234cd-ok-605073d366cfcfe925aa539fb51e098b-ok-635db9fdd14824a6482035249e7721a1-ok-34c5d55b43d935ff2c35681173c6d789-ok-b856b48371435e5db128cb76df8234cd-ok-05c550764fac8bd16b4c0e677d708b60-ok-6803f90aae01a5188a4b423f5663019b-3b3e46a20511894f386d0b4d55790692-ok-6803f90aae01a5188a4b423f5663019b-1372aed480ce909f3c03a7d9716f49a5-ok-6803f90aae01a5188a4b423f5663019b-3b3e46a20511894f386d0b4d55790692-ok-6803f90aae01a5188a4b423f5663019b-5dcb7e7d5e26a5496bb610f4eaf130a5-ok-6803f90aae01a5188a4b423f5663019b-5dcb7e7d5e26a5496bb610f4eaf130a5-ok-6803f90aae01a5188a4b423f5663019b-3b3e46a20511894f386d0b4d55790692-ok:passed:20200818-174121:10.476005861
|
||||
T_0703bcp47_mkvmerge_tracks:649c3b23ddd919260aacfc3843d2dee0-ok-6cea988df7bc3f16fc29a9eedbcb24dd-ok-2efc761eef0ab7b1611eab03ccfa1e73-ok-4142b618710c418f5f6b990debfb81dd-ok-1cddfc894acaaf967841284adc68d95d-ok-8a924fac03de11cd7e9c3148fa8d0de4-ok-6803f90aae01a5188a4b423f5663019b-ok-26253739e9b8bf8388d196406d26a637-ok-080774d207e6b0145490ef9462ee91d0-ok-0e8434cc7cc1ba21b365fef2fd1d17fa-ok-ok-b856b48371435e5db128cb76df8234cd-ok-605073d366cfcfe925aa539fb51e098b-ok-635db9fdd14824a6482035249e7721a1-ok-34c5d55b43d935ff2c35681173c6d789-ok-b856b48371435e5db128cb76df8234cd-ok-05c550764fac8bd16b4c0e677d708b60-ok-6803f90aae01a5188a4b423f5663019b-3b3e46a20511894f386d0b4d55790692-ok-6803f90aae01a5188a4b423f5663019b-1372aed480ce909f3c03a7d9716f49a5-ok-6803f90aae01a5188a4b423f5663019b-3b3e46a20511894f386d0b4d55790692-ok-6803f90aae01a5188a4b423f5663019b-3b3e46a20511894f386d0b4d55790692-ok-6803f90aae01a5188a4b423f5663019b-5dcb7e7d5e26a5496bb610f4eaf130a5-ok-6803f90aae01a5188a4b423f5663019b-5dcb7e7d5e26a5496bb610f4eaf130a5-ok:passed:20200818-174121:10.476005861
|
||||
T_0704bcp47_mkvmerge_chapters:3a427d08de294be5b9b01cb655b6e913-ok-e4ec6dbdcd7fdeef92b43a8a78dbb607-ok-472d719ce168ebdc9544ad0020f1609e-ok-4a7e1a1e5859fc8fae27e87afec436e7-ok-4a7e1a1e5859fc8fae27e87afec436e7-ok-4a7e1a1e5859fc8fae27e87afec436e7-ok-e1ed73dc53b2d05ee75e70ad6f3b47bf-ok-96e8031d45ad9e182734f52e3f2c7f46-ok-fb5d5e2d83287d45d3294b59d4d1dea0-ok-21602626c592dff8872219ed31197b1f-ok-21602626c592dff8872219ed31197b1f-ok-21602626c592dff8872219ed31197b1f-ok:passed:20200818-180751:1.70604079
|
||||
T_0705bcp47_propedit_language_ietf:649c3b23ddd919260aacfc3843d2dee0-und+und+ok+ger+de_CH+ok+ger+pt_BR+ok+ger++ok+spa+es_MX+ok+eng++ok:passed:20200822-114509:0.769998857
|
||||
T_0706bcp47_mkvmerge_tracks_disable_language_ietf:ebccd4b17fa7c048fdccd07719e0afa7-ok-649c3b23ddd919260aacfc3843d2dee0-ok-435baee92bcd3a3ebbb74c4a36819b84-ok-6cea988df7bc3f16fc29a9eedbcb24dd-435baee92bcd3a3ebbb74c4a36819b84-071937d6ab3739ea219992e473e7c5a9-ok-ok-ok-ebccd4b17fa7c048fdccd07719e0afa7-ok-649c3b23ddd919260aacfc3843d2dee0-ok-435baee92bcd3a3ebbb74c4a36819b84-ok-6cea988df7bc3f16fc29a9eedbcb24dd-435baee92bcd3a3ebbb74c4a36819b84-071937d6ab3739ea219992e473e7c5a9-ok-ok-ok:passed:20200829-101345:1.381911961
|
||||
@ -574,3 +574,4 @@ T_0726hevc_in_matroska_appending:5545f336516d5d39c00c3e87f487da88:passed:2021072
|
||||
T_0727chapters_ietf_language_without_639_2:20a0b25186a983c45a2e71d6bdc14ac6-20a0b25186a983c45a2e71d6bdc14ac6:passed:20210801-145831:0.051880945
|
||||
T_0728chapters_keep_languages_unique:e7585147ca7f2fcbbfdb9d2b176b60f1:passed:20210801-150405:0.0
|
||||
T_0729ssa_ass_appending_and_frame_numbers:40c468e2da86af42b233065bba2ee7b8:passed:20210804-152918:0.063901758
|
||||
T_0730track_selection_by_language_matching:bad3162548c6939a63270284d165ea36-true-bad3162548c6939a63270284d165ea36-true-3f833bb0cb664723542e104d4df21b9f-true-8096f4281c38bf0d9edaac957be98a24-true-76845720d174071e5502dc9955b008cb-true-89a8966b5350b4276dbe38ebc5011312-true-ea4ea7c6ae2dc8c33a19ec87980fa2eb-true-bad3162548c6939a63270284d165ea36-true:passed:20210829-170532:0.0
|
||||
|
@ -106,7 +106,7 @@ compare_languages "#{tmp}-2", %w{ger de-Latn-DE}, %w{por pt-BR}
|
||||
|
||||
test_merge src2, :keep_tmp => true, :args => "--language 0:de-latn-de --language 1:pt-br"
|
||||
test_merge tmp, :keep_tmp => true, :args => "--atracks pt", :output => "#{tmp}-2"
|
||||
compare_languages "#{tmp}-2", %w{ger de-Latn-DE}
|
||||
compare_languages "#{tmp}-2", %w{ger de-Latn-DE}, %w{por pt-BR}
|
||||
|
||||
test_merge src2, :keep_tmp => true, :args => "--language 0:de-latn-de --language 1:pt-br"
|
||||
test_merge tmp, :keep_tmp => true, :args => "--atracks !pt-br", :output => "#{tmp}-2"
|
||||
@ -114,4 +114,4 @@ compare_languages "#{tmp}-2", %w{ger de-Latn-DE}
|
||||
|
||||
test_merge src2, :keep_tmp => true, :args => "--language 0:de-latn-de --language 1:pt-br"
|
||||
test_merge tmp, :keep_tmp => true, :args => "--atracks !pt", :output => "#{tmp}-2"
|
||||
compare_languages "#{tmp}-2", %w{ger de-Latn-DE}, %w{por pt-BR}
|
||||
compare_languages "#{tmp}-2", %w{ger de-Latn-DE}
|
||||
|
34
tests/test-0730track_selection_by_language_matching.rb
Executable file
34
tests/test-0730track_selection_by_language_matching.rb
Executable file
@ -0,0 +1,34 @@
|
||||
#!/usr/bin/ruby -w
|
||||
|
||||
# T_730track_selection_by_language_matching
|
||||
describe "mkvmerge / track selection by language matching"
|
||||
|
||||
def track_languages file
|
||||
identify_json(file)["tracks"].map { |track| track["properties"]["language_ietf"] }
|
||||
end
|
||||
|
||||
def test_remux src, languages_args, languages_result = nil
|
||||
out = "#{tmp}-2"
|
||||
args = languages_args.join(',')
|
||||
|
||||
test_merge "--stracks #{args} #{src}", :output => out, :keep_tmp => true
|
||||
test("languages #{args}") { track_languages(out) == (languages_result || languages_args) }
|
||||
end
|
||||
|
||||
src1 = "data/subtitles/srt/ven.srt"
|
||||
src2 = "#{tmp}-1"
|
||||
all_languages = %w{de de-CH es es-ES es-MX es-US}
|
||||
args = all_languages.
|
||||
map { |language| "--language 0:#{language} #{src1}" }.
|
||||
join(" ")
|
||||
|
||||
test_merge args, :output => src2, :keep_tmp => true
|
||||
test("languages orig") { track_languages(src2) == all_languages }
|
||||
|
||||
test_remux src2, all_languages
|
||||
test_remux src2, %w{es-ES es-US}
|
||||
test_remux src2, %w{es-MX}
|
||||
test_remux src2, %w{!es-MX}, %w{de de-CH es es-ES es-US}
|
||||
test_remux src2, %w{es}, %w{es es-ES es-MX es-US}
|
||||
test_remux src2, %w{de}, %w{de de-CH}
|
||||
test_remux src2, %w{es de}, all_languages
|
@ -1,6 +1,5 @@
|
||||
#include "common/common_pch.h"
|
||||
|
||||
#include "common/bcp47.h"
|
||||
#include "merge/item_selector.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
@ -166,4 +165,33 @@ TEST(ItemSelector, ReversedIDsAndLanguages) {
|
||||
EXPECT_FALSE(is.selected(54, mtx::bcp47::language_c::parse("eng")));
|
||||
}
|
||||
|
||||
TEST(ItemSelector, LanguageMatching) {
|
||||
auto is = item_selector_c<int>{};
|
||||
|
||||
is.add(mtx::bcp47::language_c::parse("en"));
|
||||
is.add(mtx::bcp47::language_c::parse("en-US"));
|
||||
is.add(mtx::bcp47::language_c::parse("en-AU"));
|
||||
is.add(mtx::bcp47::language_c::parse("es"), 1);
|
||||
is.add(mtx::bcp47::language_c::parse("es-ES"), 2);
|
||||
is.add(mtx::bcp47::language_c::parse("es-MX"), 3);
|
||||
|
||||
EXPECT_FALSE(is.selected(42, mtx::bcp47::language_c::parse("de")));
|
||||
|
||||
EXPECT_TRUE(is.selected(42, mtx::bcp47::language_c::parse("en")));
|
||||
EXPECT_TRUE(is.selected(42, mtx::bcp47::language_c::parse("en-US")));
|
||||
EXPECT_TRUE(is.selected(42, mtx::bcp47::language_c::parse("en-AU")));
|
||||
|
||||
EXPECT_TRUE(is.selected(42, mtx::bcp47::language_c::parse("es")));
|
||||
EXPECT_TRUE(is.selected(42, mtx::bcp47::language_c::parse("es-ES")));
|
||||
EXPECT_TRUE(is.selected(42, mtx::bcp47::language_c::parse("es-MX")));
|
||||
EXPECT_TRUE(is.selected(42, mtx::bcp47::language_c::parse("es-US")));
|
||||
|
||||
EXPECT_EQ(0, is.get(42, mtx::bcp47::language_c::parse("de")));
|
||||
|
||||
EXPECT_EQ(1, is.get(42, mtx::bcp47::language_c::parse("es")));
|
||||
EXPECT_EQ(2, is.get(42, mtx::bcp47::language_c::parse("es-ES")));
|
||||
EXPECT_EQ(3, is.get(42, mtx::bcp47::language_c::parse("es-MX")));
|
||||
EXPECT_EQ(1, is.get(42, mtx::bcp47::language_c::parse("es-US")));
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user