From 8ca1160de08bd281cf707cda75417dea1d80ec51 Mon Sep 17 00:00:00 2001 From: Moritz Bunkus Date: Wed, 9 Mar 2016 20:15:15 +0100 Subject: [PATCH] SRT, SSA readers: ignore --sub-charset for files containing a BOM Fixes #1620. --- ChangeLog | 6 ++++++ src/input/r_ssa.cpp | 4 ++-- src/output/p_textsubs.cpp | 9 ++++----- src/output/p_textsubs.h | 1 - tests/results.txt | 1 + tests/test-537srt_bom_precedence_over_sub_charset.rb | 9 +++++++++ 6 files changed, 22 insertions(+), 8 deletions(-) create mode 100755 tests/test-537srt_bom_precedence_over_sub_charset.rb diff --git a/ChangeLog b/ChangeLog index 23a53a094..8c3dbf4b2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2016-03-09 Moritz Bunkus + + * mkvmerge: bug fix: the --sub-charset option is now ignored for + text subtitle files that start with a byte-order mark (BOM) + bringing the behavior in line with the documentation. Fixes #1620. + 2016-03-07 Moritz Bunkus * mkvextract: new feature: implemented the extraction of Big diff --git a/src/input/r_ssa.cpp b/src/input/r_ssa.cpp index ca5fdaf40..3cdc78671 100644 --- a/src/input/r_ssa.cpp +++ b/src/input/r_ssa.cpp @@ -48,9 +48,9 @@ ssa_reader_c::read_headers() { if (!ssa_reader_c::probe_file(text_in.get(), 0)) throw mtx::input::invalid_format_x(); - charset_converter_cptr cc_utf8 = mtx::includes(m_ti.m_sub_charsets, 0) ? charset_converter_c::init(m_ti.m_sub_charsets[ 0]) + charset_converter_cptr cc_utf8 = text_in->get_byte_order() != BO_NONE ? charset_converter_c::init("UTF-8") + : mtx::includes(m_ti.m_sub_charsets, 0) ? charset_converter_c::init(m_ti.m_sub_charsets[ 0]) : mtx::includes(m_ti.m_sub_charsets, -1) ? charset_converter_c::init(m_ti.m_sub_charsets[-1]) - : text_in->get_byte_order() != BO_NONE ? charset_converter_c::init("UTF-8") : g_cc_local_utf8; m_ti.m_id = 0; diff --git a/src/output/p_textsubs.cpp b/src/output/p_textsubs.cpp index 4f39fc4e9..46fbee014 100644 --- a/src/output/p_textsubs.cpp +++ b/src/output/p_textsubs.cpp @@ -36,11 +36,10 @@ textsubs_packetizer_c::textsubs_packetizer_c(generic_reader_c *p_reader, bool is_utf8) : generic_packetizer_c(p_reader, p_ti) , m_packetno{} - , m_codec_id(codec_id) - , m_recode(recode) + , m_codec_id{codec_id} { - if (m_recode) - m_cc_utf8 = charset_converter_c::init((m_ti.m_sub_charset != "") || !is_utf8 ? m_ti.m_sub_charset : "UTF-8"); + if (recode && !is_utf8 && !m_ti.m_sub_charset.empty()) + m_cc_utf8 = charset_converter_c::init(m_ti.m_sub_charset); set_track_type(track_subtitle); if (m_codec_id == MKV_S_TEXTUSF) @@ -93,7 +92,7 @@ textsubs_packetizer_c::process(packet_cptr packet) { subs = boost::regex_replace(subs, s_re_remove_trailing_nl, "", boost::match_default | boost::match_single_line); subs = boost::regex_replace(subs, s_re_translate_nl, "\r\n", boost::match_default | boost::match_single_line); - if (m_recode) + if (m_cc_utf8) subs = m_cc_utf8->utf8(subs); packet->data = memory_cptr(new memory_c((unsigned char *)subs.c_str(), subs.length(), false)); diff --git a/src/output/p_textsubs.h b/src/output/p_textsubs.h index d94f064bc..4bf9416a3 100644 --- a/src/output/p_textsubs.h +++ b/src/output/p_textsubs.h @@ -24,7 +24,6 @@ private: boost::optional m_force_rerender_track_headers_on_packetno; charset_converter_cptr m_cc_utf8; std::string m_codec_id; - bool m_recode; public: textsubs_packetizer_c(generic_reader_c *p_reader, track_info_c &p_ti, const char *codec_id, bool recode, bool is_utf8); diff --git a/tests/results.txt b/tests/results.txt index 0d9b0578a..5acdbdb90 100644 --- a/tests/results.txt +++ b/tests/results.txt @@ -382,3 +382,4 @@ T_533chapter_generation_interval:63486951fe0717eec1e93cb8fadaed92-5a57214bb210f5 T_534chapter_generation_when_appending_audio_only:000e4bfced4fae128abbb741545768c8-39028cc1508e5a37f879b3a459f3dbd2-fb20e8f516702d6a0fb9299120e6b508+e66dbfdf351112c62aa65faccdce8ee1+c4dc3cd790ee8397d0a6bdca84091981+3d716d93d0178aeabc66111b4dc10d9a+4fe4b6a15803e0c8d400ce07b4a9d48e+8b5ccf0e1b9fcba119d40727b9b7e8e4+f3f355cb0549efabdd9954f4448fc48d+ok-e668981666602602e2ed5a1b8d47f6db:passed:20160302-130929:0.645667954 T_535chapter_generation_interval_audio_only:4231b50be18c4320584d7e18c611431d-7149e4b581f601145db038035aba3ec4-7b37e30bfede450fec2a5e7b1e982b35+bb66d81871625190fbd7b15b02f6ca57+1285f17cbdb1259606bd7174e993b3f1+c0877adb7bf88212aae2212ab819cf57+78fcd2002d1a48752c5a8e4730cc2158+b2a259375cb03683b7fe6ffe95a53e21+f3f355cb0549efabdd9954f4448fc48d+ok-8f16b1baaedec4b22df0f566b39a105e:passed:20160302-131002:0.64531153 T_536extract_big_endian_pcm:8e57291db3e924e9bb45acb306426a0a:passed:20160307-190156:0.015845204 +T_537srt_bom_precedence_over_sub_charset:9687bc3195f16a852b88c599c17a9f5c-9687bc3195f16a852b88c599c17a9f5c-9687bc3195f16a852b88c599c17a9f5c-32eaa074a254eab81b90bd97be50c425:passed:20160309-180444:0.036282259 diff --git a/tests/test-537srt_bom_precedence_over_sub_charset.rb b/tests/test-537srt_bom_precedence_over_sub_charset.rb new file mode 100755 index 000000000..283cdc5b5 --- /dev/null +++ b/tests/test-537srt_bom_precedence_over_sub_charset.rb @@ -0,0 +1,9 @@ +#!/usr/bin/ruby -w + +# T_537srt_bom_precedence_over_sub_charset +describe "mkvmerge / SRT: BOMs have precedence over --sub-charset" + +test_merge "--sub-charset 0:ISO-8859-15 data/subtitles/srt/vde-utf-8-bom.srt" +test_merge " data/subtitles/srt/vde-utf-8-bom.srt" +test_merge "--sub-charset 0:ISO-8859-15 data/subtitles/srt/vde.srt" +test_merge " data/subtitles/srt/vde.srt"