From 6e2a738e82676c0875d3ab59495fed428bf5ff56 Mon Sep 17 00:00:00 2001 From: Moritz Bunkus Date: Wed, 16 Mar 2016 14:40:05 +0100 Subject: [PATCH] teletext subtitles: collapse multiple entries with same content into single entry The sample file in question contains many teletext packets that show the same content. The old code would convert those to separate Matroska frames. Additionally each frame's duration was calculated from the wrong timestamp. A teletext page can be drawn over the space of several packets with different timestamps. Therefore the timestamp of the first packet rendering a line of a teletext page must be used as the start timestamp, and the timestamp of the first packet rendering a line of the following teletext page as its end. The old code was using a wrong packet for the end, though: the timestamp of packet containing the end-of-page block. Fixes #1623. --- ChangeLog | 8 + .../teletext_to_srt_packet_converter.cpp | 140 ++++++++++++------ src/input/teletext_to_srt_packet_converter.h | 10 +- tests/results.txt | 1 + ...t-538teletext_many_packets_same_content.rb | 5 + 5 files changed, 118 insertions(+), 46 deletions(-) create mode 100755 tests/test-538teletext_many_packets_same_content.rb diff --git a/ChangeLog b/ChangeLog index 622ca8803..c1f79e84a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2016-03-16 Moritz Bunkus + + * mkvmerge: bug fix: fixed two issues in the conversion of + teletext subtitles to SRT subtitles: 1. the handling of streams in + which consecutive teletext packets contain the same content and + 2. the calculation of a packet's duration for certain + situations. Fixes #1623. + 2016-03-14 Moritz Bunkus * MKVToolNix GUI: merge tool (playlist selection dialog) diff --git a/src/input/teletext_to_srt_packet_converter.cpp b/src/input/teletext_to_srt_packet_converter.cpp index e3207957d..b9c70cdc0 100644 --- a/src/input/teletext_to_srt_packet_converter.cpp +++ b/src/input/teletext_to_srt_packet_converter.cpp @@ -13,6 +13,7 @@ #include "common/common_pch.h" +#include "common/at_scope_exit.h" #include "common/strings/formatting.h" #include "input/teletext_to_srt_packet_converter.h" #include "merge/generic_packetizer.h" @@ -109,7 +110,6 @@ teletext_to_srt_packet_converter_c::teletext_to_srt_packet_converter_c(generic_p , m_pos{} , m_data_length{} , m_buf{} - , m_previous_timecode{-1} , m_page_re1{" *\\n[ \\n]+", boost::regex::perl} , m_page_re2{" +", boost::regex::perl} , m_page_re3{"^[ \\n]+|[ \\n]+$", boost::regex::perl} @@ -213,6 +213,66 @@ teletext_to_srt_packet_converter_c::decode_line(unsigned char const *buffer, } } +void +teletext_to_srt_packet_converter_c::process_single_row(int row_number, + timestamp_c const &packet_timestamp) { + remove_parity(&m_buf[m_pos + 6], m_data_length + 2 - 6); + decode_line(&m_buf[m_pos + 6], row_number); + + m_page_changed = true; + if (!m_current_timestamp.valid()) + m_current_timestamp = packet_timestamp; +} + +void +teletext_to_srt_packet_converter_c::decode_page_data(unsigned char ttx_header_magazine) { + unsigned char ttx_packet_0_header[4]; + unham(&m_buf[m_pos + 6], ttx_packet_0_header, 8); + + if (ttx_packet_0_header[0] != 0xff) { + m_ttx_page_data.page = ttx_to_page(ttx_packet_0_header[0]); + m_ttx_page_data.page += 100 * ttx_header_magazine; + } + + m_ttx_page_data.subpage = ((ttx_packet_0_header[2] << 8) | (ttx_packet_0_header[1])) & 0x3f7f; + m_ttx_page_data.subpage = ttx_to_page(m_ttx_page_data.subpage); + + m_ttx_page_data.flags = (ttx_packet_0_header[1] & 0x80) + | ((ttx_packet_0_header[3] << 4) & 0x10) + | ((ttx_packet_0_header[3] << 2) & 0x08) + | ((ttx_packet_0_header[3] >> 0) & 0x04) + | ((ttx_packet_0_header[3] >> 1) & 0x02) + | ((ttx_packet_0_header[3] >> 4) & 0x01); + + m_ttx_page_data.erase_flag = !!(m_ttx_page_data.flags & 0x80); + m_ttx_page_data.national_set = (m_ttx_page_data.flags >> 21) & 0x07; + + mxdebug_if(m_debug, + boost::format(" ttx page %1% subpage %2% erase? %3% national set %4% changed? %5%\n") + % m_ttx_page_data.page % m_ttx_page_data.subpage % m_ttx_page_data.erase_flag % m_ttx_page_data.national_set % m_page_changed); +} + +void +teletext_to_srt_packet_converter_c::deliver_queued_content(timestamp_c const &packet_timestamp) { + if (!m_page_changed && !m_ttx_page_data.erase_flag) + return; + + if ( !m_queued_timestamp.valid() + || m_queued_content.empty() + || (!m_ttx_page_data.erase_flag && (m_queued_content == m_current_content))) + return; + + auto duration = ((m_current_timestamp.valid() ? m_current_timestamp : packet_timestamp) - m_queued_timestamp).abs(); + auto new_packet = std::make_shared(memory_c::clone(m_queued_content), m_queued_timestamp.to_ns(), duration.to_ns()); + + mxdebug_if(m_debug, boost::format(" delivering packet at %1% duration %2% content %3%\n") % format_timestamp(m_queued_timestamp) % format_timestamp(new_packet->duration) % m_queued_content); + + m_ptzr->process(new_packet); + + m_queued_timestamp.reset(); + m_queued_content.clear(); +} + void teletext_to_srt_packet_converter_c::process_ttx_packet(packet_cptr const &packet) { auto data_unit_id = m_buf[m_pos]; // 0x02 = teletext, 0x03 = subtitling, 0xff = stuffing @@ -239,60 +299,55 @@ teletext_to_srt_packet_converter_c::process_ttx_packet(packet_cptr const &packet mxdebug_if(m_debug, boost::format(" m_pos %1% packet_id/row_number %2%\n") % m_pos % static_cast(row_number)); if ((row_number > 0) && (row_number <= TTX_PAGE_ROW_SIZE)) { - remove_parity(&m_buf[m_pos + 6], m_data_length + 2 - 6); - decode_line(&m_buf[m_pos + 6], row_number); - + process_single_row(row_number, timestamp_c::ns(packet->timecode)); return; } if (row_number != 0) return; - unsigned char ttx_packet_0_header[4]; - unham(&m_buf[m_pos + 6], ttx_packet_0_header, 8); + at_scope_exit_c cleanup([&] { + m_ttx_page_data.reset(); + m_page_changed = false; + m_current_timestamp.reset(); + }); - if (ttx_packet_0_header[0] != 0xff) { - m_ttx_page_data.page = ttx_to_page(ttx_packet_0_header[0]); - m_ttx_page_data.page += 100 * ttx_header_magazine; + decode_page_data(ttx_header_magazine); + + if (m_ttx_page_data.erase_flag) + deliver_queued_content(timestamp_c::ns(packet->timecode)); + + if ( m_wanted_page + && ( (m_wanted_page->first != m_ttx_page_data.page) + || (m_wanted_page->second != m_ttx_page_data.subpage))) { + return; } - m_ttx_page_data.subpage = ((ttx_packet_0_header[2] << 8) | (ttx_packet_0_header[1])) & 0x3f7f; - m_ttx_page_data.subpage = ttx_to_page(m_ttx_page_data.subpage); - - m_ttx_page_data.flags = (ttx_packet_0_header[1] & 0x80) - | ((ttx_packet_0_header[3] << 4) & 0x10) - | ((ttx_packet_0_header[3] << 2) & 0x08) - | ((ttx_packet_0_header[3] >> 0) & 0x04) - | ((ttx_packet_0_header[3] >> 1) & 0x02) - | ((ttx_packet_0_header[3] >> 4) & 0x01); - - m_ttx_page_data.erase_flag = !!(m_ttx_page_data.flags & 0x80); - m_ttx_page_data.national_set = (m_ttx_page_data.flags >> 21) & 0x07; - - mxdebug_if(m_debug, boost::format(" ttx page %1% subpage %2% erase? %3% national set %4%\n") % m_ttx_page_data.page % m_ttx_page_data.subpage % m_ttx_page_data.erase_flag % m_ttx_page_data.national_set); - - auto current_content = page_to_string(); + m_current_content = page_to_string(); mxdebug_if(m_debug, - boost::format(" case !packet_id. page content before clearing it at timecode %2% (prev %3%): %1% PREV content: %4%\n") - % current_content % format_timestamp(packet->timecode) % format_timestamp(m_previous_timecode) % m_previous_content); + boost::format(" page complete; sub page %5%; current content at timecode %2% (queued %3%): %1% PREV content: %4%\n") + % m_current_content % format_timestamp(packet->timecode) % format_timestamp(m_queued_timestamp) % m_queued_content % m_ttx_page_data.subpage); - if (!m_previous_content.empty()) { - m_previous_timecode = std::max(m_previous_timecode, 0); - auto new_packet = std::make_shared(memory_c::clone(m_previous_content), m_previous_timecode, std::abs(packet->timecode - m_previous_timecode)); - - mxdebug_if(m_debug, boost::format(" WILL DELIVER at %1% duration %2% content %3%\n") % format_timestamp(m_previous_timecode) % format_timestamp(new_packet->duration) % m_previous_content); - - m_ptzr->process(new_packet); - - m_previous_timecode = -1; - // packet->timecode = -1; + if (!m_wanted_page && !m_current_content.empty()) { + m_wanted_page.reset({ m_ttx_page_data.page, m_ttx_page_data.subpage }); + mxdebug_if(m_debug, boost::format(" First page/subpage with content is %1%/%2%.\n") % m_wanted_page->first % m_wanted_page->second); } - if (!current_content.empty() && (m_previous_timecode == -1)) - m_previous_timecode = packet->timecode; + if (!m_wanted_page) { + return; + } - m_ttx_page_data.reset(); - m_previous_content = current_content; + deliver_queued_content(timestamp_c::ns(packet->timecode)); + + if (m_current_content.empty()) + return; + + m_queued_content = m_current_content; + if (!m_queued_timestamp.valid()) + // m_queued_timestamp = timestamp_c::ns(packet->timecode); + m_queued_timestamp = m_current_timestamp.valid() ? m_current_timestamp : timestamp_c::ns(packet->timecode); + + mxdebug_if(m_debug, boost::format(" queueing page content at %1% content %2%\n") % format_timestamp(m_queued_timestamp) % m_queued_content); } std::string @@ -341,8 +396,5 @@ teletext_to_srt_packet_converter_c::convert(packet_cptr const &packet) { m_pos += 2 + m_data_length; } - if ((-1 == m_previous_timecode) && (-1 != packet->timecode) && !page_to_string().empty()) - m_previous_timecode = packet->timecode; - return true; } diff --git a/src/input/teletext_to_srt_packet_converter.h b/src/input/teletext_to_srt_packet_converter.h index 3d1b65f02..f3d2383ed 100644 --- a/src/input/teletext_to_srt_packet_converter.h +++ b/src/input/teletext_to_srt_packet_converter.h @@ -17,6 +17,7 @@ #include "common/common_pch.h" #include "common/debugging.h" +#include "common/timestamp.h" #include "input/packet_converter.h" #define TTX_PAGE_ROW_SIZE 24 @@ -39,8 +40,10 @@ public: protected: size_t m_in_size, m_pos, m_data_length; unsigned char *m_buf; - int64_t m_previous_timecode; - std::string m_previous_content; + timestamp_c m_queued_timestamp, m_current_timestamp; + std::string m_queued_content, m_current_content; + boost::optional> m_wanted_page; + bool m_page_changed{}; ttx_page_data_t m_ttx_page_data; @@ -58,6 +61,9 @@ protected: void process_ttx_packet(packet_cptr const &packet); std::string page_to_string() const; void decode_line(unsigned char const *buffer, unsigned int row); + void process_single_row(int row_number, timestamp_c const &packet_timestamp); + void decode_page_data(unsigned char ttx_header_magazine); + void deliver_queued_content(timestamp_c const &packet_timestamp); protected: static int ttx_to_page(int ttx); diff --git a/tests/results.txt b/tests/results.txt index 6a039cae9..ce10b6de3 100644 --- a/tests/results.txt +++ b/tests/results.txt @@ -383,3 +383,4 @@ T_534chapter_generation_when_appending_audio_only:000e4bfced4fae128abbb741545768 T_535chapter_generation_interval_audio_only:4231b50be18c4320584d7e18c611431d-7149e4b581f601145db038035aba3ec4-7b37e30bfede450fec2a5e7b1e982b35+bb66d81871625190fbd7b15b02f6ca57+1285f17cbdb1259606bd7174e993b3f1+c0877adb7bf88212aae2212ab819cf57+78fcd2002d1a48752c5a8e4730cc2158+b2a259375cb03683b7fe6ffe95a53e21+f3f355cb0549efabdd9954f4448fc48d+ok-8f16b1baaedec4b22df0f566b39a105e:passed:20160302-131002:0.64531153 T_536extract_big_endian_pcm:8e57291db3e924e9bb45acb306426a0a:passed:20160307-190156:0.015845204 T_537srt_bom_precedence_over_sub_charset:9687bc3195f16a852b88c599c17a9f5c-9687bc3195f16a852b88c599c17a9f5c-9687bc3195f16a852b88c599c17a9f5c-32eaa074a254eab81b90bd97be50c425:passed:20160309-180444:0.036282259 +T_538teletext_many_packets_same_content:409dc709d9530f7fe85066af8fc07b7c:passed:20160316-143840:0.650010976 diff --git a/tests/test-538teletext_many_packets_same_content.rb b/tests/test-538teletext_many_packets_same_content.rb new file mode 100755 index 000000000..6a7ab9621 --- /dev/null +++ b/tests/test-538teletext_many_packets_same_content.rb @@ -0,0 +1,5 @@ +#!/usr/bin/ruby -w + +# T_538teletext_many_packets_same_content +describe "mkvmerge / MPEG TS with teletext subtitles with many packets having the same content" +test_merge "data/ts/teletext_subs_many_packets_same_content.ts", args: "--no-audio --no-video"