teletext subtitles: collapse multiple entries with same content into single entry

The sample file in question contains many teletext packets that show the
same content. The old code would convert those to separate Matroska
frames.

Additionally each frame's duration was calculated from the wrong
timestamp. A teletext page can be drawn over the space of several
packets with different timestamps. Therefore the timestamp of the first
packet rendering a line of a teletext page must be used as the start
timestamp, and the timestamp of the first packet rendering a line of the
following teletext page as its end.

The old code was using a wrong packet for the end, though: the timestamp
of packet containing the end-of-page block.

Fixes #1623.
This commit is contained in:
Moritz Bunkus 2016-03-16 14:40:05 +01:00
parent c3ee7d3d90
commit 6e2a738e82
5 changed files with 118 additions and 46 deletions

View File

@ -1,3 +1,11 @@
2016-03-16 Moritz Bunkus <moritz@bunkus.org>
* mkvmerge: bug fix: fixed two issues in the conversion of
teletext subtitles to SRT subtitles: 1. the handling of streams in
which consecutive teletext packets contain the same content and
2. the calculation of a packet's duration for certain
situations. Fixes #1623.
2016-03-14 Moritz Bunkus <moritz@bunkus.org>
* MKVToolNix GUI: merge tool (playlist selection dialog)

View File

@ -13,6 +13,7 @@
#include "common/common_pch.h"
#include "common/at_scope_exit.h"
#include "common/strings/formatting.h"
#include "input/teletext_to_srt_packet_converter.h"
#include "merge/generic_packetizer.h"
@ -109,7 +110,6 @@ teletext_to_srt_packet_converter_c::teletext_to_srt_packet_converter_c(generic_p
, m_pos{}
, m_data_length{}
, m_buf{}
, m_previous_timecode{-1}
, m_page_re1{" *\\n[ \\n]+", boost::regex::perl}
, m_page_re2{" +", boost::regex::perl}
, m_page_re3{"^[ \\n]+|[ \\n]+$", boost::regex::perl}
@ -213,6 +213,66 @@ teletext_to_srt_packet_converter_c::decode_line(unsigned char const *buffer,
}
}
void
teletext_to_srt_packet_converter_c::process_single_row(int row_number,
timestamp_c const &packet_timestamp) {
remove_parity(&m_buf[m_pos + 6], m_data_length + 2 - 6);
decode_line(&m_buf[m_pos + 6], row_number);
m_page_changed = true;
if (!m_current_timestamp.valid())
m_current_timestamp = packet_timestamp;
}
void
teletext_to_srt_packet_converter_c::decode_page_data(unsigned char ttx_header_magazine) {
unsigned char ttx_packet_0_header[4];
unham(&m_buf[m_pos + 6], ttx_packet_0_header, 8);
if (ttx_packet_0_header[0] != 0xff) {
m_ttx_page_data.page = ttx_to_page(ttx_packet_0_header[0]);
m_ttx_page_data.page += 100 * ttx_header_magazine;
}
m_ttx_page_data.subpage = ((ttx_packet_0_header[2] << 8) | (ttx_packet_0_header[1])) & 0x3f7f;
m_ttx_page_data.subpage = ttx_to_page(m_ttx_page_data.subpage);
m_ttx_page_data.flags = (ttx_packet_0_header[1] & 0x80)
| ((ttx_packet_0_header[3] << 4) & 0x10)
| ((ttx_packet_0_header[3] << 2) & 0x08)
| ((ttx_packet_0_header[3] >> 0) & 0x04)
| ((ttx_packet_0_header[3] >> 1) & 0x02)
| ((ttx_packet_0_header[3] >> 4) & 0x01);
m_ttx_page_data.erase_flag = !!(m_ttx_page_data.flags & 0x80);
m_ttx_page_data.national_set = (m_ttx_page_data.flags >> 21) & 0x07;
mxdebug_if(m_debug,
boost::format(" ttx page %1% subpage %2% erase? %3% national set %4% changed? %5%\n")
% m_ttx_page_data.page % m_ttx_page_data.subpage % m_ttx_page_data.erase_flag % m_ttx_page_data.national_set % m_page_changed);
}
void
teletext_to_srt_packet_converter_c::deliver_queued_content(timestamp_c const &packet_timestamp) {
if (!m_page_changed && !m_ttx_page_data.erase_flag)
return;
if ( !m_queued_timestamp.valid()
|| m_queued_content.empty()
|| (!m_ttx_page_data.erase_flag && (m_queued_content == m_current_content)))
return;
auto duration = ((m_current_timestamp.valid() ? m_current_timestamp : packet_timestamp) - m_queued_timestamp).abs();
auto new_packet = std::make_shared<packet_t>(memory_c::clone(m_queued_content), m_queued_timestamp.to_ns(), duration.to_ns());
mxdebug_if(m_debug, boost::format(" delivering packet at %1% duration %2% content %3%\n") % format_timestamp(m_queued_timestamp) % format_timestamp(new_packet->duration) % m_queued_content);
m_ptzr->process(new_packet);
m_queued_timestamp.reset();
m_queued_content.clear();
}
void
teletext_to_srt_packet_converter_c::process_ttx_packet(packet_cptr const &packet) {
auto data_unit_id = m_buf[m_pos]; // 0x02 = teletext, 0x03 = subtitling, 0xff = stuffing
@ -239,60 +299,55 @@ teletext_to_srt_packet_converter_c::process_ttx_packet(packet_cptr const &packet
mxdebug_if(m_debug, boost::format(" m_pos %1% packet_id/row_number %2%\n") % m_pos % static_cast<unsigned int>(row_number));
if ((row_number > 0) && (row_number <= TTX_PAGE_ROW_SIZE)) {
remove_parity(&m_buf[m_pos + 6], m_data_length + 2 - 6);
decode_line(&m_buf[m_pos + 6], row_number);
process_single_row(row_number, timestamp_c::ns(packet->timecode));
return;
}
if (row_number != 0)
return;
unsigned char ttx_packet_0_header[4];
unham(&m_buf[m_pos + 6], ttx_packet_0_header, 8);
at_scope_exit_c cleanup([&] {
m_ttx_page_data.reset();
m_page_changed = false;
m_current_timestamp.reset();
});
if (ttx_packet_0_header[0] != 0xff) {
m_ttx_page_data.page = ttx_to_page(ttx_packet_0_header[0]);
m_ttx_page_data.page += 100 * ttx_header_magazine;
decode_page_data(ttx_header_magazine);
if (m_ttx_page_data.erase_flag)
deliver_queued_content(timestamp_c::ns(packet->timecode));
if ( m_wanted_page
&& ( (m_wanted_page->first != m_ttx_page_data.page)
|| (m_wanted_page->second != m_ttx_page_data.subpage))) {
return;
}
m_ttx_page_data.subpage = ((ttx_packet_0_header[2] << 8) | (ttx_packet_0_header[1])) & 0x3f7f;
m_ttx_page_data.subpage = ttx_to_page(m_ttx_page_data.subpage);
m_ttx_page_data.flags = (ttx_packet_0_header[1] & 0x80)
| ((ttx_packet_0_header[3] << 4) & 0x10)
| ((ttx_packet_0_header[3] << 2) & 0x08)
| ((ttx_packet_0_header[3] >> 0) & 0x04)
| ((ttx_packet_0_header[3] >> 1) & 0x02)
| ((ttx_packet_0_header[3] >> 4) & 0x01);
m_ttx_page_data.erase_flag = !!(m_ttx_page_data.flags & 0x80);
m_ttx_page_data.national_set = (m_ttx_page_data.flags >> 21) & 0x07;
mxdebug_if(m_debug, boost::format(" ttx page %1% subpage %2% erase? %3% national set %4%\n") % m_ttx_page_data.page % m_ttx_page_data.subpage % m_ttx_page_data.erase_flag % m_ttx_page_data.national_set);
auto current_content = page_to_string();
m_current_content = page_to_string();
mxdebug_if(m_debug,
boost::format(" case !packet_id. page content before clearing it at timecode %2% (prev %3%): %1% PREV content: %4%\n")
% current_content % format_timestamp(packet->timecode) % format_timestamp(m_previous_timecode) % m_previous_content);
boost::format(" page complete; sub page %5%; current content at timecode %2% (queued %3%): %1% PREV content: %4%\n")
% m_current_content % format_timestamp(packet->timecode) % format_timestamp(m_queued_timestamp) % m_queued_content % m_ttx_page_data.subpage);
if (!m_previous_content.empty()) {
m_previous_timecode = std::max<int64_t>(m_previous_timecode, 0);
auto new_packet = std::make_shared<packet_t>(memory_c::clone(m_previous_content), m_previous_timecode, std::abs(packet->timecode - m_previous_timecode));
mxdebug_if(m_debug, boost::format(" WILL DELIVER at %1% duration %2% content %3%\n") % format_timestamp(m_previous_timecode) % format_timestamp(new_packet->duration) % m_previous_content);
m_ptzr->process(new_packet);
m_previous_timecode = -1;
// packet->timecode = -1;
if (!m_wanted_page && !m_current_content.empty()) {
m_wanted_page.reset({ m_ttx_page_data.page, m_ttx_page_data.subpage });
mxdebug_if(m_debug, boost::format(" First page/subpage with content is %1%/%2%.\n") % m_wanted_page->first % m_wanted_page->second);
}
if (!current_content.empty() && (m_previous_timecode == -1))
m_previous_timecode = packet->timecode;
if (!m_wanted_page) {
return;
}
m_ttx_page_data.reset();
m_previous_content = current_content;
deliver_queued_content(timestamp_c::ns(packet->timecode));
if (m_current_content.empty())
return;
m_queued_content = m_current_content;
if (!m_queued_timestamp.valid())
// m_queued_timestamp = timestamp_c::ns(packet->timecode);
m_queued_timestamp = m_current_timestamp.valid() ? m_current_timestamp : timestamp_c::ns(packet->timecode);
mxdebug_if(m_debug, boost::format(" queueing page content at %1% content %2%\n") % format_timestamp(m_queued_timestamp) % m_queued_content);
}
std::string
@ -341,8 +396,5 @@ teletext_to_srt_packet_converter_c::convert(packet_cptr const &packet) {
m_pos += 2 + m_data_length;
}
if ((-1 == m_previous_timecode) && (-1 != packet->timecode) && !page_to_string().empty())
m_previous_timecode = packet->timecode;
return true;
}

View File

@ -17,6 +17,7 @@
#include "common/common_pch.h"
#include "common/debugging.h"
#include "common/timestamp.h"
#include "input/packet_converter.h"
#define TTX_PAGE_ROW_SIZE 24
@ -39,8 +40,10 @@ public:
protected:
size_t m_in_size, m_pos, m_data_length;
unsigned char *m_buf;
int64_t m_previous_timecode;
std::string m_previous_content;
timestamp_c m_queued_timestamp, m_current_timestamp;
std::string m_queued_content, m_current_content;
boost::optional<std::pair<int, int>> m_wanted_page;
bool m_page_changed{};
ttx_page_data_t m_ttx_page_data;
@ -58,6 +61,9 @@ protected:
void process_ttx_packet(packet_cptr const &packet);
std::string page_to_string() const;
void decode_line(unsigned char const *buffer, unsigned int row);
void process_single_row(int row_number, timestamp_c const &packet_timestamp);
void decode_page_data(unsigned char ttx_header_magazine);
void deliver_queued_content(timestamp_c const &packet_timestamp);
protected:
static int ttx_to_page(int ttx);

View File

@ -383,3 +383,4 @@ T_534chapter_generation_when_appending_audio_only:000e4bfced4fae128abbb741545768
T_535chapter_generation_interval_audio_only:4231b50be18c4320584d7e18c611431d-7149e4b581f601145db038035aba3ec4-7b37e30bfede450fec2a5e7b1e982b35+bb66d81871625190fbd7b15b02f6ca57+1285f17cbdb1259606bd7174e993b3f1+c0877adb7bf88212aae2212ab819cf57+78fcd2002d1a48752c5a8e4730cc2158+b2a259375cb03683b7fe6ffe95a53e21+f3f355cb0549efabdd9954f4448fc48d+ok-8f16b1baaedec4b22df0f566b39a105e:passed:20160302-131002:0.64531153
T_536extract_big_endian_pcm:8e57291db3e924e9bb45acb306426a0a:passed:20160307-190156:0.015845204
T_537srt_bom_precedence_over_sub_charset:9687bc3195f16a852b88c599c17a9f5c-9687bc3195f16a852b88c599c17a9f5c-9687bc3195f16a852b88c599c17a9f5c-32eaa074a254eab81b90bd97be50c425:passed:20160309-180444:0.036282259
T_538teletext_many_packets_same_content:409dc709d9530f7fe85066af8fc07b7c:passed:20160316-143840:0.650010976

View File

@ -0,0 +1,5 @@
#!/usr/bin/ruby -w
# T_538teletext_many_packets_same_content
describe "mkvmerge / MPEG TS with teletext subtitles with many packets having the same content"
test_merge "data/ts/teletext_subs_many_packets_same_content.ts", args: "--no-audio --no-video"