From 95092247aa51922fa0130f828755f8b979f12494 Mon Sep 17 00:00:00 2001 From: Moritz Bunkus Date: Sun, 23 Jul 2017 09:22:31 +0200 Subject: [PATCH] mkvmerge: identify track's "encoding" if known and fixed This is preparation for making the GUI present the encoding detected by e.g. a byte order mark to the user so that she doesn't have to (and cannot) chose a subtitle character set herself. Implements mkvmerge's part of #2053. --- NEWS.md | 4 ++++ src/input/r_avi.cpp | 4 ++++ src/input/r_avi.h | 1 + src/input/r_matroska.cpp | 4 +++- src/input/r_ogm.cpp | 4 +++- src/input/r_srt.cpp | 6 +++++- src/input/r_ssa.cpp | 8 ++++++-- src/input/r_ssa.h | 1 + src/input/r_webvtt.cpp | 1 + tests/results.txt | 10 +++++----- 10 files changed, 33 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4ca35c781..2082ac9f3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -26,6 +26,10 @@ removing all tag elements not supported by the WebM spec. * translations: added a Romanian translation of the programs by Daniel (see AUTHORS). +* mkvmerge: identification: if the encoding/character set of a text subtitle + track is known (e.g. because a byte order mark is present in the file), then + it will be output during identification as the `encoding` + property. Implements mkvmerge's part of #2053. ## Bug fixes diff --git a/src/input/r_avi.cpp b/src/input/r_avi.cpp index ee9e334a3..8841185a2 100644 --- a/src/input/r_avi.cpp +++ b/src/input/r_avi.cpp @@ -167,6 +167,7 @@ avi_reader_c::parse_subtitle_chunks() { = srt_parser_c::probe(&text_io) ? avi_subs_demuxer_t::TYPE_SRT : ssa_parser_c::probe(&text_io) ? avi_subs_demuxer_t::TYPE_SSA : avi_subs_demuxer_t::TYPE_UNKNOWN; + demuxer.m_encoding = text_io.get_encoding(); if (avi_subs_demuxer_t::TYPE_UNKNOWN != demuxer.m_type) m_subtitle_demuxers.push_back(demuxer); @@ -894,6 +895,9 @@ avi_reader_c::identify_subtitles() { || (avi_subs_demuxer_t::TYPE_SSA == m_subtitle_demuxers[i].m_type)) info.add(mtx::id::text_subtitles, true); + if (m_subtitle_demuxers[i].m_encoding) + info.add(mtx::id::encoding, *m_subtitle_demuxers[i].m_encoding); + id_result_track(1 + AVI_audio_tracks(m_avi) + i, ID_RESULT_TRACK_SUBTITLES, avi_subs_demuxer_t::TYPE_SRT == m_subtitle_demuxers[i].m_type ? codec_c::get_name(codec_c::type_e::S_SRT, "SRT") : avi_subs_demuxer_t::TYPE_SSA == m_subtitle_demuxers[i].m_type ? codec_c::get_name(codec_c::type_e::S_SSA_ASS, "SSA/ASS") diff --git a/src/input/r_avi.h b/src/input/r_avi.h index 6e1ef9966..085d5a363 100644 --- a/src/input/r_avi.h +++ b/src/input/r_avi.h @@ -48,6 +48,7 @@ struct avi_subs_demuxer_t { mm_text_io_cptr m_text_io; subtitles_cptr m_subs; + boost::optional m_encoding; }; class avi_reader_c: public generic_reader_c { diff --git a/src/input/r_matroska.cpp b/src/input/r_matroska.cpp index 18ab35181..4b0410fd9 100644 --- a/src/input/r_matroska.cpp +++ b/src/input/r_matroska.cpp @@ -2640,8 +2640,10 @@ kax_reader_c::identify() { info.add(mtx::id::audio_bits_per_sample, track->a_bps); } else if ('s' == track->type) { - if (track->codec.is(codec_c::type_e::S_SRT) || track->codec.is(codec_c::type_e::S_SSA_ASS) || track->codec.is(codec_c::type_e::S_KATE)) + if (track->codec.is(codec_c::type_e::S_SRT) || track->codec.is(codec_c::type_e::S_SSA_ASS) || track->codec.is(codec_c::type_e::S_KATE)) { info.add(mtx::id::text_subtitles, true); + info.add(mtx::id::encoding, "UTF-8"); + } } if (track->content_decoder.has_encodings()) diff --git a/src/input/r_ogm.cpp b/src/input/r_ogm.cpp index 52bdd93b7..fc420e033 100644 --- a/src/input/r_ogm.cpp +++ b/src/input/r_ogm.cpp @@ -734,8 +734,10 @@ ogm_reader_c::identify() { if ((0 != sdemuxers[i]->display_width) && (0 != sdemuxers[i]->display_height)) info.add(mtx::id::display_dimensions, boost::format("%1%x%2%") % sdemuxers[i]->display_width % sdemuxers[i]->display_height); - if (dynamic_cast(sdemuxers[i].get()) || dynamic_cast(sdemuxers[i].get())) + if (dynamic_cast(sdemuxers[i].get()) || dynamic_cast(sdemuxers[i].get())) { info.add(mtx::id::text_subtitles, true); + info.add(mtx::id::encoding, "UTF-8"); + } auto pixel_dimensions = sdemuxers[i]->get_pixel_dimensions(); if (pixel_dimensions.first && pixel_dimensions.second) diff --git a/src/input/r_srt.cpp b/src/input/r_srt.cpp index 34adf2c4e..0c4699b08 100644 --- a/src/input/r_srt.cpp +++ b/src/input/r_srt.cpp @@ -82,8 +82,12 @@ srt_reader_c::get_progress() { void srt_reader_c::identify() { - auto info = mtx::id::info_c{}; + auto info = mtx::id::info_c{}; + auto encoding = m_text_in->get_encoding(); + info.add(mtx::id::text_subtitles, true); + if (encoding) + info.add(mtx::id::encoding, *encoding); id_result_container(); id_result_track(0, ID_RESULT_TRACK_SUBTITLES, codec_c::get_name(codec_c::type_e::S_SRT, "SRT"), info.get()); diff --git a/src/input/r_ssa.cpp b/src/input/r_ssa.cpp index 3cdc78671..889c9bdcd 100644 --- a/src/input/r_ssa.cpp +++ b/src/input/r_ssa.cpp @@ -53,8 +53,9 @@ ssa_reader_c::read_headers() { : mtx::includes(m_ti.m_sub_charsets, -1) ? charset_converter_c::init(m_ti.m_sub_charsets[-1]) : g_cc_local_utf8; - m_ti.m_id = 0; - m_subs = ssa_parser_cptr(new ssa_parser_c(this, text_in.get(), m_ti.m_fname, 0)); + m_ti.m_id = 0; + m_subs = ssa_parser_cptr(new ssa_parser_c(this, text_in.get(), m_ti.m_fname, 0)); + m_encoding = text_in->get_encoding(); m_subs->set_charset_converter(cc_utf8); m_subs->parse(); @@ -94,7 +95,10 @@ ssa_reader_c::get_progress() { void ssa_reader_c::identify() { auto info = mtx::id::info_c{}; + info.add(mtx::id::text_subtitles, true); + if (m_encoding) + info.add(mtx::id::encoding, *m_encoding); id_result_container(); id_result_track(0, ID_RESULT_TRACK_SUBTITLES, codec_c::get_name(codec_c::type_e::S_SSA_ASS, "SSA/ASS"), info.get()); diff --git a/src/input/r_ssa.h b/src/input/r_ssa.h index 14d9c5075..f62df4e03 100644 --- a/src/input/r_ssa.h +++ b/src/input/r_ssa.h @@ -23,6 +23,7 @@ class ssa_reader_c: public generic_reader_c { private: ssa_parser_cptr m_subs; + boost::optional m_encoding; public: ssa_reader_c(const track_info_c &ti, const mm_io_cptr &in); diff --git a/src/input/r_webvtt.cpp b/src/input/r_webvtt.cpp index a9be4f084..2e3081432 100644 --- a/src/input/r_webvtt.cpp +++ b/src/input/r_webvtt.cpp @@ -108,6 +108,7 @@ void webvtt_reader_c::identify() { auto info = mtx::id::info_c{}; info.add(mtx::id::text_subtitles, true); + info.add(mtx::id::encoding, "UTF-8"); id_result_container(); id_result_track(0, ID_RESULT_TRACK_SUBTITLES, codec_c::get_name(codec_c::type_e::S_WEBVTT, "WebVTT"), info.get()); diff --git a/tests/results.txt b/tests/results.txt index 74175573c..83682bac4 100644 --- a/tests/results.txt +++ b/tests/results.txt @@ -63,7 +63,7 @@ T_213mp4_broken_pixel_dimensions:82cd619cff1f3ec446b9be16204c844c:passed:2005091 T_214one_frame_avi:683d26d5c30a903e4bfba86448fc3ec2:passed:20051004-192755:0.039489971 T_215X_codec_extradata_avi:e8c0c4dad4908f20062be3c1f3b9ad85-74ac799ad899f703cbb6c6654e5f9f51:passed:20051004-194707:0.052219855 T_216mp4_editlists:bea53d8f0e626436e97fada22ce6ac0d:passed:20051118-191453:0.106975045 -T_217file_identification:7a2a506954a56f21739e7912897e07ad-903514f1cd84055d9b06ecff5e8d1ea1-07bcdabfac85ad1ec8daffbd3ec4d6b8-066048d39c245bbda03d357bec5b9593-279167d14c30edc4788cc066f22c941f-ccd6c289299801382b09a9bc13326092-44256b48783ae449ceca65e8419d72e8-cd6039be3553f8bf857c77c8bb99ad1b-55543e6744a979d4a3ffbf0e0bcc855c-d2389a900373adc4864fb3534e145fb0-7febbb46072c2b256735786471c02df4-e2853a83b5964834faa6aa34318aad9c-734e635a1b254319593e24ba89e2434c-1c31748e6eaabc9b84eb737124f457d3-f4bf52b0bf7c773ec337a9d29043b84c-ab8604871b63846cd7df9d5282db3f35-2b05e8b45ff5b8568be5f4aed1bd18bc-4efb23ae937e58377bf4cc00d93f85d0-3b9e8de7136f2fffa4e2bbf1b4aa38e0-db05f705a1e059b29f4db1ea3ee9d59e-e38a8502cd0c407d8ce517913a2db8c0-588d6dd39935990b73cbb2158cf960fe:passed:20051209-180815:1.882517588 +T_217file_identification:7a2a506954a56f21739e7912897e07ad-903514f1cd84055d9b06ecff5e8d1ea1-c9b4cd2f0a85926db88e6f32e91b642c-066048d39c245bbda03d357bec5b9593-279167d14c30edc4788cc066f22c941f-ccd6c289299801382b09a9bc13326092-44256b48783ae449ceca65e8419d72e8-cd6039be3553f8bf857c77c8bb99ad1b-55543e6744a979d4a3ffbf0e0bcc855c-d2389a900373adc4864fb3534e145fb0-7febbb46072c2b256735786471c02df4-e2853a83b5964834faa6aa34318aad9c-734e635a1b254319593e24ba89e2434c-1c31748e6eaabc9b84eb737124f457d3-f4bf52b0bf7c773ec337a9d29043b84c-ab8604871b63846cd7df9d5282db3f35-2b05e8b45ff5b8568be5f4aed1bd18bc-4efb23ae937e58377bf4cc00d93f85d0-3b9e8de7136f2fffa4e2bbf1b4aa38e0-db05f705a1e059b29f4db1ea3ee9d59e-e38a8502cd0c407d8ce517913a2db8c0-588d6dd39935990b73cbb2158cf960fe:passed:20051209-180815:1.882517588 T_218theora:049c4b011a7269d1f9682f0eb673c451-6aa1d73668db13ac875f4325d8797c3f:passed:20060428-105054:0.392912102 T_219srt_short_timecodes:4d58c1d5ddab6368080d54a7585b0f83:passed:20060926-112658:0.117747192 T_220ass_with_comments_at_start:30926355189808086b52edf95c8f49d0:passed:20060926-120101:0.382410266 @@ -251,7 +251,7 @@ T_402opus_output_order:35ddcb9621bce14b9d3ad1b5def65b60:passed:20130705-115856:0 T_403opus_remux_final:f0bed02ce77c7500626d1fa853180d1c:passed:20130705-135811:0.068533558 T_404opus_extraction:0aba264a50870d5cd62d8d12543898bd:passed:20130915-201931:0.050758351 T_405packet_ordering_and_default_duration:4e777a2b2516d47230e04010dc6d2c21:passed:20130916-211719:0.258475566 -T_406ogm_chapters_ansi_encoded:27fff755c51440ca09cce5c3ff9a885a-9671aceb413291015ba250c8b88cce63-8ce95f8788df9adcff26d86a43ded89f:passed:20131002-230255:0.275447568 +T_406ogm_chapters_ansi_encoded:b43e32d47438eca76ac6eb17767d370a-9671aceb413291015ba250c8b88cce63-8ce95f8788df9adcff26d86a43ded89f:passed:20131002-230255:0.275447568 T_407empty_tag_and_chapter_files:error-error-error:passed:20131018-202312:0.131204626 T_408utf_encodings_with_bom:9687bc3195f16a852b88c599c17a9f5c-9687bc3195f16a852b88c599c17a9f5c-9687bc3195f16a852b88c599c17a9f5c-9687bc3195f16a852b88c599c17a9f5c-9687bc3195f16a852b88c599c17a9f5c:passed:20131019-155216:0.215939391 T_409mux_vp9:a0ec1c19b50d2222712bd1046a89b917-6dc39ff738ebc616901de3831fbf3a46:passed:20131019-195820:0.071892447 @@ -273,7 +273,7 @@ T_424avc_recover_point_sei_before_second_field:15ef9998f82f4f554c6a16db01791eaa: T_425mpeg_ts_timestamp_outlier:7f20a9e9d6e7e9e3e0a7d0d4db89ac32:passed:20140305-203603:2.509694471 T_426extract_write_bom_only_once:a9255d40de93e2731aaead0a746e582f-a9255d40de93e2731aaead0a746e582f:passed:20140310-195606:0.0 T_427ui_locale_pt_BR:8719aedc77a0435129c79e3a061642bf-344b51e9ae6fe2d8ce60fef18ee0e7d1:passed:20140418-103113:0.143370167 -T_428mkv_misdetected_as_ass:240ccc1dd272e8785b9a417f3753d86c:passed:20140518-155446:0.033341203 +T_428mkv_misdetected_as_ass:0eb27fde5224e65f6ea69a2b426acd8e:passed:20140518-155446:0.033341203 T_429track_statistics_tags:f262df87ee15d60bbbe30ec5e4dea073-ca4823172c0e22ca0f92d9290d20109b-fd2bc7d3deba0ef9c08238d09714e60d-20659b6bcd8e1b37ef5362507adf4ca3:passed:20140524-194544:0.635343822 T_430cues_multiple_blocks_same_timecode:f1ab5c927064537eb59ab0f5195d6a1d:passed:20140525-173642:0.033316759 T_431ssa_comments_exclamation_mark:3caa9ad1716134cc1f3e229b88ff94ea:passed:20140618-232324:0.072735677 @@ -357,7 +357,7 @@ T_508splitting_by_parts_with_segment_linking:existence0-true-true-true-existence T_509rerender_track_headers_chapters_attachments:aca9879facd444a739b8ea9ff0c471dc:passed:20151115-230226:0.287840782 T_510propedit_add_attachments_without_meta_seek_present:770103c238a0f502c9ec55f0599d8544:passed:20151121-101043:0.070892905 T_511propedit_ensure_seek_head_exists_at_front:20f53afd94e39f5bbf3f1091eefbe31d:passed:20151129-194025:0.152563199 -T_512json_identification:e2c1bd814ea805d9711a3875e646a51d+ok-d815a6390e25b861d21093ef66f17191+ok-27fd82d5208da05a2ab71d685aac666a+ok-a86b43982a842c6b9c8572534e40dcca+ok-9167cad47e1f91998e3622e6d3aef1c1+ok-e6e342718925ef84c9c65288d477ac76+ok-59c47f40f7c35d90a4a8ba0f15aebf5b+ok-92edb5e757f553b344192b8cf9961834+ok-63ac054cbe3d9eee3ac47d41f33186e3+ok-01f6792cb05fb5dd4823a5919a5565b4+ok-3f9fd90d34f591fa205eb235b1efe36c+ok-684882d2b787328dc582a7676a8baa67+ok-7c6ccc4d705a480fdba2c6c73047cd5c+ok-b137e5af83e68f2c52af9bfe3976a977+ok-c8b2bd3f66486fc47ad98d7b6a06f713+ok-e824d350b4502a9897275820fced46bf+ok-dc49b24056e793c2f61f129b80417e1a+ok:passed:20151207-223859:1.325386646 +T_512json_identification:e2c1bd814ea805d9711a3875e646a51d+ok-d815a6390e25b861d21093ef66f17191+ok-26c87eebc7778cb59dee0d0bb328e8f4+ok-a86b43982a842c6b9c8572534e40dcca+ok-9167cad47e1f91998e3622e6d3aef1c1+ok-e6e342718925ef84c9c65288d477ac76+ok-59c47f40f7c35d90a4a8ba0f15aebf5b+ok-02bf83f6c3e2e80871c5c4bcd3a85551+ok-63ac054cbe3d9eee3ac47d41f33186e3+ok-01f6792cb05fb5dd4823a5919a5565b4+ok-3f9fd90d34f591fa205eb235b1efe36c+ok-684882d2b787328dc582a7676a8baa67+ok-7c6ccc4d705a480fdba2c6c73047cd5c+ok-b137e5af83e68f2c52af9bfe3976a977+ok-c8b2bd3f66486fc47ad98d7b6a06f713+ok-e824d350b4502a9897275820fced46bf+ok-dc49b24056e793c2f61f129b80417e1a+ok:passed:20151207-223859:1.325386646 T_513vp9_10bit_key_frame_detection:3bdaa369dc5af73ced610d978f3bd53d:passed:20151208-224613:0.267556245 T_514remove_track_statistics_tags_during_remux:f262df87ee15d60bbbe30ec5e4dea073-4342871017061370ac0989a9bb71e5c6-75205f286329069b201e4d5745f2cae4:passed:20151215-134129:1.426290351 T_515aac_sampling_frequency_8000_is_not_sbr:545f3eae0c4163d31de81b3bf921e639:passed:20151219-130357:0.066237884 @@ -435,7 +435,7 @@ T_586h265_invalid_default_display_window_in_sps_vui:f49d79d17235b95a154b5d951e48 T_587X_ssa_ass_shorter_non_standard_event_format:8a247e76b55536c66e8f0c6b03b14de7:passed:20170320-133053:0.011749409 T_588h265_must_copy_bitstream_restriction_info_in_vui_parameters:73adcf66e93a909ca150885fd5f1eb4d:passed:20170330-194958:0.579648793 T_589h264_forcing_default_duration_in_fields_with_source_matroska:57ec2c6f2b5f526a0bc0ae4b7d58a7fc-40000000+40000000+true-21388568453e93db9c2d8f57af26eb0b-20000000+20000000+true-6122f1005d1f64f9958f275220251ed9-30000000+30000000+true-6ee9a2e29ac6cd451bb971daf7b3a41e-60000000+60000000+true-205658c0f0458073f94f06a6b300ebb0-20000000+20000000+true:passed:20170331-165013:0.244234938 -T_590invalid_track_language_elements:39273b8f3b06c67c444c698d4c206847-fbb7ad6e65f0d85cacefd7765605757a-5193400637dc65f4028be97aea853e4b:passed:20170404-191832:0.036965692 +T_590invalid_track_language_elements:39273b8f3b06c67c444c698d4c206847-fbb7ad6e65f0d85cacefd7765605757a-4c72a48d41670a0afa856d30c4d1d820:passed:20170404-191832:0.036965692 T_591hevc_wrong_number_of_parameter_sets:74d06ae9a994edf652865fb12e75f09c:passed:20170412-165246:0.341087191 T_592mpeg_ts_aac_wrong_track_parameters_detected:25116993128e73fe9251dc7161ae8030:passed:20170412-225238:0.044257474 T_593flac_with_picture_metadata:c5779b653e274bbb49d3ddf0a274c63c-c58da16285f056972ce09e617e0bd19e-998802dac83743b286c37a681742f296-7cda56d8aceb15753fc915338f1c0fbb-0e4d2b364f8e535d64286ea154948709:passed:20170415-182414:0.302331784