From 5934eddc758d3ea4ee0a1fecd7f081312c0f2e9f Mon Sep 17 00:00:00 2001 From: Moritz Bunkus Date: Fri, 1 Dec 2017 14:37:16 +0100 Subject: [PATCH] mm_text_io_c: fix seeking in UTF-16/-32 encoded files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `getline` function tries to handle different line ending styles (carriage returns, new lines, a mix of both). For that it has to probe one more character after having found one of them. If that probed character is not one of them, then the previous position has to be restored — and that was wrongfully assuming that each character is only one byte long. Which it isn't. Fixes #2160. --- NEWS.md | 3 +++ src/common/mm_io.cpp | 8 +++++--- tests/results.txt | 1 + .../test-623text_files_utf16le_different_line_endings.rb | 5 +++++ 4 files changed, 14 insertions(+), 3 deletions(-) create mode 100755 tests/test-623text_files_utf16le_different_line_endings.rb diff --git a/NEWS.md b/NEWS.md index 97ad96f28..07c14f9eb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -19,6 +19,9 @@ * mkvmerge, GUI's multiplexer: MIME types: added the `font` top-level media types from RFC 8081. This means that the following new MIME types for fonts can be used: `font/ttf`, `font/otf`, `font/woff` and `font/woff2`. +* mkvmerge: fixed reading text files encoded in UTF-16 oder UTF-32 that have + different forms of line endings (new lines, carriage returns or a mix of + both). Fixes #2160. # Version 18.0.0 "Apricity" 2017-11-18 diff --git a/src/common/mm_io.cpp b/src/common/mm_io.cpp index d34562794..c2a8d03dc 100644 --- a/src/common/mm_io.cpp +++ b/src/common/mm_io.cpp @@ -1086,13 +1086,15 @@ mm_text_io_c::getline(boost::optional max_chars) { while (1) { memset(utf8char, 0, 9); - int len = read_next_char(utf8char); + auto previous_pos = getFilePointer(); + auto len = read_next_char(utf8char); + if (0 == len) return s; if ((1 == len) && (utf8char[0] == '\r')) { if (previous_was_carriage_return && !m_uses_newlines) { - setFilePointer(-1, seek_current); + setFilePointer(previous_pos); return s; } @@ -1104,7 +1106,7 @@ mm_text_io_c::getline(boost::optional max_chars) { return s; if (previous_was_carriage_return) { - setFilePointer(-len, seek_current); + setFilePointer(previous_pos); return s; } diff --git a/tests/results.txt b/tests/results.txt index e5e837e1a..4140cdafe 100644 --- a/tests/results.txt +++ b/tests/results.txt @@ -468,3 +468,4 @@ T_619ac_3_misdetected_as_mpeg_ps_and_encrypted:795e9be4c1601e9853378a1fee1bfd01: T_620ac3_incomplete_frame_with_timestamp_from_matroska:b2fa8c28c5a45d40460905464e3a3d5f:passed:20171014-153427:0.397688103 T_621propedit_remove_date:fdfebfa48bbd5fc21088827b0ad8f616-ok:passed:20171101-180348:0.062479826 T_622aac_adts_8_channels_no_pce:76a81307fdd14e0c033ea8e9b42a2b78-ok:passed:20171117-190136:0.053130324 +T_623text_files_utf16le_different_line_endings:ed339cd48ef4350f1e1e52eb49af3543:passed:20171201-143509:0.014821508 diff --git a/tests/test-623text_files_utf16le_different_line_endings.rb b/tests/test-623text_files_utf16le_different_line_endings.rb new file mode 100755 index 000000000..e3275929b --- /dev/null +++ b/tests/test-623text_files_utf16le_different_line_endings.rb @@ -0,0 +1,5 @@ +#!/usr/bin/ruby -w + +# T_623text_files_utf16le_different_line_endings +describe "mkvmerge / text files encoded in UTF-16LE with different line ending styles" +test_merge "data/subtitles/srt/utf16le_different_line_ending_styles.srt"