JSON identification: replace invalid UTF-8 bytes with placeholder characters

nlohmann's JSON libraries in version 2.x used to strip bytes that
aren't valid UTF-8 from strings (e.g. service/station names in MPEG
transport streams) before outputting the JSON data structures. With
MKVToolNix v24 that bundled library was updated to v3 which now throws
an exception on such invalid data.

Therefore mkvmerge now takes care of replacing invalid bytes with
placeholder characters itself before passing the strings to nlohmann's
JSON library.

Fixes #2327.
This commit is contained in:
Moritz Bunkus 2018-06-15 17:32:34 +02:00
parent 1ca6c7f8d9
commit 4eefc50110
No known key found for this signature in database
GPG Key ID: 74AF00ADF2E32C85
7 changed files with 42 additions and 1 deletions

View File

@ -20,6 +20,10 @@
MPLS's start and end timestamps against the transport stream's PTS instead
of its DTS. Otherwise the first key frame of a video track might be dropped
if it isn't the first in presentation order. Fixes #2321.
* mkvmerge: JSON identification: mkvmerge will ensure that all strings passed
to the JSON output modules are valid UTF-8 encoded strings by replacing
invalid bytes with placeholder characters. This avoids the JSON library
throwing an exception and mkvmerge aborting on such data. Fixes #2327.
# Version 24.0.0 "Beyond The Pale" 2018-06-10

View File

@ -16,6 +16,8 @@ std::vector<int> utf32result;
std::vector<unsigned char> utf8result;
utf8::utf8to32(s.begin(),s.end(),std::back_inserter(utf32result));
utf8::utf32to8(utf32result.begin(),utf32result.end(),std::back_inserter(utf8result));
std::string temp;
utf8::replace_invalid(s.begin(), s.end(), std::back_inserter(temp));
])],[ac_cv_utf8cpp=yes],[ac_cv_utf8cpp=no])
fi

View File

@ -21,6 +21,7 @@
#include "common/mm_mem_io.h"
#include "common/mm_proxy_io.h"
#include "common/mm_text_io.h"
#include "common/strings/utf8.h"
namespace mtx { namespace json {
@ -99,6 +100,21 @@ strip_comments(nlohmann::json::string_t const &data) {
return std::string{reinterpret_cast<char *>(out.get_buffer()), static_cast<std::string::size_type>(out.getFilePointer())};
}
void
fix_invalid_utf8_recursively(nlohmann::json &json) {
if (json.type() == nlohmann::json::value_t::string)
json = fix_invalid_utf8(json.get<std::string>());
else if (json.type() == nlohmann::json::value_t::array) {
for (auto &sub_json : json)
fix_invalid_utf8_recursively(sub_json);
} else if (json.type() == nlohmann::json::value_t::object) {
for (auto it = json.begin(), end = json.end(); it != end; ++it)
fix_invalid_utf8_recursively(it.value());
}
}
}
nlohmann::json
@ -116,7 +132,10 @@ dump(nlohmann::json const &json,
auto old_locale = std::string{::setlocale(LC_NUMERIC, "C")};
at_scope_exit_c restore_locale{ [&old_locale]() { ::setlocale(LC_NUMERIC, old_locale.c_str()); } };
return json.dump(indentation);
auto json_fixed = json;
fix_invalid_utf8_recursively(json_fixed);
return json_fixed.dump(indentation);
}
}} // namespace mtx::json

View File

@ -40,6 +40,14 @@ to_utf8(const std::wstring &source) {
return destination;
}
std::string
fix_invalid_utf8(std::string const &str) {
std::string temp;
::utf8::replace_invalid(str.begin(), str.end(), std::back_inserter(temp));
return temp;
}
size_t
get_width_in_em(const std::wstring &s) {
size_t width = 0;

View File

@ -83,5 +83,7 @@ to_utf8(::libebml::UTFstring const &source) {
return source.GetUTF8();
}
std::string fix_invalid_utf8(std::string const &source);
size_t get_width_in_em(wchar_t c);
size_t get_width_in_em(const std::wstring &s);

View File

@ -488,3 +488,4 @@ T_639vobsub_missing_duration_in_matroska:1da346761ddede951b1f2f5f11cd3f69:passed
T_640no_date:20fbba5a79a436caa36582e7675c1038:passed:20180427-231324:0.011724657
T_641keep_display_unit:f3c2b4b8f52f2d29025052a6be092451-3+16x9-2392d8546623ab6265c1cdbc1cab4739-0+123x456:passed:20180609-123022:0.062045824
T_642avc_es_clear_internal_buffers_after_reading_headers:0aaf44ccf543a6a739fd79ab74eed9c3:passed:20180613-174520:0.02386315
T_643mpeg_ts_bad_utf8_in_service_names:f967b2bf3fb4265ec723f14eb667bb9a:passed:20180615-172956:0.019394331

View File

@ -0,0 +1,5 @@
#!/usr/bin/ruby -w
# T_643mpeg_ts_bad_utf8_in_service_names
describe "mkvmerge / identification of MPEG transport streams with invalid UTF-8 strings for service names"
test_identify "data/ts/bad_utf-8_service_names.ts"