mirror of
https://gitlab.com/mbunkus/mkvtoolnix.git
synced 2024-12-21 18:45:49 +00:00
JSON identification: replace invalid UTF-8 bytes with placeholder characters
nlohmann's JSON libraries in version 2.x used to strip bytes that aren't valid UTF-8 from strings (e.g. service/station names in MPEG transport streams) before outputting the JSON data structures. With MKVToolNix v24 that bundled library was updated to v3 which now throws an exception on such invalid data. Therefore mkvmerge now takes care of replacing invalid bytes with placeholder characters itself before passing the strings to nlohmann's JSON library. Fixes #2327.
This commit is contained in:
parent
1ca6c7f8d9
commit
4eefc50110
4
NEWS.md
4
NEWS.md
@ -20,6 +20,10 @@
|
||||
MPLS's start and end timestamps against the transport stream's PTS instead
|
||||
of its DTS. Otherwise the first key frame of a video track might be dropped
|
||||
if it isn't the first in presentation order. Fixes #2321.
|
||||
* mkvmerge: JSON identification: mkvmerge will ensure that all strings passed
|
||||
to the JSON output modules are valid UTF-8 encoded strings by replacing
|
||||
invalid bytes with placeholder characters. This avoids the JSON library
|
||||
throwing an exception and mkvmerge aborting on such data. Fixes #2327.
|
||||
|
||||
|
||||
# Version 24.0.0 "Beyond The Pale" 2018-06-10
|
||||
|
@ -16,6 +16,8 @@ std::vector<int> utf32result;
|
||||
std::vector<unsigned char> utf8result;
|
||||
utf8::utf8to32(s.begin(),s.end(),std::back_inserter(utf32result));
|
||||
utf8::utf32to8(utf32result.begin(),utf32result.end(),std::back_inserter(utf8result));
|
||||
std::string temp;
|
||||
utf8::replace_invalid(s.begin(), s.end(), std::back_inserter(temp));
|
||||
])],[ac_cv_utf8cpp=yes],[ac_cv_utf8cpp=no])
|
||||
fi
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include "common/mm_mem_io.h"
|
||||
#include "common/mm_proxy_io.h"
|
||||
#include "common/mm_text_io.h"
|
||||
#include "common/strings/utf8.h"
|
||||
|
||||
namespace mtx { namespace json {
|
||||
|
||||
@ -99,6 +100,21 @@ strip_comments(nlohmann::json::string_t const &data) {
|
||||
return std::string{reinterpret_cast<char *>(out.get_buffer()), static_cast<std::string::size_type>(out.getFilePointer())};
|
||||
}
|
||||
|
||||
void
|
||||
fix_invalid_utf8_recursively(nlohmann::json &json) {
|
||||
if (json.type() == nlohmann::json::value_t::string)
|
||||
json = fix_invalid_utf8(json.get<std::string>());
|
||||
|
||||
else if (json.type() == nlohmann::json::value_t::array) {
|
||||
for (auto &sub_json : json)
|
||||
fix_invalid_utf8_recursively(sub_json);
|
||||
|
||||
} else if (json.type() == nlohmann::json::value_t::object) {
|
||||
for (auto it = json.begin(), end = json.end(); it != end; ++it)
|
||||
fix_invalid_utf8_recursively(it.value());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
nlohmann::json
|
||||
@ -116,7 +132,10 @@ dump(nlohmann::json const &json,
|
||||
auto old_locale = std::string{::setlocale(LC_NUMERIC, "C")};
|
||||
at_scope_exit_c restore_locale{ [&old_locale]() { ::setlocale(LC_NUMERIC, old_locale.c_str()); } };
|
||||
|
||||
return json.dump(indentation);
|
||||
auto json_fixed = json;
|
||||
fix_invalid_utf8_recursively(json_fixed);
|
||||
|
||||
return json_fixed.dump(indentation);
|
||||
}
|
||||
|
||||
}} // namespace mtx::json
|
||||
|
@ -40,6 +40,14 @@ to_utf8(const std::wstring &source) {
|
||||
return destination;
|
||||
}
|
||||
|
||||
std::string
|
||||
fix_invalid_utf8(std::string const &str) {
|
||||
std::string temp;
|
||||
|
||||
::utf8::replace_invalid(str.begin(), str.end(), std::back_inserter(temp));
|
||||
return temp;
|
||||
}
|
||||
|
||||
size_t
|
||||
get_width_in_em(const std::wstring &s) {
|
||||
size_t width = 0;
|
||||
|
@ -83,5 +83,7 @@ to_utf8(::libebml::UTFstring const &source) {
|
||||
return source.GetUTF8();
|
||||
}
|
||||
|
||||
std::string fix_invalid_utf8(std::string const &source);
|
||||
|
||||
size_t get_width_in_em(wchar_t c);
|
||||
size_t get_width_in_em(const std::wstring &s);
|
||||
|
@ -488,3 +488,4 @@ T_639vobsub_missing_duration_in_matroska:1da346761ddede951b1f2f5f11cd3f69:passed
|
||||
T_640no_date:20fbba5a79a436caa36582e7675c1038:passed:20180427-231324:0.011724657
|
||||
T_641keep_display_unit:f3c2b4b8f52f2d29025052a6be092451-3+16x9-2392d8546623ab6265c1cdbc1cab4739-0+123x456:passed:20180609-123022:0.062045824
|
||||
T_642avc_es_clear_internal_buffers_after_reading_headers:0aaf44ccf543a6a739fd79ab74eed9c3:passed:20180613-174520:0.02386315
|
||||
T_643mpeg_ts_bad_utf8_in_service_names:f967b2bf3fb4265ec723f14eb667bb9a:passed:20180615-172956:0.019394331
|
||||
|
5
tests/test-643mpeg_ts_bad_utf8_in_service_names.rb
Executable file
5
tests/test-643mpeg_ts_bad_utf8_in_service_names.rb
Executable file
@ -0,0 +1,5 @@
|
||||
#!/usr/bin/ruby -w
|
||||
|
||||
# T_643mpeg_ts_bad_utf8_in_service_names
|
||||
describe "mkvmerge / identification of MPEG transport streams with invalid UTF-8 strings for service names"
|
||||
test_identify "data/ts/bad_utf-8_service_names.ts"
|
Loading…
Reference in New Issue
Block a user