diff --git a/ChangeLog b/ChangeLog index ea6c564d6..51cad0c41 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,11 @@ 2003-09-16 Moritz Bunkus + * mkvmerge: bugfix: The SRT reader would abort if there was more + than one empty line between subtitle entries line. + + * mkvextract: bugfix: Proper BOMs are written according to the + desired charset when extracting text subtitles. + * Released v0.7.0. * mkvextract: Add an UTF-8 BOM to extracted SSA/ASS and SRT diff --git a/src/common.cpp b/src/common.cpp index 90404188e..d4a8b62ea 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -531,6 +531,16 @@ char *to_utf8(int handle, const char *local) { return convert_charset(kax_convs[handle].ict_to_utf8, local); } +string &to_utf8(int handle, string &local) { + char *cutf8; + + cutf8 = to_utf8(handle, local.c_str()); + local = cutf8; + safefree(cutf8); + + return local; +} + char *from_utf8(int handle, const char *utf8) { char *copy; @@ -546,6 +556,16 @@ char *from_utf8(int handle, const char *utf8) { return convert_charset(kax_convs[handle].ict_from_utf8, utf8); } +string &from_utf8(int handle, string &utf8) { + char *clocal; + + clocal = from_utf8(handle, utf8.c_str()); + utf8 = clocal; + safefree(clocal); + + return utf8; +} + /* * Random unique uint32_t numbers */ diff --git a/src/common.h b/src/common.h index 6558f869d..48c9f3f09 100644 --- a/src/common.h +++ b/src/common.h @@ -116,6 +116,8 @@ int utf8_init(const char *charset); void utf8_done(); char *to_utf8(int handle, const char *local); char *from_utf8(int handle, const char *utf8); +string &to_utf8(int handle, string &local); +string &from_utf8(int handle, string &utf8); void clear_list_of_unique_uint32(); bool is_unique_uint32(uint32_t number); diff --git a/src/mkvextract.cpp b/src/mkvextract.cpp index f0b7152c8..7480b9f5a 100644 --- a/src/mkvextract.cpp +++ b/src/mkvextract.cpp @@ -154,11 +154,12 @@ static bool chapter_format_simple = false; void parse_args(int argc, char **argv, char *&file_name, int &mode) { int i, conv_handle; - char *colon, *copy; + char *colon, *copy, *sub_charset; int64_t tid; kax_track_t track; file_name = NULL; + sub_charset = NULL; verbose = 0; if (argc < 2) { @@ -208,6 +209,7 @@ void parse_args(int argc, char **argv, char *&file_name, int &mode) { mxerror("-c lacks a charset.\n"); conv_handle = utf8_init(argv[i + 1]); + sub_charset = argv[i + 1]; i++; } else if (mode == MODE_TAGS) @@ -239,6 +241,7 @@ void parse_args(int argc, char **argv, char *&file_name, int &mode) { track.tid = tid; track.out_name = safestrdup(colon); track.conv_handle = conv_handle; + track.sub_charset = safestrdup(sub_charset); tracks.push_back(track); safefree(copy); } diff --git a/src/mkvextract.h b/src/mkvextract.h index 94a55ca53..baa04987b 100644 --- a/src/mkvextract.h +++ b/src/mkvextract.h @@ -70,6 +70,7 @@ typedef struct { int64_t default_duration; int srt_num; + char *sub_charset; int conv_handle; vector ssa_lines; bool warning_printed; diff --git a/src/mkvextract_tracks.cpp b/src/mkvextract_tracks.cpp index 9f1fcde0a..9cccffeb9 100644 --- a/src/mkvextract_tracks.cpp +++ b/src/mkvextract_tracks.cpp @@ -111,7 +111,6 @@ static void create_output_files() { bool something_to_do, is_ok; unsigned char *c; ogg_packet op; - const unsigned char utf8_bom[3] = {0xef, 0xbb, 0xbf}; something_to_do = false; @@ -393,24 +392,47 @@ static void create_output_files() { } else if (tracks[i].type == TYPESRT) { tracks[i].srt_num = 1; - tracks[i].out->write(utf8_bom, 3); + tracks[i].out->write_bom(tracks[i].sub_charset); } else if (tracks[i].type == TYPESSA) { char *s; unsigned char *pd; + int bom_len; + string sconv; + + pd = (unsigned char *)tracks[i].private_data; + bom_len = 0; + // Skip any BOM that might be present. + if ((tracks[i].private_size > 3) && + (pd[0] == 0xef) && (pd[1] == 0xbb) && (pd[2] == 0xbf)) + bom_len = 3; + else if ((tracks[i].private_size > 4) && + (pd[0] == 0xff) && (pd[1] == 0xfe) && + (pd[2] == 0x00) && (pd[3] == 0x00)) + bom_len = 4; + else if ((tracks[i].private_size > 4) && + (pd[0] == 0x00) && (pd[1] == 0x00) && + (pd[2] == 0xfe) && (pd[3] == 0xff)) + bom_len = 4; + else if ((tracks[i].private_size > 2) && + (pd[0] == 0xff) && (pd[1] == 0xfe)) + bom_len = 2; + else if ((tracks[i].private_size > 2) && + (pd[0] == 0xfe) && (pd[1] == 0xff)) + bom_len = 2; + pd += bom_len; + tracks[i].private_size -= bom_len; s = (char *)safemalloc(tracks[i].private_size + 1); - memcpy(s, tracks[i].private_data, tracks[i].private_size); + memcpy(s, pd, tracks[i].private_size); s[tracks[i].private_size] = 0; - pd = (unsigned char *)tracks[i].private_data; - if ((pd[0] != 0x00) && (pd[0] != 0xef) && (pd[0] != 0xff)) - tracks[i].out->write(utf8_bom, 3); - tracks[i].out->puts_unl(s); - tracks[i].out->puts_unl("\n[Events]\nFormat: Marked, Start, End, " - "Style, Name, MarginL, MarginR, MarginV, " - "Effect, Text\n"); - + sconv = s; safefree(s); + tracks[i].out->write_bom(tracks[i].sub_charset); + sconv += "\n[Events]\nFormat: Marked, Start, End, " + "Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"; + from_utf8(tracks[i].conv_handle, sconv); + tracks[i].out->puts_unl(sconv.c_str()); } } } @@ -583,9 +605,8 @@ static void handle_data(KaxBlock *block, int64_t block_duration, fields[7] + comma; // Effect // Do the charset conversion. - s = from_utf8(tracks[i].conv_handle, fields[8].c_str()); - line += string(s) + "\n"; - safefree(s); + line += fields[8] + "\n"; + from_utf8(tracks[i].conv_handle, line); // Now store that entry. ssa_line.num = num; diff --git a/src/mm_io.cpp b/src/mm_io.cpp index 5110afe0a..a1b01a09c 100644 --- a/src/mm_io.cpp +++ b/src/mm_io.cpp @@ -374,6 +374,41 @@ bool mm_io_c::restore_pos() { return true; } +bool mm_io_c::write_bom(const char *charset) { + const unsigned char utf8_bom[3] = {0xef, 0xbb, 0xbf}; + const unsigned char utf16le_bom[2] = {0xff, 0xfe}; + const unsigned char utf16be_bom[2] = {0xfe, 0xff}; + const unsigned char utf32le_bom[4] = {0xff, 0xfe, 0x00, 0x00}; + const unsigned char utf32be_bom[4] = {0x00, 0x00, 0xff, 0xfe}; + const unsigned char *bom; + int bom_len; + + if (charset == NULL) + return false; + + if (!strcmp(charset, "UTF-8") || !strcmp(charset, "UTF8")) { + bom_len = 3; + bom = utf8_bom; + } else if (!strcmp(charset, "UTF-16") || !strcmp(charset, "UTF-16LE") || + !strcmp(charset, "UTF16") || !strcmp(charset, "UTF16LE")) { + bom_len = 2; + bom = utf16le_bom; + } else if (!strcmp(charset, "UTF-16BE") || !strcmp(charset, "UTF16BE")) { + bom_len = 2; + bom = utf16be_bom; + } else if (!strcmp(charset, "UTF-32") || !strcmp(charset, "UTF-32LE") || + !strcmp(charset, "UTF32") || !strcmp(charset, "UTF32LE")) { + bom_len = 4; + bom = utf32le_bom; + } else if (!strcmp(charset, "UTF-32BE") || !strcmp(charset, "UTF32BE")) { + bom_len = 4; + bom = utf32be_bom; + } else + return false; + + return (write(bom, bom_len) == bom_len); +} + /* * Dummy class for output to /dev/null. Needed for two pass stuff. */ diff --git a/src/mm_io.h b/src/mm_io.h index eb86f3fea..1b2252412 100644 --- a/src/mm_io.h +++ b/src/mm_io.h @@ -66,6 +66,7 @@ public: virtual string getline(); virtual bool getline2(string &s); virtual size_t puts_unl(const char *s); + virtual bool write_bom(const char *charset); virtual void save_pos(int64_t new_pos = -1); virtual bool restore_pos(); diff --git a/src/r_srt.cpp b/src/r_srt.cpp index 4cc9eee2c..8238b921b 100644 --- a/src/r_srt.cpp +++ b/src/r_srt.cpp @@ -99,6 +99,9 @@ int srt_reader_c::read(generic_packetizer_c *) { while (1) { if (!mm_io->getline2(s)) break; + strip(s); + if (s.length() == 0) + continue; if (!mm_io->getline2(s)) break; if ((s.length() < 29) || !issrttimecode(s.c_str()))