Support for chosing the charset and language used in simple chapter files. Suggestion by Liisachan.

2024-12-23 19:31:44 +00:00 · 2003-08-28 09:10:45 +00:00 · 2003-08-28 09:10:45 +00:00 · 2d74dde71b
commit 2d74dde71b
parent 30831506f4
5 changed files with 102 additions and 12 deletions
--- a/5
+++ b/5
@ -1,7 +1,10 @@
 2003-08-28  Moritz Bunkus  <moritz@bunkus.org>

+        * mkvmerge: Support for chosing the charset and language used in
+        simple chapter files. Suggestion by Liisachan.
+
        * Rewrote the UTF-8 conversion routines. They should now handle
-        U+8000 characters correctly.
+        U+8000 characters correctly. Reported by Liisachan.

        * mkvmerge: Real reader: For RV40 the actual dimensions were also
        used for the aspect ratio/display dimensions. This has been fixed:
--- a/doc/mkvmerge.1
+++ b/doc/mkvmerge.1
@ -34,6 +34,16 @@ Write to the file '\fIout\fR'.
 \fB\-\-title\fR <\fItitle\fR>
 Sets the general title for the output file, e.g. the movie name.
 .TP
+\fB\-\-chapter\-language\fR <\fBlanguage\fR>
+Sets the ISO639-2 language code that is written for each chapter entry. Applies
+only to simple chapter files. Defaults to "eng". See the section about chapters
+below for details.
+.TP
+\fB\-\-chapter\-charset\fR <\fBcharset\fR>
+Sets the charset that is used for the conversion to UTF-8 for simple chapter
+files. Defaults to the current system locale. See the section about chapters
+below for details.
+.TP
 \fB\-\-chapters <\fIfile\fR>
 Read chapter information from the \fIfile\fR. See the section about chapters
 below for details.
@ -575,6 +585,14 @@ into one Matroska \fIChapterAtom\fR. It does not set any
 \fIChapterTrackNumber\fR which means that the chapters all apply to all
 tracks in the file.
 .LP
+The charset used in the file is assumed to be the same charset that the
+current system's locale returns. If this is not the case then the swith
+\fI\-\-chapter\-charset\fR should be used. If the file contains a valid
+BOM (byte order marker) then all UTF styles are converted automatically.
+In this case \fI\-\-chapter\-charset\fR is simply ignored. You can use
+\fBmkvinfo\fR or \fBmkvextract\fR to verify that the chapter names have
+been converted properly.
+.LP
 When splitting files \fBmkvmerge\fR will correctly adjust the chapters as
 well. This means that each file only includes the chapter entries that
 apply to it, and that the timecodes will be offset to match the new timecodes
--- a/src/chapters.cpp
+++ b/src/chapters.cpp
@ -127,14 +127,19 @@ static bool probe_simple_chapters(mm_text_io_c *in) {
 // CHAPTER01NAME=Hallo Welt

 static KaxChapters *parse_simple_chapters(mm_text_io_c *in, int64_t min_tc,
-                                          int64_t max_tc, int64_t offset) {
+                                          int64_t max_tc, int64_t offset,
+                                          const char *language,
+                                          const char *charset) {
  KaxChapters *chaps;
  KaxEditionEntry *edition;
  KaxChapterAtom *atom;
  KaxChapterDisplay *display;
  int64_t start, hour, minute, second, msecs;
  string name, line;
-  int mode, num;
+  int mode, num, cc_utf8;
+  bool do_convert;
+  char *recoded_string;
+  UTFstring wchar_string;

  in->setFilePointer(0);
  chaps = new KaxChapters;
@ -144,6 +149,16 @@ static KaxChapters *parse_simple_chapters(mm_text_io_c *in, int64_t min_tc,
  edition = NULL;
  num = 0;

+  if (in->get_byte_order() == BO_NONE) {
+    do_convert = true;
+    cc_utf8 = utf8_init(charset);
+
+  } else
+    do_convert = false;
+
+  if (language == NULL)
+    language = "eng";
+
  while (in->getline2(line)) {
    strip(line);
    if (line.length() == 0)
@ -182,14 +197,23 @@ static KaxChapters *parse_simple_chapters(mm_text_io_c *in, int64_t min_tc,

        *static_cast<EbmlUInteger *>(&GetChild<KaxChapterUID>(*atom)) =
          create_unique_uint32();
+
        *static_cast<EbmlUInteger *>(&GetChild<KaxChapterTimeStart>(*atom)) =
          (start - offset) * 1000000;
+
        display = &GetChild<KaxChapterDisplay>(*atom);
+
+        if (do_convert) {
+          recoded_string = to_utf8(cc_utf8, name.c_str());
+          wchar_string = cstrutf8_to_UTFstring(recoded_string);
+          safefree(recoded_string);
+        } else
+          wchar_string = cstrutf8_to_UTFstring(name.c_str());
        *static_cast<EbmlUnicodeString *>
-          (&GetChild<KaxChapterString>(*display)) =
-          cstr_to_UTFstring(name.c_str());
+          (&GetChild<KaxChapterString>(*display)) = wchar_string;
+
        *static_cast<EbmlString *>(&GetChild<KaxChapterLanguage>(*display)) =
-          "eng";
+          language;

        num++;
      }
@ -221,7 +245,8 @@ static KaxChapters *parse_xml_chapters(mm_text_io_c *, int64_t, int64_t,
 // }}}

 KaxChapters *parse_chapters(const char *file_name, int64_t min_tc,
-                            int64_t max_tc, int64_t offset) {
+                            int64_t max_tc, int64_t offset,
+                            const char *language, const char *charset) {
  mm_text_io_c *in;

  try {
@ -231,7 +256,8 @@ KaxChapters *parse_chapters(const char *file_name, int64_t min_tc,
  }

  if (probe_simple_chapters(in))
-    return parse_simple_chapters(in, min_tc, max_tc, offset);
+    return parse_simple_chapters(in, min_tc, max_tc, offset, language,
+                                 charset);

  if (probe_xml_chapters(in))
    return parse_xml_chapters(in, min_tc, max_tc, offset);
--- a/src/chapters.h
+++ b/src/chapters.h
@ -28,7 +28,9 @@
 using namespace libmatroska;

 KaxChapters *parse_chapters(const char *file_name, int64_t min_tc = 0,
-                            int64_t max_tc = -1, int64_t offset = 0);
+                            int64_t max_tc = -1, int64_t offset = 0,
+                            const char *language = NULL,
+                            const char *charset = NULL);

 void write_chapters_xml(KaxChapters *chapters, FILE *out);
 void write_chapters_simple(int &chapter_num, KaxChapters *chapters, FILE *out);
--- a/src/mkvmerge.cpp
+++ b/src/mkvmerge.cpp
@ -164,6 +164,8 @@ KaxChapters *kax_chapters = NULL;
 EbmlVoid *kax_chapters_void = NULL;

 char *chapter_file_name = NULL;
+char *chapter_language = NULL;
+char *chapter_charset = NULL;

 string title;

@ -226,6 +228,8 @@ static void usage() {
    "  -o, --output out         Write to the file 'out'.\n"
    "  --title <title>          Title for this output file.\n"
    "  --chapters <file>        Read chapter information from the file.\n"
+    "  --chapter-language <lng> Set the 'language' element in chapter entries."
+    "\n  --chapter-charset <cset> Charset for a simple chapter file.\n"
    "  --global-tags <file>     Read global tags from a XML file.\n"
    "\n General output control (still global, advanced options):\n"
    "  --cluster-length <n[ms]> Put at most n data blocks into each cluster.\n"
@ -1248,16 +1252,52 @@ static void parse_args(int argc, char **argv) {
      parse_and_add_tags(next_arg);
      i++;

+    } else if (!strcmp(this_arg, "--chapter-language")) {
+      if (next_arg == NULL)
+        mxerror("'--chapter-language' lacks the language.\n");
+
+      if (chapter_language != NULL)
+        mxerror("'--chapter-language' may only be given once in '"
+                "--chapter-language %s'.\n", next_arg);
+
+      if (chapter_file_name != NULL)
+        mxerror("'--chapter-language' must be given before '--chapters' in "
+                "'--chapter-language %s'.\n", next_arg);
+
+      if (!is_valid_iso639_2_code(next_arg))
+        mxerror("'%s' is not a valid ISO639-2 language code. Run "
+                "'mkvmerge --list-languages' for a complete list of language "
+                "codes.\n", next_arg);
+
+      chapter_language = safestrdup(next_arg);
+      i++;
+
+    } else if (!strcmp(this_arg, "--chapter-charset")) {
+      if (next_arg == NULL)
+        mxerror("'--chapter-charset' lacks the charset.\n");
+
+      if (chapter_charset != NULL)
+        mxerror("'--chapter-charset' may only be given once in '"
+                "--chapter-charset %s'.\n", next_arg);
+
+      if (chapter_file_name != NULL)
+        mxerror("'--chapter-charset' must be given before '--chapters' in "
+                "'--chapter-charset %s'.\n", next_arg);
+
+      chapter_charset = safestrdup(next_arg);
+      i++;
+
    } else if (!strcmp(this_arg, "--chapters")) {
      if (next_arg == NULL)
        mxerror("'--chapters' lacks the file name.\n");

-      if (kax_chapters != NULL)
+      if (chapter_file_name != NULL)
        mxerror("Only one chapter file allowed in '%s %s'.\n", this_arg,
                next_arg);

      chapter_file_name = safestrdup(next_arg);
-      kax_chapters = parse_chapters(next_arg);
+      kax_chapters = parse_chapters(next_arg, 0, -1, 0, chapter_language,
+                                    chapter_charset);
      i++;

    } else if (!strcmp(this_arg, "--dump-packets")) {
@ -1782,7 +1822,8 @@ void finish_file() {
    start = cluster_helper->get_first_timecode() + offset;
    end = cluster_helper->get_max_timecode() + offset;

-    chapters_here = parse_chapters(chapter_file_name, start, end, offset);
+    chapters_here = parse_chapters(chapter_file_name, start, end, offset,
+                                   chapter_language, chapter_charset);
    if (chapters_here != NULL)
      kax_chapters_void->ReplaceWith(*chapters_here, *out, true);
    delete kax_chapters_void;