Made the SSA reader more spec compliant. Made the charset recoding in textsubs packetizer optional (only if the source is a SRT or OGM. Not if the source is a Matroska file (already UTF-8) nor for SSA/ASS files (reader will recode)). Renamed the long writeline_unix_newlines() to puts_unl().

2024-12-24 20:01:53 +00:00 · 2003-06-21 23:24:07 +00:00 · 2003-06-21 23:24:07 +00:00 · 95a6f7f9d4
commit 95a6f7f9d4
parent d52e2f51bf
10 changed files with 115 additions and 58 deletions
--- a/mkvextract.cpp
+++ b/mkvextract.cpp
@ -510,7 +510,7 @@ void create_output_files() {
          s = (char *)safemalloc(tracks[i].private_size + 1);
          memcpy(s, tracks[i].private_data, tracks[i].private_size);
          s[tracks[i].private_size] = 0;
-          tracks[i].mm_io->writeline_unix_newlines(s);
+          tracks[i].mm_io->puts_unl(s);
          safefree(s);
        }
      }
@ -596,7 +596,7 @@ void handle_data(KaxBlock *block, int64_t block_duration, bool has_ref) {
        tracks[i].mm_io->write(buffer, strlen(buffer));

        // Print the text itself.
-        tracks[i].mm_io->writeline_unix_newlines(s);
+        tracks[i].mm_io->puts_unl(s);
        safefree(s);
        break;

@ -623,22 +623,22 @@ void handle_data(KaxBlock *block, int64_t block_duration, bool has_ref) {
        }

        // Print "Dialogue: "
-        tracks[i].mm_io->writeline_unix_newlines("Dialogue: ");
+        tracks[i].mm_io->puts_unl("Dialogue: ");
        *s2 = 0;
        s2++;
-        tracks[i].mm_io->writeline_unix_newlines(s);
-        tracks[i].mm_io->writeline_unix_newlines(",");
+        tracks[i].mm_io->puts_unl(s);
+        tracks[i].mm_io->puts_unl(",");
        sprintf(buffer, "%lld:%02lld:%02lld.%02lld",
                start / 1000 / 60 / 60, (start / 1000 / 60) % 60,
                (start / 1000) % 60, (start % 1000) / 10);
-        tracks[i].mm_io->writeline_unix_newlines(buffer);
-        tracks[i].mm_io->writeline_unix_newlines(",");
+        tracks[i].mm_io->puts_unl(buffer);
+        tracks[i].mm_io->puts_unl(",");
        sprintf(buffer, "%lld:%02lld:%02lld.%02lld",
                end / 1000 / 60 / 60, (end / 1000 / 60) % 60,
                (end / 1000) % 60, (end % 1000) / 10);
-        tracks[i].mm_io->writeline_unix_newlines(buffer);
-        tracks[i].mm_io->writeline_unix_newlines(",");
-        tracks[i].mm_io->writeline_unix_newlines(s2);
+        tracks[i].mm_io->puts_unl(buffer);
+        tracks[i].mm_io->puts_unl(",");
+        tracks[i].mm_io->puts_unl(s2);
        safefree(s);
        break;

--- a/mm_io.cpp
+++ b/mm_io.cpp
@ -272,7 +272,7 @@ string mm_io_c::getline() {

 #endif

-size_t mm_io_c::writeline_unix_newlines(const char *s) {
+size_t mm_io_c::puts_unl(const char *s) {
  int i;
  size_t bytes_written;

--- a/mm_io.h
+++ b/mm_io.h
@ -48,7 +48,7 @@ public:
  virtual bool eof();
  virtual char *gets(char *buffer, size_t max_size);
  virtual string getline();
-  virtual size_t writeline_unix_newlines(const char *s);
+  virtual size_t puts_unl(const char *s);
 };

 class mm_null_io_c: public mm_io_c {
--- a/p_textsubs.cpp
+++ b/p_textsubs.cpp
@ -33,11 +33,13 @@ using namespace LIBMATROSKA_NAMESPACE;
 textsubs_packetizer_c::textsubs_packetizer_c(generic_reader_c *nreader,
                                             const char *ncodec_id,
                                             const void *nglobal_data,
-                                             int nglobal_size,
+                                             int nglobal_size, bool nrecode,
                                             track_info_t *nti)
  throw (error_c): generic_packetizer_c(nreader, nti) {
  packetno = 0;
-  cc_utf8 = utf8_init(ti->sub_charset);
+  recode = nrecode;
+  if (recode)
+    cc_utf8 = utf8_init(ti->sub_charset);
  global_size = nglobal_size;
  global_data = safememdup(nglobal_data, global_size);
  codec_id = safestrdup(ncodec_id);
@ -118,10 +120,13 @@ int textsubs_packetizer_c::process(unsigned char *_subs, int, int64_t start,
  }
  *idx2 = 0;

-  utf8_subs = to_utf8(cc_utf8, subs);
-  add_packet((unsigned char *)utf8_subs, strlen(utf8_subs), start, length,
-             1, -1, -1);
-  safefree(utf8_subs);
+  if (recode) {
+    utf8_subs = to_utf8(cc_utf8, subs);
+    add_packet((unsigned char *)utf8_subs, strlen(utf8_subs), start, length,
+               1, -1, -1);
+    safefree(utf8_subs);
+  } else
+    add_packet((unsigned char *)subs, strlen(subs), start, length, 1, -1, -1);

  safefree(subs);

--- a/p_textsubs.h
+++ b/p_textsubs.h
@ -33,11 +33,12 @@ private:
  void *global_data;
  int global_size;
  char *codec_id;
+  bool recode;

 public:
  textsubs_packetizer_c(generic_reader_c *nreader, const char *ncodec_id,
                        const void *nglobal_data, int nglobal_size,
-                        track_info_t *nti) throw (error_c);
+                        bool nrecode, track_info_t *nti) throw (error_c);
  virtual ~textsubs_packetizer_c();

  virtual int  process(unsigned char *_subs, int size, int64_t start = -1,
--- a/r_matroska.cpp
+++ b/r_matroska.cpp
@ -1006,7 +1006,8 @@ void mkv_reader_c::create_packetizers() {
          nti.sub_charset = "UTF-8";
          t->packetizer = new textsubs_packetizer_c(this, t->codec_id,
                                                    t->private_data,
-                                                    t->private_size, &nti);
+                                                    t->private_size, false,
+                                                    &nti);
          if (verbose)
            fprintf(stdout, "Matroska demultiplexer (%s): using the text "
                    "subtitle output module for track ID %u.\n", ti->fname,
--- a/r_ogm.cpp
+++ b/r_ogm.cpp
@ -343,7 +343,7 @@ void ogm_reader_c::create_packetizers() {
      case OGM_STREAM_TYPE_TEXT:
        try {
          dmx->packetizer = new textsubs_packetizer_c(this, MKV_S_TEXTUTF8,
-                                                      NULL, 0, ti);
+                                                      NULL, 0, true, ti);
        } catch (error_c &error) {
          fprintf(stderr, "Error: ogm_reader: could not initialize the "
                  "text subtitles packetizer for stream id %d. Will try to "
--- a/r_srt.cpp
+++ b/r_srt.cpp
@ -75,7 +75,7 @@ srt_reader_c::srt_reader_c(track_info_t *nti) throw (error_c):
      throw error_c("srt_reader: Source is not a valid SRT file.");
    ti->id = 0;                 // ID for this track.
    textsubs_packetizer = new textsubs_packetizer_c(this, MKV_S_TEXTUTF8, NULL,
-                                                    0, ti);
+                                                    0, true, ti);
  } catch (exception &ex) {
    throw error_c("srt_reader: Could not open the source file.");
  }
--- a/r_ssa.cpp
+++ b/r_ssa.cpp
@ -38,6 +38,7 @@ class ssa_line_c {
 public:
  char *line;
  int64_t start, end;
+  int num;

  bool operator < (const ssa_line_c &cmp) const;
 };
@ -65,8 +66,11 @@ ssa_reader_c::ssa_reader_c(track_info_t *nti) throw (error_c):
  generic_reader_c(nti) {
  string line, global;
  int64_t old_pos;
+  char section;
  bool is_ass;

+  cc_utf8 = utf8_init(ti->sub_charset);
+
  is_ass = false;

  try {
@ -90,17 +94,33 @@ ssa_reader_c::ssa_reader_c(track_info_t *nti) throw (error_c):
      if (!strcasecmp(line.c_str(), "ScriptType: v4.00+") ||
          !strcasecmp(line.c_str(), "[V4+ Styles]"))
        is_ass = true;
+      else if (!strcasecmp(line.c_str(), "[Events]"))
+        section = 'e';
+      // Analyze the format string.
+      else if (!strncasecmp(line.c_str(), "Format: ", strlen("Format: ")) &&
+               (section == 'e')) {
+        format = split(&line.c_str()[strlen("Format: ")]);
+        strip(format);
+      }

      // Now just append the current line and some DOS style newlines.
-      global += "\r\n";
-      global += line;
+      // But not if we've already encountered the [Events] section.
+      if (section != 'e') {
+        global += "\r\n";
+        global += line;
+      }
    }

+    if (format.size() == 0)
+      throw error_c("ssa_reader: Invalid format. Could not find the "
+                    "\"Format\" line in the \"[Events]\" section.");
+
    textsubs_packetizer = new textsubs_packetizer_c(this, is_ass ? 
                                                    MKV_S_TEXTASS :
                                                    MKV_S_TEXTSSA,
                                                    global.c_str(),
-                                                    global.length(), ti);
+                                                    global.length(), false,
+                                                    ti);
  } catch (exception &ex) {
    throw error_c("ssa_reader: Could not open the source file.");
  }
@ -114,6 +134,16 @@ ssa_reader_c::~ssa_reader_c() {
    delete textsubs_packetizer;
 }

+string ssa_reader_c::get_element(const char *index, vector<string> &fields) {
+  int i;
+
+  for (i = 0; i < format.size(); i++)
+    if (format[i] == index)
+      return fields[i];
+
+  return string("");
+}
+
 int64_t ssa_reader_c::parse_time(string &stime) {
  int64_t th, tm, ts, tds;
  int pos;
@ -152,13 +182,29 @@ int64_t ssa_reader_c::parse_time(string &stime) {
  return tds * 10 + ts * 1000 + tm * 60 * 1000 + th * 60 * 60 * 1000;
 }

+string ssa_reader_c::recode_text(vector<string> &fields) {
+  char *s;
+  string res;
+
+  // TODO: Handle \fe encoding changes.
+  res = get_element("Text", fields);
+  s = to_utf8(cc_utf8, res.c_str());
+  res = s;
+  safefree(s);
+
+  return res;
+}
+
 int ssa_reader_c::read() {
-  string line, stime, orig_line;
-  int pos1, pos2, i;
+  string line, stime, orig_line, comma;
+  int i, num;
  int64_t start, end;
  vector<ssa_line_c> clines;
+  vector<string> fields;
  ssa_line_c cline;

+  num = 1;
+
  do {
    line = mm_io->getline();
    orig_line = line;
@ -167,53 +213,45 @@ int ssa_reader_c::read() {

    line.erase(0, strlen("Dialogue: ")); // Trim the start.

-    pos1 = line.find(',');      // Find and parse the start time.
-    if (pos1 < 0) {
-      fprintf(stderr, "ssa_reader: Warning: Malformed line? (%s)\n",
-              orig_line.c_str());
-      continue;
-    }
-    pos2 = line.find(',', pos1 + 1);
-    if (pos2 < 0) {
-      fprintf(stderr, "ssa_reader: Warning: Malformed line? (%s)\n",
-              orig_line.c_str());
-      continue;
-    }
+    // Split the line into fields.
+    fields = split(line.c_str(), ",", format.size());

-    stime = line.substr(pos1 + 1, pos2 - pos1 - 1);
+    // Parse the start time.
+    stime = get_element("Start", fields);
    start = parse_time(stime);
    if (start < 0) {
      fprintf(stderr, "ssa_reader: Warning: Malformed line? (%s)\n",
              orig_line.c_str());
      continue;
    }
-    line.erase(pos1, pos2 - pos1);

-    pos1 = line.find(',');      // Find and parse the end time.
-    if (pos1 < 0) {
-      fprintf(stderr, "ssa_reader: Warning: Malformed line? (%s)\n",
-              orig_line.c_str());
-      continue;
-    }
-    pos2 = line.find(',', pos1 + 1);
-    if (pos2 < 0) {
-      fprintf(stderr, "ssa_reader: Warning: Malformed line? (%s)\n",
-              orig_line.c_str());
-      continue;
-    }
-
-    stime = line.substr(pos1 + 1, pos2 - pos1 - 1);
+    // Parse the end time.
+    stime = get_element("Start", fields);
    end = parse_time(stime);
-    if (end < 0) {
+    if (start < 0) {
      fprintf(stderr, "ssa_reader: Warning: Malformed line? (%s)\n",
              orig_line.c_str());
      continue;
    }
-    line.erase(pos1, pos2 - pos1);
+
+    // Specs say that the following fields are to put into the block:
+    // ReadOrder, Layer, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+
+    comma = ",";
+    line = comma + get_element("Layer", fields) + comma +
+      get_element("Style", fields) + comma +
+      get_element("Name", fields) + comma + 
+      get_element("MarginL", fields) + comma +
+      get_element("MarginR", fields) + comma +
+      get_element("MarginV", fields) + comma +
+      get_element("Effect", fields) + comma +
+      recode_text(fields);

    cline.line = safestrdup(line.c_str());
    cline.start = start;
    cline.end = end;
+    cline.num = num;
+    num++;

    clines.push_back(cline);
  } while (!mm_io->eof());
@ -221,8 +259,11 @@ int ssa_reader_c::read() {
  stable_sort(clines.begin(), clines.end());

  for (i = 0; i < clines.size(); i++) {
+    char buffer[20];
    // Let the packetizer handle this line.
-    textsubs_packetizer->process((unsigned char *)clines[i].line, 0,
+    sprintf(buffer, "%d", clines[i].num);
+    line = string(buffer) + string(clines[i].line);
+    textsubs_packetizer->process((unsigned char *)line.c_str(), 0,
                                 clines[i].start,
                                 clines[i].end - clines[i].start);
    safefree(clines[i].line);
--- a/r_ssa.h
+++ b/r_ssa.h
@ -25,17 +25,24 @@

 #include <stdio.h>

+#include <string>
+#include <vector>
+
 #include "mm_io.h"
 #include "common.h"
 #include "pr_generic.h"

 #include "p_textsubs.h"

+using namespace std;
+
 class ssa_reader_c: public generic_reader_c {
 private:
  mm_io_c *mm_io;
  textsubs_packetizer_c *textsubs_packetizer;
  int act_wchar;
+  vector<string> format;
+  int cc_utf8;

 public:
  ssa_reader_c(track_info_t *nti) throw (error_c);
@ -53,6 +60,8 @@ public:

 protected:
  virtual int64_t parse_time(string &time);
+  virtual string get_element(const char *index, vector<string> &fields);
+  virtual string recode_text(vector<string> &fields);
 };

 #endif  // __R_SSA_H