mirror of
https://gitlab.com/mbunkus/mkvtoolnix.git
synced 2024-12-25 04:11:44 +00:00
Rewrote the UTF-8 conversion routines. They should now handle U+8000 characters correctly.
This commit is contained in:
parent
06c3bcdb98
commit
30831506f4
@ -1,5 +1,8 @@
|
|||||||
2003-08-28 Moritz Bunkus <moritz@bunkus.org>
|
2003-08-28 Moritz Bunkus <moritz@bunkus.org>
|
||||||
|
|
||||||
|
* Rewrote the UTF-8 conversion routines. They should now handle
|
||||||
|
U+8000 characters correctly.
|
||||||
|
|
||||||
* mkvmerge: Real reader: For RV40 the actual dimensions were also
|
* mkvmerge: Real reader: For RV40 the actual dimensions were also
|
||||||
used for the aspect ratio/display dimensions. This has been fixed:
|
used for the aspect ratio/display dimensions. This has been fixed:
|
||||||
the actual dimensions are used for PixelWidth/PixelHeight, the
|
the actual dimensions are used for PixelWidth/PixelHeight, the
|
||||||
|
176
src/common.cpp
176
src/common.cpp
@ -667,35 +667,104 @@ UTFstring cstr_to_UTFstring(const char *c) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int utf8_byte_length(unsigned char c) {
|
||||||
|
if (c < 0x80) // 0xxxxxxx
|
||||||
|
return 1;
|
||||||
|
else if (c < 0xc0) // 10xxxxxx
|
||||||
|
die("cstrutf8_to_UTFstring: Invalid UTF-8 sequence encountered. Please "
|
||||||
|
"contact moritz@bunkus.org and request that he implements a better "
|
||||||
|
"UTF-8 parser.");
|
||||||
|
else if (c < 0xe0) // 110xxxxx
|
||||||
|
return 2;
|
||||||
|
else if (c < 0xf0) // 1110xxxx
|
||||||
|
return 3;
|
||||||
|
else if (c < 0xf8) // 11110xxx
|
||||||
|
return 4;
|
||||||
|
else if (c < 0xfc) // 111110xx
|
||||||
|
return 5;
|
||||||
|
else if (c < 0xfe) // 1111110x
|
||||||
|
return 6;
|
||||||
|
else
|
||||||
|
die("cstrutf8_to_UTFstring: Invalid UTF-8 sequence encountered. Please "
|
||||||
|
"contact moritz@bunkus.org and request that he implements a better "
|
||||||
|
"UTF-8 parser.");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int wchar_to_utf8_byte_length(uint32_t w) {
|
||||||
|
if (w < 0x00000080)
|
||||||
|
return 1;
|
||||||
|
else if (w < 0x00000800)
|
||||||
|
return 2;
|
||||||
|
else if (w < 0x00010000)
|
||||||
|
return 3;
|
||||||
|
else if (w < 0x00200000)
|
||||||
|
return 4;
|
||||||
|
else if (w < 0x04000000)
|
||||||
|
return 5;
|
||||||
|
else if (w < 0x80000000)
|
||||||
|
return 6;
|
||||||
|
else
|
||||||
|
die("UTFstring_to_cstrutf8: Invalid wide character. Please contact "
|
||||||
|
"moritz@bunkus.org if you think that this is not true.");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
UTFstring cstrutf8_to_UTFstring(const char *c) {
|
UTFstring cstrutf8_to_UTFstring(const char *c) {
|
||||||
wchar_t *new_string;
|
wchar_t *new_string;
|
||||||
int slen, dlen, src, dst;
|
int slen, dlen, src, dst, clen;
|
||||||
UTFstring u;
|
UTFstring u;
|
||||||
|
|
||||||
slen = strlen(c);
|
slen = strlen(c);
|
||||||
dlen = 0;
|
dlen = 0;
|
||||||
for (src = 0; src < slen; dlen++) {
|
for (src = 0; src < slen; dlen++)
|
||||||
if ((c[src] & 0x80) == 0)
|
src += utf8_byte_length(c[src]);
|
||||||
src++;
|
|
||||||
else if ((c[src] & 0x20) == 0)
|
|
||||||
src += 2;
|
|
||||||
else if ((c[src] & 0x08) == 0)
|
|
||||||
src += 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
new_string = (wchar_t *)safemalloc((dlen + 1) * sizeof(wchar_t));
|
new_string = (wchar_t *)safemalloc((dlen + 1) * sizeof(wchar_t));
|
||||||
for (src = 0, dst = 0; src < slen; dst++) {
|
for (src = 0, dst = 0; src < slen; dst++) {
|
||||||
if ((c[src] & 0x80) == 0) {
|
clen = utf8_byte_length(c[src]);
|
||||||
|
if ((src + clen) > slen)
|
||||||
|
die("cstrutf8_to_UTFstring: Invalid UTF-8 sequence encountered. Please "
|
||||||
|
"contact moritz@bunkus.org and request that he implements a better "
|
||||||
|
"UTF-8 parser.");
|
||||||
|
|
||||||
|
if (clen == 1)
|
||||||
new_string[dst] = c[src];
|
new_string[dst] = c[src];
|
||||||
src++;
|
|
||||||
} else if ((c[src] & 0x20) == 0) {
|
else if (clen == 2)
|
||||||
new_string[dst] = ((c[src] & 0x1f) << 6) + (c[src + 1] & 0x3f);
|
new_string[dst] =
|
||||||
src += 2;
|
((((uint32_t)c[src]) & 0x1f) << 6) |
|
||||||
} else if ((c[src] & 0x08) == 0) {
|
(((uint32_t)c[src + 1]) & 0x3f);
|
||||||
new_string[dst] = ((c[src] & 0x0f) << 12) +
|
else if (clen == 3)
|
||||||
((c[src + 1] & 0x3f) << 6) + (c[src + 2] & 0x3f);
|
new_string[dst] =
|
||||||
src += 3;
|
((((uint32_t)c[src]) & 0x0f) << 12) |
|
||||||
}
|
((((uint32_t)c[src + 1]) & 0x3f) << 6) |
|
||||||
|
(((uint32_t)c[src + 2]) & 0x3f);
|
||||||
|
else if (clen == 4)
|
||||||
|
new_string[dst] =
|
||||||
|
((((uint32_t)c[src]) & 0x07) << 18) |
|
||||||
|
((((uint32_t)c[src + 1]) & 0x3f) << 12) |
|
||||||
|
((((uint32_t)c[src + 2]) & 0x3f) << 6) |
|
||||||
|
(((uint32_t)c[src + 3]) & 0x3f);
|
||||||
|
else if (clen == 5)
|
||||||
|
new_string[dst] =
|
||||||
|
((((uint32_t)c[src]) & 0x07) << 24) |
|
||||||
|
((((uint32_t)c[src + 1]) & 0x3f) << 18) |
|
||||||
|
((((uint32_t)c[src + 2]) & 0x3f) << 12) |
|
||||||
|
((((uint32_t)c[src + 3]) & 0x3f) << 6) |
|
||||||
|
(((uint32_t)c[src + 4]) & 0x3f);
|
||||||
|
else if (clen == 6)
|
||||||
|
new_string[dst] =
|
||||||
|
((((uint32_t)c[src]) & 0x07) << 30) |
|
||||||
|
((((uint32_t)c[src + 1]) & 0x3f) << 24) |
|
||||||
|
((((uint32_t)c[src + 2]) & 0x3f) << 18) |
|
||||||
|
((((uint32_t)c[src + 3]) & 0x3f) << 12) |
|
||||||
|
((((uint32_t)c[src + 4]) & 0x3f) << 6) |
|
||||||
|
(((uint32_t)c[src + 5]) & 0x3f);
|
||||||
|
|
||||||
|
src += clen;
|
||||||
}
|
}
|
||||||
new_string[dst] = 0;
|
new_string[dst] = 0;
|
||||||
|
|
||||||
@ -737,40 +806,63 @@ char *UTFstring_to_cstr(const UTFstring &u) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
char *UTFstring_to_cstrutf8(const UTFstring &u) {
|
char *UTFstring_to_cstrutf8(const UTFstring &u) {
|
||||||
int src, dst, dlen, slen;
|
int src, dst, dlen, slen, clen;
|
||||||
char *new_string;
|
unsigned char *new_string;
|
||||||
|
uint32_t uc;
|
||||||
|
|
||||||
dlen = 0;
|
dlen = 0;
|
||||||
slen = u.length();
|
slen = u.length();
|
||||||
|
|
||||||
for (src = 0, dlen = 0; src < slen; src++)
|
for (src = 0, dlen = 0; src < slen; src++)
|
||||||
if (u[src] < 0x80)
|
dlen += wchar_to_utf8_byte_length((uint32_t)u[src]);
|
||||||
dlen++;
|
|
||||||
else if (u[src] < 0x800)
|
|
||||||
dlen += 2;
|
|
||||||
else if (u[src] < 0x10000)
|
|
||||||
dlen += 3;
|
|
||||||
|
|
||||||
new_string = (char *)malloc(dlen + 1);
|
new_string = (unsigned char *)malloc(dlen + 1);
|
||||||
|
|
||||||
for (src = 0, dst = 0; src < slen; src++)
|
for (src = 0, dst = 0; src < slen; src++) {
|
||||||
if (u[src] < 0x80) {
|
uc = (uint32_t)u[src];
|
||||||
new_string[dst] = u[src];
|
clen = wchar_to_utf8_byte_length(uc);
|
||||||
dst++;
|
|
||||||
} else if (u[src] < 0x800) {
|
if (clen == 1)
|
||||||
new_string[dst] = 0xc0 | (u[src] >> 6);
|
new_string[dst] = (unsigned char)uc;
|
||||||
new_string[dst + 1] = 0x80 | (u[src] & 0x3f);
|
|
||||||
dst += 2;
|
else if (clen == 2) {
|
||||||
} else if (u[src] < 0x10000) {
|
new_string[dst] = 0xc0 | ((uc >> 6) & 0x0000001f);
|
||||||
new_string[dst] = 0xe0 | (u[src] >> 12);
|
new_string[dst + 1] = 0x80 | (uc & 0x0000003f);
|
||||||
new_string[dst + 1] = 0x80 | ((u[src] >> 6) & 0x3f);
|
|
||||||
new_string[dst + 2] = 0x80 | (u[src] & 0x3f);
|
} else if (clen == 3) {
|
||||||
dst += 3;
|
new_string[dst] = 0xe0 | ((uc >> 12) & 0x0000000f);
|
||||||
|
new_string[dst + 1] = 0x80 | ((uc >> 6) & 0x0000003f);
|
||||||
|
new_string[dst + 2] = 0x80 | (uc & 0x0000003f);
|
||||||
|
|
||||||
|
} else if (clen == 4) {
|
||||||
|
new_string[dst] = 0xf0 | ((uc >> 18) & 0x00000007);
|
||||||
|
new_string[dst + 1] = 0x80 | ((uc >> 12) & 0x0000003f);
|
||||||
|
new_string[dst + 2] = 0x80 | ((uc >> 6) & 0x0000003f);
|
||||||
|
new_string[dst + 3] = 0x80 | (uc & 0x0000003f);
|
||||||
|
|
||||||
|
} else if (clen == 5) {
|
||||||
|
new_string[dst] = 0xf8 | ((uc >> 24) & 0x00000003);
|
||||||
|
new_string[dst + 1] = 0x80 | ((uc >> 18) & 0x0000003f);
|
||||||
|
new_string[dst + 2] = 0x80 | ((uc >> 12) & 0x0000003f);
|
||||||
|
new_string[dst + 3] = 0x80 | ((uc >> 6) & 0x0000003f);
|
||||||
|
new_string[dst + 4] = 0x80 | (uc & 0x0000003f);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
new_string[dst] = 0xfc | ((uc >> 30) & 0x00000001);
|
||||||
|
new_string[dst + 1] = 0x80 | ((uc >> 24) & 0x0000003f);
|
||||||
|
new_string[dst + 2] = 0x80 | ((uc >> 18) & 0x0000003f);
|
||||||
|
new_string[dst + 3] = 0x80 | ((uc >> 12) & 0x0000003f);
|
||||||
|
new_string[dst + 4] = 0x80 | ((uc >> 6) & 0x0000003f);
|
||||||
|
new_string[dst + 5] = 0x80 | (uc & 0x0000003f);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
dst += clen;
|
||||||
}
|
}
|
||||||
|
|
||||||
new_string[dst] = 0;
|
new_string[dst] = 0;
|
||||||
|
|
||||||
return new_string;
|
return (char *)new_string;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<string> split(const char *src, const char *pattern, int max_num) {
|
vector<string> split(const char *src, const char *pattern, int max_num) {
|
||||||
|
Loading…
Reference in New Issue
Block a user