Rewrote the UTF-8 conversion routines. They should now handle U+8000 characters correctly.

This commit is contained in:
Moritz Bunkus 2003-08-28 08:33:30 +00:00
parent 06c3bcdb98
commit 30831506f4
2 changed files with 137 additions and 42 deletions

View File

@ -1,5 +1,8 @@
2003-08-28 Moritz Bunkus <moritz@bunkus.org> 2003-08-28 Moritz Bunkus <moritz@bunkus.org>
* Rewrote the UTF-8 conversion routines. They should now handle
U+8000 characters correctly.
* mkvmerge: Real reader: For RV40 the actual dimensions were also * mkvmerge: Real reader: For RV40 the actual dimensions were also
used for the aspect ratio/display dimensions. This has been fixed: used for the aspect ratio/display dimensions. This has been fixed:
the actual dimensions are used for PixelWidth/PixelHeight, the the actual dimensions are used for PixelWidth/PixelHeight, the

View File

@ -667,35 +667,104 @@ UTFstring cstr_to_UTFstring(const char *c) {
#endif #endif
} }
static int utf8_byte_length(unsigned char c) {
if (c < 0x80) // 0xxxxxxx
return 1;
else if (c < 0xc0) // 10xxxxxx
die("cstrutf8_to_UTFstring: Invalid UTF-8 sequence encountered. Please "
"contact moritz@bunkus.org and request that he implements a better "
"UTF-8 parser.");
else if (c < 0xe0) // 110xxxxx
return 2;
else if (c < 0xf0) // 1110xxxx
return 3;
else if (c < 0xf8) // 11110xxx
return 4;
else if (c < 0xfc) // 111110xx
return 5;
else if (c < 0xfe) // 1111110x
return 6;
else
die("cstrutf8_to_UTFstring: Invalid UTF-8 sequence encountered. Please "
"contact moritz@bunkus.org and request that he implements a better "
"UTF-8 parser.");
return 0;
}
static int wchar_to_utf8_byte_length(uint32_t w) {
if (w < 0x00000080)
return 1;
else if (w < 0x00000800)
return 2;
else if (w < 0x00010000)
return 3;
else if (w < 0x00200000)
return 4;
else if (w < 0x04000000)
return 5;
else if (w < 0x80000000)
return 6;
else
die("UTFstring_to_cstrutf8: Invalid wide character. Please contact "
"moritz@bunkus.org if you think that this is not true.");
return 0;
}
UTFstring cstrutf8_to_UTFstring(const char *c) { UTFstring cstrutf8_to_UTFstring(const char *c) {
wchar_t *new_string; wchar_t *new_string;
int slen, dlen, src, dst; int slen, dlen, src, dst, clen;
UTFstring u; UTFstring u;
slen = strlen(c); slen = strlen(c);
dlen = 0; dlen = 0;
for (src = 0; src < slen; dlen++) { for (src = 0; src < slen; dlen++)
if ((c[src] & 0x80) == 0) src += utf8_byte_length(c[src]);
src++;
else if ((c[src] & 0x20) == 0)
src += 2;
else if ((c[src] & 0x08) == 0)
src += 3;
}
new_string = (wchar_t *)safemalloc((dlen + 1) * sizeof(wchar_t)); new_string = (wchar_t *)safemalloc((dlen + 1) * sizeof(wchar_t));
for (src = 0, dst = 0; src < slen; dst++) { for (src = 0, dst = 0; src < slen; dst++) {
if ((c[src] & 0x80) == 0) { clen = utf8_byte_length(c[src]);
if ((src + clen) > slen)
die("cstrutf8_to_UTFstring: Invalid UTF-8 sequence encountered. Please "
"contact moritz@bunkus.org and request that he implements a better "
"UTF-8 parser.");
if (clen == 1)
new_string[dst] = c[src]; new_string[dst] = c[src];
src++;
} else if ((c[src] & 0x20) == 0) { else if (clen == 2)
new_string[dst] = ((c[src] & 0x1f) << 6) + (c[src + 1] & 0x3f); new_string[dst] =
src += 2; ((((uint32_t)c[src]) & 0x1f) << 6) |
} else if ((c[src] & 0x08) == 0) { (((uint32_t)c[src + 1]) & 0x3f);
new_string[dst] = ((c[src] & 0x0f) << 12) + else if (clen == 3)
((c[src + 1] & 0x3f) << 6) + (c[src + 2] & 0x3f); new_string[dst] =
src += 3; ((((uint32_t)c[src]) & 0x0f) << 12) |
} ((((uint32_t)c[src + 1]) & 0x3f) << 6) |
(((uint32_t)c[src + 2]) & 0x3f);
else if (clen == 4)
new_string[dst] =
((((uint32_t)c[src]) & 0x07) << 18) |
((((uint32_t)c[src + 1]) & 0x3f) << 12) |
((((uint32_t)c[src + 2]) & 0x3f) << 6) |
(((uint32_t)c[src + 3]) & 0x3f);
else if (clen == 5)
new_string[dst] =
((((uint32_t)c[src]) & 0x07) << 24) |
((((uint32_t)c[src + 1]) & 0x3f) << 18) |
((((uint32_t)c[src + 2]) & 0x3f) << 12) |
((((uint32_t)c[src + 3]) & 0x3f) << 6) |
(((uint32_t)c[src + 4]) & 0x3f);
else if (clen == 6)
new_string[dst] =
((((uint32_t)c[src]) & 0x07) << 30) |
((((uint32_t)c[src + 1]) & 0x3f) << 24) |
((((uint32_t)c[src + 2]) & 0x3f) << 18) |
((((uint32_t)c[src + 3]) & 0x3f) << 12) |
((((uint32_t)c[src + 4]) & 0x3f) << 6) |
(((uint32_t)c[src + 5]) & 0x3f);
src += clen;
} }
new_string[dst] = 0; new_string[dst] = 0;
@ -737,40 +806,63 @@ char *UTFstring_to_cstr(const UTFstring &u) {
} }
char *UTFstring_to_cstrutf8(const UTFstring &u) { char *UTFstring_to_cstrutf8(const UTFstring &u) {
int src, dst, dlen, slen; int src, dst, dlen, slen, clen;
char *new_string; unsigned char *new_string;
uint32_t uc;
dlen = 0; dlen = 0;
slen = u.length(); slen = u.length();
for (src = 0, dlen = 0; src < slen; src++) for (src = 0, dlen = 0; src < slen; src++)
if (u[src] < 0x80) dlen += wchar_to_utf8_byte_length((uint32_t)u[src]);
dlen++;
else if (u[src] < 0x800)
dlen += 2;
else if (u[src] < 0x10000)
dlen += 3;
new_string = (char *)malloc(dlen + 1); new_string = (unsigned char *)malloc(dlen + 1);
for (src = 0, dst = 0; src < slen; src++) for (src = 0, dst = 0; src < slen; src++) {
if (u[src] < 0x80) { uc = (uint32_t)u[src];
new_string[dst] = u[src]; clen = wchar_to_utf8_byte_length(uc);
dst++;
} else if (u[src] < 0x800) { if (clen == 1)
new_string[dst] = 0xc0 | (u[src] >> 6); new_string[dst] = (unsigned char)uc;
new_string[dst + 1] = 0x80 | (u[src] & 0x3f);
dst += 2; else if (clen == 2) {
} else if (u[src] < 0x10000) { new_string[dst] = 0xc0 | ((uc >> 6) & 0x0000001f);
new_string[dst] = 0xe0 | (u[src] >> 12); new_string[dst + 1] = 0x80 | (uc & 0x0000003f);
new_string[dst + 1] = 0x80 | ((u[src] >> 6) & 0x3f);
new_string[dst + 2] = 0x80 | (u[src] & 0x3f); } else if (clen == 3) {
dst += 3; new_string[dst] = 0xe0 | ((uc >> 12) & 0x0000000f);
new_string[dst + 1] = 0x80 | ((uc >> 6) & 0x0000003f);
new_string[dst + 2] = 0x80 | (uc & 0x0000003f);
} else if (clen == 4) {
new_string[dst] = 0xf0 | ((uc >> 18) & 0x00000007);
new_string[dst + 1] = 0x80 | ((uc >> 12) & 0x0000003f);
new_string[dst + 2] = 0x80 | ((uc >> 6) & 0x0000003f);
new_string[dst + 3] = 0x80 | (uc & 0x0000003f);
} else if (clen == 5) {
new_string[dst] = 0xf8 | ((uc >> 24) & 0x00000003);
new_string[dst + 1] = 0x80 | ((uc >> 18) & 0x0000003f);
new_string[dst + 2] = 0x80 | ((uc >> 12) & 0x0000003f);
new_string[dst + 3] = 0x80 | ((uc >> 6) & 0x0000003f);
new_string[dst + 4] = 0x80 | (uc & 0x0000003f);
} else {
new_string[dst] = 0xfc | ((uc >> 30) & 0x00000001);
new_string[dst + 1] = 0x80 | ((uc >> 24) & 0x0000003f);
new_string[dst + 2] = 0x80 | ((uc >> 18) & 0x0000003f);
new_string[dst + 3] = 0x80 | ((uc >> 12) & 0x0000003f);
new_string[dst + 4] = 0x80 | ((uc >> 6) & 0x0000003f);
new_string[dst + 5] = 0x80 | (uc & 0x0000003f);
}
dst += clen;
} }
new_string[dst] = 0; new_string[dst] = 0;
return new_string; return (char *)new_string;
} }
vector<string> split(const char *src, const char *pattern, int max_num) { vector<string> split(const char *src, const char *pattern, int max_num) {