3 * @brief Conversions between Unicode and local charsets, string
4 * manipulation functions that act on character types.
18 // there must be at least 4 bytes free, NOT CHECKED!
19 int wctoutf8(char *d, ucs_t s)
28 d[0] = ( s >> 6) | 0xc0;
29 d[1] = ( s & 0x3f) | 0x80;
34 d[0] = ( s >> 12) | 0xe0;
35 d[1] = ((s >> 6) & 0x3f) | 0x80;
36 d[2] = ( s & 0x3f) | 0x80;
41 d[0] = ( s >> 18) | 0xf0;
42 d[1] = ((s >> 12) & 0x3f) | 0x80;
43 d[2] = ((s >> 6) & 0x3f) | 0x80;
44 d[3] = ( s & 0x3f) | 0x80;
47 // Invalid char marker (U+FFFD).
54 int utf8towc(ucs_t *d, const char *s)
66 if ((*s & 0xc0) == 0x80)
67 { // bare tail, invalid
70 do bad++; while ((s[bad] & 0xc0) == 0x80);
76 if ((*s & 0xe0) == 0xc0)
78 else if ((*s & 0xf0) == 0xe0)
80 else if ((*s & 0xf8) == 0xf0)
82 /* valid UTF-8, invalid Unicode
83 else if ((*s & 0xfc) == 0xf8)
85 else if ((*s & 0xfe) == 0xfc)
89 { // 0xfe or 0xff, invalid
94 for (int i = 1; i < cnt; i++)
96 if ((s[i] & 0xc0) != 0x80)
97 { // only tail characters are allowed here, invalid
101 c = (c << 6) | (s[i] & 0x3f);
104 if (c < 0xA0 // illegal characters
105 || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogates
106 || (cnt == 3 && c < 0x800) // overlong characters
107 || (cnt == 4 && c < 0x10000) // overlong characters
108 || c > 0x10FFFF) // outside Unicode
116 #ifdef TARGET_OS_WINDOWS
117 // don't pull in wstring templates on other systems
118 std::wstring utf8_to_16(const char *s)
123 while (int l = utf8towc(&c, s))
129 d.push_back(0xD800 + (c >> 10));
130 d.push_back(0xDC00 + (c & 0x3FF));
139 std::string utf16_to_8(const utf16_t *s)
146 if (*s >= 0xD800 && *s <= 0xDBFF)
147 if (s[1] >= 0xDC00 && s[1] <= 0xDFFF)
149 c = (((ucs_t)s[0]) << 10) + s[1] - 0x35fdc00;
153 c = 0xFFFD; // leading surrogate without its tail
154 else if (*s >= 0xDC00 && *s <= 0xDFFF)
155 c = 0xFFFD; // unpaired trailing surrogate
161 int l = wctoutf8(buf, c);
162 for (int i = 0; i < l; i++)
169 std::string utf8_to_mb(const char *s)
176 memset(&ps, 0, sizeof(ps));
177 while ((l = utf8towc(&c, s)))
181 char buf[MB_LEN_MAX];
182 int r = wcrtomb(buf, c, &ps);
185 for (int i = 0; i < r; i++)
189 d.push_back('?'); // TODO: try to transliterate
194 std::string mb_to_utf8(const char *s)
201 memset(&ps, 0, sizeof(ps));
202 // the input is zero-terminated, so third argument doesn't matter
203 while ((l = mbrtowc(&c, s, MB_LEN_MAX, &ps)))
208 { // invalid input, mark it and try to recover
214 int r = wctoutf8(buf, c);
215 for (int i = 0; i < r; i++)
221 static std::string utf8_validate(const char *s)
227 while ((l = utf8towc(&c, s)))
232 int r = wctoutf8(buf, c);
233 for (int i = 0; i < r; i++)
239 static bool _check_trail(FILE *f, const char* bytes, int len)
243 if (fgetc(f) != (unsigned char)*bytes++)
252 FileLineInput::FileLineInput(const char *name)
254 f = fopen_u(name, "r");
267 if (_check_trail(f, "\xBB\xBF", 2))
271 if (_check_trail(f, "\xFF", 1))
275 if (_check_trail(f, "\xFE\x00\x00", 3))
277 else if (_check_trail(f, "\xFF\xFE", 2)) // rewound
281 if (_check_trail(f, "\x00\xFE\xFF", 3))
289 FileLineInput::~FileLineInput()
295 std::string FileLineInput::get_line()
298 std::vector<utf16_t> win;
309 if (!fgets(buf, sizeof buf, f))
315 if (out[out.length() - 1] == '\n')
317 out.erase(out.length() - 1);
321 return mb_to_utf8(out.c_str());
326 if (!fgets(buf, sizeof buf, f))
332 if (out[out.length() - 1] == '\n')
334 out.erase(out.length() - 1);
338 return utf8_validate(out.c_str());
343 if (fread(buf, 2, 1, f) != 1)
348 c = ((uint32_t)((unsigned char)buf[0]))
349 | ((uint32_t)((unsigned char)buf[1])) << 8;
356 return utf16_to_8(&win[0]);
361 if (fread(buf, 2, 1, f) != 1)
366 c = ((uint32_t)((unsigned char)buf[1]))
367 | ((uint32_t)((unsigned char)buf[0])) << 8;
374 return utf16_to_8(&win[0]);
379 if (fread(buf, 4, 1, f) != 1)
384 c = ((uint32_t)((unsigned char)buf[0]))
385 | ((uint32_t)((unsigned char)buf[1])) << 8
386 | ((uint32_t)((unsigned char)buf[2])) << 16
387 | ((uint32_t)((unsigned char)buf[3])) << 24;
390 len = wctoutf8(buf, c);
391 for (int i = 0; i < len; i++)
392 out.push_back(buf[i]);
400 if (fread(buf, 4, 1, f) != 1)
405 c = ((uint32_t)((unsigned char)buf[0])) << 24
406 | ((uint32_t)((unsigned char)buf[1])) << 16
407 | ((uint32_t)((unsigned char)buf[2])) << 8
408 | ((uint32_t)((unsigned char)buf[3]));
411 len = wctoutf8(buf, c);
412 for (int i = 0; i < len; i++)
413 out.push_back(buf[i]);
419 die("memory got trampled");
422 UTF8FileLineInput::UTF8FileLineInput(const char *name)
424 f = fopen_u(name, "r");
433 UTF8FileLineInput::~UTF8FileLineInput()
439 std::string UTF8FileLineInput::get_line()
447 if (!fgets(buf, sizeof buf, f))
453 if (out[out.length() - 1] == '\n')
455 out.erase(out.length() - 1);
459 return utf8_validate(out.c_str());
462 int strwidth(const char *s)
467 while (int l = utf8towc(&c, s))
471 if (cw != -1) // shouldn't ever happen
478 int strwidth(const std::string &s)
480 return strwidth(s.c_str());
486 return wctoutf8(dummy, c);
489 char *prev_glyph(char *s, char *start)
494 // Find the start of the previous code point.
498 while ((*s & 0xc0) == 0x80);
499 // If a combining one, continue.
501 } while (!wcwidth(c));
505 char *next_glyph(char *s)
509 // Skip at least one character.
510 s += utf8towc(&c, s);
514 s += utf8towc(&c, s_cur = s);
515 // And any combining ones after it.
516 while (c && !wcwidth(c));
520 std::string chop_string(const char *s, int width, bool spaces)
525 while (int clen = utf8towc(&c, s))
528 // Due to combining chars, we can't stop at merely reaching the
529 // target width, the next character needs to exceed it.
530 if (cw > width) // note: a CJK character might leave one space left
532 if (cw >= 0) // should we assert on control chars instead?
538 return std::string(s0, s - s0) + std::string(width, ' ');
539 return std::string(s0, s - s0);;
542 std::string chop_string(const std::string &s, int width, bool spaces)
544 return chop_string(s.c_str(), width, spaces);
547 unsigned short charset_vt100[128] =
549 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
550 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
551 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
552 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
553 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
554 0x0028, 0x0029, 0x002a, 0x2192, 0x2190, 0x2191, 0x2193, 0x002f,
555 0x2588, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
556 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
557 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
558 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
559 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
560 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x00a0,
562 0x25c6, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
563 0x2591, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0xf800,
564 0xf801, 0x2500, 0xf803, 0xf804, 0x251c, 0x2524, 0x2534, 0x252c,
565 0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x007f,
567 0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
568 0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
569 0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
570 0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020,
572 unsigned short charset_cp437[256] =
574 0x0000, 0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
575 0x25d8, 0x25cb, 0x25d9, 0x2642, 0x2640, 0x266a, 0x266b, 0x263c,
576 0x25b6, 0x25c0, 0x2195, 0x203c, 0x00b6, 0x00a7, 0x25ac, 0x21a8,
577 0x2191, 0x2193, 0x2192, 0x2190, 0x221f, 0x2194, 0x25b2, 0x25bc,
578 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
579 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
580 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
581 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
582 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
583 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
584 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
585 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
586 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
587 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
588 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
589 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x2302,
590 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,
591 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
592 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,
593 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192,
594 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,
595 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
596 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
597 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,
598 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,
599 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,
600 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b,
601 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,
602 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4,
603 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229,
604 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248,
605 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0,