Commit 875f49cc authored by Brad King's avatar Brad King Committed by Kitware Robot

Merge topic 'utf8-fixes'

fb5de060 cm_utf8: reject codepoints above 0x10FFFF
a11e5e02 cm_utf8: reject UTF-16 surrogate half codepoints
7111873e testUTF8: add more test cases
bba2b1c8 testUTF8: comment why sequences are invalid
Acked-by: Kitware Robot's avatarKitware Robot <kwrobot@kitware.com>
Merge-request: !3097
parents f6cbb02b fb5de060
......@@ -71,6 +71,16 @@ const char* cm_utf8_decode_character(const char* first, const char* last,
return 0;
}
/* UTF-16 surrogate halves. */
if (0xD800 <= uc && uc <= 0xDFFF) {
return 0;
}
/* Invalid codepoints. */
if (0x10FFFF < uc) {
return 0;
}
*pc = uc;
return first;
}
......
......@@ -21,17 +21,29 @@ struct test_utf8_entry
};
static test_utf8_entry const good_entry[] = {
{ 1, "\x20\x00\x00\x00", 0x0020 }, /* Space. */
{ 2, "\xC2\xA9\x00\x00", 0x00A9 }, /* Copyright. */
{ 3, "\xE2\x80\x98\x00", 0x2018 }, /* Open-single-quote. */
{ 3, "\xE2\x80\x99\x00", 0x2019 }, /* Close-single-quote. */
{ 4, "\xF0\xA3\x8E\xB4", 0x233B4 }, /* Example from RFC 3629. */
{ 1, "\x20\x00\x00\x00", 0x0020 }, /* Space. */
{ 2, "\xC2\xA9\x00\x00", 0x00A9 }, /* Copyright. */
{ 3, "\xE2\x80\x98\x00", 0x2018 }, /* Open-single-quote. */
{ 3, "\xE2\x80\x99\x00", 0x2019 }, /* Close-single-quote. */
{ 4, "\xF0\xA3\x8E\xB4", 0x233B4 }, /* Example from RFC 3629. */
{ 3, "\xED\x80\x80\x00", 0xD000 }, /* Valid 0xED prefixed codepoint. */
{ 4, "\xF4\x8F\xBF\xBF", 0x10FFFF }, /* Highest valid RFC codepoint. */
{ 0, { 0, 0, 0, 0, 0 }, 0 }
};
static test_utf8_char const bad_chars[] = {
"\x80\x00\x00\x00", "\xC0\x00\x00\x00", "\xE0\x00\x00\x00",
"\xE0\x80\x80\x00", "\xF0\x80\x80\x80", { 0, 0, 0, 0, 0 }
"\x80\x00\x00\x00", /* Leading continuation byte. */
"\xC0\x80\x00\x00", /* Overlong encoding. */
"\xC1\x80\x00\x00", /* Overlong encoding. */
"\xC2\x00\x00\x00", /* Missing continuation byte. */
"\xE0\x00\x00\x00", /* Missing continuation bytes. */
"\xE0\x80\x80\x00", /* Overlong encoding. */
"\xF0\x80\x80\x80", /* Overlong encoding. */
"\xED\xA0\x80\x00", /* UTF-16 surrogate half. */
"\xED\xBF\xBF\x00", /* UTF-16 surrogate half. */
"\xF4\x90\x80\x80", /* Lowest out-of-range codepoint. */
"\xF5\x80\x80\x80", /* Prefix forces out-of-range codepoints. */
{ 0, 0, 0, 0, 0 }
};
static void report_good(bool passed, test_utf8_char const c)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment