|
|
@@ -22,6 +22,24 @@
|
|
|
*/
|
|
|
|
|
|
#include "charset.h"
|
|
|
+#include <assert.h>
|
|
|
+
|
|
|
+
|
|
|
+bool utf8_validate(const char *str, size_t length)
|
|
|
+{
|
|
|
+ const char *s = str;
|
|
|
+ const char *e = str + length;
|
|
|
+ int len;
|
|
|
+
|
|
|
+ for (; s < e; s += len) {
|
|
|
+ len = utf8_validate_char(s, e);
|
|
|
+ if (len == 0)
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ assert(s == e);
|
|
|
+
|
|
|
+ return true;
|
|
|
+}
|
|
|
|
|
|
/*
|
|
|
* This function implements the syntax given in RFC3629, which is
|
|
|
@@ -37,68 +55,70 @@
|
|
|
* * The sixty-six Unicode "non-characters" are permitted
|
|
|
* (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
|
|
|
*/
|
|
|
-bool utf8_validate(const char *str, size_t length)
|
|
|
+int utf8_validate_char(const char *s, const char *e)
|
|
|
{
|
|
|
- const unsigned char *s = (const unsigned char*)str;
|
|
|
- const unsigned char *e = s + length;
|
|
|
+ unsigned char c = *s++;
|
|
|
|
|
|
- while (s < e) {
|
|
|
- unsigned char c = *s++;
|
|
|
- unsigned char c2;
|
|
|
- int len_minus_two;
|
|
|
+ if (c <= 0x7F) { /* 00..7F */
|
|
|
+ return 1;
|
|
|
+ } else if (c <= 0xC1) { /* 80..C1 */
|
|
|
+ /* Disallow overlong 2-byte sequence. */
|
|
|
+ return 0;
|
|
|
+ } else if (c <= 0xDF) { /* C2..DF */
|
|
|
+ /* Make sure the character isn't clipped. */
|
|
|
+ if (e - s < 1)
|
|
|
+ return 0;
|
|
|
|
|
|
- /* Validate the first byte and determine the sequence length. */
|
|
|
- if (c <= 0x7F) /* 00..7F */
|
|
|
- continue;
|
|
|
- else if (c <= 0xC1) /* 80..C1 */
|
|
|
- return false;
|
|
|
- else if (c <= 0xDF) /* C2..DF */
|
|
|
- len_minus_two = 0;
|
|
|
- else if (c <= 0xEF) /* E0..EF */
|
|
|
- len_minus_two = 1;
|
|
|
- else if (c <= 0xF4) /* F0..F4 */
|
|
|
- len_minus_two = 2;
|
|
|
- else
|
|
|
- return false;
|
|
|
+ /* Make sure subsequent byte is in the range 0x80..0xBF. */
|
|
|
+ if (((unsigned char)*s++ & 0xC0) != 0x80)
|
|
|
+ return 0;
|
|
|
|
|
|
+ return 2;
|
|
|
+ } else if (c <= 0xEF) { /* E0..EF */
|
|
|
/* Make sure the character isn't clipped. */
|
|
|
- if (s + len_minus_two >= e)
|
|
|
- return false;
|
|
|
+ if (e - s < 2)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /* Disallow overlong 3-byte sequence. */
|
|
|
+ if (c == 0xE0 && (unsigned char)*s < 0xA0)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /* Disallow U+D800..U+DFFF. */
|
|
|
+ if (c == 0xED && (unsigned char)*s > 0x9F)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /* Make sure subsequent bytes are in the range 0x80..0xBF. */
|
|
|
+ if (((unsigned char)*s++ & 0xC0) != 0x80)
|
|
|
+ return 0;
|
|
|
+ if (((unsigned char)*s++ & 0xC0) != 0x80)
|
|
|
+ return 0;
|
|
|
|
|
|
- c2 = *s;
|
|
|
+ return 3;
|
|
|
+ } else if (c <= 0xF4) { /* F0..F4 */
|
|
|
+ /* Make sure the character isn't clipped. */
|
|
|
+ if (e - s < 3)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /* Disallow overlong 4-byte sequence. */
|
|
|
+ if (c == 0xF0 && (unsigned char)*s < 0x90)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /* Disallow codepoints beyond U+10FFFF. */
|
|
|
+ if (c == 0xF4 && (unsigned char)*s > 0x8F)
|
|
|
+ return 0;
|
|
|
|
|
|
/* Make sure subsequent bytes are in the range 0x80..0xBF. */
|
|
|
- do {
|
|
|
- if ((*s++ & 0xC0) != 0x80)
|
|
|
- return false;
|
|
|
- } while (len_minus_two--);
|
|
|
+ if (((unsigned char)*s++ & 0xC0) != 0x80)
|
|
|
+ return 0;
|
|
|
+ if (((unsigned char)*s++ & 0xC0) != 0x80)
|
|
|
+ return 0;
|
|
|
+ if (((unsigned char)*s++ & 0xC0) != 0x80)
|
|
|
+ return 0;
|
|
|
|
|
|
- /* Handle special cases. */
|
|
|
- switch (c) {
|
|
|
- case 0xE0:
|
|
|
- /* Disallow overlong 3-byte sequence. */
|
|
|
- if (c2 < 0xA0)
|
|
|
- return false;
|
|
|
- break;
|
|
|
- case 0xED:
|
|
|
- /* Disallow U+D800..U+DFFF. */
|
|
|
- if (c2 > 0x9F)
|
|
|
- return false;
|
|
|
- break;
|
|
|
- case 0xF0:
|
|
|
- /* Disallow overlong 4-byte sequence. */
|
|
|
- if (c2 < 0x90)
|
|
|
- return false;
|
|
|
- break;
|
|
|
- case 0xF4:
|
|
|
- /* Disallow codepoints beyond U+10FFFF. */
|
|
|
- if (c2 > 0x8F)
|
|
|
- return false;
|
|
|
- break;
|
|
|
- }
|
|
|
+ return 4;
|
|
|
+ } else { /* F5..FF */
|
|
|
+ return 0;
|
|
|
}
|
|
|
-
|
|
|
- return true;
|
|
|
}
|
|
|
|
|
|
int utf8_read_char(const char *s, uchar_t *out)
|