Browse Source

charset: Rewrote utf8_validate, and added four new functions:

 * utf8_read_char
 * utf8_write_char
 * from_surrogate_pair
 * to_surrogate_pair
Joey Adams 14 years ago
parent
commit
06c4af3163

+ 135 - 24
ccan/charset/_info

@@ -5,40 +5,151 @@
 /**
  * charset - character set conversion and validation routines
  *
- * This module provides a collection (well, only one, at the moment) of
- * well-tested routines for dealing with character set nonsense.
- *
- * Validation functions:
- *  - bool utf8_validate(const char *str, size_t length);
+ * This module provides a collection of well-tested routines
+ * for dealing with character set nonsense.
  *
  * Example:
  *	#include <err.h>
  *	#include <stdio.h>
+ *	#include <stdlib.h>
  *	#include <string.h>
  *	#include <ccan/charset/charset.h>
  *	#include <ccan/grab_file/grab_file.h>
- *	#include <ccan/talloc/talloc.h>	// For talloc_free()
- *
- *	int main(int argc, char *argv[])
+ *	#include <ccan/talloc/talloc.h>
+ *	
+ *	static void print_json_string(const char *s);
+ *	static bool parse_hex16(const char **sp, unsigned int *out);
+ *	
+ *	// Take a JSON-encoded string on input and print its literal value.
+ *	int main(void)
  *	{
- *		size_t len;
- *		char *file;
- *		bool valid;
- *
- *		if (argc != 2)
- *			err(1, "Expected exactly one argument");
- *
- *		file = grab_file(NULL, argv[1], &len);
- *		if (!file)
- *			err(1, "Could not read file %s", argv[1]);
- *
- *		valid = utf8_validate(file, len);
- *		printf("File contents are %s UTF-8\n", valid ? "valid" : "invalid");
- *
- *		talloc_free(file);
- *
+ *		char *input;
+ *		size_t length;
+ *	
+ *		input = grab_file(NULL, NULL, &length);
+ *		if (!input)
+ *			err(1, "Error reading input");
+ *		if (!utf8_validate(input, length)) {
+ *			fprintf(stderr, "Input contains invalid UTF-8\n");
+ *			return 1;
+ *		}
+ *		if (strlen(input) != length) {
+ *			fprintf(stderr, "Input contains null characters\n");
+ *			return 1;
+ *		}
+ *		
+ *		print_json_string(input);
+ *		
+ *		talloc_free(input);
  *		return 0;
  *	}
+ *	
+ *	static void print_json_string(const char *s)
+ *	{
+ *		char output_buffer[4];
+ *		
+ *		// Skip leading whitespace
+ *		while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')
+ *			s++;
+ *		
+ *		if (*s++ != '"') {
+ *			fprintf(stderr, "Expected JSON string literal surrounded by double quotes.\n");
+ *			exit(EXIT_FAILURE);
+ *		}
+ *		
+ *		while (*s != '"') {
+ *			unsigned char c = *s++;
+ *			char *b = output_buffer;
+ *			
+ *			if (c == '\\') {
+ *				c = *s++;
+ *				switch (c) {
+ *					case '"':
+ *					case '\\':
+ *					case '/':
+ *						*b++ = c;
+ *						break;
+ *					case 'b': *b++ = '\b'; break;
+ *					case 'f': *b++ = '\f'; break;
+ *					case 'n': *b++ = '\n'; break;
+ *					case 'r': *b++ = '\r'; break;
+ *					case 't': *b++ = '\t'; break;
+ *					case 'u': {
+ *						unsigned int uc, lc;
+ *						
+ *						if (!parse_hex16(&s, &uc))
+ *							goto syntax_error;
+ *						
+ *						if (uc >= 0xD800 && uc <= 0xDFFF) {
+ *							// Handle UTF-16 surrogate pair (e.g. "\uD834\uDD1E").
+ *							uchar_t unicode;
+ *							
+ *							if (*s++ != '\\' || *s++ != 'u' || !parse_hex16(&s, &lc))
+ *								goto syntax_error;
+ *							
+ *							unicode = from_surrogate_pair(uc, lc);
+ *							if (unicode == REPLACEMENT_CHARACTER) {
+ *								fprintf(stderr, "Invalid surrogate pair.\n");
+ *								exit(EXIT_FAILURE);
+ *							}
+ *							
+ *							b += utf8_write_char(unicode, b);
+ *						} else {
+ *							// Handle ordinary Unicode escape (e.g. "\u266B").
+ *							b += utf8_write_char(uc, b);
+ *						}
+ *						
+ *						break;
+ *					}
+ *					default:
+ *						goto syntax_error;
+ *				}
+ *			} else if (c <= 0x1F) {
+ *				// Control characters are not allowed in string literals.
+ *				goto syntax_error;
+ *			} else {
+ *				*b++ = c;
+ *			}
+ *			
+ *			fwrite(output_buffer, 1, b - output_buffer, stdout);
+ *		}
+ *		
+ *		putchar('\n');
+ *		return;
+ *		
+ *	syntax_error:
+ *		fprintf(stderr, "Syntax error in JSON string literal.\n");
+ *		exit(EXIT_FAILURE);
+ *	}
+ *	
+ *	static bool parse_hex16(const char **sp, unsigned int *out)
+ *	{
+ *		const char *s = *sp;
+ *		unsigned int ret = 0;
+ *		unsigned int i;
+ *		unsigned int tmp;
+ *		char		c;
+ *	
+ *		for (i = 0; i < 4; i++)
+ *		{
+ *			c = *s++;
+ *			if (c >= '0' && c <= '9')
+ *				tmp = c - '0';
+ *			else if (c >= 'A' && c <= 'F')
+ *				tmp = c - 'A' + 10;
+ *			else if (c >= 'a' && c <= 'f')
+ *				tmp = c - 'a' + 10;
+ *			else
+ *				return false;
+ *	
+ *			ret <<= 4;
+ *			ret += tmp;
+ *		}
+ *		
+ *		*out = ret;
+ *		*sp = s;
+ *		return true;
+ *	}
  *
  * Author: Joey Adams
  * License: MIT

+ 143 - 55
ccan/charset/charset.c

@@ -23,8 +23,20 @@
 
 #include "charset.h"
 
-bool utf8_allow_surrogates = false;
-
+/*
+ * This function implements the syntax given in RFC3629, which is
+ * the same as that given in The Unicode Standard, Version 6.0.
+ *
+ * It has the following properties:
+ *
+ *  * All codepoints U+0000..U+10FFFF may be encoded,
+ *    except for U+D800..U+DFFF, which are reserved
+ *    for UTF-16 surrogate pair encoding.
+ *  * UTF-8 byte sequences longer than 4 bytes are not permitted,
+ *    as they exceed the range of Unicode.
+ *  * The sixty-six Unicode "non-characters" are permitted
+ *    (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
+ */
 bool utf8_validate(const char *str, size_t length)
 {
 	const unsigned char *s = (const unsigned char*)str;
@@ -32,69 +44,145 @@ bool utf8_validate(const char *str, size_t length)
 	
 	while (s < e) {
 		unsigned char c = *s++;
-		unsigned int len; /* number of bytes in sequence - 2 */
+		unsigned char c2;
+		int len_minus_two;
 		
-		/* If character is ASCII, move on. */
-		if (c < 0x80)
+		/* Validate the first byte and determine the sequence length. */
+		if (c <= 0x7F)          /* 00..7F */
 			continue;
+		else if (c <= 0xC1)     /* 80..C1 */
+			return false;
+		else if (c <= 0xDF)     /* C2..DF */
+			len_minus_two = 0;
+		else if (c <= 0xEF)     /* E0..EF */
+			len_minus_two = 1;
+		else if (c <= 0xF4)     /* F0..F4 */
+			len_minus_two = 2;
+		else
+			return false;
 		
-		if (s >= e)
-			return false; /* Missing bytes in sequence. */
-		
-		if (c < 0xE0) {
-			/* 2-byte sequence, U+0080 to U+07FF
-			   c must be 11000010 or higher
-			   s[0] must be 10xxxxxx */
-			len = 0;
-			if (c < 0xC2)
-				return false;
-		} else if (c < 0xF0) {
-			/* 3-byte sequence, U+0800 to U+FFFF
-			   Note that the surrogate range is U+D800 to U+DFFF,
-				  and that U+FFFE and U+FFFF are illegal characters.
-			   c must be >= 11100000 (which it is)
-			   If c is 11100000, then s[0] must be >= 10100000
-			   If the global parameter utf8_allow_surrogates is false:
-			      If c is 11101101 and s[0] is >= 10100000,
-			         then this is a surrogate and we should fail.
-			   If c is 11101111, s[0] is 10111111, and s[1] >= 10111110,
-				  then this is an illegal character and we should fail.
-			   s[0] and s[1] must be 10xxxxxx */
-			len = 1;
-			if (c == 0xE0 && *s < 0xA0)
-				return false;
-			if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
-				return false;
-			if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE))
-				return false;
-		} else {
-			/* 4-byte sequence, U+010000 to U+10FFFF
-			   c must be >= 11110000 (which it is) and <= 11110100
-			   If c is 11110000, then s[0] must be >= 10010000
-			   If c is 11110100, then s[0] must be < 10010000
-			   s[0], s[1], and s[2] must be 10xxxxxx */
-			len = 2;
-			if (c > 0xF4)
-				return false;
-			if (c == 0xF0 && *s < 0x90)
-				return false;
-			if (c == 0xF4 && *s >= 0x90)
-				return false;
-		}
+		/* Make sure the character isn't clipped. */
+		if (s + len_minus_two >= e)
+			return false;
 		
-		if (s + len >= e)
-			return false; /* Missing bytes in sequence. */
+		c2 = *s;
 		
+		/* Make sure subsequent bytes are in the range 0x80..0xBF. */
 		do {
 			if ((*s++ & 0xC0) != 0x80)
 				return false;
-		} while (len--);
+		} while (len_minus_two--);
+		
+		/* Handle special cases. */
+		switch (c) {
+			case 0xE0:
+				/* Disallow overlong 3-byte sequence. */
+				if (c2 < 0xA0)
+					return false;
+				break;
+			case 0xED:
+				/* Disallow U+D800..U+DFFF. */
+				if (c2 > 0x9F)
+					return false;
+				break;
+			case 0xF0:
+				/* Disallow overlong 4-byte sequence. */
+				if (c2 < 0x90)
+					return false;
+				break;
+			case 0xF4:
+				/* Disallow codepoints beyond U+10FFFF. */
+				if (c2 > 0x8F)
+					return false;
+				break;
+		}
 	}
 	
 	return true;
 }
 
-/*
-  Note to future contributors: These routines are currently all under the
-    MIT license.  It would be nice to keep it that way :)
-*/
+int utf8_read_char(const char *s, uchar_t *out)
+{
+	const unsigned char *c = (const unsigned char*) s;
+
+	if (c[0] <= 0x7F) {
+		/* 00..7F */
+		*out = c[0];
+		return 1;
+	} else if (c[0] <= 0xDF) {
+		/* C2..DF (unless input is invalid) */
+		*out = ((uchar_t)c[0] & 0x1F) << 6 |
+		       ((uchar_t)c[1] & 0x3F);
+		return 2;
+	} else if (c[0] <= 0xEF) {
+		/* E0..EF */
+		*out = ((uchar_t)c[0] &  0xF) << 12 |
+		       ((uchar_t)c[1] & 0x3F) << 6  |
+		       ((uchar_t)c[2] & 0x3F);
+		return 3;
+	} else {
+		/* F0..F4 (unless input is invalid) */
+		*out = ((uchar_t)c[0] &  0x7) << 18 |
+		       ((uchar_t)c[1] & 0x3F) << 12 |
+		       ((uchar_t)c[2] & 0x3F) << 6  |
+		       ((uchar_t)c[3] & 0x3F);
+		return 4;
+	}
+}
+
+int utf8_write_char(uchar_t unicode, char *out)
+{
+	unsigned char *o = (unsigned char*) out;
+
+	if (unicode <= 0x7F) {
+		/* U+0000..U+007F */
+		*o++ = unicode;
+		return 1;
+	} else if (unicode <= 0x7FF) {
+		/* U+0080..U+07FF */
+		*o++ = 0xC0 | unicode >> 6;
+		*o++ = 0x80 | (unicode & 0x3F);
+		return 2;
+	} else if (unicode <= 0xFFFF) {
+		/* U+0800..U+FFFF */
+		if (unicode >= 0xD800 && unicode <= 0xDFFF)
+			unicode = REPLACEMENT_CHARACTER;
+	three_byte_character:
+		*o++ = 0xE0 | unicode >> 12;
+		*o++ = 0x80 | (unicode >> 6 & 0x3F);
+		*o++ = 0x80 | (unicode & 0x3F);
+		return 3;
+	} else if (unicode <= 0x10FFFF) {
+		/* U+10000..U+10FFFF */
+		*o++ = 0xF0 | unicode >> 18;
+		*o++ = 0x80 | (unicode >> 12 & 0x3F);
+		*o++ = 0x80 | (unicode >> 6 & 0x3F);
+		*o++ = 0x80 | (unicode & 0x3F);
+		return 4;
+	} else {
+		/* U+110000... */
+		unicode = REPLACEMENT_CHARACTER;
+		goto three_byte_character;
+	}
+}
+
+uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc)
+{
+	if (uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF)
+		return 0x10000 + ((((uchar_t)uc & 0x3FF) << 10) | (lc & 0x3FF));
+	else
+		return REPLACEMENT_CHARACTER;
+}
+
+bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc)
+{
+	if (unicode >= 0x10000 && unicode <= 0x10FFFF) {
+		uchar_t n = unicode - 0x10000;
+		*uc = ((n >> 10) & 0x3FF) | 0xD800;
+		*lc = (n & 0x3FF) | 0xDC00;
+		return true;
+	} else {
+		*uc = *lc = REPLACEMENT_CHARACTER;
+		return false;
+	}
+}

+ 47 - 9
ccan/charset/charset.h

@@ -26,19 +26,57 @@
 
 #include <stdbool.h>
 #include <stddef.h>
+#include <stdint.h>
+
+#define REPLACEMENT_CHARACTER 0xFFFD
 
 /*
- * Validate the given UTF-8 string.  If it contains '\0' characters,
- * it is still valid.
- *
- * By default, Unicode characters U+D800 thru U+DFFF will be considered
- * invalid UTF-8.  However, if you set utf8_allow_surrogates to true,
- * they will be allowed.  Allowing the surrogate range makes it possible
- * to losslessly encode malformed UTF-16.
+ * Type for Unicode codepoints.
+ * We need our own because wchar_t might be 16 bits.
+ */
+typedef uint32_t uchar_t;
+
+/*
+ * Validate the given UTF-8 string.
+ * If it contains '\0' characters, it is still valid.
  */
 bool utf8_validate(const char *str, size_t length);
 
-/* Default: false */
-extern bool utf8_allow_surrogates;
+/*
+ * Read a single UTF-8 character starting at @s,
+ * returning the length, in bytes, of the character read.
+ *
+ * This function assumes input is valid UTF-8,
+ * and that there are enough characters in front of @s.
+ */
+int utf8_read_char(const char *s, uchar_t *out);
+
+/*
+ * Write a single UTF-8 character to @s,
+ * returning the length, in bytes, of the character written.
+ *
+ * @unicode should be U+0000..U+10FFFF, but not U+D800..U+DFFF.
+ * If @unicode is invalid, REPLACEMENT_CHARACTER will be emitted instead.
+ *
+ * This function will write up to 4 bytes to @out.
+ */
+int utf8_write_char(uchar_t unicode, char *out);
+
+/*
+ * Compute the Unicode codepoint of a UTF-16 surrogate pair.
+ *
+ * @uc should be 0xD800..0xDBFF, and @lc should be 0xDC00..0xDFFF.
+ * If they aren't, this function returns REPLACEMENT_CHARACTER.
+ */
+uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc);
+
+/*
+ * Construct a UTF-16 surrogate pair given a Unicode codepoint.
+ *
+ * @unicode should be U+10000..U+10FFFF.
+ * If it's not, this function returns false,
+ * and sets *uc and *lc to REPLACEMENT_CHARACTER.
+ */
+bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc);
 
 #endif

+ 27 - 0
ccan/charset/test/common.h

@@ -0,0 +1,27 @@
+#include <stdint.h>
+#include <stdlib.h>
+
+/*
+ * Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
+ * Uses the BCPL linear congruential generator method.
+ *
+ * Used instead of system RNG to ensure tests are consistent.
+ */
+static uint32_t rand32(void)
+{
+#if 0
+	/*
+	 * Tests should be run with a different random function
+	 * from time to time.  I've found that the method below
+	 * sometimes behaves poorly for testing purposes.
+	 * For example, rand32() % N might only return even numbers.
+	 */
+	assert(RAND_MAX == 2147483647);
+	return ((random() & 0xFFFF) << 16) | (random() & 0xFFFF);
+#else
+	static uint32_t rand32_state = 0;
+	rand32_state *= (uint32_t)0x7FF8A3ED;
+	rand32_state += (uint32_t)0x2AA01D31;
+	return rand32_state;
+#endif
+}

+ 135 - 0
ccan/charset/test/run-surrogate-pair.c

@@ -0,0 +1,135 @@
+#include <ccan/charset/charset.c>
+#include <ccan/tap/tap.h>
+
+#include <string.h>
+
+#include "common.h"
+
+/*
+ * Testing procedure for from_surrogate_pair and to_surrogate_pair:
+ *
+ *  * For each Unicode code point from 0x10000 to 0x10FFFF:
+ *    - Call to_surrogate_pair, and make sure that:
+ *      - It returns true.
+ *      - uc is 0xD800..0xDBFF
+ *      - lc is 0xDC00..0xDFFF
+ *    - Call from_surrogate_pair on the pair, and make sure that
+ *      it returns the original character.
+ *  * For various invalid arguments to to_surrogate_pair
+ *    (U+0000..U+FFFF and U+110000...):
+ *    - Call to_surrogate_pair, and make sure it:
+ *      - Returns false.
+ *      - Sets *uc and *lc to REPLACEMENT_CHARACTER.
+ *  * For various invalid arguments to from_surrogate_pair
+ *    (uc: not 0xD800..0xDBFF, lc: not 0xDC00..0xDFFF):
+ *    - Call from_surrogate_pair, and make sure
+ *      it returns REPLACEMENT_CHARACTER.
+ */
+
+#define INVALID_TRIAL_COUNT     10000
+
+#define range(r, lo, hi)  ((r) % ((hi)-(lo)+1) + (lo))
+
+static void test_valid(void)
+{
+	uchar_t unicode;
+	unsigned int uc, lc;
+	
+	for (unicode = 0x10000; unicode <= 0x10FFFF; unicode++) {
+		if (to_surrogate_pair(unicode, &uc, &lc) != true) {
+			fail("to_surrogate_pair did not return true on valid input.");
+			return;
+		}
+		if (!(uc >= 0xD800 && uc <= 0xDBFF)) {
+			fail("to_surrogate_pair: uc is out of range");
+			return;
+		}
+		if (!(lc >= 0xDC00 && lc <= 0xDFFF)) {
+			fail("to_surrogate_pair: lc is out of range");
+			return;
+		}
+		if (from_surrogate_pair(uc, lc) != unicode) {
+			fail("Surrogate pair conversion did not preserve original value (U+%04lX).", (unsigned long)unicode);
+			return;
+		}
+	}
+	
+	pass("to_surrogate_pair and from_surrogate_pair work for all valid arguments.");
+}
+
+static void test_invalid_to_surrogate_pair(void)
+{
+	long i;
+	uchar_t unicode;
+	unsigned int uc, lc;
+	
+	for (i = 1; i <= INVALID_TRIAL_COUNT; i++) {
+		if (rand32() % 2) {
+			unicode = range(rand32(), 0x0, 0xFFFF);
+		} else {
+			do {
+				unicode = rand32();
+			} while (unicode < 0x110000);
+		}
+		
+		if (to_surrogate_pair(unicode, &uc, &lc) != false) {
+			fail("to_surrogate_pair did not return false on invalid input.");
+			return;
+		}
+		if (uc != REPLACEMENT_CHARACTER || lc != REPLACEMENT_CHARACTER) {
+			fail("to_surrogate_pair did not set uc and lc to the replacement character on invalid input.");
+			return;
+		}
+	}
+	
+	pass("to_surrogate_pair seems to handle invalid argument values properly.");
+}
+
+static void test_invalid_from_surrogate_pair(void)
+{
+	long i;
+	unsigned int uc, lc;
+	
+	for (i = 1; i <= INVALID_TRIAL_COUNT; i++) {
+		switch (rand32() % 3) {
+			case 0:
+				uc = range(rand32(), 0x0, 0xD7FF);
+				break;
+			case 1:
+				uc = range(rand32(), 0xDC00, 0xDFFF);
+				break;
+			default:
+				uc = range(rand32(), 0xE000, 0xFFFF);
+				break;
+		}
+		switch (rand32() % 3) {
+			case 0:
+				lc = range(rand32(), 0x0, 0xD7FF);
+				break;
+			case 1:
+				lc = range(rand32(), 0xD800, 0xDBFF);
+				break;
+			default:
+				lc = range(rand32(), 0xE000, 0xFFFF);
+				break;
+		}
+		
+		if (from_surrogate_pair(uc, lc) != REPLACEMENT_CHARACTER) {
+			fail("from_surrogate_pair(0x%04X, 0x%04X) did not return the replacement character", uc, lc);
+			return;
+		}
+	}
+	
+	pass("from_surrogate_pair seems to handle invalid arguments properly.");
+}
+
+int main(void)
+{
+	plan_tests(3);
+	
+	test_valid();
+	test_invalid_to_surrogate_pair();
+	test_invalid_from_surrogate_pair();
+	
+	return exit_status();
+}

+ 150 - 0
ccan/charset/test/run-utf8-read-write.c

@@ -0,0 +1,150 @@
+#include <ccan/charset/charset.c>
+#include <ccan/tap/tap.h>
+
+#include <string.h>
+
+#include "common.h"
+
+/*
+ * Testing procedure for utf8_read_char and utf8_write_char:
+ *
+ *  * Generate N valid and invalid Unicode code points.
+ *  * Encode them with utf8_write_char.
+ *  * Copy the resulting string into a buffer sized exactly as big as
+ *    the string produced.  This way, Valgrind can catch buffer overflows
+ *    by utf8_validate and utf8_read_char.
+ *  * Validate the string with utf8_validate.
+ *  * Decode the string, ensuring that:
+ *    - Valid codepoints are read back.
+ *    - Invalid characters are read back, but replaced
+ *      with REPLACEMENT_CHARACTER.
+ *    - No extra characters are read back.
+ */
+
+#define TRIAL_COUNT             1000
+#define MAX_CHARS_PER_TRIAL     100
+
+#define range(r, lo, hi)  ((r) % ((hi)-(lo)+1) + (lo))
+
+int main(void)
+{
+	int trial;
+	
+	plan_tests(TRIAL_COUNT);
+	
+	for (trial = 1; trial <= TRIAL_COUNT; trial++) {
+		int i, count;
+		uchar_t codepoints[MAX_CHARS_PER_TRIAL];
+		uchar_t c;
+		bool c_valid;
+		
+		char write_buffer[MAX_CHARS_PER_TRIAL * 4];
+		char *o = write_buffer;
+		char *oe = write_buffer + sizeof(write_buffer);
+		
+		char *string;
+		const char *s;
+		const char *e;
+		
+		int len;
+		
+		count = rand32() % MAX_CHARS_PER_TRIAL + 1;
+		
+		for (i = 0; i < count; i++) {
+			if (o >= oe) {
+				fail("utf8_write_char: Buffer overflow (1)");
+				goto next_trial;
+			}
+			
+			switch (rand32() % 7) {
+				case 0:
+					c = range(rand32(), 0x0, 0x7F);
+					c_valid = true;
+					break;
+				case 1:
+					c = range(rand32(), 0x80, 0x7FF);
+					c_valid = true;
+					break;
+				case 2:
+					c = range(rand32(), 0x800, 0xD7FF);
+					c_valid = true;
+					break;
+				case 3:
+					c = range(rand32(), 0xD800, 0xDFFF);
+					c_valid = false;
+					break;
+				case 4:
+					c = range(rand32(), 0xE000, 0xFFFF);
+					c_valid = true;
+					break;
+				case 5:
+					c = range(rand32(), 0x10000, 0x10FFFF);
+					c_valid = true;
+					break;
+				default:
+					do {
+						c = rand32();
+					} while (c < 0x110000);
+					c_valid = false;
+					break;
+			}
+			
+			codepoints[i] = c_valid ? c : REPLACEMENT_CHARACTER;
+			
+			len = utf8_write_char(c, o);
+			if (len < 1 || len > 4) {
+				fail("utf8_write_char: Return value is not 1 thru 4.");
+				goto next_trial;
+			}
+			o += len;
+		}
+		if (o > oe) {
+			fail("utf8_write_char: Buffer overflow (2)");
+			goto next_trial;
+		}
+		
+		string = malloc(o - write_buffer);
+		memcpy(string, write_buffer, o - write_buffer);
+		s = string;
+		e = string + (o - write_buffer);
+		
+		if (!utf8_validate(s, e - s)) {
+			fail("Invalid string produced by utf8_write_char.");
+			goto next_trial_free_string;
+		}
+		
+		for (i = 0; i < count; i++) {
+			if (s >= e) {
+				fail("utf8_read_char: Buffer overflow (1)");
+				goto next_trial_free_string;
+			}
+			
+			len = utf8_read_char(s, &c);
+			if (len < 1 || len > 4) {
+				fail("utf8_read_char: Return value is not 1 thru 4.");
+				goto next_trial_free_string;
+			}
+			if (c != codepoints[i]) {
+				fail("utf8_read_char: Character read differs from that written.");
+				goto next_trial_free_string;
+			}
+			s += len;
+		}
+		if (s > e) {
+			fail("utf8_read_char: Buffer overflow (2)");
+			goto next_trial_free_string;
+		}
+		if (s < e) {
+			fail("utf8_read_char: Did not reach end of string.");
+			goto next_trial_free_string;
+		}
+		
+		pass("Trial %d: %d characters", trial, count);
+		
+	next_trial_free_string:
+		free(string);
+	next_trial:;
+	}
+	
+	return exit_status();
+}

+ 256 - 0
ccan/charset/test/run-utf8_validate.c

@@ -0,0 +1,256 @@
+#include <ccan/charset/charset.c>
+#include <ccan/tap/tap.h>
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "common.h"
+
+/* Make a valid or invalid Unicode character fitting in exactly @len UTF-8 bytes. */
+static uchar_t utf8_randcode(int len, bool valid, bool after_clipped)
+{
+	uint32_t r = rand32();
+	uchar_t ret;
+	
+	#define range(lo, hi)  ((r & 0x7FFFFFFF) % ((hi)-(lo)+1) + (lo))
+	#define high_bit_set() (!!(r & 0x80000000))
+	
+	switch (len) {
+		case 1:
+			if (valid) {
+				/* Generate a character U+0000..U+007F */
+				return r & 0x7F;
+			} else {
+				/*
+				 * Generate a character U+0080..U+00BF or U+00F8..U+00FF.
+				 *
+				 * However, don't generate U+0080..U+00BF (10xxxxxx) after a
+				 * clipped character, as that can inadvertently form a valid,
+				 * complete character.
+				 */
+				if (!after_clipped && high_bit_set())
+					return range(0x80, 0xBF);
+				else
+					return range(0xF8, 0xFF);
+			}
+		case 2:
+			if (valid) {
+				/* Generate a character U+0080..U+07FF */
+				return range(0x80, 0x7FF);
+			} else {
+				/* Generate a character U+0000..U+007F */
+				return r & 0x7F;
+			}
+		case 3:
+			if (valid) {
+				/* Generate a character U+0800..U+FFFF, but not U+D800..U+DFFF */
+				for (;;) {
+					ret = range(0x800, 0xFFFF);
+					if (ret >= 0xD800 && ret <= 0xDFFF) {
+						r = rand32();
+						continue;
+					} else {
+						break;
+					}
+				}
+				return ret;
+			} else {
+				/* Generate a character U+0000..U+07FF or U+D800..U+DFFF */
+				if (high_bit_set())
+					return r & 0x7FF;
+				else
+					return 0xD800 + (r & 0x7FF);
+			}
+		case 4:
+			if (valid) {
+				/* Generate a character U+10000..U+10FFFF */
+				return range(0x10000, 0x10FFFF);
+			} else {
+				/* Generate a character U+0000..0xFFFF or U+110000..U+1FFFFF */
+				if (high_bit_set())
+					return r & 0xFFFF;
+				else
+					return range(0x110000, 0x1FFFFF);
+			}
+		default:
+			assert(false);
+	}
+	
+	#undef range
+	#undef high_bit_set
+}
+
+/* Encode @uc as UTF-8 using exactly @len characters.
+   @len should be 1 thru 4. */
+static void utf8_encode_raw(char *out, unsigned int uc, int len)
+{
+	switch (len) {
+		case 1:
+			assert(uc <= 0xC1 || (uc >= 0xF8 && uc <= 0xFF));
+			*out++ = uc;
+			break;
+		case 2:
+			assert(uc <= 0x7FF);
+			*out++ = 0xC0 | ((uc >> 6) & 0x1F);
+			*out++ = 0x80 | (uc & 0x3F);
+			break;
+		case 3:
+			assert(uc <= 0xFFFF);
+			*out++ = 0xE0 | ((uc >> 12) & 0x0F);
+			*out++ = 0x80 | ((uc >> 6) & 0x3F);
+			*out++ = 0x80 | (uc & 0x3F);
+			break;
+		case 4:
+			assert(uc <= 0x1FFFFF);
+			*out++ = 0xF0 | ((uc >> 18) & 0x07);
+			*out++ = 0x80 | ((uc >> 12) & 0x3F);
+			*out++ = 0x80 | ((uc >> 6) & 0x3F);
+			*out++ = 0x80 | (uc & 0x3F);
+			break;
+	}
+}
+
+#if COMPUTE_AVERAGE_LENGTH
+double total_averages;
+#endif
+
+/* Generate a UTF-8 string of the given byte length,
+   randomly deciding if it should be valid or not.
+   
+   Return true if it's valid, false if it's not. */
+static bool utf8_mktest(char *out, int len)
+{
+	double pf;
+	uint32_t pu;
+	int n;
+	bool valid = true;
+	bool v;
+	bool after_clipped = false;
+	
+	#if COMPUTE_AVERAGE_LENGTH
+	int n_total = 0;
+	int count = 0;
+	#endif
+	
+	/*
+	 * Probability that, per character, it should be valid.
+	 * The goal is to make utf8_mktest as a whole
+	 * have a 50% chance of generating a valid string.
+	 *
+	 * The equation being solved is:
+	 *
+	 *     p^n = 0.5
+	 *
+	 * where p is the probability that each character is valid,
+	 * and n is the number of characters in the string.
+	 *
+	 * 2.384 is the approximate average length of each character,
+	 * so len/2.384 is about how many characters this string
+	 * is expected to contain.
+	 */
+	pf = pow(0.5, 2.384/len);
+	
+	/* Convert to uint32_t to test against rand32. */
+	pu = pf * 4294967295.0;
+	
+	for (;len > 0; len -= n, out += n) {
+		v = rand32() <= pu;
+		
+		if (v) {
+			/* Generate a valid character. */
+			n = rand32() % (len < 4 ? len : 4) + 1;
+			utf8_encode_raw(out, utf8_randcode(n, true, after_clipped), n);
+			after_clipped = false;
+		} else if (rand32() % 5) {
+			/* Generate an invalid character. */
+			n = rand32() % (len < 4 ? len : 4) + 1;
+			utf8_encode_raw(out, utf8_randcode(n, false, after_clipped), n);
+			after_clipped = false;
+		} else {
+			/* Generate a clipped but otherwise valid character. */
+			char tmp[4];
+			n = rand32() % 3 + 2;
+			utf8_encode_raw(tmp, utf8_randcode(n, true, after_clipped), n);
+			n -= rand32() % (n-1) + 1;
+			if (n > len)
+				n = len;
+			assert(n >= 1 && n <= 3);
+			memcpy(out, tmp, n);
+			after_clipped = true;
+		}
+		
+		if (!v)
+			valid = false;
+		
+		#if COMPUTE_AVERAGE_LENGTH
+		n_total += n;
+		count++;
+		#endif
+	}
+	
+	#if COMPUTE_AVERAGE_LENGTH
+	if (count > 0)
+		total_averages += (double)n_total / count;
+	#endif
+	
+	return valid;
+}
+
+static void test_utf8_validate(void)
+{
+	char buffer[128];
+	int i;
+	int len;
+	bool valid;
+	int passed=0, p_valid=0, p_invalid=0, total=0;
+	int count;
+	
+	count = 100000;
+	
+	#if COMPUTE_AVERAGE_LENGTH
+	total_averages = 0.0;
+	#endif
+	
+	for (i=0; i<count; i++) {
+		len = rand32() % (sizeof(buffer) + 1);
+		valid = utf8_mktest(buffer, len);
+		if (utf8_validate(buffer, len) == valid) {
+			passed++;
+			if (valid)
+				p_valid++;
+			else
+				p_invalid++;
+		} else {
+			bool uvalid = utf8_validate(buffer, len);
+			printf("Failed: generated %s string, but utf8_validate returned %s\n",
+			       valid ? "valid" : "invalid",
+			       uvalid ? "true" : "false");
+		}
+		total++;
+	}
+	
+	if (passed == total)
+		pass("%d valid tests, %d invalid tests", p_valid, p_invalid);
+	else
+		fail("Passed only %d out of %d tests\n", passed, total);
+	
+	ok(p_valid > count/10 && p_invalid > count/10,
+	   "Valid and invalid should be balanced");
+	
+	#if COMPUTE_AVERAGE_LENGTH
+	printf("Average character length: %f\n", total_averages / count);
+	#endif
+}
+
+int main(void)
+{
+	/* This is how many tests you plan to run */
+	plan_tests(2);
+	
+	test_utf8_validate();
+
+	/* This exits depending on whether all tests passed */
+	return exit_status();
+}

+ 0 - 199
ccan/charset/test/run.c

@@ -1,199 +0,0 @@
-#include <ccan/charset/charset.h>
-#include <ccan/charset/charset.c>
-#include <ccan/tap/tap.h>
-
-#include <assert.h>
-#include <math.h>
-#include <stdint.h>
-#include <stdio.h>
-
-/*
- * Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
- * Uses the BCPL linear congruential generator method.
- *
- * Used instead of system RNG to ensure tests are consistent.
- */
-static uint32_t rand32(void)
-{
-	static uint32_t rand32_state = 0;
-	rand32_state *= (uint32_t)0x7FF8A3ED;
-	rand32_state += (uint32_t)0x2AA01D31;
-	return rand32_state;
-}
-
-/*
- * Make a Unicode character requiring exactly @len UTF-8 bytes.
- *
- * Unless utf8_allow_surrogates is set,
- * do not return a value in the range U+D800 thru U+DFFF .
- *
- * If @len is not 1 thru 4, generate an out-of-range character.
- */
-static unsigned int utf8_randcode(int len)
-{
-	uint32_t r = rand32();
-	unsigned int ret;
-	
-	switch (len) {
-		case 1: return r % 0x80;
-		case 2: return r % (0x800-0x80) + 0x80;
-		case 3:
-			for (;;) {
-				ret = r % (0x10000-0x800) + 0x800;
-				if ((!utf8_allow_surrogates && ret >= 0xD800 && ret <= 0xDFFF)
-				|| ret >= 0xFFFE)
-				{
-					r = rand32();
-					continue;
-				} else {
-					break;
-				}
-			}
-			return ret;
-		case 4: return r % (0x110000-0x10000) + 0x10000;
-		default:
-			while (r < 0x110000)
-				r = rand32();
-			return r;
-	}
-}
-
-static unsigned int rand_surrogate(void)
-{
-	return rand32() % (0xE000 - 0xD800) + 0xD800;
-}
-
-/* Encode @uc as UTF-8 using exactly @len characters.
-   @len should be 1 thru 4.
-   @uc will be truncated to the bits it will go into.
-   If, after bit truncation, @uc is in the wrong range for its length,
-   an invalid character will be generated. */
-static void utf8_encode_raw(char *out, unsigned int uc, int len)
-{
-	switch (len) {
-		case 1:
-			*out++ = uc & 0x7F;
-			break;
-		case 2:
-			*out++ = 0xC0 | ((uc >> 6) & 0x1F);
-			*out++ = 0x80 | (uc & 0x3F);
-			break;
-		case 3:
-			*out++ = 0xE0 | ((uc >> 12) & 0x0F);
-			*out++ = 0x80 | ((uc >> 6) & 0x3F);
-			*out++ = 0x80 | (uc & 0x3F);
-			break;
-		case 4:
-			*out++ = 0xF0 | ((uc >> 18) & 0x07);
-			*out++ = 0x80 | ((uc >> 12) & 0x3F);
-			*out++ = 0x80 | ((uc >> 6) & 0x3F);
-			*out++ = 0x80 | (uc & 0x3F);
-			break;
-	}
-}
-
-/* Generate a UTF-8 string of the given byte length,
-   randomly deciding if it should be valid or not.
-   
-   Return true if it's valid, false if it's not. */
-static bool utf8_mktest(char *out, int len)
-{
-	int m, n;
-	bool valid = true;
-	bool v;
-	double pf;
-	uint32_t pu;
-	
-	/* Probability that, per character, it should be valid.
-	   The goal is to make utf8_mktest as a whole
-	   have a 50% chance of generating a valid string. */
-	pf = pow(0.5, 2.5/len);
-	
-	/* Convert to uint32_t to test against rand32. */
-	pu = pf * 4294967295.0;
-	
-	for (;len; len -= n) {
-		v = len == 1 || rand32() <= pu;
-		m = len < 4 ? len : 4;
-		
-		if (v) {
-			/* Generate a valid character. */
-			n = rand32() % m + 1;
-			utf8_encode_raw(out, utf8_randcode(n), n);
-		} else {
-			/* Generate an invalid character. */
-			assert(m >= 2);
-			n = rand32() % (m-1) + 2;
-			switch (n) {
-				case 2:
-					utf8_encode_raw(out, utf8_randcode(1), n);
-					break;
-				case 3:
-					if (!utf8_allow_surrogates && (rand32() & 1))
-						utf8_encode_raw(out, rand_surrogate(), n);
-					else
-						utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
-					break;
-				case 4:
-					utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
-					break;
-			}
-			valid = false;
-		}
-		out += n;
-	}
-	
-	return valid;
-}
-
-static void test_utf8_validate(bool allow_surrogates)
-{
-	char buffer[1024];
-	int i;
-	int len;
-	bool valid;
-	int passed=0, p_valid=0, p_invalid=0, total=0;
-	int count;
-	
-	count = 10000;
-	
-	utf8_allow_surrogates = allow_surrogates;
-	
-	for (i=0; i<count; i++) {
-		len = rand32() % (1024 + 1);
-		valid = utf8_mktest(buffer, len);
-		if (utf8_validate(buffer, len) == valid) {
-			passed++;
-			if (valid)
-				p_valid++;
-			else
-				p_invalid++;
-		}
-		total++;
-	}
-	
-	if (passed == total) {
-		printf("PASS:  %d valid tests, %d invalid tests\n",
-			p_valid, p_invalid);
-	} else {
-		printf("FAIL:  Passed %d out of %d tests\n", passed, total);
-	}
-	
-	ok(passed, "utf8_validate test passed%s",
-		!allow_surrogates ? " (surrogates disallowed)" : "");
-	
-	ok(p_valid > count/10 && p_invalid > count/10,
-		"   valid/invalid are balanced");
-}
-
-int main(void)
-{
-	/* This is how many tests you plan to run */
-	plan_tests(4);
-	
-	test_utf8_validate(false);
-	test_utf8_validate(true);
-
-	/* This exits depending on whether all tests passed */
-	return exit_status();
-}