15 years ago · 06c4af3163
--- a/ccan/charset/_info
+++ b/ccan/charset/_info
@@ -5,40 +5,151 @@
 
				 /**
			
 
				  * charset - character set conversion and validation routines
			
 
				  *
			
 
				- * This module provides a collection (well, only one, at the moment) of
			
 
				- * well-tested routines for dealing with character set nonsense.
			
 
				- *
			
 
				- * Validation functions:
			
 
				- *  - bool utf8_validate(const char *str, size_t length);
			
 
				+ * This module provides a collection of well-tested routines
			
 
				+ * for dealing with character set nonsense.
			
 
				  *
			
 
				  * Example:
			
 
				  *	#include <err.h>
			
 
				  *	#include <stdio.h>
			
 
				+ *	#include <stdlib.h>
			
 
				  *	#include <string.h>
			
 
				  *	#include <ccan/charset/charset.h>
			
 
				  *	#include <ccan/grab_file/grab_file.h>
			
 
				- *	#include <ccan/talloc/talloc.h>	// For talloc_free()
			
 
				- *
			
 
				- *	int main(int argc, char *argv[])
			
 
				+ *	#include <ccan/talloc/talloc.h>
			
 
				+ *	
			
 
				+ *	static void print_json_string(const char *s);
			
 
				+ *	static bool parse_hex16(const char **sp, unsigned int *out);
			
 
				+ *	
			
 
				+ *	// Take a JSON-encoded string on input and print its literal value.
			
 
				+ *	int main(void)
			
 
				  *	{
			
 
				- *		size_t len;
			
 
				- *		char *file;
			
 
				- *		bool valid;
			
 
				- *
			
 
				- *		if (argc != 2)
			
 
				- *			err(1, "Expected exactly one argument");
			
 
				- *
			
 
				- *		file = grab_file(NULL, argv[1], &len);
			
 
				- *		if (!file)
			
 
				- *			err(1, "Could not read file %s", argv[1]);
			
 
				- *
			
 
				- *		valid = utf8_validate(file, len);
			
 
				- *		printf("File contents are %s UTF-8\n", valid ? "valid" : "invalid");
			
 
				- *
			
 
				- *		talloc_free(file);
			
 
				- *
			
 
				+ *		char *input;
			
 
				+ *		size_t length;
			
 
				+ *	
			
 
				+ *		input = grab_file(NULL, NULL, &length);
			
 
				+ *		if (!input)
			
 
				+ *			err(1, "Error reading input");
			
 
				+ *		if (!utf8_validate(input, length)) {
			
 
				+ *			fprintf(stderr, "Input contains invalid UTF-8\n");
			
 
				+ *			return 1;
			
 
				+ *		}
			
 
				+ *		if (strlen(input) != length) {
			
 
				+ *			fprintf(stderr, "Input contains null characters\n");
			
 
				+ *			return 1;
			
 
				+ *		}
			
 
				+ *		
			
 
				+ *		print_json_string(input);
			
 
				+ *		
			
 
				+ *		talloc_free(input);
			
 
				  *		return 0;
			
 
				  *	}
			
 
				+ *	
			
 
				+ *	static void print_json_string(const char *s)
			
 
				+ *	{
			
 
				+ *		char output_buffer[4];
			
 
				+ *		
			
 
				+ *		// Skip leading whitespace
			
 
				+ *		while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')
			
 
				+ *			s++;
			
 
				+ *		
			
 
				+ *		if (*s++ != '"') {
			
 
				+ *			fprintf(stderr, "Expected JSON string literal surrounded by double quotes.\n");
			
 
				+ *			exit(EXIT_FAILURE);
			
 
				+ *		}
			
 
				+ *		
			
 
				+ *		while (*s != '"') {
			
 
				+ *			unsigned char c = *s++;
			
 
				+ *			char *b = output_buffer;
			
 
				+ *			
			
 
				+ *			if (c == '\\') {
			
 
				+ *				c = *s++;
			
 
				+ *				switch (c) {
			
 
				+ *					case '"':
			
 
				+ *					case '\\':
			
 
				+ *					case '/':
			
 
				+ *						*b++ = c;
			
 
				+ *						break;
			
 
				+ *					case 'b': *b++ = '\b'; break;
			
 
				+ *					case 'f': *b++ = '\f'; break;
			
 
				+ *					case 'n': *b++ = '\n'; break;
			
 
				+ *					case 'r': *b++ = '\r'; break;
			
 
				+ *					case 't': *b++ = '\t'; break;
			
 
				+ *					case 'u': {
			
 
				+ *						unsigned int uc, lc;
			
 
				+ *						
			
 
				+ *						if (!parse_hex16(&s, &uc))
			
 
				+ *							goto syntax_error;
			
 
				+ *						
			
 
				+ *						if (uc >= 0xD800 && uc <= 0xDFFF) {
			
 
				+ *							// Handle UTF-16 surrogate pair (e.g. "\uD834\uDD1E").
			
 
				+ *							uchar_t unicode;
			
 
				+ *							
			
 
				+ *							if (*s++ != '\\' || *s++ != 'u' || !parse_hex16(&s, &lc))
			
 
				+ *								goto syntax_error;
			
 
				+ *							
			
 
				+ *							unicode = from_surrogate_pair(uc, lc);
			
 
				+ *							if (unicode == REPLACEMENT_CHARACTER) {
			
 
				+ *								fprintf(stderr, "Invalid surrogate pair.\n");
			
 
				+ *								exit(EXIT_FAILURE);
			
 
				+ *							}
			
 
				+ *							
			
 
				+ *							b += utf8_write_char(unicode, b);
			
 
				+ *						} else {
			
 
				+ *							// Handle ordinary Unicode escape (e.g. "\u266B").
			
 
				+ *							b += utf8_write_char(uc, b);
			
 
				+ *						}
			
 
				+ *						
			
 
				+ *						break;
			
 
				+ *					}
			
 
				+ *					default:
			
 
				+ *						goto syntax_error;
			
 
				+ *				}
			
 
				+ *			} else if (c <= 0x1F) {
			
 
				+ *				// Control characters are not allowed in string literals.
			
 
				+ *				goto syntax_error;
			
 
				+ *			} else {
			
 
				+ *				*b++ = c;
			
 
				+ *			}
			
 
				+ *			
			
 
				+ *			fwrite(output_buffer, 1, b - output_buffer, stdout);
			
 
				+ *		}
			
 
				+ *		
			
 
				+ *		putchar('\n');
			
 
				+ *		return;
			
 
				+ *		
			
 
				+ *	syntax_error:
			
 
				+ *		fprintf(stderr, "Syntax error in JSON string literal.\n");
			
 
				+ *		exit(EXIT_FAILURE);
			
 
				+ *	}
			
 
				+ *	
			
 
				+ *	static bool parse_hex16(const char **sp, unsigned int *out)
			
 
				+ *	{
			
 
				+ *		const char *s = *sp;
			
 
				+ *		unsigned int ret = 0;
			
 
				+ *		unsigned int i;
			
 
				+ *		unsigned int tmp;
			
 
				+ *		char		c;
			
 
				+ *	
			
 
				+ *		for (i = 0; i < 4; i++)
			
 
				+ *		{
			
 
				+ *			c = *s++;
			
 
				+ *			if (c >= '0' && c <= '9')
			
 
				+ *				tmp = c - '0';
			
 
				+ *			else if (c >= 'A' && c <= 'F')
			
 
				+ *				tmp = c - 'A' + 10;
			
 
				+ *			else if (c >= 'a' && c <= 'f')
			
 
				+ *				tmp = c - 'a' + 10;
			
 
				+ *			else
			
 
				+ *				return false;
			
 
				+ *	
			
 
				+ *			ret <<= 4;
			
 
				+ *			ret += tmp;
			
 
				+ *		}
			
 
				+ *		
			
 
				+ *		*out = ret;
			
 
				+ *		*sp = s;
			
 
				+ *		return true;
			
 
				+ *	}
			
 
				  *
			
 
				  * Author: Joey Adams
			
 
				  * License: MIT
			
--- a/ccan/charset/charset.c
+++ b/ccan/charset/charset.c
@@ -23,8 +23,20 @@
 
				 
			
 
				 #include "charset.h"
			
 
				 
			
 
				-bool utf8_allow_surrogates = false;
			
 
				-
			
 
				+/*
			
 
				+ * This function implements the syntax given in RFC3629, which is
			
 
				+ * the same as that given in The Unicode Standard, Version 6.0.
			
 
				+ *
			
 
				+ * It has the following properties:
			
 
				+ *
			
 
				+ *  * All codepoints U+0000..U+10FFFF may be encoded,
			
 
				+ *    except for U+D800..U+DFFF, which are reserved
			
 
				+ *    for UTF-16 surrogate pair encoding.
			
 
				+ *  * UTF-8 byte sequences longer than 4 bytes are not permitted,
			
 
				+ *    as they exceed the range of Unicode.
			
 
				+ *  * The sixty-six Unicode "non-characters" are permitted
			
 
				+ *    (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
			
 
				+ */
			
 
				 bool utf8_validate(const char *str, size_t length)
			
 
				 {
			
 
				 	const unsigned char *s = (const unsigned char*)str;
			
@@ -32,69 +44,145 @@ bool utf8_validate(const char *str, size_t length)
 
				 	
			
 
				 	while (s < e) {
			
 
				 		unsigned char c = *s++;
			
 
				-		unsigned int len; /* number of bytes in sequence - 2 */
			
 
				+		unsigned char c2;
			
 
				+		int len_minus_two;
			
 
				 		
			
 
				-		/* If character is ASCII, move on. */
			
 
				-		if (c < 0x80)
			
 
				+		/* Validate the first byte and determine the sequence length. */
			
 
				+		if (c <= 0x7F)          /* 00..7F */
			
 
				 			continue;
			
 
				+		else if (c <= 0xC1)     /* 80..C1 */
			
 
				+			return false;
			
 
				+		else if (c <= 0xDF)     /* C2..DF */
			
 
				+			len_minus_two = 0;
			
 
				+		else if (c <= 0xEF)     /* E0..EF */
			
 
				+			len_minus_two = 1;
			
 
				+		else if (c <= 0xF4)     /* F0..F4 */
			
 
				+			len_minus_two = 2;
			
 
				+		else
			
 
				+			return false;
			
 
				 		
			
 
				-		if (s >= e)
			
 
				-			return false; /* Missing bytes in sequence. */
			
 
				-		
			
 
				-		if (c < 0xE0) {
			
 
				-			/* 2-byte sequence, U+0080 to U+07FF
			
 
				-			   c must be 11000010 or higher
			
 
				-			   s[0] must be 10xxxxxx */
			
 
				-			len = 0;
			
 
				-			if (c < 0xC2)
			
 
				-				return false;
			
 
				-		} else if (c < 0xF0) {
			
 
				-			/* 3-byte sequence, U+0800 to U+FFFF
			
 
				-			   Note that the surrogate range is U+D800 to U+DFFF,
			
 
				-				  and that U+FFFE and U+FFFF are illegal characters.
			
 
				-			   c must be >= 11100000 (which it is)
			
 
				-			   If c is 11100000, then s[0] must be >= 10100000
			
 
				-			   If the global parameter utf8_allow_surrogates is false:
			
 
				-			      If c is 11101101 and s[0] is >= 10100000,
			
 
				-			         then this is a surrogate and we should fail.
			
 
				-			   If c is 11101111, s[0] is 10111111, and s[1] >= 10111110,
			
 
				-				  then this is an illegal character and we should fail.
			
 
				-			   s[0] and s[1] must be 10xxxxxx */
			
 
				-			len = 1;
			
 
				-			if (c == 0xE0 && *s < 0xA0)
			
 
				-				return false;
			
 
				-			if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
			
 
				-				return false;
			
 
				-			if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE))
			
 
				-				return false;
			
 
				-		} else {
			
 
				-			/* 4-byte sequence, U+010000 to U+10FFFF
			
 
				-			   c must be >= 11110000 (which it is) and <= 11110100
			
 
				-			   If c is 11110000, then s[0] must be >= 10010000
			
 
				-			   If c is 11110100, then s[0] must be < 10010000
			
 
				-			   s[0], s[1], and s[2] must be 10xxxxxx */
			
 
				-			len = 2;
			
 
				-			if (c > 0xF4)
			
 
				-				return false;
			
 
				-			if (c == 0xF0 && *s < 0x90)
			
 
				-				return false;
			
 
				-			if (c == 0xF4 && *s >= 0x90)
			
 
				-				return false;
			
 
				-		}
			
 
				+		/* Make sure the character isn't clipped. */
			
 
				+		if (s + len_minus_two >= e)
			
 
				+			return false;
			
 
				 		
			
 
				-		if (s + len >= e)
			
 
				-			return false; /* Missing bytes in sequence. */
			
 
				+		c2 = *s;
			
 
				 		
			
 
				+		/* Make sure subsequent bytes are in the range 0x80..0xBF. */
			
 
				 		do {
			
 
				 			if ((*s++ & 0xC0) != 0x80)
			
 
				 				return false;
			
 
				-		} while (len--);
			
 
				+		} while (len_minus_two--);
			
 
				+		
			
 
				+		/* Handle special cases. */
			
 
				+		switch (c) {
			
 
				+			case 0xE0:
			
 
				+				/* Disallow overlong 3-byte sequence. */
			
 
				+				if (c2 < 0xA0)
			
 
				+					return false;
			
 
				+				break;
			
 
				+			case 0xED:
			
 
				+				/* Disallow U+D800..U+DFFF. */
			
 
				+				if (c2 > 0x9F)
			
 
				+					return false;
			
 
				+				break;
			
 
				+			case 0xF0:
			
 
				+				/* Disallow overlong 4-byte sequence. */
			
 
				+				if (c2 < 0x90)
			
 
				+					return false;
			
 
				+				break;
			
 
				+			case 0xF4:
			
 
				+				/* Disallow codepoints beyond U+10FFFF. */
			
 
				+				if (c2 > 0x8F)
			
 
				+					return false;
			
 
				+				break;
			
 
				+		}
			
 
				 	}
			
 
				 	
			
 
				 	return true;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				-  Note to future contributors: These routines are currently all under the
			
 
				-    MIT license.  It would be nice to keep it that way :)
			
 
				-*/
			
 
				+int utf8_read_char(const char *s, uchar_t *out)
			
 
				+{
			
 
				+	const unsigned char *c = (const unsigned char*) s;
			
 
				+
			
 
				+	if (c[0] <= 0x7F) {
			
 
				+		/* 00..7F */
			
 
				+		*out = c[0];
			
 
				+		return 1;
			
 
				+	} else if (c[0] <= 0xDF) {
			
 
				+		/* C2..DF (unless input is invalid) */
			
 
				+		*out = ((uchar_t)c[0] & 0x1F) << 6 |
			
 
				+		       ((uchar_t)c[1] & 0x3F);
			
 
				+		return 2;
			
 
				+	} else if (c[0] <= 0xEF) {
			
 
				+		/* E0..EF */
			
 
				+		*out = ((uchar_t)c[0] &  0xF) << 12 |
			
 
				+		       ((uchar_t)c[1] & 0x3F) << 6  |
			
 
				+		       ((uchar_t)c[2] & 0x3F);
			
 
				+		return 3;
			
 
				+	} else {
			
 
				+		/* F0..F4 (unless input is invalid) */
			
 
				+		*out = ((uchar_t)c[0] &  0x7) << 18 |
			
 
				+		       ((uchar_t)c[1] & 0x3F) << 12 |
			
 
				+		       ((uchar_t)c[2] & 0x3F) << 6  |
			
 
				+		       ((uchar_t)c[3] & 0x3F);
			
 
				+		return 4;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int utf8_write_char(uchar_t unicode, char *out)
			
 
				+{
			
 
				+	unsigned char *o = (unsigned char*) out;
			
 
				+
			
 
				+	if (unicode <= 0x7F) {
			
 
				+		/* U+0000..U+007F */
			
 
				+		*o++ = unicode;
			
 
				+		return 1;
			
 
				+	} else if (unicode <= 0x7FF) {
			
 
				+		/* U+0080..U+07FF */
			
 
				+		*o++ = 0xC0 | unicode >> 6;
			
 
				+		*o++ = 0x80 | (unicode & 0x3F);
			
 
				+		return 2;
			
 
				+	} else if (unicode <= 0xFFFF) {
			
 
				+		/* U+0800..U+FFFF */
			
 
				+		if (unicode >= 0xD800 && unicode <= 0xDFFF)
			
 
				+			unicode = REPLACEMENT_CHARACTER;
			
 
				+	three_byte_character:
			
 
				+		*o++ = 0xE0 | unicode >> 12;
			
 
				+		*o++ = 0x80 | (unicode >> 6 & 0x3F);
			
 
				+		*o++ = 0x80 | (unicode & 0x3F);
			
 
				+		return 3;
			
 
				+	} else if (unicode <= 0x10FFFF) {
			
 
				+		/* U+10000..U+10FFFF */
			
 
				+		*o++ = 0xF0 | unicode >> 18;
			
 
				+		*o++ = 0x80 | (unicode >> 12 & 0x3F);
			
 
				+		*o++ = 0x80 | (unicode >> 6 & 0x3F);
			
 
				+		*o++ = 0x80 | (unicode & 0x3F);
			
 
				+		return 4;
			
 
				+	} else {
			
 
				+		/* U+110000... */
			
 
				+		unicode = REPLACEMENT_CHARACTER;
			
 
				+		goto three_byte_character;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc)
			
 
				+{
			
 
				+	if (uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF)
			
 
				+		return 0x10000 + ((((uchar_t)uc & 0x3FF) << 10) | (lc & 0x3FF));
			
 
				+	else
			
 
				+		return REPLACEMENT_CHARACTER;
			
 
				+}
			
 
				+
			
 
				+bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc)
			
 
				+{
			
 
				+	if (unicode >= 0x10000 && unicode <= 0x10FFFF) {
			
 
				+		uchar_t n = unicode - 0x10000;
			
 
				+		*uc = ((n >> 10) & 0x3FF) | 0xD800;
			
 
				+		*lc = (n & 0x3FF) | 0xDC00;
			
 
				+		return true;
			
 
				+	} else {
			
 
				+		*uc = *lc = REPLACEMENT_CHARACTER;
			
 
				+		return false;
			
 
				+	}
			
 
				+}
			
--- a/ccan/charset/charset.h
+++ b/ccan/charset/charset.h
@@ -26,19 +26,57 @@
 
				 
			
 
				 #include <stdbool.h>
			
 
				 #include <stddef.h>
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+#define REPLACEMENT_CHARACTER 0xFFFD
			
 
				 
			
 
				 /*
			
 
				- * Validate the given UTF-8 string.  If it contains '\0' characters,
			
 
				- * it is still valid.
			
 
				- *
			
 
				- * By default, Unicode characters U+D800 thru U+DFFF will be considered
			
 
				- * invalid UTF-8.  However, if you set utf8_allow_surrogates to true,
			
 
				- * they will be allowed.  Allowing the surrogate range makes it possible
			
 
				- * to losslessly encode malformed UTF-16.
			
 
				+ * Type for Unicode codepoints.
			
 
				+ * We need our own because wchar_t might be 16 bits.
			
 
				+ */
			
 
				+typedef uint32_t uchar_t;
			
 
				+
			
 
				+/*
			
 
				+ * Validate the given UTF-8 string.
			
 
				+ * If it contains '\0' characters, it is still valid.
			
 
				  */
			
 
				 bool utf8_validate(const char *str, size_t length);
			
 
				 
			
 
				-/* Default: false */
			
 
				-extern bool utf8_allow_surrogates;
			
 
				+/*
			
 
				+ * Read a single UTF-8 character starting at @s,
			
 
				+ * returning the length, in bytes, of the character read.
			
 
				+ *
			
 
				+ * This function assumes input is valid UTF-8,
			
 
				+ * and that there are enough characters in front of @s.
			
 
				+ */
			
 
				+int utf8_read_char(const char *s, uchar_t *out);
			
 
				+
			
 
				+/*
			
 
				+ * Write a single UTF-8 character to @s,
			
 
				+ * returning the length, in bytes, of the character written.
			
 
				+ *
			
 
				+ * @unicode should be U+0000..U+10FFFF, but not U+D800..U+DFFF.
			
 
				+ * If @unicode is invalid, REPLACEMENT_CHARACTER will be emitted instead.
			
 
				+ *
			
 
				+ * This function will write up to 4 bytes to @out.
			
 
				+ */
			
 
				+int utf8_write_char(uchar_t unicode, char *out);
			
 
				+
			
 
				+/*
			
 
				+ * Compute the Unicode codepoint of a UTF-16 surrogate pair.
			
 
				+ *
			
 
				+ * @uc should be 0xD800..0xDBFF, and @lc should be 0xDC00..0xDFFF.
			
 
				+ * If they aren't, this function returns REPLACEMENT_CHARACTER.
			
 
				+ */
			
 
				+uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc);
			
 
				+
			
 
				+/*
			
 
				+ * Construct a UTF-16 surrogate pair given a Unicode codepoint.
			
 
				+ *
			
 
				+ * @unicode should be U+10000..U+10FFFF.
			
 
				+ * If it's not, this function returns false,
			
 
				+ * and sets *uc and *lc to REPLACEMENT_CHARACTER.
			
 
				+ */
			
 
				+bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc);
			
 
				 
			
 
				 #endif
			
--- a/ccan/charset/test/common.h
+++ b/ccan/charset/test/common.h
@@ -0,0 +1,27 @@
 
				+#include <stdint.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+/*
			
 
				+ * Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
			
 
				+ * Uses the BCPL linear congruential generator method.
			
 
				+ *
			
 
				+ * Used instead of system RNG to ensure tests are consistent.
			
 
				+ */
			
 
				+static uint32_t rand32(void)
			
 
				+{
			
 
				+#if 0
			
 
				+	/*
			
 
				+	 * Tests should be run with a different random function
			
 
				+	 * from time to time.  I've found that the method below
			
 
				+	 * sometimes behaves poorly for testing purposes.
			
 
				+	 * For example, rand32() % N might only return even numbers.
			
 
				+	 */
			
 
				+	assert(RAND_MAX == 2147483647);
			
 
				+	return ((random() & 0xFFFF) << 16) | (random() & 0xFFFF);
			
 
				+#else
			
 
				+	static uint32_t rand32_state = 0;
			
 
				+	rand32_state *= (uint32_t)0x7FF8A3ED;
			
 
				+	rand32_state += (uint32_t)0x2AA01D31;
			
 
				+	return rand32_state;
			
 
				+#endif
			
 
				+}
			
--- a/ccan/charset/test/run-surrogate-pair.c
+++ b/ccan/charset/test/run-surrogate-pair.c
@@ -0,0 +1,135 @@
 
				+#include <ccan/charset/charset.c>
			
 
				+#include <ccan/tap/tap.h>
			
 
				+
			
 
				+#include <string.h>
			
 
				+
			
 
				+#include "common.h"
			
 
				+
			
 
				+/*
			
 
				+ * Testing procedure for from_surrogate_pair and to_surrogate_pair:
			
 
				+ *
			
 
				+ *  * For each Unicode code point from 0x10000 to 0x10FFFF:
			
 
				+ *    - Call to_surrogate_pair, and make sure that:
			
 
				+ *      - It returns true.
			
 
				+ *      - uc is 0xD800..0xDBFF
			
 
				+ *      - lc is 0xDC00..0xDFFF
			
 
				+ *    - Call from_surrogate_pair on the pair, and make sure that
			
 
				+ *      it returns the original character.
			
 
				+ *  * For various invalid arguments to to_surrogate_pair
			
 
				+ *    (U+0000..U+FFFF and U+110000...):
			
 
				+ *    - Call to_surrogate_pair, and make sure it:
			
 
				+ *      - Returns false.
			
 
				+ *      - Sets *uc and *lc to REPLACEMENT_CHARACTER.
			
 
				+ *  * For various invalid arguments to from_surrogate_pair
			
 
				+ *    (uc: not 0xD800..0xDBFF, lc: not 0xDC00..0xDFFF):
			
 
				+ *    - Call from_surrogate_pair, and make sure
			
 
				+ *      it returns REPLACEMENT_CHARACTER.
			
 
				+ */
			
 
				+
			
 
				+#define INVALID_TRIAL_COUNT     10000
			
 
				+
			
 
				+#define range(r, lo, hi)  ((r) % ((hi)-(lo)+1) + (lo))
			
 
				+
			
 
				+static void test_valid(void)
			
 
				+{
			
 
				+	uchar_t unicode;
			
 
				+	unsigned int uc, lc;
			
 
				+	
			
 
				+	for (unicode = 0x10000; unicode <= 0x10FFFF; unicode++) {
			
 
				+		if (to_surrogate_pair(unicode, &uc, &lc) != true) {
			
 
				+			fail("to_surrogate_pair did not return true on valid input.");
			
 
				+			return;
			
 
				+		}
			
 
				+		if (!(uc >= 0xD800 && uc <= 0xDBFF)) {
			
 
				+			fail("to_surrogate_pair: uc is out of range");
			
 
				+			return;
			
 
				+		}
			
 
				+		if (!(lc >= 0xDC00 && lc <= 0xDFFF)) {
			
 
				+			fail("to_surrogate_pair: lc is out of range");
			
 
				+			return;
			
 
				+		}
			
 
				+		if (from_surrogate_pair(uc, lc) != unicode) {
			
 
				+			fail("Surrogate pair conversion did not preserve original value (U+%04lX).", (unsigned long)unicode);
			
 
				+			return;
			
 
				+		}
			
 
				+	}
			
 
				+	
			
 
				+	pass("to_surrogate_pair and from_surrogate_pair work for all valid arguments.");
			
 
				+}
			
 
				+
			
 
				+static void test_invalid_to_surrogate_pair(void)
			
 
				+{
			
 
				+	long i;
			
 
				+	uchar_t unicode;
			
 
				+	unsigned int uc, lc;
			
 
				+	
			
 
				+	for (i = 1; i <= INVALID_TRIAL_COUNT; i++) {
			
 
				+		if (rand32() % 2) {
			
 
				+			unicode = range(rand32(), 0x0, 0xFFFF);
			
 
				+		} else {
			
 
				+			do {
			
 
				+				unicode = rand32();
			
 
				+			} while (unicode < 0x110000);
			
 
				+		}
			
 
				+		
			
 
				+		if (to_surrogate_pair(unicode, &uc, &lc) != false) {
			
 
				+			fail("to_surrogate_pair did not return false on invalid input.");
			
 
				+			return;
			
 
				+		}
			
 
				+		if (uc != REPLACEMENT_CHARACTER || lc != REPLACEMENT_CHARACTER) {
			
 
				+			fail("to_surrogate_pair did not set uc and lc to the replacement character on invalid input.");
			
 
				+			return;
			
 
				+		}
			
 
				+	}
			
 
				+	
			
 
				+	pass("to_surrogate_pair seems to handle invalid argument values properly.");
			
 
				+}
			
 
				+
			
 
				+static void test_invalid_from_surrogate_pair(void)
			
 
				+{
			
 
				+	long i;
			
 
				+	unsigned int uc, lc;
			
 
				+	
			
 
				+	for (i = 1; i <= INVALID_TRIAL_COUNT; i++) {
			
 
				+		switch (rand32() % 3) {
			
 
				+			case 0:
			
 
				+				uc = range(rand32(), 0x0, 0xD7FF);
			
 
				+				break;
			
 
				+			case 1:
			
 
				+				uc = range(rand32(), 0xDC00, 0xDFFF);
			
 
				+				break;
			
 
				+			default:
			
 
				+				uc = range(rand32(), 0xE000, 0xFFFF);
			
 
				+				break;
			
 
				+		}
			
 
				+		switch (rand32() % 3) {
			
 
				+			case 0:
			
 
				+				lc = range(rand32(), 0x0, 0xD7FF);
			
 
				+				break;
			
 
				+			case 1:
			
 
				+				lc = range(rand32(), 0xD800, 0xDBFF);
			
 
				+				break;
			
 
				+			default:
			
 
				+				lc = range(rand32(), 0xE000, 0xFFFF);
			
 
				+				break;
			
 
				+		}
			
 
				+		
			
 
				+		if (from_surrogate_pair(uc, lc) != REPLACEMENT_CHARACTER) {
			
 
				+			fail("from_surrogate_pair(0x%04X, 0x%04X) did not return the replacement character", uc, lc);
			
 
				+			return;
			
 
				+		}
			
 
				+	}
			
 
				+	
			
 
				+	pass("from_surrogate_pair seems to handle invalid arguments properly.");
			
 
				+}
			
 
				+
			
 
				+int main(void)
			
 
				+{
			
 
				+	plan_tests(3);
			
 
				+	
			
 
				+	test_valid();
			
 
				+	test_invalid_to_surrogate_pair();
			
 
				+	test_invalid_from_surrogate_pair();
			
 
				+	
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/charset/test/run-utf8-read-write.c
+++ b/ccan/charset/test/run-utf8-read-write.c
@@ -0,0 +1,150 @@
 
				+#include <ccan/charset/charset.c>
			
 
				+#include <ccan/tap/tap.h>
			
 
				+
			
 
				+#include <string.h>
			
 
				+
			
 
				+#include "common.h"
			
 
				+
			
 
				+/*
			
 
				+ * Testing procedure for utf8_read_char and utf8_write_char:
			
 
				+ *
			
 
				+ *  * Generate N valid and invalid Unicode code points.
			
 
				+ *  * Encode them with utf8_write_char.
			
 
				+ *  * Copy the resulting string into a buffer sized exactly as big as
			
 
				+ *    the string produced.  This way, Valgrind can catch buffer overflows
			
 
				+ *    by utf8_validate and utf8_read_char.
			
 
				+ *  * Validate the string with utf8_validate.
			
 
				+ *  * Decode the string, ensuring that:
			
 
				+ *    - Valid codepoints are read back.
			
 
				+ *    - Invalid characters are read back, but replaced
			
 
				+ *      with REPLACEMENT_CHARACTER.
			
 
				+ *    - No extra characters are read back.
			
 
				+ */
			
 
				+
			
 
				+#define TRIAL_COUNT             1000
			
 
				+#define MAX_CHARS_PER_TRIAL     100
			
 
				+
			
 
				+#define range(r, lo, hi)  ((r) % ((hi)-(lo)+1) + (lo))
			
 
				+
			
 
				+int main(void)
			
 
				+{
			
 
				+	int trial;
			
 
				+	
			
 
				+	plan_tests(TRIAL_COUNT);
			
 
				+	
			
 
				+	for (trial = 1; trial <= TRIAL_COUNT; trial++) {
			
 
				+		int i, count;
			
 
				+		uchar_t codepoints[MAX_CHARS_PER_TRIAL];
			
 
				+		uchar_t c;
			
 
				+		bool c_valid;
			
 
				+		
			
 
				+		char write_buffer[MAX_CHARS_PER_TRIAL * 4];
			
 
				+		char *o = write_buffer;
			
 
				+		char *oe = write_buffer + sizeof(write_buffer);
			
 
				+		
			
 
				+		char *string;
			
 
				+		const char *s;
			
 
				+		const char *e;
			
 
				+		
			
 
				+		int len;
			
 
				+		
			
 
				+		count = rand32() % MAX_CHARS_PER_TRIAL + 1;
			
 
				+		
			
 
				+		for (i = 0; i < count; i++) {
			
 
				+			if (o >= oe) {
			
 
				+				fail("utf8_write_char: Buffer overflow (1)");
			
 
				+				goto next_trial;
			
 
				+			}
			
 
				+			
			
 
				+			switch (rand32() % 7) {
			
 
				+				case 0:
			
 
				+					c = range(rand32(), 0x0, 0x7F);
			
 
				+					c_valid = true;
			
 
				+					break;
			
 
				+				case 1:
			
 
				+					c = range(rand32(), 0x80, 0x7FF);
			
 
				+					c_valid = true;
			
 
				+					break;
			
 
				+				case 2:
			
 
				+					c = range(rand32(), 0x800, 0xD7FF);
			
 
				+					c_valid = true;
			
 
				+					break;
			
 
				+				case 3:
			
 
				+					c = range(rand32(), 0xD800, 0xDFFF);
			
 
				+					c_valid = false;
			
 
				+					break;
			
 
				+				case 4:
			
 
				+					c = range(rand32(), 0xE000, 0xFFFF);
			
 
				+					c_valid = true;
			
 
				+					break;
			
 
				+				case 5:
			
 
				+					c = range(rand32(), 0x10000, 0x10FFFF);
			
 
				+					c_valid = true;
			
 
				+					break;
			
 
				+				default:
			
 
				+					do {
			
 
				+						c = rand32();
			
 
				+					} while (c < 0x110000);
			
 
				+					c_valid = false;
			
 
				+					break;
			
 
				+			}
			
 
				+			
			
 
				+			codepoints[i] = c_valid ? c : REPLACEMENT_CHARACTER;
			
 
				+			
			
 
				+			len = utf8_write_char(c, o);
			
 
				+			if (len < 1 || len > 4) {
			
 
				+				fail("utf8_write_char: Return value is not 1 thru 4.");
			
 
				+				goto next_trial;
			
 
				+			}
			
 
				+			o += len;
			
 
				+		}
			
 
				+		if (o > oe) {
			
 
				+			fail("utf8_write_char: Buffer overflow (2)");
			
 
				+			goto next_trial;
			
 
				+		}
			
 
				+		
			
 
				+		string = malloc(o - write_buffer);
			
 
				+		memcpy(string, write_buffer, o - write_buffer);
			
 
				+		s = string;
			
 
				+		e = string + (o - write_buffer);
			
 
				+		
			
 
				+		if (!utf8_validate(s, e - s)) {
			
 
				+			fail("Invalid string produced by utf8_write_char.");
			
 
				+			goto next_trial_free_string;
			
 
				+		}
			
 
				+		
			
 
				+		for (i = 0; i < count; i++) {
			
 
				+			if (s >= e) {
			
 
				+				fail("utf8_read_char: Buffer overflow (1)");
			
 
				+				goto next_trial_free_string;
			
 
				+			}
			
 
				+			
			
 
				+			len = utf8_read_char(s, &c);
			
 
				+			if (len < 1 || len > 4) {
			
 
				+				fail("utf8_read_char: Return value is not 1 thru 4.");
			
 
				+				goto next_trial_free_string;
			
 
				+			}
			
 
				+			if (c != codepoints[i]) {
			
 
				+				fail("utf8_read_char: Character read differs from that written.");
			
 
				+				goto next_trial_free_string;
			
 
				+			}
			
 
				+			s += len;
			
 
				+		}
			
 
				+		if (s > e) {
			
 
				+			fail("utf8_read_char: Buffer overflow (2)");
			
 
				+			goto next_trial_free_string;
			
 
				+		}
			
 
				+		if (s < e) {
			
 
				+			fail("utf8_read_char: Did not reach end of string.");
			
 
				+			goto next_trial_free_string;
			
 
				+		}
			
 
				+		
			
 
				+		pass("Trial %d: %d characters", trial, count);
			
 
				+		
			
 
				+	next_trial_free_string:
			
 
				+		free(string);
			
 
				+	next_trial:;
			
 
				+	}
			
 
				+	
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/charset/test/run-utf8_validate.c
+++ b/ccan/charset/test/run-utf8_validate.c
@@ -0,0 +1,256 @@
 
				+#include <ccan/charset/charset.c>
			
 
				+#include <ccan/tap/tap.h>
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include <math.h>
			
 
				+#include <stdio.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+#include "common.h"
			
 
				+
			
 
				+/* Make a valid or invalid Unicode character fitting in exactly @len UTF-8 bytes. */
			
 
				+static uchar_t utf8_randcode(int len, bool valid, bool after_clipped)
			
 
				+{
			
 
				+	uint32_t r = rand32();
			
 
				+	uchar_t ret;
			
 
				+	
			
 
				+	#define range(lo, hi)  ((r & 0x7FFFFFFF) % ((hi)-(lo)+1) + (lo))
			
 
				+	#define high_bit_set() (!!(r & 0x80000000))
			
 
				+	
			
 
				+	switch (len) {
			
 
				+		case 1:
			
 
				+			if (valid) {
			
 
				+				/* Generate a character U+0000..U+007F */
			
 
				+				return r & 0x7F;
			
 
				+			} else {
			
 
				+				/*
			
 
				+				 * Generate a character U+0080..U+00BF or U+00F8..U+00FF.
			
 
				+				 *
			
 
				+				 * However, don't generate U+0080..U+00BF (10xxxxxx) after a
			
 
				+				 * clipped character, as that can inadvertently form a valid,
			
 
				+				 * complete character.
			
 
				+				 */
			
 
				+				if (!after_clipped && high_bit_set())
			
 
				+					return range(0x80, 0xBF);
			
 
				+				else
			
 
				+					return range(0xF8, 0xFF);
			
 
				+			}
			
 
				+		case 2:
			
 
				+			if (valid) {
			
 
				+				/* Generate a character U+0080..U+07FF */
			
 
				+				return range(0x80, 0x7FF);
			
 
				+			} else {
			
 
				+				/* Generate a character U+0000..U+007F */
			
 
				+				return r & 0x7F;
			
 
				+			}
			
 
				+		case 3:
			
 
				+			if (valid) {
			
 
				+				/* Generate a character U+0800..U+FFFF, but not U+D800..U+DFFF */
			
 
				+				for (;;) {
			
 
				+					ret = range(0x800, 0xFFFF);
			
 
				+					if (ret >= 0xD800 && ret <= 0xDFFF) {
			
 
				+						r = rand32();
			
 
				+						continue;
			
 
				+					} else {
			
 
				+						break;
			
 
				+					}
			
 
				+				}
			
 
				+				return ret;
			
 
				+			} else {
			
 
				+				/* Generate a character U+0000..U+07FF or U+D800..U+DFFF */
			
 
				+				if (high_bit_set())
			
 
				+					return r & 0x7FF;
			
 
				+				else
			
 
				+					return 0xD800 + (r & 0x7FF);
			
 
				+			}
			
 
				+		case 4:
			
 
				+			if (valid) {
			
 
				+				/* Generate a character U+10000..U+10FFFF */
			
 
				+				return range(0x10000, 0x10FFFF);
			
 
				+			} else {
			
 
				+				/* Generate a character U+0000..0xFFFF or U+110000..U+1FFFFF */
			
 
				+				if (high_bit_set())
			
 
				+					return r & 0xFFFF;
			
 
				+				else
			
 
				+					return range(0x110000, 0x1FFFFF);
			
 
				+			}
			
 
				+		default:
			
 
				+			assert(false);
			
 
				+	}
			
 
				+	
			
 
				+	#undef range
			
 
				+	#undef high_bit_set
			
 
				+}
			
 
				+
			
 
				+/* Encode @uc as UTF-8 using exactly @len characters.
			
 
				+   @len should be 1 thru 4. */
			
 
				+static void utf8_encode_raw(char *out, unsigned int uc, int len)
			
 
				+{
			
 
				+	switch (len) {
			
 
				+		case 1:
			
 
				+			assert(uc <= 0xC1 || (uc >= 0xF8 && uc <= 0xFF));
			
 
				+			*out++ = uc;
			
 
				+			break;
			
 
				+		case 2:
			
 
				+			assert(uc <= 0x7FF);
			
 
				+			*out++ = 0xC0 | ((uc >> 6) & 0x1F);
			
 
				+			*out++ = 0x80 | (uc & 0x3F);
			
 
				+			break;
			
 
				+		case 3:
			
 
				+			assert(uc <= 0xFFFF);
			
 
				+			*out++ = 0xE0 | ((uc >> 12) & 0x0F);
			
 
				+			*out++ = 0x80 | ((uc >> 6) & 0x3F);
			
 
				+			*out++ = 0x80 | (uc & 0x3F);
			
 
				+			break;
			
 
				+		case 4:
			
 
				+			assert(uc <= 0x1FFFFF);
			
 
				+			*out++ = 0xF0 | ((uc >> 18) & 0x07);
			
 
				+			*out++ = 0x80 | ((uc >> 12) & 0x3F);
			
 
				+			*out++ = 0x80 | ((uc >> 6) & 0x3F);
			
 
				+			*out++ = 0x80 | (uc & 0x3F);
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#if COMPUTE_AVERAGE_LENGTH
			
 
				+double total_averages;
			
 
				+#endif
			
 
				+
			
 
				+/* Generate a UTF-8 string of the given byte length,
			
 
				+   randomly deciding if it should be valid or not.
			
 
				+   
			
 
				+   Return true if it's valid, false if it's not. */
			
 
				+static bool utf8_mktest(char *out, int len)
			
 
				+{
			
 
				+	double pf;
			
 
				+	uint32_t pu;
			
 
				+	int n;
			
 
				+	bool valid = true;
			
 
				+	bool v;
			
 
				+	bool after_clipped = false;
			
 
				+	
			
 
				+	#if COMPUTE_AVERAGE_LENGTH
			
 
				+	int n_total = 0;
			
 
				+	int count = 0;
			
 
				+	#endif
			
 
				+	
			
 
				+	/*
			
 
				+	 * Probability that, per character, it should be valid.
			
 
				+	 * The goal is to make utf8_mktest as a whole
			
 
				+	 * have a 50% chance of generating a valid string.
			
 
				+	 *
			
 
				+	 * The equation being solved is:
			
 
				+	 *
			
 
				+	 *     p^n = 0.5
			
 
				+	 *
			
 
				+	 * where p is the probability that each character is valid,
			
 
				+	 * and n is the number of characters in the string.
			
 
				+	 *
			
 
				+	 * 2.384 is the approximate average length of each character,
			
 
				+	 * so len/2.384 is about how many characters this string
			
 
				+	 * is expected to contain.
			
 
				+	 */
			
 
				+	pf = pow(0.5, 2.384/len);
			
 
				+	
			
 
				+	/* Convert to uint32_t to test against rand32. */
			
 
				+	pu = pf * 4294967295.0;
			
 
				+	
			
 
				+	for (;len > 0; len -= n, out += n) {
			
 
				+		v = rand32() <= pu;
			
 
				+		
			
 
				+		if (v) {
			
 
				+			/* Generate a valid character. */
			
 
				+			n = rand32() % (len < 4 ? len : 4) + 1;
			
 
				+			utf8_encode_raw(out, utf8_randcode(n, true, after_clipped), n);
			
 
				+			after_clipped = false;
			
 
				+		} else if (rand32() % 5) {
			
 
				+			/* Generate an invalid character. */
			
 
				+			n = rand32() % (len < 4 ? len : 4) + 1;
			
 
				+			utf8_encode_raw(out, utf8_randcode(n, false, after_clipped), n);
			
 
				+			after_clipped = false;
			
 
				+		} else {
			
 
				+			/* Generate a clipped but otherwise valid character. */
			
 
				+			char tmp[4];
			
 
				+			n = rand32() % 3 + 2;
			
 
				+			utf8_encode_raw(tmp, utf8_randcode(n, true, after_clipped), n);
			
 
				+			n -= rand32() % (n-1) + 1;
			
 
				+			if (n > len)
			
 
				+				n = len;
			
 
				+			assert(n >= 1 && n <= 3);
			
 
				+			memcpy(out, tmp, n);
			
 
				+			after_clipped = true;
			
 
				+		}
			
 
				+		
			
 
				+		if (!v)
			
 
				+			valid = false;
			
 
				+		
			
 
				+		#if COMPUTE_AVERAGE_LENGTH
			
 
				+		n_total += n;
			
 
				+		count++;
			
 
				+		#endif
			
 
				+	}
			
 
				+	
			
 
				+	#if COMPUTE_AVERAGE_LENGTH
			
 
				+	if (count > 0)
			
 
				+		total_averages += (double)n_total / count;
			
 
				+	#endif
			
 
				+	
			
 
				+	return valid;
			
 
				+}
			
 
				+
			
 
				+static void test_utf8_validate(void)
			
 
				+{
			
 
				+	char buffer[128];
			
 
				+	int i;
			
 
				+	int len;
			
 
				+	bool valid;
			
 
				+	int passed=0, p_valid=0, p_invalid=0, total=0;
			
 
				+	int count;
			
 
				+	
			
 
				+	count = 100000;
			
 
				+	
			
 
				+	#if COMPUTE_AVERAGE_LENGTH
			
 
				+	total_averages = 0.0;
			
 
				+	#endif
			
 
				+	
			
 
				+	for (i=0; i<count; i++) {
			
 
				+		len = rand32() % (sizeof(buffer) + 1);
			
 
				+		valid = utf8_mktest(buffer, len);
			
 
				+		if (utf8_validate(buffer, len) == valid) {
			
 
				+			passed++;
			
 
				+			if (valid)
			
 
				+				p_valid++;
			
 
				+			else
			
 
				+				p_invalid++;
			
 
				+		} else {
			
 
				+			bool uvalid = utf8_validate(buffer, len);
			
 
				+			printf("Failed: generated %s string, but utf8_validate returned %s\n",
			
 
				+			       valid ? "valid" : "invalid",
			
 
				+			       uvalid ? "true" : "false");
			
 
				+		}
			
 
				+		total++;
			
 
				+	}
			
 
				+	
			
 
				+	if (passed == total)
			
 
				+		pass("%d valid tests, %d invalid tests", p_valid, p_invalid);
			
 
				+	else
			
 
				+		fail("Passed only %d out of %d tests\n", passed, total);
			
 
				+	
			
 
				+	ok(p_valid > count/10 && p_invalid > count/10,
			
 
				+	   "Valid and invalid should be balanced");
			
 
				+	
			
 
				+	#if COMPUTE_AVERAGE_LENGTH
			
 
				+	printf("Average character length: %f\n", total_averages / count);
			
 
				+	#endif
			
 
				+}
			
 
				+
			
 
				+int main(void)
			
 
				+{
			
 
				+	/* This is how many tests you plan to run */
			
 
				+	plan_tests(2);
			
 
				+	
			
 
				+	test_utf8_validate();
			
 
				+
			
 
				+	/* This exits depending on whether all tests passed */
			
 
				+	return exit_status();
			
 
				+}
			
--- a/ccan/charset/test/run.c
+++ b/ccan/charset/test/run.c
@@ -1,199 +0,0 @@
 
				-#include <ccan/charset/charset.h>
			
 
				-#include <ccan/charset/charset.c>
			
 
				-#include <ccan/tap/tap.h>
			
 
				-
			
 
				-#include <assert.h>
			
 
				-#include <math.h>
			
 
				-#include <stdint.h>
			
 
				-#include <stdio.h>
			
 
				-
			
 
				-/*
			
 
				- * Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
			
 
				- * Uses the BCPL linear congruential generator method.
			
 
				- *
			
 
				- * Used instead of system RNG to ensure tests are consistent.
			
 
				- */
			
 
				-static uint32_t rand32(void)
			
 
				-{
			
 
				-	static uint32_t rand32_state = 0;
			
 
				-	rand32_state *= (uint32_t)0x7FF8A3ED;
			
 
				-	rand32_state += (uint32_t)0x2AA01D31;
			
 
				-	return rand32_state;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Make a Unicode character requiring exactly @len UTF-8 bytes.
			
 
				- *
			
 
				- * Unless utf8_allow_surrogates is set,
			
 
				- * do not return a value in the range U+D800 thru U+DFFF .
			
 
				- *
			
 
				- * If @len is not 1 thru 4, generate an out-of-range character.
			
 
				- */
			
 
				-static unsigned int utf8_randcode(int len)
			
 
				-{
			
 
				-	uint32_t r = rand32();
			
 
				-	unsigned int ret;
			
 
				-	
			
 
				-	switch (len) {
			
 
				-		case 1: return r % 0x80;
			
 
				-		case 2: return r % (0x800-0x80) + 0x80;
			
 
				-		case 3:
			
 
				-			for (;;) {
			
 
				-				ret = r % (0x10000-0x800) + 0x800;
			
 
				-				if ((!utf8_allow_surrogates && ret >= 0xD800 && ret <= 0xDFFF)
			
 
				-				|| ret >= 0xFFFE)
			
 
				-				{
			
 
				-					r = rand32();
			
 
				-					continue;
			
 
				-				} else {
			
 
				-					break;
			
 
				-				}
			
 
				-			}
			
 
				-			return ret;
			
 
				-		case 4: return r % (0x110000-0x10000) + 0x10000;
			
 
				-		default:
			
 
				-			while (r < 0x110000)
			
 
				-				r = rand32();
			
 
				-			return r;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static unsigned int rand_surrogate(void)
			
 
				-{
			
 
				-	return rand32() % (0xE000 - 0xD800) + 0xD800;
			
 
				-}
			
 
				-
			
 
				-/* Encode @uc as UTF-8 using exactly @len characters.
			
 
				-   @len should be 1 thru 4.
			
 
				-   @uc will be truncated to the bits it will go into.
			
 
				-   If, after bit truncation, @uc is in the wrong range for its length,
			
 
				-   an invalid character will be generated. */
			
 
				-static void utf8_encode_raw(char *out, unsigned int uc, int len)
			
 
				-{
			
 
				-	switch (len) {
			
 
				-		case 1:
			
 
				-			*out++ = uc & 0x7F;
			
 
				-			break;
			
 
				-		case 2:
			
 
				-			*out++ = 0xC0 | ((uc >> 6) & 0x1F);
			
 
				-			*out++ = 0x80 | (uc & 0x3F);
			
 
				-			break;
			
 
				-		case 3:
			
 
				-			*out++ = 0xE0 | ((uc >> 12) & 0x0F);
			
 
				-			*out++ = 0x80 | ((uc >> 6) & 0x3F);
			
 
				-			*out++ = 0x80 | (uc & 0x3F);
			
 
				-			break;
			
 
				-		case 4:
			
 
				-			*out++ = 0xF0 | ((uc >> 18) & 0x07);
			
 
				-			*out++ = 0x80 | ((uc >> 12) & 0x3F);
			
 
				-			*out++ = 0x80 | ((uc >> 6) & 0x3F);
			
 
				-			*out++ = 0x80 | (uc & 0x3F);
			
 
				-			break;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-/* Generate a UTF-8 string of the given byte length,
			
 
				-   randomly deciding if it should be valid or not.
			
 
				-   
			
 
				-   Return true if it's valid, false if it's not. */
			
 
				-static bool utf8_mktest(char *out, int len)
			
 
				-{
			
 
				-	int m, n;
			
 
				-	bool valid = true;
			
 
				-	bool v;
			
 
				-	double pf;
			
 
				-	uint32_t pu;
			
 
				-	
			
 
				-	/* Probability that, per character, it should be valid.
			
 
				-	   The goal is to make utf8_mktest as a whole
			
 
				-	   have a 50% chance of generating a valid string. */
			
 
				-	pf = pow(0.5, 2.5/len);
			
 
				-	
			
 
				-	/* Convert to uint32_t to test against rand32. */
			
 
				-	pu = pf * 4294967295.0;
			
 
				-	
			
 
				-	for (;len; len -= n) {
			
 
				-		v = len == 1 || rand32() <= pu;
			
 
				-		m = len < 4 ? len : 4;
			
 
				-		
			
 
				-		if (v) {
			
 
				-			/* Generate a valid character. */
			
 
				-			n = rand32() % m + 1;
			
 
				-			utf8_encode_raw(out, utf8_randcode(n), n);
			
 
				-		} else {
			
 
				-			/* Generate an invalid character. */
			
 
				-			assert(m >= 2);
			
 
				-			n = rand32() % (m-1) + 2;
			
 
				-			switch (n) {
			
 
				-				case 2:
			
 
				-					utf8_encode_raw(out, utf8_randcode(1), n);
			
 
				-					break;
			
 
				-				case 3:
			
 
				-					if (!utf8_allow_surrogates && (rand32() & 1))
			
 
				-						utf8_encode_raw(out, rand_surrogate(), n);
			
 
				-					else
			
 
				-						utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
			
 
				-					break;
			
 
				-				case 4:
			
 
				-					utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
			
 
				-					break;
			
 
				-			}
			
 
				-			valid = false;
			
 
				-		}
			
 
				-		out += n;
			
 
				-	}
			
 
				-	
			
 
				-	return valid;
			
 
				-}
			
 
				-
			
 
				-static void test_utf8_validate(bool allow_surrogates)
			
 
				-{
			
 
				-	char buffer[1024];
			
 
				-	int i;
			
 
				-	int len;
			
 
				-	bool valid;
			
 
				-	int passed=0, p_valid=0, p_invalid=0, total=0;
			
 
				-	int count;
			
 
				-	
			
 
				-	count = 10000;
			
 
				-	
			
 
				-	utf8_allow_surrogates = allow_surrogates;
			
 
				-	
			
 
				-	for (i=0; i<count; i++) {
			
 
				-		len = rand32() % (1024 + 1);
			
 
				-		valid = utf8_mktest(buffer, len);
			
 
				-		if (utf8_validate(buffer, len) == valid) {
			
 
				-			passed++;
			
 
				-			if (valid)
			
 
				-				p_valid++;
			
 
				-			else
			
 
				-				p_invalid++;
			
 
				-		}
			
 
				-		total++;
			
 
				-	}
			
 
				-	
			
 
				-	if (passed == total) {
			
 
				-		printf("PASS:  %d valid tests, %d invalid tests\n",
			
 
				-			p_valid, p_invalid);
			
 
				-	} else {
			
 
				-		printf("FAIL:  Passed %d out of %d tests\n", passed, total);
			
 
				-	}
			
 
				-	
			
 
				-	ok(passed, "utf8_validate test passed%s",
			
 
				-		!allow_surrogates ? " (surrogates disallowed)" : "");
			
 
				-	
			
 
				-	ok(p_valid > count/10 && p_invalid > count/10,
			
 
				-		"   valid/invalid are balanced");
			
 
				-}
			
 
				-
			
 
				-int main(void)
			
 
				-{
			
 
				-	/* This is how many tests you plan to run */
			
 
				-	plan_tests(4);
			
 
				-	
			
 
				-	test_utf8_validate(false);
			
 
				-	test_utf8_validate(true);
			
 
				-
			
 
				-	/* This exits depending on whether all tests passed */
			
 
				-	return exit_status();
			
 
				-}