16 years ago · c8c69dc687
--- a/ccan/charset/_info
+++ b/ccan/charset/_info
@@ -0,0 +1,63 @@
 
															+#include <stdio.h>
														
 
															+#include <string.h>
														
 
															+#include "config.h"
														
 
															+
														
 
															+/**
														
 
															+ * charset - character set conversion and validation routines
														
 
															+ *
														
 
															+ * This module provides a collection (well, only one, at the moment) of
														
 
															+ * well-tested routines for dealing with character set nonsense.
														
 
															+ *
														
 
															+ * Validation functions:
														
 
															+ *  - bool utf8_validate(const char *str, size_t length);
														
 
															+ *
														
 
															+ * Example:
														
 
															+ *	#include <err.h>
														
 
															+ *	#include <stdio.h>
														
 
															+ *	#include <string.h>
														
 
															+ *	#include <ccan/charset/charset.h>
														
 
															+ *	#include <ccan/grab_file/grab_file.h>
														
 
															+ *	#include <ccan/talloc/talloc.h>	// For talloc_free()
														
 
															+ *
														
 
															+ *	int main(int argc, char *argv[])
														
 
															+ *	{
														
 
															+ *		size_t len;
														
 
															+ *		char *file;
														
 
															+ *		bool valid;
														
 
															+ *
														
 
															+ *		if (argc != 2)
														
 
															+ *			err(1, "Expected exactly one argument");
														
 
															+ *
														
 
															+ *		file = grab_file(NULL, argv[1], &len);
														
 
															+ *		if (!file)
														
 
															+ *			err(1, "Could not read file %s", argv[1]);
														
 
															+ *
														
 
															+ *		valid = utf8_validate(file, len));
														
 
															+ *		printf("File contents are %s UTF-8\n", valid ? "valid" : "invalid");
														
 
															+ *
														
 
															+ *		talloc_free(file);
														
 
															+ *
														
 
															+ *		return 0;
														
 
															+ *	}
														
 
															+ *
														
 
															+ * Author: Joey Adams
														
 
															+ * Licence: MIT
														
 
															+ */
														
 
															+int main(int argc, char *argv[])
														
 
															+{
														
 
															+	/* Expect exactly one argument */
														
 
															+	if (argc != 2)
														
 
															+		return 1;
														
 
															+
														
 
															+	if (strcmp(argv[1], "depends") == 0) {
														
 
															+		/* Nothing */
														
 
															+		return 0;
														
 
															+	}
														
 
															+	
														
 
															+	if (strcmp(argv[1], "libs") == 0) {
														
 
															+		printf("m\n"); /* Needed for the pow() invocation in run.c */
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	return 1;
														
 
															+}
														
--- a/ccan/charset/charset.c
+++ b/ccan/charset/charset.c
@@ -0,0 +1,95 @@
 
															+/*
														
 
															+  Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
														
 
															+  All rights reserved.
														
 
															+
														
 
															+  Permission is hereby granted, free of charge, to any person obtaining a copy
														
 
															+  of this software and associated documentation files (the "Software"), to deal
														
 
															+  in the Software without restriction, including without limitation the rights
														
 
															+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
														
 
															+  copies of the Software, and to permit persons to whom the Software is
														
 
															+  furnished to do so, subject to the following conditions:
														
 
															+
														
 
															+  The above copyright notice and this permission notice shall be included in
														
 
															+  all copies or substantial portions of the Software.
														
 
															+
														
 
															+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
														
 
															+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
														
 
															+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
														
 
															+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
														
 
															+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
														
 
															+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
														
 
															+  THE SOFTWARE.
														
 
															+*/
														
 
															+
														
 
															+#include "charset.h"
														
 
															+
														
 
															+bool utf8_allow_surrogates = false;
														
 
															+
														
 
															+bool utf8_validate(const char *str, size_t length)
														
 
															+{
														
 
															+	const unsigned char *s = (const unsigned char*)str;
														
 
															+	const unsigned char *e = s + length;
														
 
															+	
														
 
															+	while (s < e) {
														
 
															+		unsigned char c = *s++;
														
 
															+		unsigned int len; /* number of bytes in sequence - 2 */
														
 
															+		
														
 
															+		/* If character is ASCII, move on. */
														
 
															+		if (c < 0x80)
														
 
															+			continue;
														
 
															+		
														
 
															+		if (s >= e)
														
 
															+			return false; /* Missing bytes in sequence. */
														
 
															+		
														
 
															+		if (c < 0xE0) {
														
 
															+			/* 2-byte sequence, U+0080 to U+07FF
														
 
															+			   c must be 11000010 or higher
														
 
															+			   s[0] must be 10xxxxxx */
														
 
															+			len = 0;
														
 
															+			if (c < 0xC2)
														
 
															+				return false;
														
 
															+		} else if (c < 0xF0) {
														
 
															+			/* 3-byte sequence, U+0800 to U+FFFF
														
 
															+			   Note that the surrogate range is U+D800 to U+DFFF
														
 
															+			   c must be >= 11100000 (which it is)
														
 
															+			   If c is 11100000, then s[0] must be >= 10100000
														
 
															+			   If the global parameter utf8_allow_surrogates is false:
														
 
															+			      If c is 11101101 and s[0] is >= 10100000,
														
 
															+			         then this is a surrogate and we should fail.
														
 
															+			   s[0] and s[1] must be 10xxxxxx */
														
 
															+			len = 1;
														
 
															+			if (c == 0xE0 && *s < 0xA0)
														
 
															+				return false;
														
 
															+			if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
														
 
															+				return false;
														
 
															+		} else {
														
 
															+			/* 4-byte sequence, U+010000 to U+10FFFF
														
 
															+			   c must be >= 11110000 (which it is) and <= 11110100
														
 
															+			   If c is 11110000, then s[0] must be >= 10010000
														
 
															+			   If c is 11110100, then s[0] must be < 10010000
														
 
															+			   s[0], s[1], and s[2] must be 10xxxxxx */
														
 
															+			len = 2;
														
 
															+			if (c > 0xF4)
														
 
															+				return false;
														
 
															+			if (c == 0xF0 && *s < 0x90)
														
 
															+				return false;
														
 
															+			if (c == 0xF4 && *s >= 0x90)
														
 
															+				return false;
														
 
															+		}
														
 
															+		
														
 
															+		if (s + len >= e)
														
 
															+			return false; /* Missing bytes in sequence. */
														
 
															+		
														
 
															+		do {
														
 
															+			if ((*s++ & 0xC0) != 0x80)
														
 
															+				return false;
														
 
															+		} while (len--);
														
 
															+	}
														
 
															+	
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+  Note to future contributors: These routines are currently all under the
														
 
															+    MIT license.  It would be nice to keep it that way :)
														
 
															+*/
														
--- a/ccan/charset/charset.h
+++ b/ccan/charset/charset.h
@@ -0,0 +1,44 @@
 
															+/*
														
 
															+  Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
														
 
															+  All rights reserved.
														
 
															+
														
 
															+  Permission is hereby granted, free of charge, to any person obtaining a copy
														
 
															+  of this software and associated documentation files (the "Software"), to deal
														
 
															+  in the Software without restriction, including without limitation the rights
														
 
															+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
														
 
															+  copies of the Software, and to permit persons to whom the Software is
														
 
															+  furnished to do so, subject to the following conditions:
														
 
															+
														
 
															+  The above copyright notice and this permission notice shall be included in
														
 
															+  all copies or substantial portions of the Software.
														
 
															+
														
 
															+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
														
 
															+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
														
 
															+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
														
 
															+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
														
 
															+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
														
 
															+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
														
 
															+  THE SOFTWARE.
														
 
															+*/
														
 
															+
														
 
															+#ifndef CCAN_CHARSET_H
														
 
															+#define CCAN_CHARSET_H
														
 
															+
														
 
															+#include <stdbool.h>
														
 
															+#include <stddef.h>
														
 
															+
														
 
															+/*
														
 
															+ * Validate the given UTF-8 string.  If it contains '\0' characters,
														
 
															+ * it is still valid.
														
 
															+ *
														
 
															+ * By default, Unicode characters U+D800 thru U+DFFF will be considered
														
 
															+ * invalid UTF-8.  However, if you set utf8_allow_surrogates to true,
														
 
															+ * they will be allowed.  Allowing the surrogate range makes it possible
														
 
															+ * to losslessly encode malformed UTF-16.
														
 
															+ */
														
 
															+bool utf8_validate(const char *str, size_t length);
														
 
															+
														
 
															+/* Default: false */
														
 
															+extern bool utf8_allow_surrogates;
														
 
															+
														
 
															+#endif
														
--- a/ccan/charset/test/run.c
+++ b/ccan/charset/test/run.c
@@ -0,0 +1,198 @@
 
															+#include <ccan/charset/charset.h>
														
 
															+#include <ccan/charset/charset.c>
														
 
															+#include <ccan/tap/tap.h>
														
 
															+
														
 
															+#include <assert.h>
														
 
															+#include <math.h>
														
 
															+#include <stdint.h>
														
 
															+#include <stdio.h>
														
 
															+
														
 
															+/*
														
 
															+ * Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
														
 
															+ * Uses the BCPL linear congruential generator method.
														
 
															+ *
														
 
															+ * Used instead of system RNG to ensure tests are consistent.
														
 
															+ */
														
 
															+static uint32_t rand32(void)
														
 
															+{
														
 
															+	static uint32_t rand32_state = 0;
														
 
															+	rand32_state *= (uint32_t)0x7FF8A3ED;
														
 
															+	rand32_state += (uint32_t)0x2AA01D31;
														
 
															+	return rand32_state;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Make a Unicode character requiring exactly @len UTF-8 bytes.
														
 
															+ *
														
 
															+ * Unless utf8_allow_surrogates is set,
														
 
															+ * do not return a value in the range U+D800 thru U+DFFF .
														
 
															+ *
														
 
															+ * If @len is not 1 thru 4, generate an out-of-range character.
														
 
															+ */
														
 
															+static unsigned int utf8_randcode(int len)
														
 
															+{
														
 
															+	uint32_t r = rand32();
														
 
															+	unsigned int ret;
														
 
															+	
														
 
															+	switch (len) {
														
 
															+		case 1: return r % 0x80;
														
 
															+		case 2: return r % (0x800-0x80) + 0x80;
														
 
															+		case 3:
														
 
															+			for (;;) {
														
 
															+				ret = r % (0x10000-0x800) + 0x800;
														
 
															+				if (!utf8_allow_surrogates && ret >= 0xD800 && ret <= 0xDFFF)
														
 
															+				{
														
 
															+					r = rand32();
														
 
															+					continue;
														
 
															+				} else {
														
 
															+					break;
														
 
															+				}
														
 
															+			}
														
 
															+			return ret;
														
 
															+		case 4: return r % (0x110000-0x10000) + 0x10000;
														
 
															+		default:
														
 
															+			while (r < 0x110000)
														
 
															+				r = rand32();
														
 
															+			return r;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static unsigned int rand_surrogate(void)
														
 
															+{
														
 
															+	return rand32() % (0xE000 - 0xD800) + 0xD800;
														
 
															+}
														
 
															+
														
 
															+/* Encode @uc as UTF-8 using exactly @len characters.
														
 
															+   @len should be 1 thru 4.
														
 
															+   @uc will be truncated to the bits it will go into.
														
 
															+   If, after bit truncation, @uc is in the wrong range for its length,
														
 
															+   an invalid character will be generated. */
														
 
															+static void utf8_encode_raw(char *out, unsigned int uc, int len)
														
 
															+{
														
 
															+	switch (len) {
														
 
															+		case 1:
														
 
															+			*out++ = uc & 0x7F;
														
 
															+			break;
														
 
															+		case 2:
														
 
															+			*out++ = 0xC0 | ((uc >> 6) & 0x1F);
														
 
															+			*out++ = 0x80 | (uc & 0x3F);
														
 
															+			break;
														
 
															+		case 3:
														
 
															+			*out++ = 0xE0 | ((uc >> 12) & 0x0F);
														
 
															+			*out++ = 0x80 | ((uc >> 6) & 0x3F);
														
 
															+			*out++ = 0x80 | (uc & 0x3F);
														
 
															+			break;
														
 
															+		case 4:
														
 
															+			*out++ = 0xF0 | ((uc >> 18) & 0x07);
														
 
															+			*out++ = 0x80 | ((uc >> 12) & 0x3F);
														
 
															+			*out++ = 0x80 | ((uc >> 6) & 0x3F);
														
 
															+			*out++ = 0x80 | (uc & 0x3F);
														
 
															+			break;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* Generate a UTF-8 string of the given byte length,
														
 
															+   randomly deciding if it should be valid or not.
														
 
															+   
														
 
															+   Return true if it's valid, false if it's not. */
														
 
															+static bool utf8_mktest(char *out, int len)
														
 
															+{
														
 
															+	int m, n;
														
 
															+	bool valid = true;
														
 
															+	bool v;
														
 
															+	double pf;
														
 
															+	uint32_t pu;
														
 
															+	
														
 
															+	/* Probability that, per character, it should be valid.
														
 
															+	   The goal is to make utf8_mktest as a whole
														
 
															+	   have a 50% chance of generating a valid string. */
														
 
															+	pf = pow(0.5, 2.5/len);
														
 
															+	
														
 
															+	/* Convert to uint32_t to test against rand32. */
														
 
															+	pu = pf * 4294967295.0;
														
 
															+	
														
 
															+	for (;len; len -= n) {
														
 
															+		v = len == 1 || rand32() <= pu;
														
 
															+		m = len < 4 ? len : 4;
														
 
															+		
														
 
															+		if (v) {
														
 
															+			/* Generate a valid character. */
														
 
															+			n = rand32() % m + 1;
														
 
															+			utf8_encode_raw(out, utf8_randcode(n), n);
														
 
															+		} else {
														
 
															+			/* Generate an invalid character. */
														
 
															+			assert(m >= 2);
														
 
															+			n = rand32() % (m-1) + 2;
														
 
															+			switch (n) {
														
 
															+				case 2:
														
 
															+					utf8_encode_raw(out, utf8_randcode(1), n);
														
 
															+					break;
														
 
															+				case 3:
														
 
															+					if (!utf8_allow_surrogates && (rand32() & 1))
														
 
															+						utf8_encode_raw(out, rand_surrogate(), n);
														
 
															+					else
														
 
															+						utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
														
 
															+					break;
														
 
															+				case 4:
														
 
															+					utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
														
 
															+					break;
														
 
															+			}
														
 
															+			valid = false;
														
 
															+		}
														
 
															+		out += n;
														
 
															+	}
														
 
															+	
														
 
															+	return valid;
														
 
															+}
														
 
															+
														
 
															+static void test_utf8_validate(bool allow_surrogates)
														
 
															+{
														
 
															+	char buffer[1024];
														
 
															+	int i;
														
 
															+	int len;
														
 
															+	bool valid;
														
 
															+	int passed=0, p_valid=0, p_invalid=0, total=0;
														
 
															+	int count;
														
 
															+	
														
 
															+	count = 10000;
														
 
															+	
														
 
															+	utf8_allow_surrogates = allow_surrogates;
														
 
															+	
														
 
															+	for (i=0; i<count; i++) {
														
 
															+		len = rand32() % (1024 + 1);
														
 
															+		valid = utf8_mktest(buffer, len);
														
 
															+		if (utf8_validate(buffer, len) == valid) {
														
 
															+			passed++;
														
 
															+			if (valid)
														
 
															+				p_valid++;
														
 
															+			else
														
 
															+				p_invalid++;
														
 
															+		}
														
 
															+		total++;
														
 
															+	}
														
 
															+	
														
 
															+	if (passed == total) {
														
 
															+		printf("PASS:  %d valid tests, %d invalid tests\n",
														
 
															+			p_valid, p_invalid);
														
 
															+	} else {
														
 
															+		printf("FAIL:  Passed %d out of %d tests\n", passed, total);
														
 
															+	}
														
 
															+	
														
 
															+	ok(passed, "utf8_validate test passed%s",
														
 
															+		!allow_surrogates ? " (surrogates disallowed)" : "");
														
 
															+	
														
 
															+	ok(p_valid > count/10 && p_invalid > count/10,
														
 
															+		"   valid/invalid are balanced");
														
 
															+}
														
 
															+
														
 
															+int main(void)
														
 
															+{
														
 
															+	/* This is how many tests you plan to run */
														
 
															+	plan_tests(4);
														
 
															+	
														
 
															+	test_utf8_validate(false);
														
 
															+	test_utf8_validate(true);
														
 
															+
														
 
															+	/* This exits depending on whether all tests passed */
														
 
															+	return exit_status();
														
 
															+}