Browse Source

strset: new module using critbit trees.

Not as fast as using htable, but simple and provides order and prefix
ops.
Rusty Russell 14 years ago
parent
commit
ab83de9537

+ 71 - 0
ccan/strset/_info

@@ -0,0 +1,71 @@
+#include <string.h>
+#include "config.h"
+
+/**
+ * strset - an ordered set of strings
+ *
+ * This code implements an ordered set of string as a critbit tree. See:
+ *
+ *  http://cr.yp.to/critbit.html
+ *  http://github.com/agl/critbit (which this code is based on)
+ *
+ * Note that ccan/htable is faster and uses less memory, but doesn't provide
+ * ordered or prefix operations.
+ *
+ * Example:
+ *	// Print all words in order.
+ *	#include <ccan/strset/strset.h>
+ *	#include <ccan/grab_file/grab_file.h>
+ *	#include <err.h>
+ *	#include <string.h>
+ *
+ *	static bool dump(const char *member, void *unused)
+ *	{
+ *		printf("%s ", member);
+ *		return false;
+ *	}
+ *
+ *	int main(void)
+ *	{
+ *		struct strset words;
+ *		char *file, *word;
+ *
+ *		strset_init(&words);
+ *		file = grab_fd(NULL, 0, NULL);
+ *		if (!file)
+ *			err(1, "Reading stdin");
+ *
+ *		for (word = strtok(file, " \t\r\n");
+ *		     word;
+ *		     word = strtok(NULL, " \t\r\n")) {
+ *			strset_set(&words, word);
+ *		}
+ *		strset_iterate(&words, dump, NULL);
+ *		printf("\n");
+ *		return 0;
+ *	}
+ *	// Given "foo bar" outputs "bar foo "
+ *	// Given "foo foo bar" outputs "bar foo "
+ *
+ * License: Public domain (but some dependencies are LGPL!)
+ * Author: Rusty Russell <rusty@rustcorp.com.au>
+ * Ccanlint:
+ *	license_depends_compat FAIL
+ */
+int main(int argc, char *argv[])
+{
+	/* Expect exactly one argument */
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		printf("ccan/ilog\n"
+		       "ccan/likely\n"
+		       "ccan/short_types\n"
+		       "ccan/str\n"
+		       "ccan/typesafe_cb\n");
+		return 0;
+	}
+
+	return 1;
+}

+ 298 - 0
ccan/strset/strset.c

@@ -0,0 +1,298 @@
+/* This code is based on the public domain code at
+ * http://github.com/agl/critbit writtem by Adam Langley
+ * <agl@imperialviolet.org>.
+ *
+ * Here are the main implementation differences:
+ * (1) We don't strdup the string on insert; we use the pointer we're given.
+ * (2) We use a straight bit number rather than a mask; it's simpler.
+ * (3) We don't use the bottom bit of the pointer, but instead use a leading
+ *     zero to distinguish nodes from strings.
+ * (4) The empty string (which would look like a node) is handled
+ *     using a special "empty node".
+ * (5) Delete returns the string, so you can free it if you want to.
+ * (6) Unions instead of void *, bool instead of int.
+ */
+#include <ccan/strset/strset.h>
+#include <ccan/short_types/short_types.h>
+#include <ccan/likely/likely.h>
+#include <ccan/str/str.h>
+#include <ccan/ilog/ilog.h>
+#include <assert.h>
+#include <stdlib.h>
+
+struct node {
+	/* To differentiate us from strings. */
+	char nul_byte;
+	/* The bit where these children differ. */
+	u8 bit_num;
+	/* The byte number where first bit differs (-1 == empty string node). */
+	size_t byte_num;
+	/* These point to strings or nodes. */
+	struct strset child[2];
+};
+
+/* Closest member to this in a non-empty set. */
+static const char *closest(struct strset n, const char *member)
+{
+	size_t len = strlen(member);
+	const u8 *bytes = (const u8 *)member;
+
+	/* Anything with first byte 0 is a node. */
+	while (!n.u.s[0]) {
+		u8 direction = 0;
+
+		/* Special node which represents the empty string. */
+		if (unlikely(n.u.n->byte_num == (size_t)-1)) {
+			n = n.u.n->child[0];
+			break;
+		}
+
+		if (n.u.n->byte_num < len) {
+			u8 c = bytes[n.u.n->byte_num];
+			direction = (c >> n.u.n->bit_num) & 1;
+		}
+		n = n.u.n->child[direction];
+	}
+	return n.u.s;
+}
+
+char *strset_test(const struct strset *set, const char *member)
+{
+	const char *str;
+
+	/* Empty set? */
+	if (!set->u.n)
+		return NULL;
+	str = closest(*set, member);
+	if (streq(member, str))
+		return (char *)str;
+	return NULL;
+}
+
+static bool set_string(struct strset *set,
+		       struct strset *n, const char *member)
+{
+	/* Substitute magic empty node if this is the empty string */
+	if (unlikely(!member[0])) {
+		n->u.n = malloc(sizeof(*n->u.n));
+		if (unlikely(!n->u.n))
+			return false;
+		n->u.n->nul_byte = '\0';
+		n->u.n->byte_num = (size_t)-1;
+		/* Attach the string to child[0] */
+		n = &n->u.n->child[0];
+	}
+	n->u.s = member;
+	return true;
+}
+
+bool strset_set(struct strset *set, const char *member)
+{
+	size_t len = strlen(member);
+	const u8 *bytes = (const u8 *)member;
+	struct strset *np;
+	const char *str;
+	struct node *newn;
+	size_t byte_num;
+	u8 bit_num, new_dir;
+
+	/* Empty set? */
+	if (!set->u.n) {
+		return set_string(set, set, member);
+	}
+
+	/* Find closest existing member. */
+	str = closest(*set, member);
+
+	/* Find where they differ. */
+	for (byte_num = 0; str[byte_num] == member[byte_num]; byte_num++) {
+		if (member[byte_num] == '\0') {
+			/* All identical! */
+			return false;
+		}
+	}
+
+	/* Find which bit differs (if we had ilog8, we'd use it) */
+	bit_num = ilog32_nz((u8)str[byte_num] ^ bytes[byte_num]) - 1;
+	assert(bit_num < CHAR_BIT);
+
+	/* Which direction do we go at this bit? */
+	new_dir = ((bytes[byte_num]) >> bit_num) & 1;
+
+	/* Allocate new node. */
+	newn = malloc(sizeof(*newn));
+	if (!newn) {
+		/* FIXME */
+		return false;
+	}
+	newn->nul_byte = '\0';
+	newn->byte_num = byte_num;
+	newn->bit_num = bit_num;
+	if (unlikely(!set_string(set, &newn->child[new_dir], member))) {
+		free(newn);
+		return false;
+	}
+
+	/* Find where to insert: not closest, but first which differs! */
+	np = set;
+	while (!np->u.s[0]) {
+		u8 direction = 0;
+
+		/* Special node which represents the empty string will
+		 * break here too! */
+		if (np->u.n->byte_num > byte_num)
+			break;
+		/* Subtle: bit numbers are "backwards" for comparison */
+		if (np->u.n->byte_num == byte_num && np->u.n->bit_num < bit_num)
+			break;
+
+		if (np->u.n->byte_num < len) {
+			u8 c = bytes[np->u.n->byte_num];
+			direction = (c >> np->u.n->bit_num) & 1;
+		}
+		np = &np->u.n->child[direction];
+	}
+
+	newn->child[!new_dir]= *np;
+	np->u.n = newn;
+	return true;
+}
+
+char *strset_clear(struct strset *set, const char *member)
+{
+	size_t len = strlen(member);
+	const u8 *bytes = (const u8 *)member;
+	struct strset *parent = NULL, *n;
+	const char *ret = NULL;
+	u8 direction = 0; /* prevent bogus gcc warning. */
+
+	/* Empty set? */
+	if (!set->u.n)
+		return NULL;
+
+	/* Find closest, but keep track of parent. */
+	n = set;
+	/* Anything with first byte 0 is a node. */
+	while (!n->u.s[0]) {
+		u8 c = 0;
+
+		/* Special node which represents the empty string. */
+		if (unlikely(n->u.n->byte_num == (size_t)-1)) {
+			const char *empty_str = n->u.n->child[0].u.s;
+
+			if (member[0])
+				return NULL;
+
+			/* Sew empty string back so remaining logic works */
+			free(n->u.n);
+			n->u.s = empty_str;
+			break;
+		}
+
+		parent = n;
+		if (n->u.n->byte_num < len) {
+			c = bytes[n->u.n->byte_num];
+			direction = (c >> n->u.n->bit_num) & 1;
+		} else
+			direction = 0;
+		n = &n->u.n->child[direction];
+	}
+
+	/* Did we find it? */
+	if (!streq(member, n->u.s))
+		return NULL;
+
+	ret = n->u.s;
+
+	if (!parent) {
+		/* We deleted last node. */
+		set->u.n = NULL;
+	} else {
+		struct node *old = parent->u.n;
+		/* Raise other node to parent. */
+		*parent = old->child[!direction];
+		free(old);
+	}
+
+	return (char *)ret;
+}
+
+static bool iterate(struct strset n,
+		    bool (*handle)(const char *, void *), void *data)
+{
+	if (n.u.s[0])
+		return handle(n.u.s, data);
+	if (unlikely(n.u.n->byte_num == (size_t)-1))
+		return handle(n.u.n->child[0].u.s, data);
+
+	return iterate(n.u.n->child[0], handle, data)
+		|| iterate(n.u.n->child[1], handle, data);
+}
+
+void strset_iterate_(const struct strset *set,
+		     bool (*handle)(const char *, void *), void *data)
+{
+	/* Empty set? */
+	if (!set->u.n)
+		return;
+
+	iterate(*set, handle, data);
+}
+
+const struct strset *strset_prefix(const struct strset *set, const char *prefix)
+{
+	const struct strset *n, *top;
+	size_t len = strlen(prefix);
+	const u8 *bytes = (const u8 *)prefix;
+
+	/* Empty set -> return empty set. */
+	if (!set->u.n)
+		return set;
+
+	top = n = set;
+
+	/* We walk to find the top, but keep going to check prefix matches. */
+	while (!n->u.s[0]) {
+		u8 c = 0, direction;
+
+		/* Special node which represents the empty string. */
+		if (unlikely(n->u.n->byte_num == (size_t)-1)) {
+			n = &n->u.n->child[0];
+			break;
+		}
+
+		if (n->u.n->byte_num < len)
+			c = bytes[n->u.n->byte_num];
+
+		direction = (c >> n->u.n->bit_num) & 1;
+		n = &n->u.n->child[direction];
+		if (c)
+			top = n;
+	}
+
+	if (!strstarts(n->u.s, prefix)) {
+		/* Convenient return for prefixes which do not appear in set. */
+		static const struct strset empty_set;
+		return &empty_set;
+	}
+
+	return top;
+}
+
+static void destroy(struct strset n)
+{
+	if (!n.u.s[0]) {
+		if (likely(n.u.n->byte_num != (size_t)-1)) {
+			destroy(n.u.n->child[0]);
+			destroy(n.u.n->child[1]);
+		}
+		free(n.u.n);
+	}
+}
+
+void strset_destroy(struct strset *set)
+{
+	if (set->u.n)
+		destroy(*set);
+	set->u.n = NULL;
+}

+ 164 - 0
ccan/strset/strset.h

@@ -0,0 +1,164 @@
+#ifndef CCAN_STRSET_H
+#define CCAN_STRSET_H
+#include "config.h"
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+/**
+ * struct strset - representation of a string set
+ *
+ * It's exposed here to allow you to embed it and so we can inline the
+ * trivial functions.
+ */
+struct strset {
+	union {
+		struct node *n;
+		const char *s;
+	} u;
+};
+
+/**
+ * strset_init - initialize a string set (empty)
+ *
+ * For completeness; if you've arranged for it to be NULL already you don't
+ * need this.
+ *
+ * Example:
+ *	struct strset set;
+ *
+ *	strset_init(&set);
+ */
+static inline void strset_init(struct strset *set)
+{
+	set->u.n = NULL;
+}
+
+/**
+ * strset_empty - is this string set empty?
+ * @set: the set.
+ *
+ * Example:
+ *	if (!strset_empty(&set))
+ *		abort();
+ */
+static inline bool strset_empty(const struct strset *set)
+{
+	return set->u.n == NULL;
+}
+
+/**
+ * strset_test - is this a member of this string set?
+ * @set: the set.
+ * @member: the string to search for.
+ *
+ * Returns the member, or NULL if it isn't in the set.
+ *
+ * Example:
+ *	if (strset_test(&set, "hello"))
+ *		printf("hello is in the set\n");
+ */
+char *strset_test(const struct strset *set, const char *member);
+
+/**
+ * strset_set - place a member in the string set.
+ * @set: the set.
+ * @member: the string to place in the set.
+ *
+ * This returns false if we run out of memory, or (more normally) if that
+ * string already appears in the set.
+ *
+ * Note that the pointer is placed in the set, the string is not copied.  If
+ * you want a copy in the set, use strdup().
+ *
+ * Example:
+ *	if (!strset_set(&set, "goodbye"))
+ *		printf("goodbye was already in the set\n");
+ */
+bool strset_set(struct strset *set, const char *member);
+
+/**
+ * strset_clear - remove a member from the string set.
+ * @set: the set.
+ * @member: the string to remove from the set.
+ *
+ * This returns the string which was passed to strset_set(), or NULL.
+ * This means that if you allocated a string (eg. using strdup()), you can
+ * free it here.
+ *
+ * Example:
+ *	if (!strset_clear(&set, "goodbye"))
+ *		printf("goodbye was not in the set?\n");
+ */
+char *strset_clear(struct strset *set, const char *member);
+
+/**
+ * strset_destroy - remove every member from the set.
+ * @set: the set.
+ *
+ * The set will be empty after this.
+ *
+ * Example:
+ *	strset_destroy(&set);
+ */
+void strset_destroy(struct strset *set);
+
+/**
+ * strset_iterate - ordered iteration over a set
+ * @set: the set.
+ * @handle: the function to call.
+ * @arg: the argument for the function (types should match).
+ *
+ * You should not alter the set within the @handle function!  If it returns
+ * true, the iteration will stop.
+ *
+ * Example:
+ *	static bool dump_some(const char *member, int *num)
+ *	{
+ *		// Only dump out num nodes.
+ *		if (*(num--) == 0)
+ *			return true;
+ *		printf("%s\n", member);
+ *		return false;
+ *	}
+ *
+ *	static void dump_set(const struct strset *set)
+ *	{
+ *		int max = 100;
+ *		strset_iterate(set, dump_some, &max);
+ *		if (max < 0)
+ *			printf("... (truncated to 100 entries)\n");
+ *	}
+ */
+#define strset_iterate(set, handle, arg)				\
+	strset_iterate_((set), typesafe_cb_preargs(bool, void *,	\
+						   (handle), (arg),	\
+						   const char *),	\
+			(arg))
+void strset_iterate_(const struct strset *set,
+		     bool (*handle)(const char *, void *), void *data);
+
+
+/**
+ * strset_prefix - return a subset matching a prefix
+ * @set: the set.
+ * @prefix: the prefix.
+ *
+ * This returns a pointer into @set, so don't alter @set while using
+ * the return value.  You can use strset_iterate(), strset_test() or
+ * strset_empty() on the returned pointer.
+ *
+ * Example:
+ *	static void dump_prefix(const struct strset *set, const char *prefix)
+ *	{
+ *		int max = 100;
+ *		printf("Nodes with prefix %s:\n", prefix);
+ *		strset_iterate(strset_prefix(set, prefix), dump_some, &max);
+ *		if (max < 0)
+ *			printf("... (truncated to 100 entries)\n");
+ *	}
+ */
+const struct strset *strset_prefix(const struct strset *set,
+				   const char *prefix);
+
+#endif /* CCAN_STRSET_H */

+ 82 - 0
ccan/strset/test/run-hibit.c

@@ -0,0 +1,82 @@
+/* Test high bit handling. */
+#include <ccan/strset/strset.h>
+#include <ccan/strset/strset.c>
+#include <ccan/tap/tap.h>
+
+#define NUM 1000
+
+static void encode(char template[3], unsigned int val)
+{
+	assert(val < 255 * 255);
+	template[0] = (val / 255) + 1;
+	template[1] = (val % 255) + 1;
+	template[2] = '\0';
+}
+
+static bool in_order(const char *value, unsigned int *count)
+{
+	char template[3];
+	encode(template, *count);
+	ok1(streq(value, template));
+	(*count)++;
+	return false;
+}
+
+static bool in_order_by_2(const char *value, unsigned int *count)
+{
+	char template[3];
+	encode(template, *count);
+	ok1(streq(value, template));
+	(*count) += 2;
+	return false;
+}
+
+static bool dump(const char *value, void *unused)
+{
+	diag("%s", value);
+	return false;
+}
+
+int main(void)
+{
+	struct strset set;
+	unsigned int i;
+	char *str[NUM];
+
+	plan_tests(NUM + 3 * NUM / 2);
+	strset_init(&set);
+
+	for (i = 0; i < NUM; i++) {
+		char template[3];
+		encode(template, i);
+		str[i] = strdup(template);
+	}
+
+	for (i = 0; i < NUM; i++)
+		strset_set(&set, str[i]);
+
+	strset_iterate(&set, dump, NULL);
+
+	/* Iterate. */
+	i = 0;
+	strset_iterate(&set, in_order, &i);
+
+	/* Preserve order after deletion. */
+	for (i = 0; i < NUM; i += 2)
+		ok1(strset_clear(&set, str[i]) == str[i]);
+
+	i = 1;
+	strset_iterate(&set, in_order_by_2, &i);
+
+	for (i = 1; i < NUM; i += 2)
+		ok1(strset_clear(&set, str[i]) == str[i]);
+
+	/* empty traverse. */
+	strset_iterate(&set, in_order_by_2, (unsigned int *)NULL);
+
+	for (i = 0; i < NUM; i++)
+		free(str[i]);
+
+	/* This exits depending on whether all tests passed */
+	return exit_status();
+}

+ 81 - 0
ccan/strset/test/run-order.c

@@ -0,0 +1,81 @@
+#include <ccan/strset/strset.h>
+#include <ccan/strset/strset.c>
+#include <ccan/tap/tap.h>
+#include <stdio.h>
+
+#define NUM 1000
+
+static bool in_order(const char *value, unsigned int *count)
+{
+	int i = atoi(value);
+	ok1(*count == i);
+	(*count)++;
+	return false;
+}
+
+static bool in_order_by_2(const char *value, unsigned int *count)
+{
+	int i = atoi(value);
+	ok1(*count == i);
+	(*count) += 2;
+	return false;
+}
+
+static bool dump(const char *value, void *unused)
+{
+	diag("%s", value);
+	return false;
+}
+
+int main(void)
+{
+	struct strset set;
+	unsigned int i;
+	char *str[NUM];
+
+	plan_tests(NUM * 2 + 3 * NUM / 2);
+	strset_init(&set);
+
+	for (i = 0; i < NUM; i++) {
+		char template[10];
+		sprintf(template, "%08u", i);
+		str[i] = strdup(template);
+	}
+
+	for (i = 0; i < NUM; i++)
+		strset_set(&set, str[i]);
+
+	strset_iterate(&set, dump, NULL);
+
+	/* Iterate. */
+	i = 0;
+	strset_iterate(&set, in_order, &i);
+
+	/* Preserve order after deletion. */
+	for (i = 0; i < NUM; i += 2)
+		ok1(strset_clear(&set, str[i]) == str[i]);
+
+	i = 1;
+	strset_iterate(&set, in_order_by_2, &i);
+
+	for (i = 1; i < NUM; i += 2)
+		ok1(strset_clear(&set, str[i]) == str[i]);
+
+	/* empty traverse. */
+	strset_iterate(&set, in_order_by_2, (unsigned int *)NULL);
+
+	/* insert backwards, should be fine. */
+	for (i = 0; i < NUM; i++)
+		strset_set(&set, str[NUM-1-i]);
+
+	i = 0;
+	strset_iterate(&set, in_order, &i);
+
+	strset_destroy(&set);
+
+	for (i = 0; i < NUM; i++)
+		free(str[i]);
+
+	/* This exits depending on whether all tests passed */
+	return exit_status();
+}

+ 87 - 0
ccan/strset/test/run-prefix.c

@@ -0,0 +1,87 @@
+#include <ccan/strset/strset.h>
+#include <ccan/strset/strset.c>
+#include <ccan/tap/tap.h>
+#include <stdio.h>
+
+/* Must be > 100, see below. */
+#define NUM 200
+
+static bool in_order(const char *value, unsigned int *count)
+{
+	int i = atoi(value);
+	ok1(*count == i);
+	(*count)++;
+	return false;
+}
+
+static bool find_empty(const char *value, char *empty)
+{
+	if (value == empty)
+		pass("Found empty entry!");
+	return false;
+}
+
+int main(void)
+{
+	struct strset set;
+	const struct strset *sub;
+	unsigned int i;
+	char *str[NUM], *empty;
+
+	plan_tests(7 + 1 + 10 + 100);
+	strset_init(&set);
+
+	for (i = 0; i < NUM; i++) {
+		char template[10];
+		sprintf(template, "%08u", i);
+		str[i] = strdup(template);
+	}
+
+	for (i = 0; i < NUM; i++)
+		strset_set(&set, str[i]);
+
+	/* Nothing */
+	sub = strset_prefix(&set, "a");
+	ok1(strset_empty(sub));
+
+	/* Everything */
+	sub = strset_prefix(&set, "0");
+	ok1(sub->u.n == set.u.n);
+	sub = strset_prefix(&set, "");
+	ok1(sub->u.n == set.u.n);
+
+	/* Singleton. */
+	sub = strset_prefix(&set, "00000000");
+	i = 0;
+	strset_iterate(sub, in_order, &i);
+	ok1(i == 1);
+
+	/* First 10. */
+	sub = strset_prefix(&set, "0000000");
+	i = 0;
+	strset_iterate(sub, in_order, &i);
+	ok1(i == 10);
+
+	/* First 100. */
+	sub = strset_prefix(&set, "000000");
+	i = 0;
+	strset_iterate(sub, in_order, &i);
+	ok1(i == 100);
+
+	/* Everything, *plus* empty string. */
+	empty = strdup("");
+	strset_set(&set, empty);
+
+	sub = strset_prefix(&set, "");
+	/* Check we get *our* empty string back! */
+	strset_iterate(sub, find_empty, empty);
+
+	strset_destroy(&set);
+
+	for (i = 0; i < NUM; i++)
+		free(str[i]);
+	free(empty);
+
+	/* This exits depending on whether all tests passed */
+	return exit_status();
+}

+ 56 - 0
ccan/strset/test/run.c

@@ -0,0 +1,56 @@
+#include <ccan/strset/strset.h>
+#include <ccan/strset/strset.c>
+#include <ccan/tap/tap.h>
+
+int main(void)
+{
+	struct strset set;
+	const char str[] = "hello";
+	const char none[] = "";
+	char *dup = strdup(str);
+
+	/* This is how many tests you plan to run */
+	plan_tests(24);
+
+	strset_init(&set);
+
+	ok1(!strset_test(&set, str));
+	ok1(!strset_test(&set, none));
+	ok1(!strset_clear(&set, str));
+	ok1(!strset_clear(&set, none));
+
+	ok1(strset_set(&set, str));
+	ok1(strset_test(&set, str));
+	/* We compare the string, not the pointer. */
+	ok1(strset_test(&set, dup));
+	ok1(!strset_test(&set, none));
+
+	/* Delete should return original string. */
+	ok1(strset_clear(&set, dup) == str);
+	ok1(!strset_test(&set, str));
+	ok1(!strset_test(&set, none));
+
+	/* Try insert and delete of empty string. */
+	ok1(strset_set(&set, none));
+	ok1(strset_test(&set, none));
+	ok1(!strset_test(&set, str));
+
+	/* Delete should return original string. */
+	ok1(strset_clear(&set, "") == none);
+	ok1(!strset_test(&set, str));
+	ok1(!strset_test(&set, none));
+
+	/* Both at once... */
+	ok1(strset_set(&set, none));
+	ok1(strset_set(&set, str));
+	ok1(strset_test(&set, str));
+	ok1(strset_test(&set, none));
+	ok1(strset_clear(&set, "") == none);
+	ok1(strset_clear(&set, dup) == str);
+
+	ok1(set.u.n == NULL);
+	free(dup);
+
+	/* This exits depending on whether all tests passed */
+	return exit_status();
+}