Browse Source

tdb2: initial commit (doesn't work, still writing tests)

Rusty Russell 15 years ago
parent
commit
39f01834db

+ 81 - 0
ccan/tdb2/_info

@@ -0,0 +1,81 @@
+#include <string.h>
+#include <stdio.h>
+
+/**
+ * tdb2 - [[WORK IN PROGRESS!]] The trivial (64bit transactional) database
+ *
+ * The tdb2 module provides an efficient keyword data mapping (usually
+ * within a file).  It supports transactions, so the contents of the
+ * database is reliable even across crashes.
+ *
+ * Example:
+ *	#include <ccan/tdb2/tdb2.h>
+ *	#include <ccan/str/str.h>
+ *	#include <err.h>
+ *	#include <stdio.h>
+ *	
+ *	static void usage(void)
+ *	{
+ *		errx(1, "Usage: %s fetch <dbfile> <key>\n"
+ *		     "OR %s store <dbfile> <key> <data>");
+ *	}
+ *	
+ *	int main(int argc, char *argv[])
+ *	{
+ *		struct tdb_context *tdb;
+ *		TDB_DATA key, value;
+ *	
+ *		if (argc < 4)
+ *			usage();
+ *	
+ *		tdb = tdb_open(argv[2], 1024, TDB_DEFAULT, O_CREAT|O_RDWR,
+ *				0600);
+ *		if (!tdb)
+ *			err(1, "Opening %s", argv[2]);
+ *	
+ *		key.dptr = (void *)argv[3];
+ *		key.dsize = strlen(argv[3]);
+ *	
+ *		if (streq(argv[1], "fetch")) {
+ *			if (argc != 4)
+ *				usage();
+ *			value = tdb_fetch(tdb, key);
+ *			if (!value.dptr)
+ *				errx(1, "fetch %s: %s",
+ *				     argv[3], tdb_errorstr(tdb));
+ *			printf("%.*s\n", value.dsize, (char *)value.dptr);
+ *			free(value.dptr);
+ *		} else if (streq(argv[1], "store")) {
+ *			if (argc != 5)
+ *				usage();
+ *			value.dptr = (void *)argv[4];
+ *			value.dsize = strlen(argv[4]);
+ *			if (tdb_store(tdb, key, value, 0) != 0)
+ *				errx(1, "store %s: %s",
+ *				     argv[3], tdb_errorstr(tdb));
+ *		} else
+ *			usage();
+ *	
+ *		return 0;
+ *	}
+ *
+ * Maintainer: Rusty Russell <rusty@rustcorp.com.au>
+ *
+ * Author: Rusty Russell
+ *
+ * Licence: LGPLv3 (or later)
+ */
+int main(int argc, char *argv[])
+{
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		printf("ccan/hash\n");
+		printf("ccan/likely\n");
+		printf("ccan/asearch\n");
+		return 0;
+	}
+
+	return 1;
+}

+ 411 - 0
ccan/tdb2/check.c

@@ -0,0 +1,411 @@
+ /* 
+   Trivial Database 2: free list/block handling
+   Copyright (C) Rusty Russell 2010
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+#include <ccan/asearch/asearch.h>
+
+/* We keep an ordered array of offsets. */
+static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
+{
+	tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
+	if (!new)
+		return false;
+	new[(*num)++] = off;
+	*arr = new;
+	return true;
+}
+
+static bool check_header(struct tdb_context *tdb)
+{
+	uint64_t hash_test;
+
+	hash_test = TDB_HASH_MAGIC;
+	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
+	if (tdb->header.hash_test != hash_test) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "check: hash test %llu should be %llu\n",
+			 tdb->header.hash_test, hash_test);
+		return false;
+	}
+	if (strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "check: bad magic '%.*s'\n",
+			 sizeof(tdb->header.magic_food),
+			 tdb->header.magic_food);
+		return false;
+	}
+	if (tdb->header.v.hash_bits < INITIAL_HASH_BITS) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "check: bad hash bits %llu\n",
+			 (long long)tdb->header.v.hash_bits);
+		return false;
+	}
+	if (tdb->header.v.zone_bits < INITIAL_ZONE_BITS) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "check: bad zone_bits %llu\n",
+			 (long long)tdb->header.v.zone_bits);
+		return false;
+	}
+	if (tdb->header.v.free_buckets < INITIAL_FREE_BUCKETS) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "check: bad free_buckets %llu\n",
+			 (long long)tdb->header.v.free_buckets);
+		return false;
+	}
+	if ((1ULL << tdb->header.v.zone_bits) * tdb->header.v.num_zones
+	    < tdb->map_size) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "check: %llu zones size %llu don't cover %llu\n",
+			 (long long)(1ULL << tdb->header.v.zone_bits),
+			 (long long)tdb->header.v.num_zones,
+			 (long long)tdb->map_size);
+		return false;
+	}
+
+	/* We check hash_off and free_off later. */
+
+	/* Don't check reserved: they *can* be used later. */
+	return true;
+}
+
+static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
+{
+	/* Can overflow an int. */
+	return a > b ? 1
+		: a < b ? -1
+		: 0;
+}
+
+static bool check_hash_list(struct tdb_context *tdb,
+			    tdb_off_t used[],
+			    size_t num_used)
+{
+	struct tdb_used_record rec;
+	tdb_len_t hashlen, i, num_nonzero;
+	tdb_off_t h;
+	size_t num_found;
+
+	hashlen = sizeof(tdb_off_t) << tdb->header.v.hash_bits;
+
+	if (tdb_read_convert(tdb, tdb->header.v.hash_off - sizeof(rec),
+			     &rec, sizeof(rec)) == -1)
+		return false;
+
+	if (rec_data_length(&rec) != hashlen) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: Bad hash table length %llu vs %llu\n",
+			 (long long)rec_data_length(&rec),
+			 (long long)hashlen);
+		return false;
+	}
+	if (rec_key_length(&rec) != 0) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: Bad hash table key length %llu\n",
+			 (long long)rec_key_length(&rec));
+		return false;
+	}
+	if (rec_hash(&rec) != 0) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: Bad hash table hash value %llu\n",
+			 (long long)rec_hash(&rec));
+		return false;
+	}
+
+	num_found = 0;
+	num_nonzero = 0;
+	for (i = 0, h = tdb->header.v.hash_off;
+	     i < (1ULL << tdb->header.v.hash_bits);
+	     i++, h += sizeof(tdb_off_t)) {
+		tdb_off_t off, *p, pos;
+		struct tdb_used_record rec;
+		uint64_t hash;
+
+		off = tdb_read_off(tdb, h);
+		if (off == TDB_OFF_ERR)
+			return false;
+		if (!off) {
+			num_nonzero = 0;
+			continue;
+		}
+		/* FIXME: Check hash bits */
+		p = asearch(&off, used, num_used, off_cmp);
+		if (!p) {
+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				 "tdb_check: Invalid offset %llu in hash\n",
+				 (long long)off);
+			return false;
+		}
+		/* Mark it invalid. */
+		*p ^= 1;
+		num_found++;
+
+		if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
+			return false;
+
+		/* Check it is hashed correctly. */
+		hash = hash_record(tdb, off);
+
+		/* Top bits must match header. */
+		if (hash >> (64 - 11) != rec_hash(&rec)) {
+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				 "tdb_check: Bad hash magic at offset %llu"
+				 " (0x%llx vs 0x%llx)\n",
+				 (long long)off,
+				 (long long)hash, (long long)rec_hash(&rec));
+			return false;
+		}
+
+		/* It must be in the right place in hash array. */
+		pos = hash & ((1ULL << tdb->header.v.hash_bits)-1);
+		if (pos < i - num_nonzero || pos > i) {
+			/* Could be wrap from end of array?  FIXME: check? */
+			if (i != num_nonzero) {
+				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+					 "tdb_check: Bad hash position %llu at"
+					 " offset %llu hash 0x%llx\n",
+					 (long long)i,
+					 (long long)off,
+					 (long long)hash);
+				return false;
+			}
+		}
+		num_nonzero++;
+	}
+
+	if (num_found != num_used) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: Not all entries are in hash\n");
+		return false;
+	}
+	return true;
+}
+
+static bool check_free(struct tdb_context *tdb,
+		       tdb_off_t off,
+		       const struct tdb_free_record *frec,
+		       tdb_off_t prev,
+		       tdb_off_t zone, unsigned int bucket)
+{
+	if (frec->magic != TDB_FREE_MAGIC) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: offset %llu bad magic 0x%llx\n",
+			 (long long)off, (long long)frec->magic);
+		return false;
+	}
+	if (tdb->methods->oob(tdb, off
+			      + frec->data_len-sizeof(struct tdb_used_record),
+			      true))
+		return false;
+	if (zone_of(tdb, off) != zone) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: offset %llu in wrong zone %llu vs %llu\n",
+			 (long long)off,
+			 (long long)zone, (long long)zone_of(tdb, off));
+		return false;
+	}
+	if (size_to_bucket(tdb, frec->data_len) != bucket) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: offset %llu in wrong bucket %u vs %u\n",
+			 (long long)off,
+			 bucket, size_to_bucket(tdb, frec->data_len));
+		return false;
+	}
+	if (prev != frec->prev) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: offset %llu bad prev %llu vs %llu\n",
+			 (long long)off,
+			 (long long)prev, (long long)frec->prev);
+		return false;
+	}
+	return true;
+}
+		       
+static bool check_free_list(struct tdb_context *tdb,
+			    tdb_off_t free[],
+			    size_t num_free)
+{
+	struct tdb_used_record rec;
+	tdb_len_t freelen, i, j;
+	tdb_off_t h;
+	size_t num_found;
+
+	freelen = sizeof(tdb_off_t) * tdb->header.v.num_zones
+		* (tdb->header.v.free_buckets + 1);
+
+	if (tdb_read_convert(tdb, tdb->header.v.free_off - sizeof(rec),
+			     &rec, sizeof(rec)) == -1)
+		return false;
+
+	if (rec_data_length(&rec) != freelen) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: Bad free table length %llu vs %llu\n",
+			 (long long)rec_data_length(&rec),
+			 (long long)freelen);
+		return false;
+	}
+	if (rec_key_length(&rec) != 0) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: Bad free table key length %llu\n",
+			 (long long)rec_key_length(&rec));
+		return false;
+	}
+	if (rec_hash(&rec) != 0) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: Bad free table hash value %llu\n",
+			 (long long)rec_hash(&rec));
+		return false;
+	}
+
+	num_found = 0;
+	h = tdb->header.v.free_off;
+	for (i = 0; i < tdb->header.v.num_zones; i++) {
+		for (j = 0; j <= tdb->header.v.free_buckets;
+		     j++, h += sizeof(tdb_off_t)) {
+			tdb_off_t off, prev = 0, *p;
+			struct tdb_free_record f;
+
+			for (off = tdb_read_off(tdb, h); off; off = f.next) {
+				if (off == TDB_OFF_ERR)
+					return false;
+				if (tdb_read_convert(tdb, off, &f, sizeof(f)))
+					return false;
+				if (!check_free(tdb, off, &f, prev, i, j))
+					return false;
+
+				/* FIXME: Check hash bits */
+				p = asearch(&off, free, num_free, off_cmp);
+				if (!p) {
+					tdb->log(tdb, TDB_DEBUG_ERROR,
+						 tdb->log_priv,
+						 "tdb_check: Invalid offset"
+						 " %llu in free table\n",
+						 (long long)off);
+					return false;
+				}
+				/* Mark it invalid. */
+				*p ^= 1;
+				num_found++;
+				prev = off;
+			}
+		}
+	}
+	if (num_found != num_free) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: Not all entries are in free table\n");
+		return false;
+	}
+	return true;
+}
+
+/* FIXME: call check() function. */
+int tdb_check(struct tdb_context *tdb,
+	      int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
+	      void *private_data)
+{
+	tdb_off_t *free = NULL, *used = NULL, off;
+	tdb_len_t len;
+	size_t num_free = 0, num_used = 0;
+	bool hash_found = false, free_found = false;
+
+	if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
+		return -1;
+
+	update_header(tdb);
+
+	if (!check_header(tdb))
+		goto fail;
+
+	/* First we do a linear scan, checking all records. */
+	for (off = sizeof(struct tdb_header);
+	     off < tdb->map_size;
+	     off += len) {
+		union {
+			struct tdb_used_record u;
+			struct tdb_free_record f;
+		} pad, *p;
+		p = tdb_get(tdb, off, &pad, sizeof(pad));
+		if (!p)
+			goto fail;
+		if (p->f.magic == TDB_FREE_MAGIC) {
+			/* This record is free! */
+			if (!append(&free, &num_free, off))
+				goto fail;
+			len = sizeof(p->u) + p->f.data_len;
+			if (tdb->methods->oob(tdb, off + len, false))
+				goto fail;
+		} else {
+			uint64_t klen, dlen, extra;
+
+			/* This record is used! */
+			if (rec_magic(&p->u) != TDB_MAGIC) {
+				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+					 "tdb_check: Bad magic 0x%llx"
+					 " at offset %llu\n",
+					 (long long)rec_magic(&p->u),
+					 (long long)off);
+				goto fail;
+			}
+			
+			if (!append(&used, &num_used, off))
+				goto fail;
+
+			klen = rec_key_length(&p->u);
+			dlen = rec_data_length(&p->u);
+			extra = rec_extra_padding(&p->u);
+
+			len = sizeof(p->u) + klen + dlen + extra;
+			if (tdb->methods->oob(tdb, off + len, false))
+				goto fail;
+
+			if (off + sizeof(p->u) == tdb->header.v.hash_off) {
+				hash_found = true;
+			} else if (off + sizeof(p->u)
+				   == tdb->header.v.free_off) {
+				free_found = true;
+			}
+		}
+	}
+
+	if (!hash_found) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: hash table not found at %llu\n",
+			 (long long)tdb->header.v.hash_off);
+		goto fail;
+	}
+
+	if (!free_found) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_check: free table not found at %llu\n",
+			 (long long)tdb->header.v.free_off);
+		goto fail;
+	}
+
+	/* FIXME: Check key uniqueness? */
+	if (!check_hash_list(tdb, used, num_used))
+		goto fail;
+
+	if (!check_free_list(tdb, free, num_free))
+		goto fail;
+
+	tdb_allrecord_unlock(tdb, F_RDLCK);
+	return true;
+
+fail:
+	tdb_allrecord_unlock(tdb, F_RDLCK);
+	return false;
+}

+ 1050 - 0
ccan/tdb2/doc/design-1.3.txt

@@ -0,0 +1,1050 @@
+TDB2: A Redesigning The Trivial DataBase
+
+Rusty Russell, IBM Corporation
+
+27-April-2010
+
+Abstract
+
+The Trivial DataBase on-disk format is 32 bits; with usage cases 
+heading towards the 4G limit, that must change. This required 
+breakage provides an opportunity to revisit TDB's other design 
+decisions and reassess them.
+
+1 Introduction
+
+The Trivial DataBase was originally written by Andrew Tridgell as 
+a simple key/data pair storage system with the same API as dbm, 
+but allowing multiple readers and writers while being small 
+enough (< 1000 lines of C) to include in SAMBA. The simple design 
+created in 1999 has proven surprisingly robust and performant, 
+used in Samba versions 3 and 4 as well as numerous other 
+projects. Its useful life was greatly increased by the 
+(backwards-compatible!) addition of transaction support in 2005.
+
+The wider variety and greater demands of TDB-using code has lead 
+to some organic growth of the API, as well as some compromises on 
+the implementation. None of these, by themselves, are seen as 
+show-stoppers, but the cumulative effect is to a loss of elegance 
+over the initial, simple TDB implementation. Here is a table of 
+the approximate number of lines of implementation code and number 
+of API functions at the end of each year:
+
+
++-----------+----------------+--------------------------------+
+| Year End  | API Functions  | Lines of C Code Implementation |
++-----------+----------------+--------------------------------+
++-----------+----------------+--------------------------------+
+|   1999    |      13        |              1195              |
++-----------+----------------+--------------------------------+
+|   2000    |      24        |              1725              |
++-----------+----------------+--------------------------------+
+|   2001    |      32        |              2228              |
++-----------+----------------+--------------------------------+
+|   2002    |      35        |              2481              |
++-----------+----------------+--------------------------------+
+|   2003    |      35        |              2552              |
++-----------+----------------+--------------------------------+
+|   2004    |      40        |              2584              |
++-----------+----------------+--------------------------------+
+|   2005    |      38        |              2647              |
++-----------+----------------+--------------------------------+
+|   2006    |      52        |              3754              |
++-----------+----------------+--------------------------------+
+|   2007    |      66        |              4398              |
++-----------+----------------+--------------------------------+
+|   2008    |      71        |              4768              |
++-----------+----------------+--------------------------------+
+|   2009    |      73        |              5715              |
++-----------+----------------+--------------------------------+
+
+
+This review is an attempt to catalog and address all the known 
+issues with TDB and create solutions which address the problems 
+without significantly increasing complexity; all involved are far 
+too aware of the dangers of second system syndrome in rewriting a 
+successful project like this.
+
+2 API Issues
+
+2.1 tdb_open_ex Is Not Expandable
+
+The tdb_open() call was expanded to tdb_open_ex(), which added an 
+optional hashing function and an optional logging function 
+argument. Additional arguments to open would require the 
+introduction of a tdb_open_ex2 call etc.
+
+2.1.1 Proposed Solution
+
+tdb_open() will take a linked-list of attributes:
+
+enum tdb_attribute {
+
+    TDB_ATTRIBUTE_LOG = 0,
+
+    TDB_ATTRIBUTE_HASH = 1
+
+};
+
+struct tdb_attribute_base {
+
+    enum tdb_attribute attr;
+
+    union tdb_attribute *next;
+
+};
+
+struct tdb_attribute_log {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG 
+*/
+
+    tdb_log_func log_fn;
+
+    void *log_private;
+
+};
+
+struct tdb_attribute_hash {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH 
+*/
+
+    tdb_hash_func hash_fn;
+
+    void *hash_private;
+
+};
+
+union tdb_attribute {
+
+    struct tdb_attribute_base base;
+
+    struct tdb_attribute_log log;
+
+    struct tdb_attribute_hash hash;
+
+};
+
+This allows future attributes to be added, even if this expands 
+the size of the union.
+
+2.2 tdb_traverse Makes Impossible Guarantees
+
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, 
+and it was thought that it was important to guarantee that all 
+records which exist at the start and end of the traversal would 
+be included, and no record would be included twice.
+
+This adds complexity (see[Reliable-Traversal-Adds]) and does not 
+work anyway for records which are altered (in particular, those 
+which are expanded may be effectively deleted and re-added behind 
+the traversal).
+
+2.2.1 <traverse-Proposed-Solution>Proposed Solution
+
+Abandon the guarantee. You will see every record if no changes 
+occur during your traversal, otherwise you will see some subset. 
+You can prevent changes by using a transaction or the locking 
+API.
+
+2.3 Nesting of Transactions Is Fraught
+
+TDB has alternated between allowing nested transactions and not 
+allowing them. Various paths in the Samba codebase assume that 
+transactions will nest, and in a sense they can: the operation is 
+only committed to disk when the outer transaction is committed. 
+There are two problems, however:
+
+1. Canceling the inner transaction will cause the outer 
+  transaction commit to fail, and will not undo any operations 
+  since the inner transaction began. This problem is soluble with 
+  some additional internal code.
+
+2. An inner transaction commit can be cancelled by the outer 
+  transaction. This is desirable in the way which Samba's 
+  database initialization code uses transactions, but could be a 
+  surprise to any users expecting a successful transaction commit 
+  to expose changes to others.
+
+The current solution is to specify the behavior at tdb_open(), 
+with the default currently that nested transactions are allowed. 
+This flag can also be changed at runtime.
+
+2.3.1 Proposed Solution
+
+Given the usage patterns, it seems that the “least-surprise” 
+behavior of disallowing nested transactions should become the 
+default. Additionally, it seems the outer transaction is the only 
+code which knows whether inner transactions should be allowed, so 
+a flag to indicate this could be added to tdb_transaction_start. 
+However, this behavior can be simulated with a wrapper which uses 
+tdb_add_flags() and tdb_remove_flags(), so the API should not be 
+expanded for this relatively-obscure case.
+
+2.4 Incorrect Hash Function is Not Detected
+
+tdb_open_ex() allows the calling code to specify a different hash 
+function to use, but does not check that all other processes 
+accessing this tdb are using the same hash function. The result 
+is that records are missing from tdb_fetch().
+
+2.4.1 Proposed Solution
+
+The header should contain an example hash result (eg. the hash of 
+0xdeadbeef), and tdb_open_ex() should check that the given hash 
+function produces the same answer, or fail the tdb_open call.
+
+2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+
+In response to scalability issues with the free list ([TDB-Freelist-Is]
+) two API workarounds have been incorporated in TDB: 
+tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The 
+latter actually calls the former with an argument of “5”.
+
+This code allows deleted records to accumulate without putting 
+them in the free list. On delete we iterate through each chain 
+and free them in a batch if there are more than max_dead entries. 
+These are never otherwise recycled except as a side-effect of a 
+tdb_repack.
+
+2.5.1 Proposed Solution
+
+With the scalability problems of the freelist solved, this API 
+can be removed. The TDB_VOLATILE flag may still be useful as a 
+hint that store and delete of records will be at least as common 
+as fetch in order to allow some internal tuning, but initially 
+will become a no-op.
+
+2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times 
+  In The Same Process
+
+No process can open the same TDB twice; we check and disallow it. 
+This is an unfortunate side-effect of fcntl locks, which operate 
+on a per-file rather than per-file-descriptor basis, and do not 
+nest. Thus, closing any file descriptor on a file clears all the 
+locks obtained by this process, even if they were placed using a 
+different file descriptor!
+
+Note that even if this were solved, deadlock could occur if 
+operations were nested: this is a more manageable programming 
+error in most cases.
+
+2.6.1 Proposed Solution
+
+We could lobby POSIX to fix the perverse rules, or at least lobby 
+Linux to violate them so that the most common implementation does 
+not have this restriction. This would be a generally good idea 
+for other fcntl lock users.
+
+Samba uses a wrapper which hands out the same tdb_context to 
+multiple callers if this happens, and does simple reference 
+counting. We should do this inside the tdb library, which already 
+emulates lock nesting internally; it would need to recognize when 
+deadlock occurs within a single process. This would create a new 
+failure mode for tdb operations (while we currently handle 
+locking failures, they are impossible in normal use and a process 
+encountering them can do little but give up).
+
+I do not see benefit in an additional tdb_open flag to indicate 
+whether re-opening is allowed, as though there may be some 
+benefit to adding a call to detect when a tdb_context is shared, 
+to allow other to create such an API.
+
+2.7 TDB API Is Not POSIX Thread-safe
+
+The TDB API uses an error code which can be queried after an 
+operation to determine what went wrong. This programming model 
+does not work with threads, unless specific additional guarantees 
+are given by the implementation. In addition, even 
+otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
+).
+
+2.7.1 Proposed Solution
+
+Reachitecting the API to include a tdb_errcode pointer would be a 
+great deal of churn; we are better to guarantee that the 
+tdb_errcode is per-thread so the current programming model can be 
+maintained.
+
+This requires dynamic per-thread allocations, which is awkward 
+with POSIX threads (pthread_key_create space is limited and we 
+cannot simply allocate a key for every TDB).
+
+Internal locking is required to make sure that fcntl locks do not 
+overlap between threads, and also that the global list of tdbs is 
+maintained.
+
+The aim is that building tdb with -DTDB_PTHREAD will result in a 
+pthread-safe version of the library, and otherwise no overhead 
+will exist.
+
+2.8 *_nonblock Functions And *_mark Functions Expose 
+  Implementation
+
+CTDB[footnote:
+Clustered TDB, see http://ctdb.samba.org
+] wishes to operate on TDB in a non-blocking manner. This is 
+currently done as follows:
+
+1. Call the _nonblock variant of an API function (eg. 
+  tdb_lockall_nonblock). If this fails:
+
+2. Fork a child process, and wait for it to call the normal 
+  variant (eg. tdb_lockall).
+
+3. If the child succeeds, call the _mark variant to indicate we 
+  already have the locks (eg. tdb_lockall_mark).
+
+4. Upon completion, tell the child to release the locks (eg. 
+  tdb_unlockall).
+
+5. Indicate to tdb that it should consider the locks removed (eg. 
+  tdb_unlockall_mark).
+
+There are several issues with this approach. Firstly, adding two 
+new variants of each function clutters the API for an obscure 
+use, and so not all functions have three variants. Secondly, it 
+assumes that all paths of the functions ask for the same locks, 
+otherwise the parent process will have to get a lock which the 
+child doesn't have under some circumstances. I don't believe this 
+is currently the case, but it constrains the implementation. 
+
+2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
+
+Implement a hook for locking methods, so that the caller can 
+control the calls to create and remove fcntl locks. In this 
+scenario, ctdbd would operate as follows:
+
+1. Call the normal API function, eg tdb_lockall().
+
+2. When the lock callback comes in, check if the child has the 
+  lock. Initially, this is always false. If so, return 0. 
+  Otherwise, try to obtain it in non-blocking mode. If that 
+  fails, return EWOULDBLOCK.
+
+3. Release locks in the unlock callback as normal.
+
+4. If tdb_lockall() fails, see if we recorded a lock failure; if 
+  so, call the child to repeat the operation.
+
+5. The child records what locks it obtains, and returns that 
+  information to the parent.
+
+6. When the child has succeeded, goto 1.
+
+This is flexible enough to handle any potential locking scenario, 
+even when lock requirements change. It can be optimized so that 
+the parent does not release locks, just tells the child which 
+locks it doesn't need to obtain.
+
+It also keeps the complexity out of the API, and in ctdbd where 
+it is needed.
+
+2.9 tdb_chainlock Functions Expose Implementation
+
+tdb_chainlock locks some number of records, including the record 
+indicated by the given key. This gave atomicity guarantees; 
+no-one can start a transaction, alter, read or delete that key 
+while the lock is held.
+
+It also makes the same guarantee for any other key in the chain, 
+which is an internal implementation detail and potentially a 
+cause for deadlock.
+
+2.9.1 Proposed Solution
+
+None. It would be nice to have an explicit single entry lock 
+which effected no other keys. Unfortunately, this won't work for 
+an entry which doesn't exist. Thus while chainlock may be 
+implemented more efficiently for the existing case, it will still 
+have overlap issues with the non-existing case. So it is best to 
+keep the current (lack of) guarantee about which records will be 
+effected to avoid constraining our implementation.
+
+2.10 Signal Handling is Not Race-Free
+
+The tdb_setalarm_sigptr() call allows the caller's signal handler 
+to indicate that the tdb locking code should return with a 
+failure, rather than trying again when a signal is received (and 
+errno == EAGAIN). This is usually used to implement timeouts.
+
+Unfortunately, this does not work in the case where the signal is 
+received before the tdb code enters the fcntl() call to place the 
+lock: the code will sleep within the fcntl() code, unaware that 
+the signal wants it to exit. In the case of long timeouts, this 
+does not happen in practice.
+
+2.10.1 Proposed Solution
+
+The locking hooks proposed in[Proposed-Solution-locking-hook] 
+would allow the user to decide on whether to fail the lock 
+acquisition on a signal. This allows the caller to choose their 
+own compromise: they could narrow the race by checking 
+immediately before the fcntl call.[footnote:
+It may be possible to make this race-free in some implementations 
+by having the signal handler alter the struct flock to make it 
+invalid. This will cause the fcntl() lock call to fail with 
+EINVAL if the signal occurs before the kernel is entered, 
+otherwise EAGAIN.
+]
+
+2.11 The API Uses Gratuitous Typedefs, Capitals
+
+typedefs are useful for providing source compatibility when types 
+can differ across implementations, or arguably in the case of 
+function pointer definitions which are hard for humans to parse. 
+Otherwise it is simply obfuscation and pollutes the namespace.
+
+Capitalization is usually reserved for compile-time constants and 
+macros.
+
+  TDB_CONTEXT There is no reason to use this over 'struct 
+  tdb_context'; the definition isn't visible to the API user 
+  anyway.
+
+  TDB_DATA There is no reason to use this over struct TDB_DATA; 
+  the struct needs to be understood by the API user.
+
+  struct TDB_DATA This would normally be called 'struct 
+  tdb_data'.
+
+  enum TDB_ERROR Similarly, this would normally be enum 
+  tdb_error.
+
+2.11.1 Proposed Solution
+
+None. Introducing lower case variants would please pedants like 
+myself, but if it were done the existing ones should be kept. 
+There is little point forcing a purely cosmetic change upon tdb 
+users.
+
+2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The 
+  Private Pointer
+
+For API compatibility reasons, the logging function needs to call 
+tdb_get_logging_private() to retrieve the pointer registered by 
+the tdb_open_ex for logging.
+
+2.12.1 Proposed Solution
+
+It should simply take an extra argument, since we are prepared to 
+break the API/ABI.
+
+2.13 Various Callback Functions Are Not Typesafe
+
+The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read 
+and tdb_check all take void * and must internally convert it to 
+the argument type they were expecting.
+
+If this type changes, the compiler will not produce warnings on 
+the callers, since it only sees void *.
+
+2.13.1 Proposed Solution
+
+With careful use of macros, we can create callback functions 
+which give a warning when used on gcc and the types of the 
+callback and its private argument differ. Unsupported compilers 
+will not give a warning, which is no worse than now. In addition, 
+the callbacks become clearer, as they need not use void * for 
+their parameter.
+
+See CCAN's typesafe_cb module at 
+http://ccan.ozlabs.org/info/typesafe_cb.html
+
+2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, 
+  tdb_reopen_all Problematic
+
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB 
+file should be cleared if the caller discovers it is the only 
+process with the TDB open. However, if any caller does not 
+specify TDB_CLEAR_IF_FIRST it will not be detected, so will have 
+the TDB erased underneath them (usually resulting in a crash).
+
+There is a similar issue on fork(); if the parent exits (or 
+otherwise closes the tdb) before the child calls tdb_reopen_all() 
+to establish the lock used to indicate the TDB is opened by 
+someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe 
+it alone has opened the TDB and will erase it.
+
+2.14.1 Proposed Solution
+
+Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but 
+see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
+
+3 Performance And Scalability Issues
+
+3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST 
+  Imposes Performance Penalty
+
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is 
+placed at offset 4 (aka. the ACTIVE_LOCK). While these locks 
+never conflict in normal tdb usage, they do add substantial 
+overhead for most fcntl lock implementations when the kernel 
+scans to detect if a lock conflict exists. This is often a single 
+linked list, making the time to acquire and release a fcntl lock 
+O(N) where N is the number of processes with the TDB open, not 
+the number actually doing work.
+
+In a Samba server it is common to have huge numbers of clients 
+sitting idle, and thus they have weaned themselves off the 
+TDB_CLEAR_IF_FIRST flag.[footnote:
+There is a flag to tdb_reopen_all() which is used for this 
+optimization: if the parent process will outlive the child, the 
+child does not need the ACTIVE_LOCK. This is a workaround for 
+this very performance issue.
+]
+
+3.1.1 Proposed Solution
+
+Remove the flag. It was a neat idea, but even trivial servers 
+tend to know when they are initializing for the first time and 
+can simply unlink the old tdb at that point.
+
+3.2 TDB Files Have a 4G Limit
+
+This seems to be becoming an issue (so much for “trivial”!), 
+particularly for ldb.
+
+3.2.1 Proposed Solution
+
+A new, incompatible TDB format which uses 64 bit offsets 
+internally rather than 32 bit as now. For simplicity of endian 
+conversion (which TDB does on the fly if required), all values 
+will be 64 bit on disk. In practice, some upper bits may be used 
+for other purposes, but at least 56 bits will be available for 
+file offsets.
+
+tdb_open() will automatically detect the old version, and even 
+create them if TDB_VERSION6 is specified to tdb_open.
+
+32 bit processes will still be able to access TDBs larger than 4G 
+(assuming that their off_t allows them to seek to 64 bits), they 
+will gracefully fall back as they fail to mmap. This can happen 
+already with large TDBs.
+
+Old versions of tdb will fail to open the new TDB files (since 28 
+August 2009, commit 398d0c29290: prior to that any unrecognized 
+file format would be erased and initialized as a fresh tdb!)
+
+3.3 TDB Records Have a 4G Limit
+
+This has not been a reported problem, and the API uses size_t 
+which can be 64 bit on 64 bit platforms. However, other limits 
+may have made such an issue moot.
+
+3.3.1 Proposed Solution
+
+Record sizes will be 64 bit, with an error returned on 32 bit 
+platforms which try to access such records (the current 
+implementation would return TDB_ERR_OOM in a similar case). It 
+seems unlikely that 32 bit keys will be a limitation, so the 
+implementation may not support this (see [sub:Records-Incur-A]).
+
+3.4 Hash Size Is Determined At TDB Creation Time
+
+TDB contains a number of hash chains in the header; the number is 
+specified at creation time, and defaults to 131. This is such a 
+bottleneck on large databases (as each hash chain gets quite 
+long), that LDB uses 10,000 for this hash. In general it is 
+impossible to know what the 'right' answer is at database 
+creation time.
+
+3.4.1 Proposed Solution
+
+After comprehensive performance testing on various scalable hash 
+variants[footnote:
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 
+This was annoying because I was previously convinced that an 
+expanding tree of hashes would be very close to optimal.
+], it became clear that it is hard to beat a straight linear hash 
+table which doubles in size when it reaches saturation. There are 
+three details which become important:
+
+1. On encountering a full bucket, we use the next bucket.
+
+2. Extra hash bits are stored with the offset, to reduce 
+  comparisons.
+
+3. A marker entry is used on deleting an entry.
+
+The doubling of the table must be done under a transaction; we 
+will not reduce it on deletion, so it will be an unusual case. It 
+will either be placed at the head (other entries will be moved 
+out the way so we can expand). We could have a pointer in the 
+header to the current hashtable location, but that pointer would 
+have to be read frequently to check for hashtable moves.
+
+The locking for this is slightly more complex than the chained 
+case; we currently have one lock per bucket, and that means we 
+would need to expand the lock if we overflow to the next bucket. 
+The frequency of such collisions will effect our locking 
+heuristics: we can always lock more buckets than we need.
+
+One possible optimization is to only re-check the hash size on an 
+insert or a lookup miss.
+
+3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
+
+TDB uses a single linked list for the free list. Allocation 
+occurs as follows, using heuristics which have evolved over time:
+
+1. Get the free list lock for this whole operation.
+
+2. Multiply length by 1.25, so we always over-allocate by 25%.
+
+3. Set the slack multiplier to 1.
+
+4. Examine the current freelist entry: if it is > length but < 
+  the current best case, remember it as the best case.
+
+5. Multiply the slack multiplier by 1.05.
+
+6. If our best fit so far is less than length * slack multiplier, 
+  return it. The slack will be turned into a new free record if 
+  it's large enough.
+
+7. Otherwise, go onto the next freelist entry.
+
+Deleting a record occurs as follows:
+
+1. Lock the hash chain for this whole operation.
+
+2. Walk the chain to find the record, keeping the prev pointer 
+  offset.
+
+3. If max_dead is non-zero:
+
+  (a) Walk the hash chain again and count the dead records.
+
+  (b) If it's more than max_dead, bulk free all the dead ones 
+    (similar to steps 4 and below, but the lock is only obtained 
+    once).
+
+  (c) Simply mark this record as dead and return. 
+
+4. Get the free list lock for the remainder of this operation.
+
+5. <right-merging>Examine the following block to see if it is 
+  free; if so, enlarge the current block and remove that block 
+  from the free list. This was disabled, as removal from the free 
+  list was O(entries-in-free-list).
+
+6. Examine the preceeding block to see if it is free: for this 
+  reason, each block has a 32-bit tailer which indicates its 
+  length. If it is free, expand it to cover our new block and 
+  return.
+
+7. Otherwise, prepend ourselves to the free list.
+
+Disabling right-merging (step [right-merging]) causes 
+fragmentation; the other heuristics proved insufficient to 
+address this, so the final answer to this was that when we expand 
+the TDB file inside a transaction commit, we repack the entire 
+tdb.
+
+The single list lock limits our allocation rate; due to the other 
+issues this is not currently seen as a bottleneck.
+
+3.5.1 Proposed Solution
+
+The first step is to remove all the current heuristics, as they 
+obviously interact, then examine them once the lock contention is 
+addressed.
+
+The free list must be split to reduce contention. Assuming 
+perfect free merging, we can at most have 1 free list entry for 
+each entry. This implies that the number of free lists is related 
+to the size of the hash table, but as it is rare to walk a large 
+number of free list entries we can use far fewer, say 1/32 of the 
+number of hash buckets.
+
+There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
+) but it's not clear this would reduce contention in the common 
+case where all processes are allocating/freeing the same size. 
+Thus we almost certainly need to divide in other ways: the most 
+obvious is to divide the file into zones, and using a free list 
+(or set of free lists) for each. This approximates address 
+ordering.
+
+Note that this means we need to split the free lists when we 
+expand the file; this is probably acceptable when we double the 
+hash table size, since that is such an expensive operation 
+already. In the case of increasing the file size, there is an 
+optimization we can use: if we use M in the formula above as the 
+file size rounded up to the next power of 2, we only need 
+reshuffle free lists when the file size crosses a power of 2 
+boundary, and reshuffling the free lists is trivial: we simply 
+merge every consecutive pair of free lists.
+
+The basic algorithm is as follows. Freeing is simple:
+
+1. Identify the correct zone.
+
+2. Lock the corresponding list.
+
+3. Re-check the zone (we didn't have a lock, sizes could have 
+  changed): relock if necessary.
+
+4. Place the freed entry in the list for that zone.
+
+Allocation is a little more complicated, as we perform delayed 
+coalescing at this point:
+
+1. Pick a zone either the zone we last freed into, or based on a “
+  random” number.
+
+2. Lock the corresponding list.
+
+3. Re-check the zone: relock if necessary.
+
+4. If the top entry is -large enough, remove it from the list and 
+  return it.
+
+5. Otherwise, coalesce entries in the list.
+
+  (a) 
+
+  (b) 
+
+  (c) 
+
+  (d) 
+
+6. If there was no entry large enough, unlock the list and try 
+  the next zone.
+
+7. 
+
+8. 
+
+9. If no zone satisfies, expand the file.
+
+This optimizes rapid insert/delete of free list entries by not 
+coalescing them all the time.. First-fit address ordering 
+ordering seems to be fairly good for keeping fragmentation low 
+(see [sub:TDB-Becomes-Fragmented]). Note that address ordering 
+does not need a tailer to coalesce, though if we needed one we 
+could have one cheaply: see [sub:Records-Incur-A]. 
+
+
+
+I anticipate that the number of entries in each free zone would 
+be small, but it might be worth using one free entry to hold 
+pointers to the others for cache efficiency.
+
+3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
+
+Much of this is a result of allocation strategy[footnote:
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 
+ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
+] and deliberate hobbling of coalescing; internal fragmentation 
+(aka overallocation) is deliberately set at 25%, and external 
+fragmentation is only cured by the decision to repack the entire 
+db when a transaction commit needs to enlarge the file.
+
+3.6.1 Proposed Solution
+
+The 25% overhead on allocation works in practice for ldb because 
+indexes tend to expand by one record at a time. This internal 
+fragmentation can be resolved by having an “expanded” bit in the 
+header to note entries that have previously expanded, and 
+allocating more space for them.
+
+There are is a spectrum of possible solutions for external 
+fragmentation: one is to use a fragmentation-avoiding allocation 
+strategy such as best-fit address-order allocator. The other end 
+of the spectrum would be to use a bump allocator (very fast and 
+simple) and simply repack the file when we reach the end.
+
+There are three problems with efficient fragmentation-avoiding 
+allocators: they are non-trivial, they tend to use a single free 
+list for each size, and there's no evidence that tdb allocation 
+patterns will match those recorded for general allocators (though 
+it seems likely).
+
+Thus we don't spend too much effort on external fragmentation; we 
+will be no worse than the current code if we need to repack on 
+occasion. More effort is spent on reducing freelist contention, 
+and reducing overhead.
+
+3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
+
+Each TDB record has a header as follows:
+
+struct tdb_record {
+
+        tdb_off_t next; /* offset of the next record in the list 
+*/
+
+        tdb_len_t rec_len; /* total byte length of record */
+
+        tdb_len_t key_len; /* byte length of key */
+
+        tdb_len_t data_len; /* byte length of data */
+
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+
+        uint32_t magic;   /* try to catch errors */
+
+        /* the following union is implied:
+
+                union {
+
+                        char record[rec_len];
+
+                        struct {
+
+                                char key[key_len];
+
+                                char data[data_len];
+
+                        }
+
+                        uint32_t totalsize; (tailer)
+
+                }
+
+        */
+
+};
+
+Naively, this would double to a 56-byte overhead on a 64 bit 
+implementation.
+
+3.7.1 Proposed Solution
+
+We can use various techniques to reduce this for an allocated 
+block:
+
+1. The 'next' pointer is not required, as we are using a flat 
+  hash table.
+
+2. 'rec_len' can instead be expressed as an addition to key_len 
+  and data_len (it accounts for wasted or overallocated length in 
+  the record). Since the record length is always a multiple of 8, 
+  we can conveniently fit it in 32 bits (representing up to 35 
+  bits).
+
+3. 'key_len' and 'data_len' can be reduced. I'm unwilling to 
+  restrict 'data_len' to 32 bits, but instead we can combine the 
+  two into one 64-bit field and using a 5 bit value which 
+  indicates at what bit to divide the two. Keys are unlikely to 
+  scale as fast as data, so I'm assuming a maximum key size of 32 
+  bits.
+
+4. 'full_hash' is used to avoid a memcmp on the “miss” case, but 
+  this is diminishing returns after a handful of bits (at 10 
+  bits, it reduces 99.9% of false memcmp). As an aside, as the 
+  lower bits are already incorporated in the hash table 
+  resolution, the upper bits should be used here.
+
+5. 'magic' does not need to be enlarged: it currently reflects 
+  one of 5 values (used, free, dead, recovery, and 
+  unused_recovery). It is useful for quick sanity checking 
+  however, and should not be eliminated.
+
+6. 'tailer' is only used to coalesce free blocks (so a block to 
+  the right can find the header to check if this block is free). 
+  This can be replaced by a single 'free' bit in the header of 
+  the following block (and the tailer only exists in free 
+  blocks).[footnote:
+This technique from Thomas Standish. Data Structure Techniques. 
+Addison-Wesley, Reading, Massachusetts, 1980.
+] The current proposed coalescing algorithm doesn't need this, 
+  however.
+
+This produces a 16 byte used header like this:
+
+struct tdb_used_record {
+
+        uint32_t magic : 16,
+
+                 prev_is_free: 1,
+
+                 key_data_divide: 5,
+
+                 top_hash: 10;
+
+        uint32_t extra_octets;
+
+        uint64_t key_and_data_len;
+
+};
+
+And a free record like this:
+
+struct tdb_free_record {
+
+        uint32_t free_magic;
+
+        uint64_t total_length;
+
+        ...
+
+        uint64_t tailer;
+
+};
+
+
+
+3.8 Transaction Commit Requires 4 fdatasync
+
+The current transaction algorithm is:
+
+1. write_recovery_data();
+
+2. sync();
+
+3. write_recovery_header();
+
+4. sync();
+
+5. overwrite_with_new_data();
+
+6. sync();
+
+7. remove_recovery_header();
+
+8. sync(); 
+
+On current ext3, each sync flushes all data to disk, so the next 
+3 syncs are relatively expensive. But this could become a 
+performance bottleneck on other filesystems such as ext4.
+
+3.8.1 Proposed Solution
+
+
+
+
+
+
+
+
+
+Neil Brown points out that this is overzealous, and only one sync 
+is needed:
+
+1. Bundle the recovery data, a transaction counter and a strong 
+  checksum of the new data.
+
+2. Strong checksum that whole bundle.
+
+3. Store the bundle in the database.
+
+4. Overwrite the oldest of the two recovery pointers in the 
+  header (identified using the transaction counter) with the 
+  offset of this bundle.
+
+5. sync.
+
+6. Write the new data to the file.
+
+Checking for recovery means identifying the latest bundle with a 
+valid checksum and using the new data checksum to ensure that it 
+has been applied. This is more expensive than the current check, 
+but need only be done at open. For running databases, a separate 
+header field can be used to indicate a transaction in progress; 
+we need only check for recovery if this is set.
+
+3.9 TDB Does Not Have Snapshot Support
+
+3.9.1 Proposed Solution
+
+None. At some point you say “use a real database”.
+
+But as a thought experiment, if we implemented transactions to 
+only overwrite free entries (this is tricky: there must not be a 
+header in each entry which indicates whether it is free, but use 
+of presence in metadata elsewhere), and a pointer to the hash 
+table, we could create an entirely new commit without destroying 
+existing data. Then it would be easy to implement snapshots in a 
+similar way.
+
+This would not allow arbitrary changes to the database, such as 
+tdb_repack does, and would require more space (since we have to 
+preserve the current and future entries at once). If we used hash 
+trees rather than one big hash table, we might only have to 
+rewrite some sections of the hash, too.
+
+We could then implement snapshots using a similar method, using 
+multiple different hash tables/free tables.
+
+3.10 Transactions Cannot Operate in Parallel
+
+This would be useless for ldb, as it hits the index records with 
+just about every update. It would add significant complexity in 
+resolving clashes, and cause the all transaction callers to write 
+their code to loop in the case where the transactions spuriously 
+failed.
+
+3.10.1 Proposed Solution
+
+We could solve a small part of the problem by providing read-only 
+transactions. These would allow one write transaction to begin, 
+but it could not commit until all r/o transactions are done. This 
+would require a new RO_TRANSACTION_LOCK, which would be upgraded 
+on commit.
+
+3.11 Default Hash Function Is Suboptimal
+
+The Knuth-inspired multiplicative hash used by tdb is fairly slow 
+(especially if we expand it to 64 bits), and works best when the 
+hash bucket size is a prime number (which also means a slow 
+modulus). In addition, it is highly predictable which could 
+potentially lead to a Denial of Service attack in some TDB uses.
+
+3.11.1 Proposed Solution
+
+The Jenkins lookup3 hash[footnote:
+http://burtleburtle.net/bob/c/lookup3.c
+] is a fast and superbly-mixing hash. It's used by the Linux 
+kernel and almost everything else. This has the particular 
+properties that it takes an initial seed, and produces two 32 bit 
+hash numbers, which we can combine into a 64-bit hash.
+
+The seed should be created at tdb-creation time from some random 
+source, and placed in the header. This is far from foolproof, but 
+adds a little bit of protection against hash bombing.
+
+3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
+
+We lock a record during traversal iteration, and try to grab that 
+lock in the delete code. If that grab on delete fails, we simply 
+mark it deleted and continue onwards; traversal checks for this 
+condition and does the delete when it moves off the record.
+
+If traversal terminates, the dead record may be left 
+indefinitely.
+
+3.12.1 Proposed Solution
+
+Remove reliability guarantees; see [traverse-Proposed-Solution].
+
+3.13 Fcntl Locking Adds Overhead
+
+Placing a fcntl lock means a system call, as does removing one. 
+This is actually one reason why transactions can be faster 
+(everything is locked once at transaction start). In the 
+uncontended case, this overhead can theoretically be eliminated.
+
+3.13.1 Proposed Solution
+
+None.
+
+We tried this before with spinlock support, in the early days of 
+TDB, and it didn't make much difference except in manufactured 
+benchmarks.
+
+We could use spinlocks (with futex kernel support under Linux), 
+but it means that we lose automatic cleanup when a process dies 
+with a lock. There is a method of auto-cleanup under Linux, but 
+it's not supported by other operating systems. We could 
+reintroduce a clear-if-first-style lock and sweep for dead 
+futexes on open, but that wouldn't help the normal case of one 
+concurrent opener dying. Increasingly elaborate repair schemes 
+could be considered, but they require an ABI change (everyone 
+must use them) anyway, so there's no need to do this at the same 
+time as everything else.
+

+ 2282 - 0
ccan/tdb2/doc/design.lyx

@@ -0,0 +1,2282 @@
+#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
+\lyxformat 345
+\begin_document
+\begin_header
+\textclass article
+\use_default_options true
+\language english
+\inputencoding auto
+\font_roman default
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\paperfontsize default
+\use_hyperref false
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\cite_engine basic
+\use_bibtopic false
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation indent
+\defskip medskip
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes true
+\output_changes true
+\author "" 
+\author "" 
+\end_header
+
+\begin_body
+
+\begin_layout Title
+TDB2: A Redesigning The Trivial DataBase
+\end_layout
+
+\begin_layout Author
+Rusty Russell, IBM Corporation
+\end_layout
+
+\begin_layout Date
+26-July-2010
+\end_layout
+
+\begin_layout Abstract
+The Trivial DataBase on-disk format is 32 bits; with usage cases heading
+ towards the 4G limit, that must change.
+ This required breakage provides an opportunity to revisit TDB's other design
+ decisions and reassess them.
+\end_layout
+
+\begin_layout Section
+Introduction
+\end_layout
+
+\begin_layout Standard
+The Trivial DataBase was originally written by Andrew Tridgell as a simple
+ key/data pair storage system with the same API as dbm, but allowing multiple
+ readers and writers while being small enough (< 1000 lines of C) to include
+ in SAMBA.
+ The simple design created in 1999 has proven surprisingly robust and performant
+, used in Samba versions 3 and 4 as well as numerous other projects.
+ Its useful life was greatly increased by the (backwards-compatible!) addition
+ of transaction support in 2005.
+\end_layout
+
+\begin_layout Standard
+The wider variety and greater demands of TDB-using code has lead to some
+ organic growth of the API, as well as some compromises on the implementation.
+ None of these, by themselves, are seen as show-stoppers, but the cumulative
+ effect is to a loss of elegance over the initial, simple TDB implementation.
+ Here is a table of the approximate number of lines of implementation code
+ and number of API functions at the end of each year:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="12" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Year End
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+API Functions
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Lines of C Code Implementation
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1999
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+13
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1195
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2000
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+24
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1725
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2001
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+32
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2228
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2002
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2481
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2003
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2552
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2004
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+40
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2584
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2005
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+38
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2647
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2006
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+52
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+3754
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2007
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+66
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4398
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2008
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+71
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4768
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2009
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+73
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+5715
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This review is an attempt to catalog and address all the known issues with
+ TDB and create solutions which address the problems without significantly
+ increasing complexity; all involved are far too aware of the dangers of
+ second system syndrome in rewriting a successful project like this.
+\end_layout
+
+\begin_layout Section
+API Issues
+\end_layout
+
+\begin_layout Subsection
+tdb_open_ex Is Not Expandable
+\end_layout
+
+\begin_layout Standard
+The tdb_open() call was expanded to tdb_open_ex(), which added an optional
+ hashing function and an optional logging function argument.
+ Additional arguments to open would require the introduction of a tdb_open_ex2
+ call etc.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+tdb_open() will take a linked-list of attributes:
+\end_layout
+
+\begin_layout LyX-Code
+enum tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_LOG = 0,
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_HASH = 1
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_base {
+\end_layout
+
+\begin_layout LyX-Code
+    enum tdb_attribute attr;
+\end_layout
+
+\begin_layout LyX-Code
+    union tdb_attribute *next;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_log {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_log_func log_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *log_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_hash {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_hash_func hash_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *hash_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+union tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_log log;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_hash hash;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+This allows future attributes to be added, even if this expands the size
+ of the union.
+\end_layout
+
+\begin_layout Subsection
+tdb_traverse Makes Impossible Guarantees
+\end_layout
+
+\begin_layout Standard
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
+ was thought that it was important to guarantee that all records which exist
+ at the start and end of the traversal would be included, and no record
+ would be included twice.
+\end_layout
+
+\begin_layout Standard
+This adds complexity (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Reliable-Traversal-Adds"
+
+\end_inset
+
+) and does not work anyway for records which are altered (in particular,
+ those which are expanded may be effectively deleted and re-added behind
+ the traversal).
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "traverse-Proposed-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Abandon the guarantee.
+ You will see every record if no changes occur during your traversal, otherwise
+ you will see some subset.
+ You can prevent changes by using a transaction or the locking API.
+\end_layout
+
+\begin_layout Subsection
+Nesting of Transactions Is Fraught
+\end_layout
+
+\begin_layout Standard
+TDB has alternated between allowing nested transactions and not allowing
+ them.
+ Various paths in the Samba codebase assume that transactions will nest,
+ and in a sense they can: the operation is only committed to disk when the
+ outer transaction is committed.
+ There are two problems, however:
+\end_layout
+
+\begin_layout Enumerate
+Canceling the inner transaction will cause the outer transaction commit
+ to fail, and will not undo any operations since the inner transaction began.
+ This problem is soluble with some additional internal code.
+\end_layout
+
+\begin_layout Enumerate
+An inner transaction commit can be cancelled by the outer transaction.
+ This is desirable in the way which Samba's database initialization code
+ uses transactions, but could be a surprise to any users expecting a successful
+ transaction commit to expose changes to others.
+\end_layout
+
+\begin_layout Standard
+The current solution is to specify the behavior at tdb_open(), with the
+ default currently that nested transactions are allowed.
+ This flag can also be changed at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Given the usage patterns, it seems that the 
+\begin_inset Quotes eld
+\end_inset
+
+least-surprise
+\begin_inset Quotes erd
+\end_inset
+
+ behavior of disallowing nested transactions should become the default.
+ Additionally, it seems the outer transaction is the only code which knows
+ whether inner transactions should be allowed, so a flag to indicate this
+ could be added to tdb_transaction_start.
+ However, this behavior can be simulated with a wrapper which uses tdb_add_flags
+() and tdb_remove_flags(), so the API should not be expanded for this relatively
+-obscure case.
+\end_layout
+
+\begin_layout Subsection
+Incorrect Hash Function is Not Detected
+\end_layout
+
+\begin_layout Standard
+tdb_open_ex() allows the calling code to specify a different hash function
+ to use, but does not check that all other processes accessing this tdb
+ are using the same hash function.
+ The result is that records are missing from tdb_fetch().
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain an example hash result (eg.
+ the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
+ hash function produces the same answer, or fail the tdb_open call.
+\end_layout
+
+\begin_layout Subsection
+tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+\end_layout
+
+\begin_layout Standard
+In response to scalability issues with the free list (
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Freelist-Is"
+
+\end_inset
+
+) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
+ and the TDB_VOLATILE flag to tdb_open.
+ The latter actually calls the former with an argument of 
+\begin_inset Quotes eld
+\end_inset
+
+5
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+This code allows deleted records to accumulate without putting them in the
+ free list.
+ On delete we iterate through each chain and free them in a batch if there
+ are more than max_dead entries.
+ These are never otherwise recycled except as a side-effect of a tdb_repack.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With the scalability problems of the freelist solved, this API can be removed.
+ The TDB_VOLATILE flag may still be useful as a hint that store and delete
+ of records will be at least as common as fetch in order to allow some internal
+ tuning, but initially will become a no-op.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Files-Cannot"
+
+\end_inset
+
+TDB Files Cannot Be Opened Multiple Times In The Same Process
+\end_layout
+
+\begin_layout Standard
+No process can open the same TDB twice; we check and disallow it.
+ This is an unfortunate side-effect of fcntl locks, which operate on a per-file
+ rather than per-file-descriptor basis, and do not nest.
+ Thus, closing any file descriptor on a file clears all the locks obtained
+ by this process, even if they were placed using a different file descriptor!
+\end_layout
+
+\begin_layout Standard
+Note that even if this were solved, deadlock could occur if operations were
+ nested: this is a more manageable programming error in most cases.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We could lobby POSIX to fix the perverse rules, or at least lobby Linux
+ to violate them so that the most common implementation does not have this
+ restriction.
+ This would be a generally good idea for other fcntl lock users.
+\end_layout
+
+\begin_layout Standard
+Samba uses a wrapper which hands out the same tdb_context to multiple callers
+ if this happens, and does simple reference counting.
+ We should do this inside the tdb library, which already emulates lock nesting
+ internally; it would need to recognize when deadlock occurs within a single
+ process.
+ This would create a new failure mode for tdb operations (while we currently
+ handle locking failures, they are impossible in normal use and a process
+ encountering them can do little but give up).
+\end_layout
+
+\begin_layout Standard
+I do not see benefit in an additional tdb_open flag to indicate whether
+ re-opening is allowed, as though there may be some benefit to adding a
+ call to detect when a tdb_context is shared, to allow other to create such
+ an API.
+\end_layout
+
+\begin_layout Subsection
+TDB API Is Not POSIX Thread-safe
+\end_layout
+
+\begin_layout Standard
+The TDB API uses an error code which can be queried after an operation to
+ determine what went wrong.
+ This programming model does not work with threads, unless specific additional
+ guarantees are given by the implementation.
+ In addition, even otherwise-independent threads cannot open the same TDB
+ (as in 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Files-Cannot"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Reachitecting the API to include a tdb_errcode pointer would be a great
+ deal of churn; we are better to guarantee that the tdb_errcode is per-thread
+ so the current programming model can be maintained.
+\end_layout
+
+\begin_layout Standard
+This requires dynamic per-thread allocations, which is awkward with POSIX
+ threads (pthread_key_create space is limited and we cannot simply allocate
+ a key for every TDB).
+\end_layout
+
+\begin_layout Standard
+Internal locking is required to make sure that fcntl locks do not overlap
+ between threads, and also that the global list of tdbs is maintained.
+\end_layout
+
+\begin_layout Standard
+The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
+ version of the library, and otherwise no overhead will exist.
+\end_layout
+
+\begin_layout Subsection
+*_nonblock Functions And *_mark Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+CTDB
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Clustered TDB, see http://ctdb.samba.org
+\end_layout
+
+\end_inset
+
+ wishes to operate on TDB in a non-blocking manner.
+ This is currently done as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the _nonblock variant of an API function (eg.
+ tdb_lockall_nonblock).
+ If this fails:
+\end_layout
+
+\begin_layout Enumerate
+Fork a child process, and wait for it to call the normal variant (eg.
+ tdb_lockall).
+\end_layout
+
+\begin_layout Enumerate
+If the child succeeds, call the _mark variant to indicate we already have
+ the locks (eg.
+ tdb_lockall_mark).
+\end_layout
+
+\begin_layout Enumerate
+Upon completion, tell the child to release the locks (eg.
+ tdb_unlockall).
+\end_layout
+
+\begin_layout Enumerate
+Indicate to tdb that it should consider the locks removed (eg.
+ tdb_unlockall_mark).
+\end_layout
+
+\begin_layout Standard
+There are several issues with this approach.
+ Firstly, adding two new variants of each function clutters the API for
+ an obscure use, and so not all functions have three variants.
+ Secondly, it assumes that all paths of the functions ask for the same locks,
+ otherwise the parent process will have to get a lock which the child doesn't
+ have under some circumstances.
+ I don't believe this is currently the case, but it constrains the implementatio
+n.
+ 
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Proposed-Solution-locking-hook"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Implement a hook for locking methods, so that the caller can control the
+ calls to create and remove fcntl locks.
+ In this scenario, ctdbd would operate as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the normal API function, eg tdb_lockall().
+\end_layout
+
+\begin_layout Enumerate
+When the lock callback comes in, check if the child has the lock.
+ Initially, this is always false.
+ If so, return 0.
+ Otherwise, try to obtain it in non-blocking mode.
+ If that fails, return EWOULDBLOCK.
+\end_layout
+
+\begin_layout Enumerate
+Release locks in the unlock callback as normal.
+\end_layout
+
+\begin_layout Enumerate
+If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
+ child to repeat the operation.
+\end_layout
+
+\begin_layout Enumerate
+The child records what locks it obtains, and returns that information to
+ the parent.
+\end_layout
+
+\begin_layout Enumerate
+When the child has succeeded, goto 1.
+\end_layout
+
+\begin_layout Standard
+This is flexible enough to handle any potential locking scenario, even when
+ lock requirements change.
+ It can be optimized so that the parent does not release locks, just tells
+ the child which locks it doesn't need to obtain.
+\end_layout
+
+\begin_layout Standard
+It also keeps the complexity out of the API, and in ctdbd where it is needed.
+\end_layout
+
+\begin_layout Subsection
+tdb_chainlock Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+tdb_chainlock locks some number of records, including the record indicated
+ by the given key.
+ This gave atomicity guarantees; no-one can start a transaction, alter,
+ read or delete that key while the lock is held.
+\end_layout
+
+\begin_layout Standard
+It also makes the same guarantee for any other key in the chain, which is
+ an internal implementation detail and potentially a cause for deadlock.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ It would be nice to have an explicit single entry lock which effected no
+ other keys.
+ Unfortunately, this won't work for an entry which doesn't exist.
+ Thus while chainlock may be implemented more efficiently for the existing
+ case, it will still have overlap issues with the non-existing case.
+ So it is best to keep the current (lack of) guarantee about which records
+ will be effected to avoid constraining our implementation.
+\end_layout
+
+\begin_layout Subsection
+Signal Handling is Not Race-Free
+\end_layout
+
+\begin_layout Standard
+The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
+ that the tdb locking code should return with a failure, rather than trying
+ again when a signal is received (and errno == EAGAIN).
+ This is usually used to implement timeouts.
+\end_layout
+
+\begin_layout Standard
+Unfortunately, this does not work in the case where the signal is received
+ before the tdb code enters the fcntl() call to place the lock: the code
+ will sleep within the fcntl() code, unaware that the signal wants it to
+ exit.
+ In the case of long timeouts, this does not happen in practice.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The locking hooks proposed in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ would allow the user to decide on whether to fail the lock acquisition
+ on a signal.
+ This allows the caller to choose their own compromise: they could narrow
+ the race by checking immediately before the fcntl call.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+It may be possible to make this race-free in some implementations by having
+ the signal handler alter the struct flock to make it invalid.
+ This will cause the fcntl() lock call to fail with EINVAL if the signal
+ occurs before the kernel is entered, otherwise EAGAIN.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+The API Uses Gratuitous Typedefs, Capitals
+\end_layout
+
+\begin_layout Standard
+typedefs are useful for providing source compatibility when types can differ
+ across implementations, or arguably in the case of function pointer definitions
+ which are hard for humans to parse.
+ Otherwise it is simply obfuscation and pollutes the namespace.
+\end_layout
+
+\begin_layout Standard
+Capitalization is usually reserved for compile-time constants and macros.
+\end_layout
+
+\begin_layout Description
+TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
+ definition isn't visible to the API user anyway.
+\end_layout
+
+\begin_layout Description
+TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
+ needs to be understood by the API user.
+\end_layout
+
+\begin_layout Description
+struct
+\begin_inset space ~
+\end_inset
+
+TDB_DATA This would normally be called 'struct tdb_data'.
+\end_layout
+
+\begin_layout Description
+enum
+\begin_inset space ~
+\end_inset
+
+TDB_ERROR Similarly, this would normally be enum tdb_error.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ Introducing lower case variants would please pedants like myself, but if
+ it were done the existing ones should be kept.
+ There is little point forcing a purely cosmetic change upon tdb users.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+tdb_log_func Doesn't Take The Private Pointer
+\end_layout
+
+\begin_layout Standard
+For API compatibility reasons, the logging function needs to call tdb_get_loggin
+g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+It should simply take an extra argument, since we are prepared to break
+ the API/ABI.
+\end_layout
+
+\begin_layout Subsection
+Various Callback Functions Are Not Typesafe
+\end_layout
+
+\begin_layout Standard
+The callback functions in tdb_set_logging_function (after 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
+ all take void * and must internally convert it to the argument type they
+ were expecting.
+\end_layout
+
+\begin_layout Standard
+If this type changes, the compiler will not produce warnings on the callers,
+ since it only sees void *.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With careful use of macros, we can create callback functions which give
+ a warning when used on gcc and the types of the callback and its private
+ argument differ.
+ Unsupported compilers will not give a warning, which is no worse than now.
+ In addition, the callbacks become clearer, as they need not use void *
+ for their parameter.
+\end_layout
+
+\begin_layout Standard
+See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
+\end_layout
+
+\begin_layout Subsection
+TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
+\end_layout
+
+\begin_layout Standard
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
+ be cleared if the caller discovers it is the only process with the TDB
+ open.
+ However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
+ be detected, so will have the TDB erased underneath them (usually resulting
+ in a crash).
+\end_layout
+
+\begin_layout Standard
+There is a similar issue on fork(); if the parent exits (or otherwise closes
+ the tdb) before the child calls tdb_reopen_all() to establish the lock
+ used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
+ at that moment will believe it alone has opened the TDB and will erase
+ it.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove TDB_CLEAR_IF_FIRST.
+ Other workarounds are possible, but see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Section
+Performance And Scalability Issues
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+TDB_CLEAR_IF_FIRST Imposes Performance Penalty
+\end_layout
+
+\begin_layout Standard
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
+ 4 (aka.
+ the ACTIVE_LOCK).
+ While these locks never conflict in normal tdb usage, they do add substantial
+ overhead for most fcntl lock implementations when the kernel scans to detect
+ if a lock conflict exists.
+ This is often a single linked list, making the time to acquire and release
+ a fcntl lock O(N) where N is the number of processes with the TDB open,
+ not the number actually doing work.
+\end_layout
+
+\begin_layout Standard
+In a Samba server it is common to have huge numbers of clients sitting idle,
+ and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+There is a flag to tdb_reopen_all() which is used for this optimization:
+ if the parent process will outlive the child, the child does not need the
+ ACTIVE_LOCK.
+ This is a workaround for this very performance issue.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove the flag.
+ It was a neat idea, but even trivial servers tend to know when they are
+ initializing for the first time and can simply unlink the old tdb at that
+ point.
+\end_layout
+
+\begin_layout Subsection
+TDB Files Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This seems to be becoming an issue (so much for 
+\begin_inset Quotes eld
+\end_inset
+
+trivial
+\begin_inset Quotes erd
+\end_inset
+
+!), particularly for ldb.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+A new, incompatible TDB format which uses 64 bit offsets internally rather
+ than 32 bit as now.
+ For simplicity of endian conversion (which TDB does on the fly if required),
+ all values will be 64 bit on disk.
+ In practice, some upper bits may be used for other purposes, but at least
+ 56 bits will be available for file offsets.
+\end_layout
+
+\begin_layout Standard
+tdb_open() will automatically detect the old version, and even create them
+ if TDB_VERSION6 is specified to tdb_open.
+\end_layout
+
+\begin_layout Standard
+32 bit processes will still be able to access TDBs larger than 4G (assuming
+ that their off_t allows them to seek to 64 bits), they will gracefully
+ fall back as they fail to mmap.
+ This can happen already with large TDBs.
+\end_layout
+
+\begin_layout Standard
+Old versions of tdb will fail to open the new TDB files (since 28 August
+ 2009, commit 398d0c29290: prior to that any unrecognized file format would
+ be erased and initialized as a fresh tdb!)
+\end_layout
+
+\begin_layout Subsection
+TDB Records Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This has not been a reported problem, and the API uses size_t which can
+ be 64 bit on 64 bit platforms.
+ However, other limits may have made such an issue moot.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Record sizes will be 64 bit, with an error returned on 32 bit platforms
+ which try to access such records (the current implementation would return
+ TDB_ERR_OOM in a similar case).
+ It seems unlikely that 32 bit keys will be a limitation, so the implementation
+ may not support this (see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsection
+Hash Size Is Determined At TDB Creation Time
+\end_layout
+
+\begin_layout Standard
+TDB contains a number of hash chains in the header; the number is specified
+ at creation time, and defaults to 131.
+ This is such a bottleneck on large databases (as each hash chain gets quite
+ long), that LDB uses 10,000 for this hash.
+ In general it is impossible to know what the 'right' answer is at database
+ creation time.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+After comprehensive performance testing on various scalable hash variants
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
+ because I was previously convinced that an expanding tree of hashes would
+ be very close to optimal.
+\end_layout
+
+\end_inset
+
+, it became clear that it is hard to beat a straight linear hash table which
+ doubles in size when it reaches saturation.
+ There are three details which become important:
+\end_layout
+
+\begin_layout Enumerate
+On encountering a full bucket, we use the next bucket.
+\end_layout
+
+\begin_layout Enumerate
+Extra hash bits are stored with the offset, to reduce comparisons.
+\end_layout
+
+\begin_layout Enumerate
+A marker entry is used on deleting an entry.
+\end_layout
+
+\begin_layout Standard
+The doubling of the table must be done under a transaction; we will not
+ reduce it on deletion, so it will be an unusual case.
+ It will either be placed at the head (other entries will be moved out the
+ way so we can expand).
+ We could have a pointer in the header to the current hashtable location,
+ but that pointer would have to be read frequently to check for hashtable
+ moves.
+\end_layout
+
+\begin_layout Standard
+The locking for this is slightly more complex than the chained case; we
+ currently have one lock per bucket, and that means we would need to expand
+ the lock if we overflow to the next bucket.
+ The frequency of such collisions will effect our locking heuristics: we
+ can always lock more buckets than we need.
+\end_layout
+
+\begin_layout Standard
+One possible optimization is to only re-check the hash size on an insert
+ or a lookup miss.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Freelist-Is"
+
+\end_inset
+
+TDB Freelist Is Highly Contended
+\end_layout
+
+\begin_layout Standard
+TDB uses a single linked list for the free list.
+ Allocation occurs as follows, using heuristics which have evolved over
+ time:
+\end_layout
+
+\begin_layout Enumerate
+Get the free list lock for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Multiply length by 1.25, so we always over-allocate by 25%.
+\end_layout
+
+\begin_layout Enumerate
+Set the slack multiplier to 1.
+\end_layout
+
+\begin_layout Enumerate
+Examine the current freelist entry: if it is > length but < the current
+ best case, remember it as the best case.
+\end_layout
+
+\begin_layout Enumerate
+Multiply the slack multiplier by 1.05.
+\end_layout
+
+\begin_layout Enumerate
+If our best fit so far is less than length * slack multiplier, return it.
+ The slack will be turned into a new free record if it's large enough.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, go onto the next freelist entry.
+\end_layout
+
+\begin_layout Standard
+Deleting a record occurs as follows:
+\end_layout
+
+\begin_layout Enumerate
+Lock the hash chain for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Walk the chain to find the record, keeping the prev pointer offset.
+\end_layout
+
+\begin_layout Enumerate
+If max_dead is non-zero:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Walk the hash chain again and count the dead records.
+\end_layout
+
+\begin_layout Enumerate
+If it's more than max_dead, bulk free all the dead ones (similar to steps
+ 4 and below, but the lock is only obtained once).
+\end_layout
+
+\begin_layout Enumerate
+Simply mark this record as dead and return.
+ 
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+Get the free list lock for the remainder of this operation.
+\end_layout
+
+\begin_layout Enumerate
+\begin_inset CommandInset label
+LatexCommand label
+name "right-merging"
+
+\end_inset
+
+Examine the following block to see if it is free; if so, enlarge the current
+ block and remove that block from the free list.
+ This was disabled, as removal from the free list was O(entries-in-free-list).
+\end_layout
+
+\begin_layout Enumerate
+Examine the preceeding block to see if it is free: for this reason, each
+ block has a 32-bit tailer which indicates its length.
+ If it is free, expand it to cover our new block and return.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, prepend ourselves to the free list.
+\end_layout
+
+\begin_layout Standard
+Disabling right-merging (step 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "right-merging"
+
+\end_inset
+
+) causes fragmentation; the other heuristics proved insufficient to address
+ this, so the final answer to this was that when we expand the TDB file
+ inside a transaction commit, we repack the entire tdb.
+\end_layout
+
+\begin_layout Standard
+The single list lock limits our allocation rate; due to the other issues
+ this is not currently seen as a bottleneck.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+\end_layout
+
+\begin_layout Standard
+The free list must be split to reduce contention.
+ Assuming perfect free merging, we can at most have 1 free list entry for
+ each entry.
+ This implies that the number of free lists is related to the size of the
+ hash table, but as it is rare to walk a large number of free list entries
+ we can use far fewer, say 1/32 of the number of hash buckets.
+\end_layout
+
+\begin_layout Standard
+There are various benefits in using per-size free lists (see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+) but it's not clear this would reduce contention in the common case where
+ all processes are allocating/freeing the same size.
+ Thus we almost certainly need to divide in other ways: the most obvious
+ is to divide the file into zones, and using a free list (or set of free
+ lists) for each.
+ This approximates address ordering.
+\end_layout
+
+\begin_layout Standard
+Note that this means we need to split the free lists when we expand the
+ file; this is probably acceptable when we double the hash table size, since
+ that is such an expensive operation already.
+ In the case of increasing the file size, there is an optimization we can
+ use: if we use M in the formula above as the file size rounded up to the
+ next power of 2, we only need reshuffle free lists when the file size crosses
+ a power of 2 boundary, 
+\emph on
+and 
+\emph default
+reshuffling the free lists is trivial: we simply merge every consecutive
+ pair of free lists.
+\end_layout
+
+\begin_layout Standard
+The basic algorithm is as follows.
+ Freeing is simple:
+\end_layout
+
+\begin_layout Enumerate
+Identify the correct zone.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+Re-check the zone (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+\end_layout
+
+\begin_layout Enumerate
+Place the freed entry in the list for that zone.
+\end_layout
+
+\begin_layout Standard
+Allocation is a little more complicated, as we perform delayed coalescing
+ at this point:
+\end_layout
+
+\begin_layout Enumerate
+Pick a zone either the zone we last freed into, or based on a 
+\begin_inset Quotes eld
+\end_inset
+
+random
+\begin_inset Quotes erd
+\end_inset
+
+ number.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+Re-check the zone: relock if necessary.
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is -large enough, remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, coalesce entries in the list.If there was no entry large enough,
+ unlock the list and try the next zone.
+\end_layout
+
+\begin_layout Enumerate
+If no zone satisfies, expand the file.
+\end_layout
+
+\begin_layout Standard
+This optimizes rapid insert/delete of free list entries by not coalescing
+ them all the time..
+ First-fit address ordering ordering seems to be fairly good for keeping
+ fragmentation low (see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+).
+ Note that address ordering does not need a tailer to coalesce, though if
+ we needed one we could have one cheaply: see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Standard
+I anticipate that the number of entries in each free zone would be small,
+ but it might be worth using one free entry to hold pointers to the others
+ for cache efficiency.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+TDB Becomes Fragmented
+\end_layout
+
+\begin_layout Standard
+Much of this is a result of allocation strategy
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
+xas.edu/pub/garbage/malloc/ismm98.ps
+\end_layout
+
+\end_inset
+
+ and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
+on) is deliberately set at 25%, and external fragmentation is only cured
+ by the decision to repack the entire db when a transaction commit needs
+ to enlarge the file.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The 25% overhead on allocation works in practice for ldb because indexes
+ tend to expand by one record at a time.
+ This internal fragmentation can be resolved by having an 
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+\end_layout
+
+\begin_layout Standard
+There are is a spectrum of possible solutions for external fragmentation:
+ one is to use a fragmentation-avoiding allocation strategy such as best-fit
+ address-order allocator.
+ The other end of the spectrum would be to use a bump allocator (very fast
+ and simple) and simply repack the file when we reach the end.
+\end_layout
+
+\begin_layout Standard
+There are three problems with efficient fragmentation-avoiding allocators:
+ they are non-trivial, they tend to use a single free list for each size,
+ and there's no evidence that tdb allocation patterns will match those recorded
+ for general allocators (though it seems likely).
+\end_layout
+
+\begin_layout Standard
+Thus we don't spend too much effort on external fragmentation; we will be
+ no worse than the current code if we need to repack on occasion.
+ More effort is spent on reducing freelist contention, and reducing overhead.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Records-Incur-A"
+
+\end_inset
+
+Records Incur A 28-Byte Overhead
+\end_layout
+
+\begin_layout Standard
+Each TDB record has a header as follows:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_record {
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_off_t next; /* offset of the next record in the list */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t rec_len; /* total byte length of record */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t key_len; /* byte length of key */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t data_len; /* byte length of data */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t magic;   /* try to catch errors */
+\end_layout
+
+\begin_layout LyX-Code
+        /* the following union is implied:
+\end_layout
+
+\begin_layout LyX-Code
+                union {
+\end_layout
+
+\begin_layout LyX-Code
+                        char record[rec_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        struct {
+\end_layout
+
+\begin_layout LyX-Code
+                                char key[key_len];
+\end_layout
+
+\begin_layout LyX-Code
+                                char data[data_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        }
+\end_layout
+
+\begin_layout LyX-Code
+                        uint32_t totalsize; (tailer)
+\end_layout
+
+\begin_layout LyX-Code
+                }
+\end_layout
+
+\begin_layout LyX-Code
+        */
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Naively, this would double to a 56-byte overhead on a 64 bit implementation.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We can use various techniques to reduce this for an allocated block:
+\end_layout
+
+\begin_layout Enumerate
+The 'next' pointer is not required, as we are using a flat hash table.
+\end_layout
+
+\begin_layout Enumerate
+'rec_len' can instead be expressed as an addition to key_len and data_len
+ (it accounts for wasted or overallocated length in the record).
+ Since the record length is always a multiple of 8, we can conveniently
+ fit it in 32 bits (representing up to 35 bits).
+\end_layout
+
+\begin_layout Enumerate
+'key_len' and 'data_len' can be reduced.
+ I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
+ the two into one 64-bit field and using a 5 bit value which indicates at
+ what bit to divide the two.
+ Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
+ size of 32 bits.
+\end_layout
+
+\begin_layout Enumerate
+'full_hash' is used to avoid a memcmp on the 
+\begin_inset Quotes eld
+\end_inset
+
+miss
+\begin_inset Quotes erd
+\end_inset
+
+ case, but this is diminishing returns after a handful of bits (at 10 bits,
+ it reduces 99.9% of false memcmp).
+ As an aside, as the lower bits are already incorporated in the hash table
+ resolution, the upper bits should be used here.
+\end_layout
+
+\begin_layout Enumerate
+'magic' does not need to be enlarged: it currently reflects one of 5 values
+ (used, free, dead, recovery, and unused_recovery).
+ It is useful for quick sanity checking however, and should not be eliminated.
+\end_layout
+
+\begin_layout Enumerate
+'tailer' is only used to coalesce free blocks (so a block to the right can
+ find the header to check if this block is free).
+ This can be replaced by a single 'free' bit in the header of the following
+ block (and the tailer only exists in free blocks).
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+This technique from Thomas Standish.
+ Data Structure Techniques.
+ Addison-Wesley, Reading, Massachusetts, 1980.
+\end_layout
+
+\end_inset
+
+ The current proposed coalescing algorithm doesn't need this, however.
+\end_layout
+
+\begin_layout Standard
+This produces a 16 byte used header like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_used_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t magic : 16,
+\end_layout
+
+\begin_layout LyX-Code
+                 prev_is_free: 1,
+\end_layout
+
+\begin_layout LyX-Code
+                 key_data_divide: 5,
+\end_layout
+
+\begin_layout LyX-Code
+                 top_hash: 10;
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t extra_octets;
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t key_and_data_len;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+And a free record like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_free_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t free_magic;
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t total_length;
+\end_layout
+
+\begin_layout LyX-Code
+        ...
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t tailer;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout Subsection
+Transaction Commit Requires 4 fdatasync
+\end_layout
+
+\begin_layout Standard
+The current transaction algorithm is:
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+overwrite_with_new_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+remove_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync(); 
+\end_layout
+
+\begin_layout Standard
+On current ext3, each sync flushes all data to disk, so the next 3 syncs
+ are relatively expensive.
+ But this could become a performance bottleneck on other filesystems such
+ as ext4.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Neil Brown points out that this is overzealous, and only one sync is needed:
+\end_layout
+
+\begin_layout Enumerate
+Bundle the recovery data, a transaction counter and a strong checksum of
+ the new data.
+\end_layout
+
+\begin_layout Enumerate
+Strong checksum that whole bundle.
+\end_layout
+
+\begin_layout Enumerate
+Store the bundle in the database.
+\end_layout
+
+\begin_layout Enumerate
+Overwrite the oldest of the two recovery pointers in the header (identified
+ using the transaction counter) with the offset of this bundle.
+\end_layout
+
+\begin_layout Enumerate
+sync.
+\end_layout
+
+\begin_layout Enumerate
+Write the new data to the file.
+\end_layout
+
+\begin_layout Standard
+Checking for recovery means identifying the latest bundle with a valid checksum
+ and using the new data checksum to ensure that it has been applied.
+ This is more expensive than the current check, but need only be done at
+ open.
+ For running databases, a separate header field can be used to indicate
+ a transaction in progress; we need only check for recovery if this is set.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Does-Not"
+
+\end_inset
+
+TDB Does Not Have Snapshot Support
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ At some point you say 
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+But as a thought experiment, if we implemented transactions to only overwrite
+ free entries (this is tricky: there must not be a header in each entry
+ which indicates whether it is free, but use of presence in metadata elsewhere),
+ and a pointer to the hash table, we could create an entirely new commit
+ without destroying existing data.
+ Then it would be easy to implement snapshots in a similar way.
+\end_layout
+
+\begin_layout Standard
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\end_layout
+
+\begin_layout Standard
+We could then implement snapshots using a similar method, using multiple
+ different hash tables/free tables.
+\end_layout
+
+\begin_layout Subsection
+Transactions Cannot Operate in Parallel
+\end_layout
+
+\begin_layout Standard
+This would be useless for ldb, as it hits the index records with just about
+ every update.
+ It would add significant complexity in resolving clashes, and cause the
+ all transaction callers to write their code to loop in the case where the
+ transactions spuriously failed.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We could solve a small part of the problem by providing read-only transactions.
+ These would allow one write transaction to begin, but it could not commit
+ until all r/o transactions are done.
+ This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
+ commit.
+\end_layout
+
+\begin_layout Subsection
+Default Hash Function Is Suboptimal
+\end_layout
+
+\begin_layout Standard
+The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
+ if we expand it to 64 bits), and works best when the hash bucket size is
+ a prime number (which also means a slow modulus).
+ In addition, it is highly predictable which could potentially lead to a
+ Denial of Service attack in some TDB uses.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The Jenkins lookup3 hash
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+http://burtleburtle.net/bob/c/lookup3.c
+\end_layout
+
+\end_inset
+
+ is a fast and superbly-mixing hash.
+ It's used by the Linux kernel and almost everything else.
+ This has the particular properties that it takes an initial seed, and produces
+ two 32 bit hash numbers, which we can combine into a 64-bit hash.
+\end_layout
+
+\begin_layout Standard
+The seed should be created at tdb-creation time from some random source,
+ and placed in the header.
+ This is far from foolproof, but adds a little bit of protection against
+ hash bombing.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Reliable-Traversal-Adds"
+
+\end_inset
+
+Reliable Traversal Adds Complexity
+\end_layout
+
+\begin_layout Standard
+We lock a record during traversal iteration, and try to grab that lock in
+ the delete code.
+ If that grab on delete fails, we simply mark it deleted and continue onwards;
+ traversal checks for this condition and does the delete when it moves off
+ the record.
+\end_layout
+
+\begin_layout Standard
+If traversal terminates, the dead record may be left indefinitely.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove reliability guarantees; see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "traverse-Proposed-Solution"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsection
+Fcntl Locking Adds Overhead
+\end_layout
+
+\begin_layout Standard
+Placing a fcntl lock means a system call, as does removing one.
+ This is actually one reason why transactions can be faster (everything
+ is locked once at transaction start).
+ In the uncontended case, this overhead can theoretically be eliminated.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+We tried this before with spinlock support, in the early days of TDB, and
+ it didn't make much difference except in manufactured benchmarks.
+\end_layout
+
+\begin_layout Standard
+We could use spinlocks (with futex kernel support under Linux), but it means
+ that we lose automatic cleanup when a process dies with a lock.
+ There is a method of auto-cleanup under Linux, but it's not supported by
+ other operating systems.
+ We could reintroduce a clear-if-first-style lock and sweep for dead futexes
+ on open, but that wouldn't help the normal case of one concurrent opener
+ dying.
+ Increasingly elaborate repair schemes could be considered, but they require
+ an ABI change (everyone must use them) anyway, so there's no need to do
+ this at the same time as everything else.
+\end_layout
+
+\begin_layout Subsection
+Some Transactions Don't Require Durability
+\end_layout
+
+\begin_layout Standard
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
+ usage, and occasionally empties the results into a transactional TDB.
+ This kind of usage prioritizes performance over durability: as long as
+ we are consistent, data can be lost.
+\end_layout
+
+\begin_layout Standard
+This would be more neatly implemented inside tdb: a 
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ transaction commit (ie.
+ syncless) which meant that data may be reverted on a crash.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+Unfortunately any transaction scheme which overwrites old data requires
+ a sync before that overwrite to avoid the possibility of corruption.
+\end_layout
+
+\begin_layout Standard
+It seems possible to use a scheme similar to that described in 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Does-Not"
+
+\end_inset
+
+,where transactions are committed without overwriting existing data, and
+ an array of top-level pointers were available in the header.
+ If the transaction is 
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ then we would not need a sync at all: existing processes would pick up
+ the new hash table and free list and work with that.
+\end_layout
+
+\begin_layout Standard
+At some later point, a sync would allow recovery of the old data into the
+ free lists (perhaps when the array of top-level pointers filled).
+ On crash, tdb_open() would examine the array of top levels, and apply the
+ transactions until it encountered an invalid checksum.
+\end_layout
+
+\end_body
+\end_document

+ 3106 - 0
ccan/tdb2/doc/design.lyx,v

@@ -0,0 +1,3106 @@
+head	1.6;
+access;
+symbols;
+locks; strict;
+comment	@# @;
+
+
+1.6
+date	2010.08.02.00.21.43;	author rusty;	state Exp;
+branches;
+next	1.5;
+
+1.5
+date	2010.08.02.00.21.16;	author rusty;	state Exp;
+branches;
+next	1.4;
+
+1.4
+date	2010.05.10.13.09.11;	author rusty;	state Exp;
+branches;
+next	1.3;
+
+1.3
+date	2010.05.10.11.58.37;	author rusty;	state Exp;
+branches;
+next	1.2;
+
+1.2
+date	2010.05.10.05.35.13;	author rusty;	state Exp;
+branches;
+next	1.1;
+
+1.1
+date	2010.05.04.02.29.16;	author rusty;	state Exp;
+branches;
+next	;
+
+
+desc
+@First draft
+@
+
+
+1.6
+log
+@Commit changes
+@
+text
+@#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
+\lyxformat 345
+\begin_document
+\begin_header
+\textclass article
+\use_default_options true
+\language english
+\inputencoding auto
+\font_roman default
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\paperfontsize default
+\use_hyperref false
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\cite_engine basic
+\use_bibtopic false
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation indent
+\defskip medskip
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes true
+\output_changes true
+\author "" 
+\author "" 
+\end_header
+
+\begin_body
+
+\begin_layout Title
+TDB2: A Redesigning The Trivial DataBase
+\end_layout
+
+\begin_layout Author
+Rusty Russell, IBM Corporation
+\end_layout
+
+\begin_layout Date
+26-July-2010
+\end_layout
+
+\begin_layout Abstract
+The Trivial DataBase on-disk format is 32 bits; with usage cases heading
+ towards the 4G limit, that must change.
+ This required breakage provides an opportunity to revisit TDB's other design
+ decisions and reassess them.
+\end_layout
+
+\begin_layout Section
+Introduction
+\end_layout
+
+\begin_layout Standard
+The Trivial DataBase was originally written by Andrew Tridgell as a simple
+ key/data pair storage system with the same API as dbm, but allowing multiple
+ readers and writers while being small enough (< 1000 lines of C) to include
+ in SAMBA.
+ The simple design created in 1999 has proven surprisingly robust and performant
+, used in Samba versions 3 and 4 as well as numerous other projects.
+ Its useful life was greatly increased by the (backwards-compatible!) addition
+ of transaction support in 2005.
+\end_layout
+
+\begin_layout Standard
+The wider variety and greater demands of TDB-using code has lead to some
+ organic growth of the API, as well as some compromises on the implementation.
+ None of these, by themselves, are seen as show-stoppers, but the cumulative
+ effect is to a loss of elegance over the initial, simple TDB implementation.
+ Here is a table of the approximate number of lines of implementation code
+ and number of API functions at the end of each year:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="12" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Year End
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+API Functions
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Lines of C Code Implementation
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1999
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+13
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1195
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2000
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+24
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1725
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2001
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+32
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2228
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2002
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2481
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2003
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2552
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2004
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+40
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2584
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2005
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+38
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2647
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2006
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+52
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+3754
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2007
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+66
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4398
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2008
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+71
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4768
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2009
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+73
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+5715
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This review is an attempt to catalog and address all the known issues with
+ TDB and create solutions which address the problems without significantly
+ increasing complexity; all involved are far too aware of the dangers of
+ second system syndrome in rewriting a successful project like this.
+\end_layout
+
+\begin_layout Section
+API Issues
+\end_layout
+
+\begin_layout Subsection
+tdb_open_ex Is Not Expandable
+\end_layout
+
+\begin_layout Standard
+The tdb_open() call was expanded to tdb_open_ex(), which added an optional
+ hashing function and an optional logging function argument.
+ Additional arguments to open would require the introduction of a tdb_open_ex2
+ call etc.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+tdb_open() will take a linked-list of attributes:
+\end_layout
+
+\begin_layout LyX-Code
+enum tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_LOG = 0,
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_HASH = 1
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_base {
+\end_layout
+
+\begin_layout LyX-Code
+    enum tdb_attribute attr;
+\end_layout
+
+\begin_layout LyX-Code
+    union tdb_attribute *next;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_log {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_log_func log_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *log_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_hash {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_hash_func hash_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *hash_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+union tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_log log;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_hash hash;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+This allows future attributes to be added, even if this expands the size
+ of the union.
+\end_layout
+
+\begin_layout Subsection
+tdb_traverse Makes Impossible Guarantees
+\end_layout
+
+\begin_layout Standard
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
+ was thought that it was important to guarantee that all records which exist
+ at the start and end of the traversal would be included, and no record
+ would be included twice.
+\end_layout
+
+\begin_layout Standard
+This adds complexity (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Reliable-Traversal-Adds"
+
+\end_inset
+
+) and does not work anyway for records which are altered (in particular,
+ those which are expanded may be effectively deleted and re-added behind
+ the traversal).
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "traverse-Proposed-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Abandon the guarantee.
+ You will see every record if no changes occur during your traversal, otherwise
+ you will see some subset.
+ You can prevent changes by using a transaction or the locking API.
+\end_layout
+
+\begin_layout Subsection
+Nesting of Transactions Is Fraught
+\end_layout
+
+\begin_layout Standard
+TDB has alternated between allowing nested transactions and not allowing
+ them.
+ Various paths in the Samba codebase assume that transactions will nest,
+ and in a sense they can: the operation is only committed to disk when the
+ outer transaction is committed.
+ There are two problems, however:
+\end_layout
+
+\begin_layout Enumerate
+Canceling the inner transaction will cause the outer transaction commit
+ to fail, and will not undo any operations since the inner transaction began.
+ This problem is soluble with some additional internal code.
+\end_layout
+
+\begin_layout Enumerate
+An inner transaction commit can be cancelled by the outer transaction.
+ This is desirable in the way which Samba's database initialization code
+ uses transactions, but could be a surprise to any users expecting a successful
+ transaction commit to expose changes to others.
+\end_layout
+
+\begin_layout Standard
+The current solution is to specify the behavior at tdb_open(), with the
+ default currently that nested transactions are allowed.
+ This flag can also be changed at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Given the usage patterns, it seems that the 
+\begin_inset Quotes eld
+\end_inset
+
+least-surprise
+\begin_inset Quotes erd
+\end_inset
+
+ behavior of disallowing nested transactions should become the default.
+ Additionally, it seems the outer transaction is the only code which knows
+ whether inner transactions should be allowed, so a flag to indicate this
+ could be added to tdb_transaction_start.
+ However, this behavior can be simulated with a wrapper which uses tdb_add_flags
+() and tdb_remove_flags(), so the API should not be expanded for this relatively
+-obscure case.
+\end_layout
+
+\begin_layout Subsection
+Incorrect Hash Function is Not Detected
+\end_layout
+
+\begin_layout Standard
+tdb_open_ex() allows the calling code to specify a different hash function
+ to use, but does not check that all other processes accessing this tdb
+ are using the same hash function.
+ The result is that records are missing from tdb_fetch().
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain an example hash result (eg.
+ the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
+ hash function produces the same answer, or fail the tdb_open call.
+\end_layout
+
+\begin_layout Subsection
+tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+\end_layout
+
+\begin_layout Standard
+In response to scalability issues with the free list (
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Freelist-Is"
+
+\end_inset
+
+) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
+ and the TDB_VOLATILE flag to tdb_open.
+ The latter actually calls the former with an argument of 
+\begin_inset Quotes eld
+\end_inset
+
+5
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+This code allows deleted records to accumulate without putting them in the
+ free list.
+ On delete we iterate through each chain and free them in a batch if there
+ are more than max_dead entries.
+ These are never otherwise recycled except as a side-effect of a tdb_repack.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With the scalability problems of the freelist solved, this API can be removed.
+ The TDB_VOLATILE flag may still be useful as a hint that store and delete
+ of records will be at least as common as fetch in order to allow some internal
+ tuning, but initially will become a no-op.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Files-Cannot"
+
+\end_inset
+
+TDB Files Cannot Be Opened Multiple Times In The Same Process
+\end_layout
+
+\begin_layout Standard
+No process can open the same TDB twice; we check and disallow it.
+ This is an unfortunate side-effect of fcntl locks, which operate on a per-file
+ rather than per-file-descriptor basis, and do not nest.
+ Thus, closing any file descriptor on a file clears all the locks obtained
+ by this process, even if they were placed using a different file descriptor!
+\end_layout
+
+\begin_layout Standard
+Note that even if this were solved, deadlock could occur if operations were
+ nested: this is a more manageable programming error in most cases.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We could lobby POSIX to fix the perverse rules, or at least lobby Linux
+ to violate them so that the most common implementation does not have this
+ restriction.
+ This would be a generally good idea for other fcntl lock users.
+\end_layout
+
+\begin_layout Standard
+Samba uses a wrapper which hands out the same tdb_context to multiple callers
+ if this happens, and does simple reference counting.
+ We should do this inside the tdb library, which already emulates lock nesting
+ internally; it would need to recognize when deadlock occurs within a single
+ process.
+ This would create a new failure mode for tdb operations (while we currently
+ handle locking failures, they are impossible in normal use and a process
+ encountering them can do little but give up).
+\end_layout
+
+\begin_layout Standard
+I do not see benefit in an additional tdb_open flag to indicate whether
+ re-opening is allowed, as though there may be some benefit to adding a
+ call to detect when a tdb_context is shared, to allow other to create such
+ an API.
+\end_layout
+
+\begin_layout Subsection
+TDB API Is Not POSIX Thread-safe
+\end_layout
+
+\begin_layout Standard
+The TDB API uses an error code which can be queried after an operation to
+ determine what went wrong.
+ This programming model does not work with threads, unless specific additional
+ guarantees are given by the implementation.
+ In addition, even otherwise-independent threads cannot open the same TDB
+ (as in 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Files-Cannot"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Reachitecting the API to include a tdb_errcode pointer would be a great
+ deal of churn; we are better to guarantee that the tdb_errcode is per-thread
+ so the current programming model can be maintained.
+\end_layout
+
+\begin_layout Standard
+This requires dynamic per-thread allocations, which is awkward with POSIX
+ threads (pthread_key_create space is limited and we cannot simply allocate
+ a key for every TDB).
+\end_layout
+
+\begin_layout Standard
+Internal locking is required to make sure that fcntl locks do not overlap
+ between threads, and also that the global list of tdbs is maintained.
+\end_layout
+
+\begin_layout Standard
+The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
+ version of the library, and otherwise no overhead will exist.
+\end_layout
+
+\begin_layout Subsection
+*_nonblock Functions And *_mark Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+CTDB
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Clustered TDB, see http://ctdb.samba.org
+\end_layout
+
+\end_inset
+
+ wishes to operate on TDB in a non-blocking manner.
+ This is currently done as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the _nonblock variant of an API function (eg.
+ tdb_lockall_nonblock).
+ If this fails:
+\end_layout
+
+\begin_layout Enumerate
+Fork a child process, and wait for it to call the normal variant (eg.
+ tdb_lockall).
+\end_layout
+
+\begin_layout Enumerate
+If the child succeeds, call the _mark variant to indicate we already have
+ the locks (eg.
+ tdb_lockall_mark).
+\end_layout
+
+\begin_layout Enumerate
+Upon completion, tell the child to release the locks (eg.
+ tdb_unlockall).
+\end_layout
+
+\begin_layout Enumerate
+Indicate to tdb that it should consider the locks removed (eg.
+ tdb_unlockall_mark).
+\end_layout
+
+\begin_layout Standard
+There are several issues with this approach.
+ Firstly, adding two new variants of each function clutters the API for
+ an obscure use, and so not all functions have three variants.
+ Secondly, it assumes that all paths of the functions ask for the same locks,
+ otherwise the parent process will have to get a lock which the child doesn't
+ have under some circumstances.
+ I don't believe this is currently the case, but it constrains the implementatio
+n.
+ 
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Proposed-Solution-locking-hook"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Implement a hook for locking methods, so that the caller can control the
+ calls to create and remove fcntl locks.
+ In this scenario, ctdbd would operate as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the normal API function, eg tdb_lockall().
+\end_layout
+
+\begin_layout Enumerate
+When the lock callback comes in, check if the child has the lock.
+ Initially, this is always false.
+ If so, return 0.
+ Otherwise, try to obtain it in non-blocking mode.
+ If that fails, return EWOULDBLOCK.
+\end_layout
+
+\begin_layout Enumerate
+Release locks in the unlock callback as normal.
+\end_layout
+
+\begin_layout Enumerate
+If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
+ child to repeat the operation.
+\end_layout
+
+\begin_layout Enumerate
+The child records what locks it obtains, and returns that information to
+ the parent.
+\end_layout
+
+\begin_layout Enumerate
+When the child has succeeded, goto 1.
+\end_layout
+
+\begin_layout Standard
+This is flexible enough to handle any potential locking scenario, even when
+ lock requirements change.
+ It can be optimized so that the parent does not release locks, just tells
+ the child which locks it doesn't need to obtain.
+\end_layout
+
+\begin_layout Standard
+It also keeps the complexity out of the API, and in ctdbd where it is needed.
+\end_layout
+
+\begin_layout Subsection
+tdb_chainlock Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+tdb_chainlock locks some number of records, including the record indicated
+ by the given key.
+ This gave atomicity guarantees; no-one can start a transaction, alter,
+ read or delete that key while the lock is held.
+\end_layout
+
+\begin_layout Standard
+It also makes the same guarantee for any other key in the chain, which is
+ an internal implementation detail and potentially a cause for deadlock.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ It would be nice to have an explicit single entry lock which effected no
+ other keys.
+ Unfortunately, this won't work for an entry which doesn't exist.
+ Thus while chainlock may be implemented more efficiently for the existing
+ case, it will still have overlap issues with the non-existing case.
+ So it is best to keep the current (lack of) guarantee about which records
+ will be effected to avoid constraining our implementation.
+\end_layout
+
+\begin_layout Subsection
+Signal Handling is Not Race-Free
+\end_layout
+
+\begin_layout Standard
+The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
+ that the tdb locking code should return with a failure, rather than trying
+ again when a signal is received (and errno == EAGAIN).
+ This is usually used to implement timeouts.
+\end_layout
+
+\begin_layout Standard
+Unfortunately, this does not work in the case where the signal is received
+ before the tdb code enters the fcntl() call to place the lock: the code
+ will sleep within the fcntl() code, unaware that the signal wants it to
+ exit.
+ In the case of long timeouts, this does not happen in practice.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The locking hooks proposed in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ would allow the user to decide on whether to fail the lock acquisition
+ on a signal.
+ This allows the caller to choose their own compromise: they could narrow
+ the race by checking immediately before the fcntl call.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+It may be possible to make this race-free in some implementations by having
+ the signal handler alter the struct flock to make it invalid.
+ This will cause the fcntl() lock call to fail with EINVAL if the signal
+ occurs before the kernel is entered, otherwise EAGAIN.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+The API Uses Gratuitous Typedefs, Capitals
+\end_layout
+
+\begin_layout Standard
+typedefs are useful for providing source compatibility when types can differ
+ across implementations, or arguably in the case of function pointer definitions
+ which are hard for humans to parse.
+ Otherwise it is simply obfuscation and pollutes the namespace.
+\end_layout
+
+\begin_layout Standard
+Capitalization is usually reserved for compile-time constants and macros.
+\end_layout
+
+\begin_layout Description
+TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
+ definition isn't visible to the API user anyway.
+\end_layout
+
+\begin_layout Description
+TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
+ needs to be understood by the API user.
+\end_layout
+
+\begin_layout Description
+struct
+\begin_inset space ~
+\end_inset
+
+TDB_DATA This would normally be called 'struct tdb_data'.
+\end_layout
+
+\begin_layout Description
+enum
+\begin_inset space ~
+\end_inset
+
+TDB_ERROR Similarly, this would normally be enum tdb_error.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ Introducing lower case variants would please pedants like myself, but if
+ it were done the existing ones should be kept.
+ There is little point forcing a purely cosmetic change upon tdb users.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+tdb_log_func Doesn't Take The Private Pointer
+\end_layout
+
+\begin_layout Standard
+For API compatibility reasons, the logging function needs to call tdb_get_loggin
+g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+It should simply take an extra argument, since we are prepared to break
+ the API/ABI.
+\end_layout
+
+\begin_layout Subsection
+Various Callback Functions Are Not Typesafe
+\end_layout
+
+\begin_layout Standard
+The callback functions in tdb_set_logging_function (after 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
+ all take void * and must internally convert it to the argument type they
+ were expecting.
+\end_layout
+
+\begin_layout Standard
+If this type changes, the compiler will not produce warnings on the callers,
+ since it only sees void *.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With careful use of macros, we can create callback functions which give
+ a warning when used on gcc and the types of the callback and its private
+ argument differ.
+ Unsupported compilers will not give a warning, which is no worse than now.
+ In addition, the callbacks become clearer, as they need not use void *
+ for their parameter.
+\end_layout
+
+\begin_layout Standard
+See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
+\end_layout
+
+\begin_layout Subsection
+TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
+\end_layout
+
+\begin_layout Standard
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
+ be cleared if the caller discovers it is the only process with the TDB
+ open.
+ However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
+ be detected, so will have the TDB erased underneath them (usually resulting
+ in a crash).
+\end_layout
+
+\begin_layout Standard
+There is a similar issue on fork(); if the parent exits (or otherwise closes
+ the tdb) before the child calls tdb_reopen_all() to establish the lock
+ used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
+ at that moment will believe it alone has opened the TDB and will erase
+ it.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove TDB_CLEAR_IF_FIRST.
+ Other workarounds are possible, but see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Section
+Performance And Scalability Issues
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+TDB_CLEAR_IF_FIRST Imposes Performance Penalty
+\end_layout
+
+\begin_layout Standard
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
+ 4 (aka.
+ the ACTIVE_LOCK).
+ While these locks never conflict in normal tdb usage, they do add substantial
+ overhead for most fcntl lock implementations when the kernel scans to detect
+ if a lock conflict exists.
+ This is often a single linked list, making the time to acquire and release
+ a fcntl lock O(N) where N is the number of processes with the TDB open,
+ not the number actually doing work.
+\end_layout
+
+\begin_layout Standard
+In a Samba server it is common to have huge numbers of clients sitting idle,
+ and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+There is a flag to tdb_reopen_all() which is used for this optimization:
+ if the parent process will outlive the child, the child does not need the
+ ACTIVE_LOCK.
+ This is a workaround for this very performance issue.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove the flag.
+ It was a neat idea, but even trivial servers tend to know when they are
+ initializing for the first time and can simply unlink the old tdb at that
+ point.
+\end_layout
+
+\begin_layout Subsection
+TDB Files Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This seems to be becoming an issue (so much for 
+\begin_inset Quotes eld
+\end_inset
+
+trivial
+\begin_inset Quotes erd
+\end_inset
+
+!), particularly for ldb.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+A new, incompatible TDB format which uses 64 bit offsets internally rather
+ than 32 bit as now.
+ For simplicity of endian conversion (which TDB does on the fly if required),
+ all values will be 64 bit on disk.
+ In practice, some upper bits may be used for other purposes, but at least
+ 56 bits will be available for file offsets.
+\end_layout
+
+\begin_layout Standard
+tdb_open() will automatically detect the old version, and even create them
+ if TDB_VERSION6 is specified to tdb_open.
+\end_layout
+
+\begin_layout Standard
+32 bit processes will still be able to access TDBs larger than 4G (assuming
+ that their off_t allows them to seek to 64 bits), they will gracefully
+ fall back as they fail to mmap.
+ This can happen already with large TDBs.
+\end_layout
+
+\begin_layout Standard
+Old versions of tdb will fail to open the new TDB files (since 28 August
+ 2009, commit 398d0c29290: prior to that any unrecognized file format would
+ be erased and initialized as a fresh tdb!)
+\end_layout
+
+\begin_layout Subsection
+TDB Records Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This has not been a reported problem, and the API uses size_t which can
+ be 64 bit on 64 bit platforms.
+ However, other limits may have made such an issue moot.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Record sizes will be 64 bit, with an error returned on 32 bit platforms
+ which try to access such records (the current implementation would return
+ TDB_ERR_OOM in a similar case).
+ It seems unlikely that 32 bit keys will be a limitation, so the implementation
+ may not support this (see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsection
+Hash Size Is Determined At TDB Creation Time
+\end_layout
+
+\begin_layout Standard
+TDB contains a number of hash chains in the header; the number is specified
+ at creation time, and defaults to 131.
+ This is such a bottleneck on large databases (as each hash chain gets quite
+ long), that LDB uses 10,000 for this hash.
+ In general it is impossible to know what the 'right' answer is at database
+ creation time.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+After comprehensive performance testing on various scalable hash variants
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
+ because I was previously convinced that an expanding tree of hashes would
+ be very close to optimal.
+\end_layout
+
+\end_inset
+
+, it became clear that it is hard to beat a straight linear hash table which
+ doubles in size when it reaches saturation.
+ There are three details which become important:
+\end_layout
+
+\begin_layout Enumerate
+On encountering a full bucket, we use the next bucket.
+\end_layout
+
+\begin_layout Enumerate
+Extra hash bits are stored with the offset, to reduce comparisons.
+\end_layout
+
+\begin_layout Enumerate
+A marker entry is used on deleting an entry.
+\end_layout
+
+\begin_layout Standard
+The doubling of the table must be done under a transaction; we will not
+ reduce it on deletion, so it will be an unusual case.
+ It will either be placed at the head (other entries will be moved out the
+ way so we can expand).
+ We could have a pointer in the header to the current hashtable location,
+ but that pointer would have to be read frequently to check for hashtable
+ moves.
+\end_layout
+
+\begin_layout Standard
+The locking for this is slightly more complex than the chained case; we
+ currently have one lock per bucket, and that means we would need to expand
+ the lock if we overflow to the next bucket.
+ The frequency of such collisions will effect our locking heuristics: we
+ can always lock more buckets than we need.
+\end_layout
+
+\begin_layout Standard
+One possible optimization is to only re-check the hash size on an insert
+ or a lookup miss.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Freelist-Is"
+
+\end_inset
+
+TDB Freelist Is Highly Contended
+\end_layout
+
+\begin_layout Standard
+TDB uses a single linked list for the free list.
+ Allocation occurs as follows, using heuristics which have evolved over
+ time:
+\end_layout
+
+\begin_layout Enumerate
+Get the free list lock for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Multiply length by 1.25, so we always over-allocate by 25%.
+\end_layout
+
+\begin_layout Enumerate
+Set the slack multiplier to 1.
+\end_layout
+
+\begin_layout Enumerate
+Examine the current freelist entry: if it is > length but < the current
+ best case, remember it as the best case.
+\end_layout
+
+\begin_layout Enumerate
+Multiply the slack multiplier by 1.05.
+\end_layout
+
+\begin_layout Enumerate
+If our best fit so far is less than length * slack multiplier, return it.
+ The slack will be turned into a new free record if it's large enough.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, go onto the next freelist entry.
+\end_layout
+
+\begin_layout Standard
+Deleting a record occurs as follows:
+\end_layout
+
+\begin_layout Enumerate
+Lock the hash chain for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Walk the chain to find the record, keeping the prev pointer offset.
+\end_layout
+
+\begin_layout Enumerate
+If max_dead is non-zero:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Walk the hash chain again and count the dead records.
+\end_layout
+
+\begin_layout Enumerate
+If it's more than max_dead, bulk free all the dead ones (similar to steps
+ 4 and below, but the lock is only obtained once).
+\end_layout
+
+\begin_layout Enumerate
+Simply mark this record as dead and return.
+ 
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+Get the free list lock for the remainder of this operation.
+\end_layout
+
+\begin_layout Enumerate
+\begin_inset CommandInset label
+LatexCommand label
+name "right-merging"
+
+\end_inset
+
+Examine the following block to see if it is free; if so, enlarge the current
+ block and remove that block from the free list.
+ This was disabled, as removal from the free list was O(entries-in-free-list).
+\end_layout
+
+\begin_layout Enumerate
+Examine the preceeding block to see if it is free: for this reason, each
+ block has a 32-bit tailer which indicates its length.
+ If it is free, expand it to cover our new block and return.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, prepend ourselves to the free list.
+\end_layout
+
+\begin_layout Standard
+Disabling right-merging (step 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "right-merging"
+
+\end_inset
+
+) causes fragmentation; the other heuristics proved insufficient to address
+ this, so the final answer to this was that when we expand the TDB file
+ inside a transaction commit, we repack the entire tdb.
+\end_layout
+
+\begin_layout Standard
+The single list lock limits our allocation rate; due to the other issues
+ this is not currently seen as a bottleneck.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+\end_layout
+
+\begin_layout Standard
+The free list must be split to reduce contention.
+ Assuming perfect free merging, we can at most have 1 free list entry for
+ each entry.
+ This implies that the number of free lists is related to the size of the
+ hash table, but as it is rare to walk a large number of free list entries
+ we can use far fewer, say 1/32 of the number of hash buckets.
+\end_layout
+
+\begin_layout Standard
+There are various benefits in using per-size free lists (see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+) but it's not clear this would reduce contention in the common case where
+ all processes are allocating/freeing the same size.
+ Thus we almost certainly need to divide in other ways: the most obvious
+ is to divide the file into zones, and using a free list (or set of free
+ lists) for each.
+ This approximates address ordering.
+\end_layout
+
+\begin_layout Standard
+Note that this means we need to split the free lists when we expand the
+ file; this is probably acceptable when we double the hash table size, since
+ that is such an expensive operation already.
+ In the case of increasing the file size, there is an optimization we can
+ use: if we use M in the formula above as the file size rounded up to the
+ next power of 2, we only need reshuffle free lists when the file size crosses
+ a power of 2 boundary, 
+\emph on
+and 
+\emph default
+reshuffling the free lists is trivial: we simply merge every consecutive
+ pair of free lists.
+\end_layout
+
+\begin_layout Standard
+The basic algorithm is as follows.
+ Freeing is simple:
+\end_layout
+
+\begin_layout Enumerate
+Identify the correct zone.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+Re-check the zone (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+\end_layout
+
+\begin_layout Enumerate
+Place the freed entry in the list for that zone.
+\end_layout
+
+\begin_layout Standard
+Allocation is a little more complicated, as we perform delayed coalescing
+ at this point:
+\end_layout
+
+\begin_layout Enumerate
+Pick a zone either the zone we last freed into, or based on a 
+\begin_inset Quotes eld
+\end_inset
+
+random
+\begin_inset Quotes erd
+\end_inset
+
+ number.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+Re-check the zone: relock if necessary.
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is -large enough, remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, coalesce entries in the list.If there was no entry large enough,
+ unlock the list and try the next zone.
+\end_layout
+
+\begin_layout Enumerate
+If no zone satisfies, expand the file.
+\end_layout
+
+\begin_layout Standard
+This optimizes rapid insert/delete of free list entries by not coalescing
+ them all the time..
+ First-fit address ordering ordering seems to be fairly good for keeping
+ fragmentation low (see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+).
+ Note that address ordering does not need a tailer to coalesce, though if
+ we needed one we could have one cheaply: see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Standard
+I anticipate that the number of entries in each free zone would be small,
+ but it might be worth using one free entry to hold pointers to the others
+ for cache efficiency.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+TDB Becomes Fragmented
+\end_layout
+
+\begin_layout Standard
+Much of this is a result of allocation strategy
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
+xas.edu/pub/garbage/malloc/ismm98.ps
+\end_layout
+
+\end_inset
+
+ and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
+on) is deliberately set at 25%, and external fragmentation is only cured
+ by the decision to repack the entire db when a transaction commit needs
+ to enlarge the file.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The 25% overhead on allocation works in practice for ldb because indexes
+ tend to expand by one record at a time.
+ This internal fragmentation can be resolved by having an 
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+\end_layout
+
+\begin_layout Standard
+There are is a spectrum of possible solutions for external fragmentation:
+ one is to use a fragmentation-avoiding allocation strategy such as best-fit
+ address-order allocator.
+ The other end of the spectrum would be to use a bump allocator (very fast
+ and simple) and simply repack the file when we reach the end.
+\end_layout
+
+\begin_layout Standard
+There are three problems with efficient fragmentation-avoiding allocators:
+ they are non-trivial, they tend to use a single free list for each size,
+ and there's no evidence that tdb allocation patterns will match those recorded
+ for general allocators (though it seems likely).
+\end_layout
+
+\begin_layout Standard
+Thus we don't spend too much effort on external fragmentation; we will be
+ no worse than the current code if we need to repack on occasion.
+ More effort is spent on reducing freelist contention, and reducing overhead.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Records-Incur-A"
+
+\end_inset
+
+Records Incur A 28-Byte Overhead
+\end_layout
+
+\begin_layout Standard
+Each TDB record has a header as follows:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_record {
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_off_t next; /* offset of the next record in the list */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t rec_len; /* total byte length of record */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t key_len; /* byte length of key */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t data_len; /* byte length of data */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t magic;   /* try to catch errors */
+\end_layout
+
+\begin_layout LyX-Code
+        /* the following union is implied:
+\end_layout
+
+\begin_layout LyX-Code
+                union {
+\end_layout
+
+\begin_layout LyX-Code
+                        char record[rec_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        struct {
+\end_layout
+
+\begin_layout LyX-Code
+                                char key[key_len];
+\end_layout
+
+\begin_layout LyX-Code
+                                char data[data_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        }
+\end_layout
+
+\begin_layout LyX-Code
+                        uint32_t totalsize; (tailer)
+\end_layout
+
+\begin_layout LyX-Code
+                }
+\end_layout
+
+\begin_layout LyX-Code
+        */
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Naively, this would double to a 56-byte overhead on a 64 bit implementation.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We can use various techniques to reduce this for an allocated block:
+\end_layout
+
+\begin_layout Enumerate
+The 'next' pointer is not required, as we are using a flat hash table.
+\end_layout
+
+\begin_layout Enumerate
+'rec_len' can instead be expressed as an addition to key_len and data_len
+ (it accounts for wasted or overallocated length in the record).
+ Since the record length is always a multiple of 8, we can conveniently
+ fit it in 32 bits (representing up to 35 bits).
+\end_layout
+
+\begin_layout Enumerate
+'key_len' and 'data_len' can be reduced.
+ I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
+ the two into one 64-bit field and using a 5 bit value which indicates at
+ what bit to divide the two.
+ Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
+ size of 32 bits.
+\end_layout
+
+\begin_layout Enumerate
+'full_hash' is used to avoid a memcmp on the 
+\begin_inset Quotes eld
+\end_inset
+
+miss
+\begin_inset Quotes erd
+\end_inset
+
+ case, but this is diminishing returns after a handful of bits (at 10 bits,
+ it reduces 99.9% of false memcmp).
+ As an aside, as the lower bits are already incorporated in the hash table
+ resolution, the upper bits should be used here.
+\end_layout
+
+\begin_layout Enumerate
+'magic' does not need to be enlarged: it currently reflects one of 5 values
+ (used, free, dead, recovery, and unused_recovery).
+ It is useful for quick sanity checking however, and should not be eliminated.
+\end_layout
+
+\begin_layout Enumerate
+'tailer' is only used to coalesce free blocks (so a block to the right can
+ find the header to check if this block is free).
+ This can be replaced by a single 'free' bit in the header of the following
+ block (and the tailer only exists in free blocks).
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+This technique from Thomas Standish.
+ Data Structure Techniques.
+ Addison-Wesley, Reading, Massachusetts, 1980.
+\end_layout
+
+\end_inset
+
+ The current proposed coalescing algorithm doesn't need this, however.
+\end_layout
+
+\begin_layout Standard
+This produces a 16 byte used header like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_used_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t magic : 16,
+\end_layout
+
+\begin_layout LyX-Code
+                 prev_is_free: 1,
+\end_layout
+
+\begin_layout LyX-Code
+                 key_data_divide: 5,
+\end_layout
+
+\begin_layout LyX-Code
+                 top_hash: 10;
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t extra_octets;
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t key_and_data_len;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+And a free record like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_free_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t free_magic;
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t total_length;
+\end_layout
+
+\begin_layout LyX-Code
+        ...
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t tailer;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout Subsection
+Transaction Commit Requires 4 fdatasync
+\end_layout
+
+\begin_layout Standard
+The current transaction algorithm is:
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+overwrite_with_new_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+remove_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync(); 
+\end_layout
+
+\begin_layout Standard
+On current ext3, each sync flushes all data to disk, so the next 3 syncs
+ are relatively expensive.
+ But this could become a performance bottleneck on other filesystems such
+ as ext4.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Neil Brown points out that this is overzealous, and only one sync is needed:
+\end_layout
+
+\begin_layout Enumerate
+Bundle the recovery data, a transaction counter and a strong checksum of
+ the new data.
+\end_layout
+
+\begin_layout Enumerate
+Strong checksum that whole bundle.
+\end_layout
+
+\begin_layout Enumerate
+Store the bundle in the database.
+\end_layout
+
+\begin_layout Enumerate
+Overwrite the oldest of the two recovery pointers in the header (identified
+ using the transaction counter) with the offset of this bundle.
+\end_layout
+
+\begin_layout Enumerate
+sync.
+\end_layout
+
+\begin_layout Enumerate
+Write the new data to the file.
+\end_layout
+
+\begin_layout Standard
+Checking for recovery means identifying the latest bundle with a valid checksum
+ and using the new data checksum to ensure that it has been applied.
+ This is more expensive than the current check, but need only be done at
+ open.
+ For running databases, a separate header field can be used to indicate
+ a transaction in progress; we need only check for recovery if this is set.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Does-Not"
+
+\end_inset
+
+TDB Does Not Have Snapshot Support
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ At some point you say 
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+But as a thought experiment, if we implemented transactions to only overwrite
+ free entries (this is tricky: there must not be a header in each entry
+ which indicates whether it is free, but use of presence in metadata elsewhere),
+ and a pointer to the hash table, we could create an entirely new commit
+ without destroying existing data.
+ Then it would be easy to implement snapshots in a similar way.
+\end_layout
+
+\begin_layout Standard
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\end_layout
+
+\begin_layout Standard
+We could then implement snapshots using a similar method, using multiple
+ different hash tables/free tables.
+\end_layout
+
+\begin_layout Subsection
+Transactions Cannot Operate in Parallel
+\end_layout
+
+\begin_layout Standard
+This would be useless for ldb, as it hits the index records with just about
+ every update.
+ It would add significant complexity in resolving clashes, and cause the
+ all transaction callers to write their code to loop in the case where the
+ transactions spuriously failed.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We could solve a small part of the problem by providing read-only transactions.
+ These would allow one write transaction to begin, but it could not commit
+ until all r/o transactions are done.
+ This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
+ commit.
+\end_layout
+
+\begin_layout Subsection
+Default Hash Function Is Suboptimal
+\end_layout
+
+\begin_layout Standard
+The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
+ if we expand it to 64 bits), and works best when the hash bucket size is
+ a prime number (which also means a slow modulus).
+ In addition, it is highly predictable which could potentially lead to a
+ Denial of Service attack in some TDB uses.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The Jenkins lookup3 hash
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+http://burtleburtle.net/bob/c/lookup3.c
+\end_layout
+
+\end_inset
+
+ is a fast and superbly-mixing hash.
+ It's used by the Linux kernel and almost everything else.
+ This has the particular properties that it takes an initial seed, and produces
+ two 32 bit hash numbers, which we can combine into a 64-bit hash.
+\end_layout
+
+\begin_layout Standard
+The seed should be created at tdb-creation time from some random source,
+ and placed in the header.
+ This is far from foolproof, but adds a little bit of protection against
+ hash bombing.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Reliable-Traversal-Adds"
+
+\end_inset
+
+Reliable Traversal Adds Complexity
+\end_layout
+
+\begin_layout Standard
+We lock a record during traversal iteration, and try to grab that lock in
+ the delete code.
+ If that grab on delete fails, we simply mark it deleted and continue onwards;
+ traversal checks for this condition and does the delete when it moves off
+ the record.
+\end_layout
+
+\begin_layout Standard
+If traversal terminates, the dead record may be left indefinitely.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove reliability guarantees; see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "traverse-Proposed-Solution"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsection
+Fcntl Locking Adds Overhead
+\end_layout
+
+\begin_layout Standard
+Placing a fcntl lock means a system call, as does removing one.
+ This is actually one reason why transactions can be faster (everything
+ is locked once at transaction start).
+ In the uncontended case, this overhead can theoretically be eliminated.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+We tried this before with spinlock support, in the early days of TDB, and
+ it didn't make much difference except in manufactured benchmarks.
+\end_layout
+
+\begin_layout Standard
+We could use spinlocks (with futex kernel support under Linux), but it means
+ that we lose automatic cleanup when a process dies with a lock.
+ There is a method of auto-cleanup under Linux, but it's not supported by
+ other operating systems.
+ We could reintroduce a clear-if-first-style lock and sweep for dead futexes
+ on open, but that wouldn't help the normal case of one concurrent opener
+ dying.
+ Increasingly elaborate repair schemes could be considered, but they require
+ an ABI change (everyone must use them) anyway, so there's no need to do
+ this at the same time as everything else.
+\end_layout
+
+\begin_layout Subsection
+Some Transactions Don't Require Durability
+\end_layout
+
+\begin_layout Standard
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
+ usage, and occasionally empties the results into a transactional TDB.
+ This kind of usage prioritizes performance over durability: as long as
+ we are consistent, data can be lost.
+\end_layout
+
+\begin_layout Standard
+This would be more neatly implemented inside tdb: a 
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ transaction commit (ie.
+ syncless) which meant that data may be reverted on a crash.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+Unfortunately any transaction scheme which overwrites old data requires
+ a sync before that overwrite to avoid the possibility of corruption.
+\end_layout
+
+\begin_layout Standard
+It seems possible to use a scheme similar to that described in 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Does-Not"
+
+\end_inset
+
+,where transactions are committed without overwriting existing data, and
+ an array of top-level pointers were available in the header.
+ If the transaction is 
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ then we would not need a sync at all: existing processes would pick up
+ the new hash table and free list and work with that.
+\end_layout
+
+\begin_layout Standard
+At some later point, a sync would allow recovery of the old data into the
+ free lists (perhaps when the array of top-level pointers filled).
+ On crash, tdb_open() would examine the array of top levels, and apply the
+ transactions until it encountered an invalid checksum.
+\end_layout
+
+\end_body
+\end_document
+@
+
+
+1.5
+log
+@Soft transaction commit
+@
+text
+@d38 1
+a38 1
+\author "Rusty Russell,,," 
+a52 4
+
+\change_deleted 0 1280141199
+10-May-2010
+\change_inserted 0 1280141202
+a53 2
+\change_unchanged
+
+a2028 2
+
+\change_inserted 0 1280140902
+a2034 2
+
+\change_unchanged
+a2212 2
+\change_inserted 0 1280140661
+
+a2215 2
+
+\change_inserted 0 1280140703
+a2219 2
+
+\change_inserted 0 1280708312
+a2226 2
+
+\change_inserted 0 1280708400
+a2239 2
+
+\change_inserted 0 1280140836
+a2243 2
+
+\change_inserted 0 1280708255
+a2247 2
+
+\change_inserted 0 1280708374
+a2252 2
+
+\change_inserted 0 1280141181
+a2274 2
+
+\change_inserted 0 1280141345
+@
+
+
+1.4
+log
+@Merge changes
+@
+text
+@d38 1
+a38 1
+\author "" 
+d53 2
+d56 4
+d2035 10
+d2223 84
+@
+
+
+1.3
+log
+@Transaction and freelist rethink.
+@
+text
+@d38 1
+a38 1
+\author "Rusty Russell,,," 
+d53 1
+a53 1
+27-April-2010
+d662 1
+a662 5
+ behavior of disallowing 
+\change_inserted 0 1272940179
+nested 
+\change_unchanged
+transactions should become the default.
+a1210 2
+\change_inserted 0 1272944650
+
+a1214 2
+
+\change_inserted 0 1272944763
+a1218 2
+\change_unchanged
+
+a1223 2
+\change_unchanged
+
+a1301 2
+
+\change_inserted 0 1273478114
+a1310 2
+\change_unchanged
+
+d1515 1
+a1515 11
+The free list 
+\change_deleted 0 1273469807
+should
+\change_inserted 0 1273469810
+must
+\change_unchanged
+ be split 
+\change_deleted 0 1273469815
+into multiple lists 
+\change_unchanged
+to reduce contention.
+a1520 2
+\change_inserted 0 1273470006
+
+a1523 2
+
+\change_inserted 0 1273492055
+a1539 2
+
+\change_inserted 0 1273483888
+a1551 2
+\change_unchanged
+
+a1554 8
+
+\change_deleted 0 1272942055
+There are various ways to organize these lisys, but because we want to be
+ able to quickly identify which free list an entry is in, and reduce the
+ number of locks required for merging, we will use zoning (eg.
+ each free list covers some fixed fraction of the file).
+ 
+\change_inserted 0 1273484187
+d1556 1
+a1556 7
+ 
+\change_deleted 0 1273484194
+The algorithm for f
+\change_inserted 0 1273484194
+F
+\change_unchanged
+reeing is simple:
+d1560 1
+a1560 7
+Identify the correct 
+\change_deleted 0 1273482856
+free list
+\change_inserted 0 1273482857
+zone
+\change_unchanged
+.
+d1564 1
+a1564 7
+Lock the 
+\change_inserted 0 1273482895
+corresponding 
+\change_unchanged
+list
+\change_inserted 0 1273482863
+.
+a1567 2
+
+\change_inserted 0 1273482909
+d1573 1
+a1573 13
+
+\change_deleted 0 1273482885
+, and p
+\change_inserted 0 1273482888
+P
+\change_unchanged
+lace the freed entry 
+\change_deleted 0 1273492415
+at the head
+\change_inserted 0 1273492415
+in the list for that zone
+\change_unchanged
+.
+d1577 2
+a1578 7
+Allocation is a little more complicated, as we 
+\change_deleted 0 1273483240
+merge entries as we walk the list:
+\change_inserted 0 1273484250
+perform delayed coalescing at this point:
+\change_unchanged
+
+d1582 1
+a1582 19
+Pick a 
+\change_deleted 0 1273482955
+free list;
+\change_inserted 0 1273482957
+zone
+\change_unchanged
+ either the 
+\change_deleted 0 1273482962
+list
+\change_inserted 0 1273482962
+zone
+\change_unchanged
+ we last freed 
+\change_deleted 0 1273482966
+o
+\change_inserted 0 1273482966
+i
+\change_unchanged
+nto, or based on a 
+d1594 1
+a1594 9
+Lock th
+\change_inserted 0 1273482980
+e corresponding
+\change_deleted 0 1273482973
+at
+\change_unchanged
+ list.
+\change_inserted 0 1273482982
+
+a1597 2
+
+\change_inserted 0 1273483084
+a1598 53
+\change_unchanged
+
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is 
+\change_deleted 0 1273492155
+well-sized, 
+\change_inserted 0 1273492159
+-large enough, 
+\change_unchanged
+remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, 
+\change_inserted 0 1273492206
+coalesce entries in the list.
+\change_deleted 0 1273492200
+examine the entry to the right of it in the file.
+ If it is free:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+
+\change_deleted 0 1273492200
+If that entry is in a different list, lock that list too.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1273492200
+If we had to place a new lock, re-check that the entry is free.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1273492200
+Remove that entry from its free list and expand this entry to cover it.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1273485554
+Goto step 3.
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+
+\change_inserted 0 1273485311
+If there was no entry large enough, unlock the list and try the next zone.
+d1602 1
+a1602 5
+
+\change_deleted 0 1273483646
+Repeat step 3 with each entry in the list.
+\change_unchanged
+
+d1606 2
+a1607 5
+
+\change_deleted 0 1273483668
+Unlock the list and repeat step 2 with the next list.
+\change_unchanged
+
+d1611 1
+a1611 7
+If no 
+\change_deleted 0 1273483671
+list
+\change_inserted 0 1273483671
+zone
+\change_unchanged
+ satisfies, expand the file.
+d1615 2
+a1616 9
+This optimizes rapid insert/delete of free list entries
+\change_inserted 0 1273485794
+ by not coalescing them all the time.
+\change_deleted 0 1273483685
+, and allows us to get rid of the tailer altogether
+\change_unchanged
+.
+
+\change_inserted 0 1273492299
+a1638 39
+
+\change_deleted 0 1273476840
+The question of 
+\begin_inset Quotes eld
+\end_inset
+
+well-sized
+\begin_inset Quotes erd
+\end_inset
+
+ free entries is more difficult: the 25% overhead works in practice for
+ ldb because indexes tend to expand by one record at a time.
+ This can be resolved by having an 
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+ Whether the 
+\begin_inset Quotes eld
+\end_inset
+
+increasing slack
+\begin_inset Quotes erd
+\end_inset
+
+ algorithm should be implemented or first-fit used is still unknown: we
+ will determine this once these other ideas are implemented.
+\change_inserted 0 1273483750
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1273492450
+a1644 2
+
+\change_inserted 0 1273470441
+a1654 2
+
+\change_inserted 0 1273476556
+a1659 2
+
+\change_inserted 0 1273470423
+a1661 2
+\change_unchanged
+
+a1672 2
+
+\change_inserted 0 1273476847
+a1676 2
+
+\change_inserted 0 1273476886
+a1691 2
+
+\change_inserted 0 1273477233
+a1699 2
+
+\change_inserted 0 1273477534
+a1706 2
+
+\change_inserted 0 1273482700
+a1712 2
+
+\change_inserted 0 1273478079
+a1722 2
+
+\change_inserted 0 1273477839
+a1726 2
+
+\change_inserted 0 1273477925
+a1730 2
+
+\change_inserted 0 1273477925
+a1734 2
+
+\change_inserted 0 1273477925
+a1738 2
+
+\change_inserted 0 1273477925
+a1742 2
+
+\change_inserted 0 1273477925
+a1746 2
+
+\change_inserted 0 1273477925
+a1750 2
+
+\change_inserted 0 1273477925
+a1754 2
+
+\change_inserted 0 1273477925
+a1758 2
+
+\change_inserted 0 1273477925
+a1762 2
+
+\change_inserted 0 1273477925
+a1766 2
+
+\change_inserted 0 1273477925
+a1770 2
+
+\change_inserted 0 1273477925
+a1774 2
+
+\change_inserted 0 1273477925
+a1778 2
+
+\change_inserted 0 1273477925
+a1782 2
+
+\change_inserted 0 1273477925
+a1786 2
+
+\change_inserted 0 1273477925
+a1790 2
+
+\change_inserted 0 1273477925
+a1794 2
+
+\change_inserted 0 1273477925
+a1798 2
+
+\change_inserted 0 1273492522
+a1802 2
+
+\change_inserted 0 1273492530
+a1806 2
+
+\change_inserted 0 1273492546
+a1810 2
+
+\change_inserted 0 1273478239
+a1814 2
+
+\change_inserted 0 1273479960
+a1821 2
+
+\change_inserted 0 1273480265
+a1830 2
+
+\change_inserted 0 1273480354
+a1845 2
+
+\change_inserted 0 1273478968
+a1851 2
+
+\change_inserted 0 1273492604
+a1859 2
+
+\change_inserted 0 1273479572
+a1862 2
+\change_unchanged
+
+a1870 2
+
+\change_inserted 0 1273480282
+a1874 2
+
+\change_inserted 0 1273478931
+a1878 2
+
+\change_inserted 0 1273481549
+a1882 2
+
+\change_inserted 0 1273481557
+a1886 2
+
+\change_inserted 0 1273480307
+a1890 2
+
+\change_inserted 0 1273480335
+a1894 2
+
+\change_inserted 0 1273479897
+a1898 2
+
+\change_inserted 0 1273479653
+a1902 2
+
+\change_inserted 0 1273480371
+a1906 2
+
+\change_inserted 0 1273480464
+a1910 2
+
+\change_inserted 0 1273480399
+a1914 2
+
+\change_inserted 0 1273480425
+a1918 2
+
+\change_inserted 0 1273480453
+a1922 2
+
+\change_inserted 0 1273480455
+a1926 2
+
+\change_inserted 0 1273480450
+a1930 2
+
+\change_inserted 0 1273480452
+a1935 2
+\change_inserted 0 1273478830
+
+a1942 5
+
+\change_deleted 0 1273481604
+In theory, we could get away with 2: one after we write the new data, and
+ one to somehow atomically change over to it.
+\change_inserted 0 1273481632
+a1946 2
+
+\change_inserted 0 1273481724
+a1950 2
+
+\change_inserted 0 1273481713
+a1954 2
+
+\change_inserted 0 1273481717
+a1958 2
+
+\change_inserted 0 1273481730
+a1962 2
+
+\change_inserted 0 1273481736
+a1966 2
+
+\change_inserted 0 1273481744
+a1970 2
+
+\change_inserted 0 1273481748
+a1974 2
+
+\change_inserted 0 1273482185
+a1978 2
+
+\change_inserted 0 1273482259
+a1989 50
+
+\change_deleted 0 1273481848
+None.
+ Trying to rewrite the transaction code is a separate experiment, which
+ I encourage someone else to do.
+ At some point you say 
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1273481848
+But as a thought experiment:
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1273481788
+Say there was a pointer in the header which said where the hash table and
+ free list tables were, and that no blocks were labeled with whether they
+ were free or not (it had to be derived from what list they were in).
+ We could create new hash table and free list in some free space, and populate
+ it as we want the post-committed state to look.
+ Then we sync, then we switch the offset in the header, then we sync again.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1273481788
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\change_inserted 0 1273481854
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1273482102
+a1993 2
+
+\change_inserted 0 1273482061
+a1998 2
+
+\change_inserted 0 1273482063
+a2002 2
+
+\change_inserted 0 1273482072
+a2006 2
+
+\change_inserted 0 1273482139
+a2011 2
+
+\change_inserted 0 1273482364
+a2015 2
+
+\change_inserted 0 1273482163
+a2019 2
+
+\change_inserted 0 1273482493
+a2037 2
+
+\change_inserted 0 1273482536
+a2046 2
+\change_unchanged
+
+a2049 2
+
+\change_inserted 0 1273482641
+a2058 2
+
+\change_inserted 0 1273481827
+d2067 2
+a2068 11
+We could 
+\change_inserted 0 1273481829
+then 
+\change_unchanged
+implement snapshots using a similar method
+\change_deleted 0 1273481838
+ to the above, only
+\change_inserted 0 1273481840
+,
+\change_unchanged
+ using multiple different hash tables/free tables.
+@
+
+
+1.2
+log
+@After first feedback (Ronnie & Volker)
+@
+text
+@d1314 13
+d1531 11
+a1541 1
+The free list should be split into multiple lists to reduce contention.
+d1547 39
+d1596 7
+d1604 1
+a1604 1
+The algorithm for freeing is simple:
+d1608 7
+a1614 1
+Identify the correct free list.
+d1618 30
+a1647 1
+Lock the list, and place the freed entry at the head.
+d1651 7
+a1657 2
+Allocation is a little more complicated, as we merge entries as we walk
+ the list:
+d1661 19
+a1679 1
+Pick a free list; either the list we last freed onto, or based on a 
+d1691 17
+a1707 1
+Lock that list.
+d1711 7
+a1717 1
+If the top entry is well-sized, remove it from the list and return it.
+d1721 5
+a1725 1
+Otherwise, examine the entry to the right of it in the file.
+d1731 2
+d1737 2
+d1743 2
+d1749 2
+d1756 8
+d1765 2
+d1770 2
+d1773 2
+d1778 7
+a1784 1
+If no list satisfies, expand the file.
+d1788 28
+a1815 2
+This optimizes rapid insert/delete of free list entries, and allows us to
+ get rid of the tailer altogether.
+d1819 2
+d1851 1
+a1851 1
+\change_inserted 0 1272941474
+d1857 303
+a2159 18
+\change_inserted 0 1272942759
+There are various ways to organize these lists, but because we want to be
+ able to quickly identify which free list an entry is in, and reduce the
+ number of locks required for merging, we will use zoning (eg.
+ each of the N free lists in a tdb file of size M covers a fixed fraction
+ M/N).
+ Note that this means we need to reshuffle the free lists when we expand
+ the file; this is probably acceptable when we double the hash table size,
+ since that is such an expensive operation already.
+ In the case of increasing the file size, there is an optimization we can
+ use: if we use M in the formula above as the file size rounded up to the
+ next power of 2, we only need reshuffle free lists when the file size crosses
+ a power of 2 boundary, 
+\emph on
+and 
+\emph default
+reshuffling the free lists is trivial: we simply merge every consecutive
+ pair of free lists.
+d2164 107
+d2276 2
+d2280 59
+d2346 2
+d2363 2
+d2366 2
+d2371 2
+d2382 2
+d2389 57
+d2458 13
+d2474 32
+a2505 2
+We could implement snapshots using a similar method to the above, only using
+ multiple different hash tables/free tables.
+@
+
+
+1.1
+log
+@Initial revision
+@
+text
+@d1 1
+a1 1
+#LyX 1.6.4 created this file. For more info see http://www.lyx.org/
+d36 3
+a38 3
+\tracking_changes false
+\output_changes false
+\author "" 
+d662 5
+a666 1
+ behavior of disallowing transactions should become the default.
+d1215 21
+d1527 2
+d1533 3
+a1535 1
+ The algorithm for freeing is simple:
+d1642 26
+@

BIN
ccan/tdb2/doc/design.pdf


+ 1058 - 0
ccan/tdb2/doc/design.txt

@@ -0,0 +1,1058 @@
+TDB2: A Redesigning The Trivial DataBase
+
+Rusty Russell, IBM Corporation
+
+26-July-2010
+
+Abstract
+
+The Trivial DataBase on-disk format is 32 bits; with usage cases 
+heading towards the 4G limit, that must change. This required 
+breakage provides an opportunity to revisit TDB's other design 
+decisions and reassess them.
+
+1 Introduction
+
+The Trivial DataBase was originally written by Andrew Tridgell as 
+a simple key/data pair storage system with the same API as dbm, 
+but allowing multiple readers and writers while being small 
+enough (< 1000 lines of C) to include in SAMBA. The simple design 
+created in 1999 has proven surprisingly robust and performant, 
+used in Samba versions 3 and 4 as well as numerous other 
+projects. Its useful life was greatly increased by the 
+(backwards-compatible!) addition of transaction support in 2005.
+
+The wider variety and greater demands of TDB-using code has lead 
+to some organic growth of the API, as well as some compromises on 
+the implementation. None of these, by themselves, are seen as 
+show-stoppers, but the cumulative effect is to a loss of elegance 
+over the initial, simple TDB implementation. Here is a table of 
+the approximate number of lines of implementation code and number 
+of API functions at the end of each year:
+
+
++-----------+----------------+--------------------------------+
+| Year End  | API Functions  | Lines of C Code Implementation |
++-----------+----------------+--------------------------------+
++-----------+----------------+--------------------------------+
+|   1999    |      13        |              1195              |
++-----------+----------------+--------------------------------+
+|   2000    |      24        |              1725              |
++-----------+----------------+--------------------------------+
+|   2001    |      32        |              2228              |
++-----------+----------------+--------------------------------+
+|   2002    |      35        |              2481              |
++-----------+----------------+--------------------------------+
+|   2003    |      35        |              2552              |
++-----------+----------------+--------------------------------+
+|   2004    |      40        |              2584              |
++-----------+----------------+--------------------------------+
+|   2005    |      38        |              2647              |
++-----------+----------------+--------------------------------+
+|   2006    |      52        |              3754              |
++-----------+----------------+--------------------------------+
+|   2007    |      66        |              4398              |
++-----------+----------------+--------------------------------+
+|   2008    |      71        |              4768              |
++-----------+----------------+--------------------------------+
+|   2009    |      73        |              5715              |
++-----------+----------------+--------------------------------+
+
+
+This review is an attempt to catalog and address all the known 
+issues with TDB and create solutions which address the problems 
+without significantly increasing complexity; all involved are far 
+too aware of the dangers of second system syndrome in rewriting a 
+successful project like this.
+
+2 API Issues
+
+2.1 tdb_open_ex Is Not Expandable
+
+The tdb_open() call was expanded to tdb_open_ex(), which added an 
+optional hashing function and an optional logging function 
+argument. Additional arguments to open would require the 
+introduction of a tdb_open_ex2 call etc.
+
+2.1.1 Proposed Solution
+
+tdb_open() will take a linked-list of attributes:
+
+enum tdb_attribute {
+
+    TDB_ATTRIBUTE_LOG = 0,
+
+    TDB_ATTRIBUTE_HASH = 1
+
+};
+
+struct tdb_attribute_base {
+
+    enum tdb_attribute attr;
+
+    union tdb_attribute *next;
+
+};
+
+struct tdb_attribute_log {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG 
+*/
+
+    tdb_log_func log_fn;
+
+    void *log_private;
+
+};
+
+struct tdb_attribute_hash {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH 
+*/
+
+    tdb_hash_func hash_fn;
+
+    void *hash_private;
+
+};
+
+union tdb_attribute {
+
+    struct tdb_attribute_base base;
+
+    struct tdb_attribute_log log;
+
+    struct tdb_attribute_hash hash;
+
+};
+
+This allows future attributes to be added, even if this expands 
+the size of the union.
+
+2.2 tdb_traverse Makes Impossible Guarantees
+
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, 
+and it was thought that it was important to guarantee that all 
+records which exist at the start and end of the traversal would 
+be included, and no record would be included twice.
+
+This adds complexity (see[Reliable-Traversal-Adds]) and does not 
+work anyway for records which are altered (in particular, those 
+which are expanded may be effectively deleted and re-added behind 
+the traversal).
+
+2.2.1 <traverse-Proposed-Solution>Proposed Solution
+
+Abandon the guarantee. You will see every record if no changes 
+occur during your traversal, otherwise you will see some subset. 
+You can prevent changes by using a transaction or the locking 
+API.
+
+2.3 Nesting of Transactions Is Fraught
+
+TDB has alternated between allowing nested transactions and not 
+allowing them. Various paths in the Samba codebase assume that 
+transactions will nest, and in a sense they can: the operation is 
+only committed to disk when the outer transaction is committed. 
+There are two problems, however:
+
+1. Canceling the inner transaction will cause the outer 
+  transaction commit to fail, and will not undo any operations 
+  since the inner transaction began. This problem is soluble with 
+  some additional internal code.
+
+2. An inner transaction commit can be cancelled by the outer 
+  transaction. This is desirable in the way which Samba's 
+  database initialization code uses transactions, but could be a 
+  surprise to any users expecting a successful transaction commit 
+  to expose changes to others.
+
+The current solution is to specify the behavior at tdb_open(), 
+with the default currently that nested transactions are allowed. 
+This flag can also be changed at runtime.
+
+2.3.1 Proposed Solution
+
+Given the usage patterns, it seems that the “least-surprise” 
+behavior of disallowing nested transactions should become the 
+default. Additionally, it seems the outer transaction is the only 
+code which knows whether inner transactions should be allowed, so 
+a flag to indicate this could be added to tdb_transaction_start. 
+However, this behavior can be simulated with a wrapper which uses 
+tdb_add_flags() and tdb_remove_flags(), so the API should not be 
+expanded for this relatively-obscure case.
+
+2.4 Incorrect Hash Function is Not Detected
+
+tdb_open_ex() allows the calling code to specify a different hash 
+function to use, but does not check that all other processes 
+accessing this tdb are using the same hash function. The result 
+is that records are missing from tdb_fetch().
+
+2.4.1 Proposed Solution
+
+The header should contain an example hash result (eg. the hash of 
+0xdeadbeef), and tdb_open_ex() should check that the given hash 
+function produces the same answer, or fail the tdb_open call.
+
+2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+
+In response to scalability issues with the free list ([TDB-Freelist-Is]
+) two API workarounds have been incorporated in TDB: 
+tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The 
+latter actually calls the former with an argument of “5”.
+
+This code allows deleted records to accumulate without putting 
+them in the free list. On delete we iterate through each chain 
+and free them in a batch if there are more than max_dead entries. 
+These are never otherwise recycled except as a side-effect of a 
+tdb_repack.
+
+2.5.1 Proposed Solution
+
+With the scalability problems of the freelist solved, this API 
+can be removed. The TDB_VOLATILE flag may still be useful as a 
+hint that store and delete of records will be at least as common 
+as fetch in order to allow some internal tuning, but initially 
+will become a no-op.
+
+2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times 
+  In The Same Process
+
+No process can open the same TDB twice; we check and disallow it. 
+This is an unfortunate side-effect of fcntl locks, which operate 
+on a per-file rather than per-file-descriptor basis, and do not 
+nest. Thus, closing any file descriptor on a file clears all the 
+locks obtained by this process, even if they were placed using a 
+different file descriptor!
+
+Note that even if this were solved, deadlock could occur if 
+operations were nested: this is a more manageable programming 
+error in most cases.
+
+2.6.1 Proposed Solution
+
+We could lobby POSIX to fix the perverse rules, or at least lobby 
+Linux to violate them so that the most common implementation does 
+not have this restriction. This would be a generally good idea 
+for other fcntl lock users.
+
+Samba uses a wrapper which hands out the same tdb_context to 
+multiple callers if this happens, and does simple reference 
+counting. We should do this inside the tdb library, which already 
+emulates lock nesting internally; it would need to recognize when 
+deadlock occurs within a single process. This would create a new 
+failure mode for tdb operations (while we currently handle 
+locking failures, they are impossible in normal use and a process 
+encountering them can do little but give up).
+
+I do not see benefit in an additional tdb_open flag to indicate 
+whether re-opening is allowed, as though there may be some 
+benefit to adding a call to detect when a tdb_context is shared, 
+to allow other to create such an API.
+
+2.7 TDB API Is Not POSIX Thread-safe
+
+The TDB API uses an error code which can be queried after an 
+operation to determine what went wrong. This programming model 
+does not work with threads, unless specific additional guarantees 
+are given by the implementation. In addition, even 
+otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
+).
+
+2.7.1 Proposed Solution
+
+Reachitecting the API to include a tdb_errcode pointer would be a 
+great deal of churn; we are better to guarantee that the 
+tdb_errcode is per-thread so the current programming model can be 
+maintained.
+
+This requires dynamic per-thread allocations, which is awkward 
+with POSIX threads (pthread_key_create space is limited and we 
+cannot simply allocate a key for every TDB).
+
+Internal locking is required to make sure that fcntl locks do not 
+overlap between threads, and also that the global list of tdbs is 
+maintained.
+
+The aim is that building tdb with -DTDB_PTHREAD will result in a 
+pthread-safe version of the library, and otherwise no overhead 
+will exist.
+
+2.8 *_nonblock Functions And *_mark Functions Expose 
+  Implementation
+
+CTDB[footnote:
+Clustered TDB, see http://ctdb.samba.org
+] wishes to operate on TDB in a non-blocking manner. This is 
+currently done as follows:
+
+1. Call the _nonblock variant of an API function (eg. 
+  tdb_lockall_nonblock). If this fails:
+
+2. Fork a child process, and wait for it to call the normal 
+  variant (eg. tdb_lockall).
+
+3. If the child succeeds, call the _mark variant to indicate we 
+  already have the locks (eg. tdb_lockall_mark).
+
+4. Upon completion, tell the child to release the locks (eg. 
+  tdb_unlockall).
+
+5. Indicate to tdb that it should consider the locks removed (eg. 
+  tdb_unlockall_mark).
+
+There are several issues with this approach. Firstly, adding two 
+new variants of each function clutters the API for an obscure 
+use, and so not all functions have three variants. Secondly, it 
+assumes that all paths of the functions ask for the same locks, 
+otherwise the parent process will have to get a lock which the 
+child doesn't have under some circumstances. I don't believe this 
+is currently the case, but it constrains the implementation. 
+
+2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
+
+Implement a hook for locking methods, so that the caller can 
+control the calls to create and remove fcntl locks. In this 
+scenario, ctdbd would operate as follows:
+
+1. Call the normal API function, eg tdb_lockall().
+
+2. When the lock callback comes in, check if the child has the 
+  lock. Initially, this is always false. If so, return 0. 
+  Otherwise, try to obtain it in non-blocking mode. If that 
+  fails, return EWOULDBLOCK.
+
+3. Release locks in the unlock callback as normal.
+
+4. If tdb_lockall() fails, see if we recorded a lock failure; if 
+  so, call the child to repeat the operation.
+
+5. The child records what locks it obtains, and returns that 
+  information to the parent.
+
+6. When the child has succeeded, goto 1.
+
+This is flexible enough to handle any potential locking scenario, 
+even when lock requirements change. It can be optimized so that 
+the parent does not release locks, just tells the child which 
+locks it doesn't need to obtain.
+
+It also keeps the complexity out of the API, and in ctdbd where 
+it is needed.
+
+2.9 tdb_chainlock Functions Expose Implementation
+
+tdb_chainlock locks some number of records, including the record 
+indicated by the given key. This gave atomicity guarantees; 
+no-one can start a transaction, alter, read or delete that key 
+while the lock is held.
+
+It also makes the same guarantee for any other key in the chain, 
+which is an internal implementation detail and potentially a 
+cause for deadlock.
+
+2.9.1 Proposed Solution
+
+None. It would be nice to have an explicit single entry lock 
+which effected no other keys. Unfortunately, this won't work for 
+an entry which doesn't exist. Thus while chainlock may be 
+implemented more efficiently for the existing case, it will still 
+have overlap issues with the non-existing case. So it is best to 
+keep the current (lack of) guarantee about which records will be 
+effected to avoid constraining our implementation.
+
+2.10 Signal Handling is Not Race-Free
+
+The tdb_setalarm_sigptr() call allows the caller's signal handler 
+to indicate that the tdb locking code should return with a 
+failure, rather than trying again when a signal is received (and 
+errno == EAGAIN). This is usually used to implement timeouts.
+
+Unfortunately, this does not work in the case where the signal is 
+received before the tdb code enters the fcntl() call to place the 
+lock: the code will sleep within the fcntl() code, unaware that 
+the signal wants it to exit. In the case of long timeouts, this 
+does not happen in practice.
+
+2.10.1 Proposed Solution
+
+The locking hooks proposed in[Proposed-Solution-locking-hook] 
+would allow the user to decide on whether to fail the lock 
+acquisition on a signal. This allows the caller to choose their 
+own compromise: they could narrow the race by checking 
+immediately before the fcntl call.[footnote:
+It may be possible to make this race-free in some implementations 
+by having the signal handler alter the struct flock to make it 
+invalid. This will cause the fcntl() lock call to fail with 
+EINVAL if the signal occurs before the kernel is entered, 
+otherwise EAGAIN.
+]
+
+2.11 The API Uses Gratuitous Typedefs, Capitals
+
+typedefs are useful for providing source compatibility when types 
+can differ across implementations, or arguably in the case of 
+function pointer definitions which are hard for humans to parse. 
+Otherwise it is simply obfuscation and pollutes the namespace.
+
+Capitalization is usually reserved for compile-time constants and 
+macros.
+
+  TDB_CONTEXT There is no reason to use this over 'struct 
+  tdb_context'; the definition isn't visible to the API user 
+  anyway.
+
+  TDB_DATA There is no reason to use this over struct TDB_DATA; 
+  the struct needs to be understood by the API user.
+
+  struct TDB_DATA This would normally be called 'struct 
+  tdb_data'.
+
+  enum TDB_ERROR Similarly, this would normally be enum 
+  tdb_error.
+
+2.11.1 Proposed Solution
+
+None. Introducing lower case variants would please pedants like 
+myself, but if it were done the existing ones should be kept. 
+There is little point forcing a purely cosmetic change upon tdb 
+users.
+
+2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The 
+  Private Pointer
+
+For API compatibility reasons, the logging function needs to call 
+tdb_get_logging_private() to retrieve the pointer registered by 
+the tdb_open_ex for logging.
+
+2.12.1 Proposed Solution
+
+It should simply take an extra argument, since we are prepared to 
+break the API/ABI.
+
+2.13 Various Callback Functions Are Not Typesafe
+
+The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read 
+and tdb_check all take void * and must internally convert it to 
+the argument type they were expecting.
+
+If this type changes, the compiler will not produce warnings on 
+the callers, since it only sees void *.
+
+2.13.1 Proposed Solution
+
+With careful use of macros, we can create callback functions 
+which give a warning when used on gcc and the types of the 
+callback and its private argument differ. Unsupported compilers 
+will not give a warning, which is no worse than now. In addition, 
+the callbacks become clearer, as they need not use void * for 
+their parameter.
+
+See CCAN's typesafe_cb module at 
+http://ccan.ozlabs.org/info/typesafe_cb.html
+
+2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, 
+  tdb_reopen_all Problematic
+
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB 
+file should be cleared if the caller discovers it is the only 
+process with the TDB open. However, if any caller does not 
+specify TDB_CLEAR_IF_FIRST it will not be detected, so will have 
+the TDB erased underneath them (usually resulting in a crash).
+
+There is a similar issue on fork(); if the parent exits (or 
+otherwise closes the tdb) before the child calls tdb_reopen_all() 
+to establish the lock used to indicate the TDB is opened by 
+someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe 
+it alone has opened the TDB and will erase it.
+
+2.14.1 Proposed Solution
+
+Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but 
+see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
+
+3 Performance And Scalability Issues
+
+3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST 
+  Imposes Performance Penalty
+
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is 
+placed at offset 4 (aka. the ACTIVE_LOCK). While these locks 
+never conflict in normal tdb usage, they do add substantial 
+overhead for most fcntl lock implementations when the kernel 
+scans to detect if a lock conflict exists. This is often a single 
+linked list, making the time to acquire and release a fcntl lock 
+O(N) where N is the number of processes with the TDB open, not 
+the number actually doing work.
+
+In a Samba server it is common to have huge numbers of clients 
+sitting idle, and thus they have weaned themselves off the 
+TDB_CLEAR_IF_FIRST flag.[footnote:
+There is a flag to tdb_reopen_all() which is used for this 
+optimization: if the parent process will outlive the child, the 
+child does not need the ACTIVE_LOCK. This is a workaround for 
+this very performance issue.
+]
+
+3.1.1 Proposed Solution
+
+Remove the flag. It was a neat idea, but even trivial servers 
+tend to know when they are initializing for the first time and 
+can simply unlink the old tdb at that point.
+
+3.2 TDB Files Have a 4G Limit
+
+This seems to be becoming an issue (so much for “trivial”!), 
+particularly for ldb.
+
+3.2.1 Proposed Solution
+
+A new, incompatible TDB format which uses 64 bit offsets 
+internally rather than 32 bit as now. For simplicity of endian 
+conversion (which TDB does on the fly if required), all values 
+will be 64 bit on disk. In practice, some upper bits may be used 
+for other purposes, but at least 56 bits will be available for 
+file offsets.
+
+tdb_open() will automatically detect the old version, and even 
+create them if TDB_VERSION6 is specified to tdb_open.
+
+32 bit processes will still be able to access TDBs larger than 4G 
+(assuming that their off_t allows them to seek to 64 bits), they 
+will gracefully fall back as they fail to mmap. This can happen 
+already with large TDBs.
+
+Old versions of tdb will fail to open the new TDB files (since 28 
+August 2009, commit 398d0c29290: prior to that any unrecognized 
+file format would be erased and initialized as a fresh tdb!)
+
+3.3 TDB Records Have a 4G Limit
+
+This has not been a reported problem, and the API uses size_t 
+which can be 64 bit on 64 bit platforms. However, other limits 
+may have made such an issue moot.
+
+3.3.1 Proposed Solution
+
+Record sizes will be 64 bit, with an error returned on 32 bit 
+platforms which try to access such records (the current 
+implementation would return TDB_ERR_OOM in a similar case). It 
+seems unlikely that 32 bit keys will be a limitation, so the 
+implementation may not support this (see [sub:Records-Incur-A]).
+
+3.4 Hash Size Is Determined At TDB Creation Time
+
+TDB contains a number of hash chains in the header; the number is 
+specified at creation time, and defaults to 131. This is such a 
+bottleneck on large databases (as each hash chain gets quite 
+long), that LDB uses 10,000 for this hash. In general it is 
+impossible to know what the 'right' answer is at database 
+creation time.
+
+3.4.1 Proposed Solution
+
+After comprehensive performance testing on various scalable hash 
+variants[footnote:
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 
+This was annoying because I was previously convinced that an 
+expanding tree of hashes would be very close to optimal.
+], it became clear that it is hard to beat a straight linear hash 
+table which doubles in size when it reaches saturation. There are 
+three details which become important:
+
+1. On encountering a full bucket, we use the next bucket.
+
+2. Extra hash bits are stored with the offset, to reduce 
+  comparisons.
+
+3. A marker entry is used on deleting an entry.
+
+The doubling of the table must be done under a transaction; we 
+will not reduce it on deletion, so it will be an unusual case. It 
+will either be placed at the head (other entries will be moved 
+out the way so we can expand). We could have a pointer in the 
+header to the current hashtable location, but that pointer would 
+have to be read frequently to check for hashtable moves.
+
+The locking for this is slightly more complex than the chained 
+case; we currently have one lock per bucket, and that means we 
+would need to expand the lock if we overflow to the next bucket. 
+The frequency of such collisions will effect our locking 
+heuristics: we can always lock more buckets than we need.
+
+One possible optimization is to only re-check the hash size on an 
+insert or a lookup miss.
+
+3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
+
+TDB uses a single linked list for the free list. Allocation 
+occurs as follows, using heuristics which have evolved over time:
+
+1. Get the free list lock for this whole operation.
+
+2. Multiply length by 1.25, so we always over-allocate by 25%.
+
+3. Set the slack multiplier to 1.
+
+4. Examine the current freelist entry: if it is > length but < 
+  the current best case, remember it as the best case.
+
+5. Multiply the slack multiplier by 1.05.
+
+6. If our best fit so far is less than length * slack multiplier, 
+  return it. The slack will be turned into a new free record if 
+  it's large enough.
+
+7. Otherwise, go onto the next freelist entry.
+
+Deleting a record occurs as follows:
+
+1. Lock the hash chain for this whole operation.
+
+2. Walk the chain to find the record, keeping the prev pointer 
+  offset.
+
+3. If max_dead is non-zero:
+
+  (a) Walk the hash chain again and count the dead records.
+
+  (b) If it's more than max_dead, bulk free all the dead ones 
+    (similar to steps 4 and below, but the lock is only obtained 
+    once).
+
+  (c) Simply mark this record as dead and return. 
+
+4. Get the free list lock for the remainder of this operation.
+
+5. <right-merging>Examine the following block to see if it is 
+  free; if so, enlarge the current block and remove that block 
+  from the free list. This was disabled, as removal from the free 
+  list was O(entries-in-free-list).
+
+6. Examine the preceeding block to see if it is free: for this 
+  reason, each block has a 32-bit tailer which indicates its 
+  length. If it is free, expand it to cover our new block and 
+  return.
+
+7. Otherwise, prepend ourselves to the free list.
+
+Disabling right-merging (step [right-merging]) causes 
+fragmentation; the other heuristics proved insufficient to 
+address this, so the final answer to this was that when we expand 
+the TDB file inside a transaction commit, we repack the entire 
+tdb.
+
+The single list lock limits our allocation rate; due to the other 
+issues this is not currently seen as a bottleneck.
+
+3.5.1 Proposed Solution
+
+The first step is to remove all the current heuristics, as they 
+obviously interact, then examine them once the lock contention is 
+addressed.
+
+The free list must be split to reduce contention. Assuming 
+perfect free merging, we can at most have 1 free list entry for 
+each entry. This implies that the number of free lists is related 
+to the size of the hash table, but as it is rare to walk a large 
+number of free list entries we can use far fewer, say 1/32 of the 
+number of hash buckets.
+
+There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
+) but it's not clear this would reduce contention in the common 
+case where all processes are allocating/freeing the same size. 
+Thus we almost certainly need to divide in other ways: the most 
+obvious is to divide the file into zones, and using a free list 
+(or set of free lists) for each. This approximates address 
+ordering.
+
+Note that this means we need to split the free lists when we 
+expand the file; this is probably acceptable when we double the 
+hash table size, since that is such an expensive operation 
+already. In the case of increasing the file size, there is an 
+optimization we can use: if we use M in the formula above as the 
+file size rounded up to the next power of 2, we only need 
+reshuffle free lists when the file size crosses a power of 2 
+boundary, and reshuffling the free lists is trivial: we simply 
+merge every consecutive pair of free lists.
+
+The basic algorithm is as follows. Freeing is simple:
+
+1. Identify the correct zone.
+
+2. Lock the corresponding list.
+
+3. Re-check the zone (we didn't have a lock, sizes could have 
+  changed): relock if necessary.
+
+4. Place the freed entry in the list for that zone.
+
+Allocation is a little more complicated, as we perform delayed 
+coalescing at this point:
+
+1. Pick a zone either the zone we last freed into, or based on a “
+  random” number.
+
+2. Lock the corresponding list.
+
+3. Re-check the zone: relock if necessary.
+
+4. If the top entry is -large enough, remove it from the list and 
+  return it.
+
+5. Otherwise, coalesce entries in the list.If there was no entry 
+  large enough, unlock the list and try the next zone.
+
+6. If no zone satisfies, expand the file.
+
+This optimizes rapid insert/delete of free list entries by not 
+coalescing them all the time.. First-fit address ordering 
+ordering seems to be fairly good for keeping fragmentation low 
+(see [sub:TDB-Becomes-Fragmented]). Note that address ordering 
+does not need a tailer to coalesce, though if we needed one we 
+could have one cheaply: see [sub:Records-Incur-A]. 
+
+I anticipate that the number of entries in each free zone would 
+be small, but it might be worth using one free entry to hold 
+pointers to the others for cache efficiency.
+
+3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
+
+Much of this is a result of allocation strategy[footnote:
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 
+ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
+] and deliberate hobbling of coalescing; internal fragmentation 
+(aka overallocation) is deliberately set at 25%, and external 
+fragmentation is only cured by the decision to repack the entire 
+db when a transaction commit needs to enlarge the file.
+
+3.6.1 Proposed Solution
+
+The 25% overhead on allocation works in practice for ldb because 
+indexes tend to expand by one record at a time. This internal 
+fragmentation can be resolved by having an “expanded” bit in the 
+header to note entries that have previously expanded, and 
+allocating more space for them.
+
+There are is a spectrum of possible solutions for external 
+fragmentation: one is to use a fragmentation-avoiding allocation 
+strategy such as best-fit address-order allocator. The other end 
+of the spectrum would be to use a bump allocator (very fast and 
+simple) and simply repack the file when we reach the end.
+
+There are three problems with efficient fragmentation-avoiding 
+allocators: they are non-trivial, they tend to use a single free 
+list for each size, and there's no evidence that tdb allocation 
+patterns will match those recorded for general allocators (though 
+it seems likely).
+
+Thus we don't spend too much effort on external fragmentation; we 
+will be no worse than the current code if we need to repack on 
+occasion. More effort is spent on reducing freelist contention, 
+and reducing overhead.
+
+3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
+
+Each TDB record has a header as follows:
+
+struct tdb_record {
+
+        tdb_off_t next; /* offset of the next record in the list 
+*/
+
+        tdb_len_t rec_len; /* total byte length of record */
+
+        tdb_len_t key_len; /* byte length of key */
+
+        tdb_len_t data_len; /* byte length of data */
+
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+
+        uint32_t magic;   /* try to catch errors */
+
+        /* the following union is implied:
+
+                union {
+
+                        char record[rec_len];
+
+                        struct {
+
+                                char key[key_len];
+
+                                char data[data_len];
+
+                        }
+
+                        uint32_t totalsize; (tailer)
+
+                }
+
+        */
+
+};
+
+Naively, this would double to a 56-byte overhead on a 64 bit 
+implementation.
+
+3.7.1 Proposed Solution
+
+We can use various techniques to reduce this for an allocated 
+block:
+
+1. The 'next' pointer is not required, as we are using a flat 
+  hash table.
+
+2. 'rec_len' can instead be expressed as an addition to key_len 
+  and data_len (it accounts for wasted or overallocated length in 
+  the record). Since the record length is always a multiple of 8, 
+  we can conveniently fit it in 32 bits (representing up to 35 
+  bits).
+
+3. 'key_len' and 'data_len' can be reduced. I'm unwilling to 
+  restrict 'data_len' to 32 bits, but instead we can combine the 
+  two into one 64-bit field and using a 5 bit value which 
+  indicates at what bit to divide the two. Keys are unlikely to 
+  scale as fast as data, so I'm assuming a maximum key size of 32 
+  bits.
+
+4. 'full_hash' is used to avoid a memcmp on the “miss” case, but 
+  this is diminishing returns after a handful of bits (at 10 
+  bits, it reduces 99.9% of false memcmp). As an aside, as the 
+  lower bits are already incorporated in the hash table 
+  resolution, the upper bits should be used here.
+
+5. 'magic' does not need to be enlarged: it currently reflects 
+  one of 5 values (used, free, dead, recovery, and 
+  unused_recovery). It is useful for quick sanity checking 
+  however, and should not be eliminated.
+
+6. 'tailer' is only used to coalesce free blocks (so a block to 
+  the right can find the header to check if this block is free). 
+  This can be replaced by a single 'free' bit in the header of 
+  the following block (and the tailer only exists in free 
+  blocks).[footnote:
+This technique from Thomas Standish. Data Structure Techniques. 
+Addison-Wesley, Reading, Massachusetts, 1980.
+] The current proposed coalescing algorithm doesn't need this, 
+  however.
+
+This produces a 16 byte used header like this:
+
+struct tdb_used_record {
+
+        uint32_t magic : 16,
+
+                 prev_is_free: 1,
+
+                 key_data_divide: 5,
+
+                 top_hash: 10;
+
+        uint32_t extra_octets;
+
+        uint64_t key_and_data_len;
+
+};
+
+And a free record like this:
+
+struct tdb_free_record {
+
+        uint32_t free_magic;
+
+        uint64_t total_length;
+
+        ...
+
+        uint64_t tailer;
+
+};
+
+
+
+3.8 Transaction Commit Requires 4 fdatasync
+
+The current transaction algorithm is:
+
+1. write_recovery_data();
+
+2. sync();
+
+3. write_recovery_header();
+
+4. sync();
+
+5. overwrite_with_new_data();
+
+6. sync();
+
+7. remove_recovery_header();
+
+8. sync(); 
+
+On current ext3, each sync flushes all data to disk, so the next 
+3 syncs are relatively expensive. But this could become a 
+performance bottleneck on other filesystems such as ext4.
+
+3.8.1 Proposed Solution
+
+Neil Brown points out that this is overzealous, and only one sync 
+is needed:
+
+1. Bundle the recovery data, a transaction counter and a strong 
+  checksum of the new data.
+
+2. Strong checksum that whole bundle.
+
+3. Store the bundle in the database.
+
+4. Overwrite the oldest of the two recovery pointers in the 
+  header (identified using the transaction counter) with the 
+  offset of this bundle.
+
+5. sync.
+
+6. Write the new data to the file.
+
+Checking for recovery means identifying the latest bundle with a 
+valid checksum and using the new data checksum to ensure that it 
+has been applied. This is more expensive than the current check, 
+but need only be done at open. For running databases, a separate 
+header field can be used to indicate a transaction in progress; 
+we need only check for recovery if this is set.
+
+3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
+
+3.9.1 Proposed Solution
+
+None. At some point you say “use a real database”.
+
+But as a thought experiment, if we implemented transactions to 
+only overwrite free entries (this is tricky: there must not be a 
+header in each entry which indicates whether it is free, but use 
+of presence in metadata elsewhere), and a pointer to the hash 
+table, we could create an entirely new commit without destroying 
+existing data. Then it would be easy to implement snapshots in a 
+similar way.
+
+This would not allow arbitrary changes to the database, such as 
+tdb_repack does, and would require more space (since we have to 
+preserve the current and future entries at once). If we used hash 
+trees rather than one big hash table, we might only have to 
+rewrite some sections of the hash, too.
+
+We could then implement snapshots using a similar method, using 
+multiple different hash tables/free tables.
+
+3.10 Transactions Cannot Operate in Parallel
+
+This would be useless for ldb, as it hits the index records with 
+just about every update. It would add significant complexity in 
+resolving clashes, and cause the all transaction callers to write 
+their code to loop in the case where the transactions spuriously 
+failed.
+
+3.10.1 Proposed Solution
+
+We could solve a small part of the problem by providing read-only 
+transactions. These would allow one write transaction to begin, 
+but it could not commit until all r/o transactions are done. This 
+would require a new RO_TRANSACTION_LOCK, which would be upgraded 
+on commit.
+
+3.11 Default Hash Function Is Suboptimal
+
+The Knuth-inspired multiplicative hash used by tdb is fairly slow 
+(especially if we expand it to 64 bits), and works best when the 
+hash bucket size is a prime number (which also means a slow 
+modulus). In addition, it is highly predictable which could 
+potentially lead to a Denial of Service attack in some TDB uses.
+
+3.11.1 Proposed Solution
+
+The Jenkins lookup3 hash[footnote:
+http://burtleburtle.net/bob/c/lookup3.c
+] is a fast and superbly-mixing hash. It's used by the Linux 
+kernel and almost everything else. This has the particular 
+properties that it takes an initial seed, and produces two 32 bit 
+hash numbers, which we can combine into a 64-bit hash.
+
+The seed should be created at tdb-creation time from some random 
+source, and placed in the header. This is far from foolproof, but 
+adds a little bit of protection against hash bombing.
+
+3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
+
+We lock a record during traversal iteration, and try to grab that 
+lock in the delete code. If that grab on delete fails, we simply 
+mark it deleted and continue onwards; traversal checks for this 
+condition and does the delete when it moves off the record.
+
+If traversal terminates, the dead record may be left 
+indefinitely.
+
+3.12.1 Proposed Solution
+
+Remove reliability guarantees; see [traverse-Proposed-Solution].
+
+3.13 Fcntl Locking Adds Overhead
+
+Placing a fcntl lock means a system call, as does removing one. 
+This is actually one reason why transactions can be faster 
+(everything is locked once at transaction start). In the 
+uncontended case, this overhead can theoretically be eliminated.
+
+3.13.1 Proposed Solution
+
+None.
+
+We tried this before with spinlock support, in the early days of 
+TDB, and it didn't make much difference except in manufactured 
+benchmarks.
+
+We could use spinlocks (with futex kernel support under Linux), 
+but it means that we lose automatic cleanup when a process dies 
+with a lock. There is a method of auto-cleanup under Linux, but 
+it's not supported by other operating systems. We could 
+reintroduce a clear-if-first-style lock and sweep for dead 
+futexes on open, but that wouldn't help the normal case of one 
+concurrent opener dying. Increasingly elaborate repair schemes 
+could be considered, but they require an ABI change (everyone 
+must use them) anyway, so there's no need to do this at the same 
+time as everything else.
+
+3.14 Some Transactions Don't Require Durability
+
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for 
+normal (fast) usage, and occasionally empties the results into a 
+transactional TDB. This kind of usage prioritizes performance 
+over durability: as long as we are consistent, data can be lost.
+
+This would be more neatly implemented inside tdb: a “soft” 
+transaction commit (ie. syncless) which meant that data may be 
+reverted on a crash.
+
+3.14.1 Proposed Solution
+
+None.
+
+Unfortunately any transaction scheme which overwrites old data 
+requires a sync before that overwrite to avoid the possibility of 
+corruption.
+
+It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not]
+,where transactions are committed without overwriting existing 
+data, and an array of top-level pointers were available in the 
+header. If the transaction is “soft” then we would not need a 
+sync at all: existing processes would pick up the new hash table 
+and free list and work with that.
+
+At some later point, a sync would allow recovery of the old data 
+into the free lists (perhaps when the array of top-level pointers 
+filled). On crash, tdb_open() would examine the array of top 
+levels, and apply the transactions until it encountered an 
+invalid checksum.
+

+ 710 - 0
ccan/tdb2/free.c

@@ -0,0 +1,710 @@
+ /* 
+   Trivial Database 2: free list/block handling
+   Copyright (C) Rusty Russell 2010
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+#include <time.h>
+#include <assert.h>
+#include <limits.h>
+
+/* We have to be able to fit a free record here. */
+#define MIN_DATA_LEN	\
+	(sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
+
+/* We have a series of free lists, each one covering a "zone" of the file.
+ *
+ * For each zone we have a series of per-size buckets, and a final bucket for
+ * "too big".
+ *
+ * It's possible to move the free_list_head, but *only* under the allrecord
+ * lock. */
+static tdb_off_t free_list_off(struct tdb_context *tdb, unsigned int list)
+{
+	return tdb->header.v.free_off + list * sizeof(tdb_off_t);
+}
+
+/* We're a library: playing with srandom() is unfriendly.  srandom_r
+ * probably lacks portability.  We don't need very random here. */
+static unsigned int quick_random(struct tdb_context *tdb)
+{
+	return getpid() + time(NULL) + (unsigned long)tdb;
+}
+
+/* Start by using a random zone to spread the load. */
+uint64_t random_free_zone(struct tdb_context *tdb)
+{
+	/* num_zones might be out of date, but can only increase */
+	return quick_random(tdb) % tdb->header.v.num_zones;
+}
+
+static unsigned fls64(uint64_t val)
+{
+#if HAVE_BUILTIN_CLZL
+	if (val <= ULONG_MAX) {
+		/* This is significantly faster! */
+		return val ? sizeof(long) * CHAR_BIT - __builtin_clzl(val) : 0;
+	} else {
+#endif
+	uint64_t r = 64;
+
+	if (!val)
+		return 0;
+	if (!(val & 0xffffffff00000000ull)) {
+		val <<= 32;
+		r -= 32;
+	}
+	if (!(val & 0xffff000000000000ull)) {
+		val <<= 16;
+		r -= 16;
+	}
+	if (!(val & 0xff00000000000000ull)) {
+		val <<= 8;
+		r -= 8;
+	}
+	if (!(val & 0xf000000000000000ull)) {
+		val <<= 4;
+		r -= 4;
+	}
+	if (!(val & 0xc000000000000000ull)) {
+		val <<= 2;
+		r -= 2;
+	}
+	if (!(val & 0x8000000000000000ull)) {
+		val <<= 1;
+		r -= 1;
+	}
+	return r;
+#if HAVE_BUILTIN_CLZL
+	}
+#endif
+}
+
+/* In which bucket would we find a particular record size? (ignoring header) */
+unsigned int size_to_bucket(struct tdb_context *tdb, tdb_len_t data_len)
+{
+	unsigned int bucket;
+
+	/* We can't have records smaller than this. */
+	assert(data_len >= MIN_DATA_LEN);
+
+	/* Ignoring the header... */
+	if (data_len - MIN_DATA_LEN <= 64) {
+		/* 0 in bucket 0, 8 in bucket 1... 64 in bucket 6. */
+		bucket = (data_len - MIN_DATA_LEN) / 8;
+	} else {
+		/* After that we go power of 2. */
+		bucket = fls64(data_len - MIN_DATA_LEN) + 2;
+	}
+
+	if (unlikely(bucket > tdb->header.v.free_buckets))
+		bucket = tdb->header.v.free_buckets;
+	return bucket;
+}
+
+/* What zone does a block belong in? */ 
+tdb_off_t zone_of(struct tdb_context *tdb, tdb_off_t off)
+{
+	assert(tdb->header_uptodate);
+
+	return off >> tdb->header.v.zone_bits;
+}
+
+/* Returns fl->max_bucket + 1, or list number to search. */
+static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
+{
+	tdb_off_t first, off;
+
+	/* Speculatively search for a non-zero bucket. */
+	first = tdb->last_zone * (tdb->header.v.free_buckets+1) + bucket;
+	off = tdb_find_nonzero_off(tdb, free_list_off(tdb, first),
+				   tdb->header.v.free_buckets - bucket);
+	return bucket + off;
+}
+
+static int remove_from_list(struct tdb_context *tdb,
+			    tdb_off_t list, struct tdb_free_record *r)
+{
+	tdb_off_t off;
+
+	/* Front of list? */
+	if (r->prev == 0) {
+		off = free_list_off(tdb, list);
+	} else {
+		off = r->prev + offsetof(struct tdb_free_record, next);
+	}
+	/* r->prev->next = r->next */
+	if (tdb_write_off(tdb, off, r->next)) {
+		return -1;
+	}
+
+	if (r->next != 0) {
+		off = r->next + offsetof(struct tdb_free_record, prev);
+		/* r->next->prev = r->prev */
+		if (tdb_write_off(tdb, off, r->prev)) {
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/* Enqueue in this free list. */
+static int enqueue_in_free(struct tdb_context *tdb,
+			   tdb_off_t list,
+			   tdb_off_t off,
+			   struct tdb_free_record *new)
+{
+	new->prev = 0;
+	/* new->next = head. */
+	new->next = tdb_read_off(tdb, free_list_off(tdb, list));
+	if (new->next == TDB_OFF_ERR)
+		return -1;
+
+	if (new->next) {
+		/* next->prev = new. */
+		if (tdb_write_off(tdb, new->next
+				  + offsetof(struct tdb_free_record, prev),
+				  off) != 0)
+			return -1;
+	}
+	/* head = new */
+	if (tdb_write_off(tdb, free_list_off(tdb, list), off) != 0)
+		return -1;
+	
+	return tdb_write_convert(tdb, off, new, sizeof(*new));
+}
+
+/* List isn't locked. */
+int add_free_record(struct tdb_context *tdb,
+		    tdb_off_t off, tdb_len_t len_with_header)
+{
+	struct tdb_free_record new;
+	tdb_off_t list;
+	int ret;
+
+	assert(len_with_header >= sizeof(new));
+
+	new.magic = TDB_FREE_MAGIC;
+	new.data_len = len_with_header - sizeof(struct tdb_used_record);
+
+	tdb->last_zone = zone_of(tdb, off);
+	list = tdb->last_zone * (tdb->header.v.free_buckets+1)
+		+ size_to_bucket(tdb, new.data_len);
+		
+	if (tdb_lock_free_list(tdb, list, TDB_LOCK_WAIT) != 0)
+		return -1;
+
+	ret = enqueue_in_free(tdb, list, off, &new);
+	tdb_unlock_free_list(tdb, list);
+	return ret;
+}
+
+/* If we have enough left over to be useful, split that off. */
+static int to_used_record(struct tdb_context *tdb,
+			  tdb_off_t off,
+			  tdb_len_t needed,
+			  tdb_len_t total_len,
+			  tdb_len_t *actual)
+{
+	struct tdb_used_record used;
+	tdb_len_t leftover;
+
+	leftover = total_len - needed;
+	if (leftover < sizeof(struct tdb_free_record))
+		leftover = 0;
+
+	*actual = total_len - leftover;
+
+	if (leftover) {
+		if (add_free_record(tdb, off + sizeof(used) + *actual,
+				    total_len - needed))
+			return -1;
+	}
+	return 0;
+}
+
+/* Note: we unlock the current list if we coalesce or fail. */
+static int coalesce(struct tdb_context *tdb, tdb_off_t off,
+		    tdb_off_t list, tdb_len_t data_len)
+{
+	struct tdb_free_record pad, *r;
+	tdb_off_t end = off + sizeof(struct tdb_used_record) + data_len;
+
+	while (!tdb->methods->oob(tdb, end + sizeof(*r), 1)) {
+		tdb_off_t nlist;
+
+		r = tdb_get(tdb, end, &pad, sizeof(pad));
+		if (!r)
+			goto err;
+
+		if (r->magic != TDB_FREE_MAGIC)
+			break;
+
+		nlist = zone_of(tdb, end) * (tdb->header.v.free_buckets+1)
+			+ size_to_bucket(tdb, r->data_len);
+
+		/* We may be violating lock order here, so best effort. */
+		if (tdb_lock_free_list(tdb, nlist, TDB_LOCK_NOWAIT) == -1)
+			break;
+
+		/* Now we have lock, re-check. */
+		r = tdb_get(tdb, end, &pad, sizeof(pad));
+		if (!r) {
+			tdb_unlock_free_list(tdb, nlist);
+			goto err;
+		}
+
+		if (unlikely(r->magic != TDB_FREE_MAGIC)) {
+			tdb_unlock_free_list(tdb, nlist);
+			break;
+		}
+
+		if (remove_from_list(tdb, list, r) == -1) {
+			tdb_unlock_free_list(tdb, nlist);
+			goto err;
+		}
+
+		end += sizeof(struct tdb_used_record) + r->data_len;
+		tdb_unlock_free_list(tdb, nlist);
+	}
+
+	/* Didn't find any adjacent free? */
+	if (end == off + sizeof(struct tdb_used_record) + data_len)
+		return 0;
+
+	/* OK, expand record */
+	r = tdb_get(tdb, off, &pad, sizeof(pad));
+	if (!r)
+		goto err;
+
+	if (remove_from_list(tdb, list, r) == -1)
+		goto err;
+
+	/* We have to drop this to avoid deadlocks. */
+	tdb_unlock_free_list(tdb, list);
+
+	if (add_free_record(tdb, off, end - off) == -1)
+		return -1;
+	return 1;
+
+err:
+	/* To unify error paths, we *always* unlock list. */
+	tdb_unlock_free_list(tdb, list);
+	return -1;
+}
+
+/* We need size bytes to put our key and data in. */
+static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
+				tdb_off_t bucket, size_t size,
+				tdb_len_t *actual)
+{
+	tdb_off_t list;
+	tdb_off_t off, prev, best_off;
+	struct tdb_free_record pad, best = { 0 }, *r;
+	double multiplier;
+
+again:
+	list = tdb->last_zone * (tdb->header.v.free_buckets+1) + bucket;
+
+	/* Lock this list. */
+	if (tdb_lock_free_list(tdb, list, TDB_LOCK_WAIT) == -1) {
+		return TDB_OFF_ERR;
+	}
+
+	prev = free_list_off(tdb, list);
+	off = tdb_read_off(tdb, prev);
+
+	if (unlikely(off == TDB_OFF_ERR))
+		goto unlock_err;
+
+	best.data_len = -1ULL;
+	best_off = 0;
+	multiplier = 1.0;
+
+	/* Walk the list to see if any are large enough, getting less fussy
+	 * as we go. */
+	while (off) {
+		prev = off;
+		off = tdb_read_off(tdb, prev);
+		if (unlikely(off == TDB_OFF_ERR))
+			goto unlock_err;
+
+		r = tdb_get(tdb, off, &pad, sizeof(*r));
+		if (!r)
+			goto unlock_err;
+		if (r->magic != TDB_FREE_MAGIC) {
+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				 "lock_and_alloc: %llu non-free 0x%llx\n",
+				 (long long)off, (long long)r->magic);
+			goto unlock_err;
+		}
+
+		if (r->data_len >= size && r->data_len < best.data_len) {
+			best_off = off;
+			best = *r;
+		}
+
+		if (best.data_len < size * multiplier && best_off) {
+			/* We're happy with this size: take it. */
+			if (remove_from_list(tdb, list, &best) != 0)
+				goto unlock_err;
+			tdb_unlock_free_list(tdb, list);
+
+			if (to_used_record(tdb, best_off, size, best.data_len,
+					   actual)) {
+				return -1;
+			}
+			return best_off;
+		}
+		multiplier *= 1.01;
+
+		/* Since we're going slow anyway, try coalescing here. */
+		switch (coalesce(tdb, off, list, r->data_len)) {
+		case -1:
+			/* This has already unlocked on error. */
+			return -1;
+		case 1:
+			/* This has unlocked list, restart. */
+			goto again;
+		}
+	}
+
+	tdb_unlock_free_list(tdb, list);
+	return 0;
+
+unlock_err:
+	tdb_unlock_free_list(tdb, list);
+	return TDB_OFF_ERR;
+}
+
+/* We want a really big chunk.  Look through every zone's oversize bucket */
+static tdb_off_t huge_alloc(struct tdb_context *tdb, size_t size,
+			    tdb_len_t *actual)
+{
+	tdb_off_t i, off;
+
+	do {
+		for (i = 0; i < tdb->header.v.num_zones; i++) {
+			/* Try getting one from list. */
+			off = lock_and_alloc(tdb, tdb->header.v.free_buckets,
+					     size, actual);
+			if (off == TDB_OFF_ERR)
+				return TDB_OFF_ERR;
+			if (off != 0)
+				return off;
+			/* FIXME: Coalesce! */
+		}
+	} while (tdb_expand(tdb, 0, size, false) == 0);
+
+	return TDB_OFF_ERR;
+}
+
+static tdb_off_t get_free(struct tdb_context *tdb, size_t size,
+			  tdb_len_t *actual)
+{
+	tdb_off_t off, bucket;
+	unsigned int num_empty, step = 0;
+
+	bucket = size_to_bucket(tdb, size);
+
+	/* If we're after something bigger than a single zone, handle
+	 * specially. */
+	if (unlikely(sizeof(struct tdb_used_record) + size
+		     >= (1ULL << tdb->header.v.zone_bits))) {
+		return huge_alloc(tdb, size, actual);
+	}
+
+	/* Number of zones we search is proportional to the log of them. */
+	for (num_empty = 0; num_empty < fls64(tdb->header.v.num_zones);
+	     num_empty++) {
+		tdb_off_t b;
+
+		/* Start at exact size bucket, and search up... */
+		for (b = bucket; b <= tdb->header.v.num_zones; b++) {
+			b = find_free_head(tdb, b);
+
+			/* Non-empty list?  Try getting block. */
+			if (b <= tdb->header.v.num_zones) {
+				/* Try getting one from list. */
+				off = lock_and_alloc(tdb, b, size, actual);
+				if (off == TDB_OFF_ERR)
+					return TDB_OFF_ERR;
+				if (off != 0)
+					return off;
+				/* Didn't work.  Try next bucket. */
+			}
+		}
+
+		/* Try another zone, at pseudo random.  Avoid duplicates by
+		   using an odd step. */
+		if (step == 0)
+			step = ((quick_random(tdb)) % 65536) * 2 + 1;
+		tdb->last_zone = (tdb->last_zone + step)
+			% tdb->header.v.num_zones;
+	}
+	return 0;
+}
+
+int set_header(struct tdb_context *tdb,
+	       struct tdb_used_record *rec,
+	       uint64_t keylen, uint64_t datalen,
+	       uint64_t actuallen, uint64_t hash)
+{
+	uint64_t keybits = (fls64(keylen) + 1) / 2;
+
+	/* Use top bits of hash, so it's independent of hash table size. */
+	rec->magic_and_meta
+		= (actuallen - (keylen + datalen))
+		| ((hash >> 53) << 32)
+		| (keybits << 43)
+		| (TDB_MAGIC << 48);
+	rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
+
+	/* Encoding can fail on big values. */
+	if (rec_key_length(rec) != keylen
+	    || rec_data_length(rec) != datalen
+	    || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
+		tdb->ecode = TDB_ERR_IO;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "Could not encode k=%llu,d=%llu,a=%llu\n",
+			 (long long)keylen, (long long)datalen,
+			 (long long)actuallen);
+		return -1;
+	}
+	return 0;
+}
+
+static tdb_len_t adjust_size(size_t keylen, size_t datalen, bool growing)
+{
+	tdb_len_t size = keylen + datalen;
+
+	if (size < MIN_DATA_LEN)
+		size = MIN_DATA_LEN;
+
+	/* Overallocate if this is coming from an enlarging store. */
+	if (growing)
+		size += datalen / 2;
+
+	/* Round to next uint64_t boundary. */
+	return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
+}
+
+/* If this fails, try tdb_expand. */
+tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
+		uint64_t hash, bool growing)
+{
+	tdb_off_t off;
+	tdb_len_t size, actual;
+	struct tdb_used_record rec;
+
+	/* We don't want header to change during this! */
+	assert(tdb->header_uptodate);
+
+	size = adjust_size(keylen, datalen, growing);
+
+	off = get_free(tdb, size, &actual);
+	if (unlikely(off == TDB_OFF_ERR || off == 0))
+		return off;
+
+	/* Some supergiant values can't be encoded. */
+	if (set_header(tdb, &rec, keylen, datalen, actual, hash) != 0) {
+		add_free_record(tdb, off, sizeof(rec) + actual);
+		return TDB_OFF_ERR;
+	}
+
+	if (tdb_write_convert(tdb, off, &rec, sizeof(rec)) != 0)
+		return TDB_OFF_ERR;
+	
+	return off;
+}
+
+static bool larger_buckets_might_help(struct tdb_context *tdb)
+{
+	/* If our buckets are already covering 1/8 of a zone, don't
+	 * bother (note: might become an 1/16 of a zone if we double
+	 * zone size). */
+	tdb_len_t size = (1ULL << tdb->header.v.zone_bits) / 8;
+
+	if (size >= MIN_DATA_LEN
+	    && size_to_bucket(tdb, size) < tdb->header.v.free_buckets) {
+		return false;
+	}
+
+	/* FIXME: Put stats in tdb_context or examine db itself! */
+	/* It's fairly cheap to do as we expand database. */
+	return true;
+}
+
+static bool zones_happy(struct tdb_context *tdb)
+{
+	/* FIXME: look at distribution of zones. */
+	return true;
+}
+
+/* Expand the database. */
+int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
+	       bool growing)
+{
+	uint64_t new_num_buckets, new_num_zones, new_zone_bits;
+	uint64_t old_num_total, i;
+	tdb_len_t add, freebucket_size, needed;
+	tdb_off_t off, old_free_off;
+	const tdb_off_t *oldf;
+	struct tdb_used_record fhdr;
+	
+	/* We need room for the record header too. */
+	needed = sizeof(struct tdb_used_record)
+		+ adjust_size(klen, dlen, growing);
+
+	/* FIXME: this is overkill.  An expand lock? */
+	if (tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == -1)
+		return -1;
+
+	/* Someone may have expanded for us. */
+	if (update_header(tdb))
+		goto success;
+
+	/* Make sure we have the latest size. */
+	tdb->methods->oob(tdb, tdb->map_size + 1, true);
+
+	/* Did we enlarge zones without enlarging file? */
+	if (tdb->map_size < tdb->header.v.num_zones<<tdb->header.v.zone_bits) {
+		add = (tdb->header.v.num_zones<<tdb->header.v.zone_bits)
+			- tdb->map_size;
+		/* Updates tdb->map_size. */
+		if (tdb->methods->expand_file(tdb, tdb->map_size, add) == -1)
+			goto fail;
+		if (add_free_record(tdb, tdb->map_size - add, add) == -1)
+			goto fail;
+		if (add >= needed) {
+			/* Allocate from this zone. */
+			tdb->last_zone = zone_of(tdb, tdb->map_size - add);
+			goto success;
+		}
+	}
+
+	/* Slow path.  Should we increase the number of buckets? */
+	new_num_buckets = tdb->header.v.free_buckets;
+	if (larger_buckets_might_help(tdb))
+		new_num_buckets++;
+
+	/* Now we'll need room for the new free buckets, too.  Assume
+	 * worst case (zones expand). */
+	needed += sizeof(fhdr)
+		+ ((tdb->header.v.num_zones+1)
+		   * (new_num_buckets+1) * sizeof(tdb_off_t));
+
+	/* If we need less that one zone, and they're working well, just add
+	 * another one. */
+	if (needed < (1UL<<tdb->header.v.zone_bits) && zones_happy(tdb)) {
+		new_num_zones = tdb->header.v.num_zones+1;
+		new_zone_bits = tdb->header.v.zone_bits;
+		add = 1ULL << tdb->header.v.zone_bits;
+	} else {
+		/* Increase the zone size. */
+		new_num_zones = tdb->header.v.num_zones;
+		new_zone_bits = tdb->header.v.zone_bits+1;
+		while ((new_num_zones << new_zone_bits) - tdb->map_size
+		       < needed) {
+			new_zone_bits++;
+		}
+
+		/* We expand by enough zones to meet the need. */
+		add = (needed + (1ULL << new_zone_bits)-1)
+			& ~((1ULL << new_zone_bits)-1);
+	}
+
+	/* Updates tdb->map_size. */
+	if (tdb->methods->expand_file(tdb, tdb->map_size, add) == -1)
+		goto fail;
+
+	/* Use first part as new free bucket array. */
+	off = tdb->map_size - add;
+	freebucket_size = new_num_zones
+		* (new_num_buckets + 1) * sizeof(tdb_off_t);
+
+	/* Write header. */
+	if (set_header(tdb, &fhdr, 0, freebucket_size, freebucket_size, 0))
+		goto fail;
+	if (tdb_write_convert(tdb, off, &fhdr, sizeof(fhdr)) == -1)
+		goto fail;
+
+	/* Adjust off to point to start of buckets, add to be remainder. */
+	add -= freebucket_size + sizeof(fhdr);
+	off += sizeof(fhdr);
+
+	/* Access the old zones. */
+	old_num_total = tdb->header.v.num_zones*(tdb->header.v.free_buckets+1);
+	old_free_off = tdb->header.v.free_off;
+	oldf = tdb_access_read(tdb, old_free_off,
+			       old_num_total * sizeof(tdb_off_t));
+	if (!oldf)
+		goto fail;
+
+	/* Switch to using our new zone. */
+	if (zero_out(tdb, off, new_num_zones * (new_num_buckets + 1)) == -1)
+		goto fail_release;
+	tdb->header.v.free_off = off;
+	tdb->header.v.num_zones = new_num_zones;
+	tdb->header.v.free_buckets = new_num_buckets;
+
+	/* FIXME: If zone size hasn't changed, can simply copy pointers. */
+	/* FIXME: Coalesce? */
+	for (i = 0; i < old_num_total; i++) {
+		tdb_off_t next;
+		struct tdb_free_record rec;
+		tdb_off_t list;
+
+		for (off = oldf[i]; off; off = next) {
+			if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
+				goto fail_release;
+
+			list = zone_of(tdb, off)
+				* (tdb->header.v.free_buckets+1)
+				+ size_to_bucket(tdb, rec.data_len);
+			next = rec.next;
+		
+			if (enqueue_in_free(tdb, list, off, &rec) == -1)
+				goto fail_release;
+		}
+	}
+
+
+	/* Free up the old free buckets. */
+	old_free_off -= sizeof(fhdr);
+	if (tdb_read_convert(tdb, old_free_off, &fhdr, sizeof(fhdr)) == -1)
+		goto fail_release;
+	if (add_free_record(tdb, old_free_off,
+			    rec_data_length(&fhdr)+rec_extra_padding(&fhdr)))
+		goto fail_release;
+
+	/* Add the rest as a new free record. */
+	if (add_free_record(tdb, tdb->map_size - add, add) == -1)
+		goto fail_release;
+
+	/* Start allocating from where the new space is. */
+	tdb->last_zone = zone_of(tdb, tdb->map_size - add);
+	tdb_access_release(tdb, oldf);
+success:
+	tdb_allrecord_unlock(tdb, F_WRLCK);
+	return 0;
+
+fail_release:
+	tdb_access_release(tdb, oldf);
+fail:
+	tdb_allrecord_unlock(tdb, F_WRLCK);
+	return -1;
+}

+ 662 - 0
ccan/tdb2/io.c

@@ -0,0 +1,662 @@
+ /* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell		   2000
+   Copyright (C) Jeremy Allison			   2000-2003
+   Copyright (C) Rusty Russell			   2010
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+
+void tdb_munmap(struct tdb_context *tdb)
+{
+	if (tdb->flags & TDB_INTERNAL)
+		return;
+
+	if (tdb->map_ptr) {
+		munmap(tdb->map_ptr, tdb->map_size);
+		tdb->map_ptr = NULL;
+	}
+}
+
+void tdb_mmap(struct tdb_context *tdb)
+{
+	if (tdb->flags & TDB_INTERNAL)
+		return;
+
+	if (tdb->flags & TDB_NOMMAP)
+		return;
+
+	tdb->map_ptr = mmap(NULL, tdb->map_size, 
+			    PROT_READ|(tdb->read_only? 0:PROT_WRITE), 
+			    MAP_SHARED, tdb->fd, 0);
+
+	/*
+	 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
+	 */
+	if (tdb->map_ptr == MAP_FAILED) {
+		tdb->map_ptr = NULL;
+		tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
+			 "tdb_mmap failed for size %lld (%s)\n", 
+			 (long long)tdb->map_size, strerror(errno));
+	}
+}
+
+/* check for an out of bounds access - if it is out of bounds then
+   see if the database has been expanded by someone else and expand
+   if necessary 
+   note that "len" is the minimum length needed for the db
+*/
+static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
+{
+	struct stat st;
+	if (len <= tdb->map_size)
+		return 0;
+	if (tdb->flags & TDB_INTERNAL) {
+		if (!probe) {
+			/* Ensure ecode is set for log fn. */
+			tdb->ecode = TDB_ERR_IO;
+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				 "tdb_oob len %lld beyond internal"
+				 " malloc size %lld\n",
+				 (long long)len,
+				 (long long)tdb->map_size);
+		}
+		return -1;
+	}
+
+	if (fstat(tdb->fd, &st) == -1) {
+		tdb->ecode = TDB_ERR_IO;
+		return -1;
+	}
+
+	if (st.st_size < (size_t)len) {
+		if (!probe) {
+			/* Ensure ecode is set for log fn. */
+			tdb->ecode = TDB_ERR_IO;
+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				 "tdb_oob len %lld beyond eof at %lld\n",
+				 (long long)len, (long long)st.st_size);
+		}
+		return -1;
+	}
+
+	/* Unmap, update size, remap */
+	tdb_munmap(tdb);
+	tdb->map_size = st.st_size;
+	tdb_mmap(tdb);
+	return 0;
+}
+
+static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
+{
+	if (unlikely(!tdb->map_ptr))
+		return NULL;
+
+	/* FIXME: We can do a subset of this! */
+	if (tdb->transaction)
+		return NULL;
+
+	if (unlikely(tdb_oob(tdb, off + len, true) == -1))
+		return NULL;
+	return (char *)tdb->map_ptr + off;
+}
+
+/* Either make a copy into pad and return that, or return ptr into mmap. */
+/* Note: pad has to be a real object, so we can't get here if len
+ * overflows size_t */
+/* FIXME: Transaction */
+void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
+{
+	ssize_t r;
+
+	if (likely(!(tdb->flags & TDB_CONVERT))) {
+		void *ret = tdb_direct(tdb, off, len);
+		if (ret)
+			return ret;
+	}
+
+	if (unlikely(tdb_oob(tdb, off + len, false) == -1))
+		return NULL;
+
+	r = pread(tdb->fd, pad, len, off);
+	if (r != (ssize_t)len) {
+		/* Ensure ecode is set for log fn. */
+		tdb->ecode = TDB_ERR_IO;
+		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			 "tdb_read failed at %llu "
+			 "len=%lld ret=%lld (%s) map_size=%lld\n",
+			 (long long)off, (long long)len,
+			 (long long)r, strerror(errno),
+			 (long long)tdb->map_size);
+		return NULL;
+	}
+	return tdb_convert(tdb, pad, len);
+}
+
+/* Endian conversion: we only ever deal with 8 byte quantities */
+void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
+{
+	if (unlikely((tdb->flags & TDB_CONVERT))) {
+		uint64_t i, *p = (uint64_t *)buf;
+		for (i = 0; i < size / 8; i++)
+			p[i] = bswap_64(p[i]);
+	}
+	return buf;
+}
+
+/* Return first non-zero offset in num offset array, or num. */
+/* FIXME: Return the off? */
+uint64_t tdb_find_nonzero_off(struct tdb_context *tdb, tdb_off_t off,
+			      uint64_t num)
+{
+	uint64_t i, *val;
+	bool alloc = false;
+
+	val = tdb_direct(tdb, off, num * sizeof(tdb_off_t));
+	if (!unlikely(val)) {
+		val = tdb_alloc_read(tdb, off, num * sizeof(tdb_off_t));
+		if (!val)
+			return num;
+		alloc = true;
+	}
+
+	for (i = 0; i < num; i++) {
+		if (val[i])
+			break;
+	}
+	if (unlikely(alloc))
+		free(val);
+	return i;
+}
+
+/* Return first zero offset in num offset array, or num. */
+uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
+			   uint64_t num)
+{
+	uint64_t i, *val;
+	bool alloc = false;
+
+	val = tdb_direct(tdb, off, num * sizeof(tdb_off_t));
+	if (!unlikely(val)) {
+		val = tdb_alloc_read(tdb, off, num * sizeof(tdb_off_t));
+		if (!val)
+			return num;
+		alloc = true;
+	}
+
+	for (i = 0; i < num; i++) {
+		if (!val[i])
+			break;
+	}
+	if (unlikely(alloc))
+		free(val);
+	return i;
+}
+
+static int fill(struct tdb_context *tdb,
+		const void *buf, size_t size,
+		tdb_off_t off, tdb_len_t len)
+{
+	while (len) {
+		size_t n = len > size ? size : len;
+
+		if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
+			tdb->ecode = TDB_ERR_IO;
+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				 "fill write failed: giving up!\n");
+			return -1;
+		}
+		len -= n;
+		off += n;
+	}
+	return 0;
+}
+
+int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
+{
+	void *p = tdb_direct(tdb, off, len);
+	if (p) {
+		memset(p, 0, len);
+		return 0;
+	} else {
+		char buf[8192] = { 0 };
+		return fill(tdb, buf, sizeof(buf), len, off);
+	}
+}
+
+tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
+{
+	tdb_off_t pad, *ret;
+
+	ret = tdb_get(tdb, off, &pad, sizeof(ret));
+	if (!ret) {
+		return TDB_OFF_ERR;
+	}
+	return *ret;
+}
+
+/* Even on files, we can get partial writes due to signals. */
+bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off)
+{
+	while (len) {
+		size_t ret;
+		ret = pwrite(fd, buf, len, off);
+		if (ret < 0)
+			return false;
+		if (ret == 0) {
+			errno = ENOSPC;
+			return false;
+		}
+		buf += ret;
+		off += ret;
+		len -= ret;
+	}
+	return true;
+}
+
+/* write a lump of data at a specified offset */
+static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
+		     const void *buf, tdb_len_t len)
+{
+	if (len == 0) {
+		return 0;
+	}
+
+	if (tdb->read_only) {
+		tdb->ecode = TDB_ERR_RDONLY;
+		return -1;
+	}
+
+	if (tdb->methods->oob(tdb, off + len, 0) != 0)
+		return -1;
+
+	if (tdb->map_ptr) {
+		memcpy(off + (char *)tdb->map_ptr, buf, len);
+	} else {
+		if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
+			tdb->ecode = TDB_ERR_IO;
+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				 "tdb_write failed at %llu len=%llu (%s)\n",
+				 off, len, strerror(errno));
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/* read a lump of data at a specified offset */
+static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
+		    tdb_len_t len)
+{
+	if (tdb->methods->oob(tdb, off + len, 0) != 0) {
+		return -1;
+	}
+
+	if (tdb->map_ptr) {
+		memcpy(buf, off + (char *)tdb->map_ptr, len);
+	} else {
+		ssize_t ret = pread(tdb->fd, buf, len, off);
+		if (ret != (ssize_t)len) {
+			/* Ensure ecode is set for log fn. */
+			tdb->ecode = TDB_ERR_IO;
+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				 "tdb_read failed at %lld "
+				 "len=%lld ret=%lld (%s) map_size=%lld\n",
+				 (long long)off, (long long)len,
+				 (long long)ret, strerror(errno),
+				 (long long)tdb->map_size);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
+		      void *rec, size_t len)
+{
+	return tdb->methods->write(tdb, off, tdb_convert(tdb, rec, len), len);
+}
+
+int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
+		      void *rec, size_t len)
+{
+	int ret = tdb->methods->read(tdb, off, rec, len);
+	tdb_convert(tdb, rec, len);
+	return ret;
+}
+
+int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
+{
+	return tdb_write_convert(tdb, off, &val, sizeof(val));
+}
+
+/* read a lump of data, allocating the space for it */
+void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
+{
+	void *buf;
+
+	/* some systems don't like zero length malloc */
+	buf = malloc(len ? len : 1);
+	if (unlikely(!buf)) {
+		tdb->ecode = TDB_ERR_OOM;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_alloc_read malloc failed len=%lld\n",
+			 (long long)len);
+	} else if (unlikely(tdb->methods->read(tdb, offset, buf, len))) {
+		free(buf);
+		buf = NULL;
+	}
+	return buf;
+}
+
+uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
+{
+	struct tdb_used_record pad, *r;
+	void *key;
+	uint64_t klen, hash;
+
+	r = tdb_get(tdb, off, &pad, sizeof(*r));
+	if (!r)
+		/* FIXME */
+		return 0;
+
+	klen = rec_key_length(r);
+	key = tdb_direct(tdb, off + sizeof(*r), klen);
+	if (likely(key))
+		return tdb_hash(tdb, key, klen);
+
+	key = tdb_alloc_read(tdb, off + sizeof(*r), klen);
+	if (unlikely(!key))
+		return 0;
+	hash = tdb_hash(tdb, key, klen);
+	free(key);
+	return hash;
+}
+
+/* Give a piece of tdb data to a parser */
+int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
+		   tdb_off_t offset, tdb_len_t len,
+		   int (*parser)(TDB_DATA key, TDB_DATA data,
+				 void *private_data),
+		   void *private_data)
+{
+	TDB_DATA data;
+	int result;
+	bool allocated = false;
+
+	data.dsize = len;
+	data.dptr = tdb_direct(tdb, offset, len);
+	if (unlikely(!data.dptr)) {
+		if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
+			return -1;
+		}
+		allocated = true;
+	}
+	result = parser(key, data, private_data);
+	if (unlikely(allocated))
+		free(data.dptr);
+	return result;
+}
+
+/* expand a file.  we prefer to use ftruncate, as that is what posix
+  says to use for mmap expansion */
+static int tdb_expand_file(struct tdb_context *tdb,
+			   tdb_len_t size, tdb_len_t addition)
+{
+	char buf[8192];
+
+	if (tdb->read_only) {
+		tdb->ecode = TDB_ERR_RDONLY;
+		return -1;
+	}
+
+	/* If this fails, we try to fill anyway. */
+	if (ftruncate(tdb->fd, size+addition))
+		;
+
+	/* now fill the file with something. This ensures that the
+	   file isn't sparse, which would be very bad if we ran out of
+	   disk. This must be done with write, not via mmap */
+	memset(buf, 0x43, sizeof(buf));
+	return fill(tdb, buf, sizeof(buf), addition, size);
+}
+
+const void *tdb_access_read(struct tdb_context *tdb,
+			    tdb_off_t off, tdb_len_t len)
+{
+	const void *ret = tdb_direct(tdb, off, len);
+
+	if (!ret)
+		ret = tdb_alloc_read(tdb, off, len);
+	return ret;
+}
+
+void tdb_access_release(struct tdb_context *tdb, const void *p)
+{
+	if (!tdb->map_ptr
+	    || (char *)p < (char *)tdb->map_ptr
+	    || (char *)p >= (char *)tdb->map_ptr + tdb->map_size)
+		free((void *)p);
+}
+
+#if 0
+/* write a lump of data at a specified offset */
+static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
+		     const void *buf, tdb_len_t len)
+{
+	if (len == 0) {
+		return 0;
+	}
+
+	if (tdb->read_only || tdb->traverse_read) {
+		tdb->ecode = TDB_ERR_RDONLY;
+		return -1;
+	}
+
+	if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
+		return -1;
+
+	if (tdb->map_ptr) {
+		memcpy(off + (char *)tdb->map_ptr, buf, len);
+	} else {
+		ssize_t written = pwrite(tdb->fd, buf, len, off);
+		if ((written != (ssize_t)len) && (written != -1)) {
+			/* try once more */
+			tdb->ecode = TDB_ERR_IO;
+			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
+				 "%d of %d bytes at %d, trying once more\n",
+				 (int)written, len, off));
+			written = pwrite(tdb->fd, (const char *)buf+written,
+					 len-written,
+					 off+written);
+		}
+		if (written == -1) {
+			/* Ensure ecode is set for log fn. */
+			tdb->ecode = TDB_ERR_IO;
+			TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d "
+				 "len=%d (%s)\n", off, len, strerror(errno)));
+			return -1;
+		} else if (written != (ssize_t)len) {
+			tdb->ecode = TDB_ERR_IO;
+			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
+				 "write %d bytes at %d in two attempts\n",
+				 len, off));
+			return -1;
+		}
+	}
+	return 0;
+}
+
+
+
+/*
+  do an unlocked scan of the hash table heads to find the next non-zero head. The value
+  will then be confirmed with the lock held
+*/		
+static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
+{
+	uint32_t h = *chain;
+	if (tdb->map_ptr) {
+		for (;h < tdb->header.hash_size;h++) {
+			if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
+				break;
+			}
+		}
+	} else {
+		uint32_t off=0;
+		for (;h < tdb->header.hash_size;h++) {
+			if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
+				break;
+			}
+		}
+	}
+	(*chain) = h;
+}
+
+
+/* expand the database by expanding the underlying file and doing the
+   mmap again if necessary */
+int tdb_expand(struct tdb_context *tdb)
+{
+	struct tdb_record rec;
+	tdb_off_t offset, new_size;	
+
+	/* We have to lock every hash bucket and every free list. */
+	do {
+		
+
+	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
+		return -1;
+	}
+
+	/* must know about any previous expansions by another process */
+	tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
+
+	/* always make room for at least 100 more records, and at
+           least 25% more space. Round the database up to a multiple
+           of the page size */
+	new_size = MAX(tdb->map_size + size*100, tdb->map_size * 1.25);
+	size = TDB_ALIGN(new_size, tdb->page_size) - tdb->map_size;
+
+	if (!(tdb->flags & TDB_INTERNAL))
+		tdb_munmap(tdb);
+
+	/*
+	 * We must ensure the file is unmapped before doing this
+	 * to ensure consistency with systems like OpenBSD where
+	 * writes and mmaps are not consistent.
+	 */
+
+	/* expand the file itself */
+	if (!(tdb->flags & TDB_INTERNAL)) {
+		if (tdb->methods->tdb_expand_file(tdb, tdb->map_size, size) != 0)
+			goto fail;
+	}
+
+	tdb->map_size += size;
+
+	if (tdb->flags & TDB_INTERNAL) {
+		char *new_map_ptr = (char *)realloc(tdb->map_ptr,
+						    tdb->map_size);
+		if (!new_map_ptr) {
+			tdb->map_size -= size;
+			goto fail;
+		}
+		tdb->map_ptr = new_map_ptr;
+	} else {
+		/*
+		 * We must ensure the file is remapped before adding the space
+		 * to ensure consistency with systems like OpenBSD where
+		 * writes and mmaps are not consistent.
+		 */
+
+		/* We're ok if the mmap fails as we'll fallback to read/write */
+		tdb_mmap(tdb);
+	}
+
+	/* form a new freelist record */
+	memset(&rec,'\0',sizeof(rec));
+	rec.rec_len = size - sizeof(rec);
+
+	/* link it into the free list */
+	offset = tdb->map_size - size;
+	if (tdb_free(tdb, offset, &rec) == -1)
+		goto fail;
+
+	tdb_unlock(tdb, -1, F_WRLCK);
+	return 0;
+ fail:
+	tdb_unlock(tdb, -1, F_WRLCK);
+	return -1;
+}
+
+/* read/write a tdb_off_t */
+int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
+{
+	return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
+}
+
+int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
+{
+	tdb_off_t off = *d;
+	return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
+}
+
+
+/* read/write a record */
+int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
+{
+	if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
+		return -1;
+	if (TDB_BAD_MAGIC(rec)) {
+		/* Ensure ecode is set for log fn. */
+		tdb->ecode = TDB_ERR_CORRUPT;
+		TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
+		return -1;
+	}
+	return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
+}
+
+int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
+{
+	struct tdb_record r = *rec;
+	return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
+}
+#endif
+
+static const struct tdb_methods io_methods = {
+	tdb_read,
+	tdb_write,
+	tdb_oob,
+	tdb_expand_file,
+};
+
+/*
+  initialise the default methods table
+*/
+void tdb_io_init(struct tdb_context *tdb)
+{
+	tdb->methods = &io_methods;
+}

+ 848 - 0
ccan/tdb2/lock.c

@@ -0,0 +1,848 @@
+ /* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell		   2000
+   Copyright (C) Jeremy Allison			   2000-2003
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "private.h"
+
+static int fcntl_lock(struct tdb_context *tdb,
+		      int rw, off_t off, off_t len, bool waitflag)
+{
+	struct flock fl;
+
+	fl.l_type = rw;
+	fl.l_whence = SEEK_SET;
+	fl.l_start = off;
+	fl.l_len = len;
+	fl.l_pid = 0;
+
+	if (waitflag)
+		return fcntl(tdb->fd, F_SETLKW, &fl);
+	else
+		return fcntl(tdb->fd, F_SETLK, &fl);
+}
+
+static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
+{
+	struct flock fl;
+#if 0 /* Check they matched up locks and unlocks correctly. */
+	char line[80];
+	FILE *locks;
+	bool found = false;
+
+	locks = fopen("/proc/locks", "r");
+
+	while (fgets(line, 80, locks)) {
+		char *p;
+		int type, start, l;
+
+		/* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
+		p = strchr(line, ':') + 1;
+		if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
+			continue;
+		p += strlen(" FLOCK  ADVISORY  ");
+		if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
+			type = F_RDLCK;
+		else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
+			type = F_WRLCK;
+		else
+			abort();
+		p += 6;
+		if (atoi(p) != getpid())
+			continue;
+		p = strchr(strchr(p, ' ') + 1, ' ') + 1;
+		start = atoi(p);
+		p = strchr(p, ' ') + 1;
+		if (strncmp(p, "EOF", 3) == 0)
+			l = 0;
+		else
+			l = atoi(p) - start + 1;
+
+		if (off == start) {
+			if (len != l) {
+				fprintf(stderr, "Len %u should be %u: %s",
+					(int)len, l, line);
+				abort();
+			}
+			if (type != rw) {
+				fprintf(stderr, "Type %s wrong: %s",
+					rw == F_RDLCK ? "READ" : "WRITE", line);
+				abort();
+			}
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		fprintf(stderr, "Unlock on %u@%u not found!\n",
+			(int)off, (int)len);
+		abort();
+	}
+
+	fclose(locks);
+#endif
+
+	fl.l_type = F_UNLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_start = off;
+	fl.l_len = len;
+	fl.l_pid = 0;
+
+	return fcntl(tdb->fd, F_SETLKW, &fl);
+}
+
+/* a byte range locking function - return 0 on success
+   this functions locks/unlocks 1 byte at the specified offset.
+
+   note that a len of zero means lock to end of file
+*/
+static int tdb_brlock(struct tdb_context *tdb,
+		      int rw_type, tdb_off_t offset, tdb_off_t len,
+		      enum tdb_lock_flags flags)
+{
+	int ret;
+
+	if (tdb->flags & TDB_NOLOCK) {
+		return 0;
+	}
+
+	if (rw_type == F_WRLCK && tdb->read_only) {
+		tdb->ecode = TDB_ERR_RDONLY;
+		return -1;
+	}
+
+	/* A 32 bit system cannot open a 64-bit file, but it could have
+	 * expanded since then: check here. */
+	if ((size_t)(offset + len) != offset + len) {
+		tdb->ecode = TDB_ERR_IO;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_brlock: lock on giant offset %llu\n",
+			 (long long)(offset + len));
+		return -1;
+	}
+
+	do {
+		ret = fcntl_lock(tdb, rw_type, offset, len,
+				 flags & TDB_LOCK_WAIT);
+	} while (ret == -1 && errno == EINTR);
+
+	if (ret == -1) {
+		tdb->ecode = TDB_ERR_LOCK;
+		/* Generic lock error. errno set by fcntl.
+		 * EAGAIN is an expected return from non-blocking
+		 * locks. */
+		if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				 "tdb_brlock failed (fd=%d) at"
+				 " offset %llu rw_type=%d flags=%d len=%llu\n",
+				 tdb->fd, (long long)offset, rw_type,
+				 flags, (long long)len);
+		}
+		return -1;
+	}
+	return 0;
+}
+
+static int tdb_brunlock(struct tdb_context *tdb,
+			int rw_type, tdb_off_t offset, size_t len)
+{
+	int ret;
+
+	if (tdb->flags & TDB_NOLOCK) {
+		return 0;
+	}
+
+	do {
+		ret = fcntl_unlock(tdb, rw_type, offset, len);
+	} while (ret == -1 && errno == EINTR);
+
+	if (ret == -1) {
+		tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
+			 "tdb_brunlock failed (fd=%d) at offset %llu"
+			 " rw_type=%d len=%llu\n",
+			 tdb->fd, (long long)offset, rw_type, (long long)len);
+	}
+	return ret;
+}
+
+#if 0
+/*
+  upgrade a read lock to a write lock. This needs to be handled in a
+  special way as some OSes (such as solaris) have too conservative
+  deadlock detection and claim a deadlock when progress can be
+  made. For those OSes we may loop for a while.  
+*/
+int tdb_allrecord_upgrade(struct tdb_context *tdb)
+{
+	int count = 1000;
+
+	if (tdb->allrecord_lock.count != 1) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_allrecord_upgrade failed: count %u too high\n",
+			 tdb->allrecord_lock.count);
+		return -1;
+	}
+
+	if (tdb->allrecord_lock.off != 1) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_allrecord_upgrade failed: already upgraded?\n");
+		return -1;
+	}
+
+	while (count--) {
+		struct timeval tv;
+		if (tdb_brlock(tdb, F_WRLCK,
+			       TDB_HASH_LOCK_START
+			       + (1ULL << tdb->header.v.hash_bits), 0,
+			       TDB_LOCK_WAIT|TDB_LOCK_PROBE) == 0) {
+			tdb->allrecord_lock.ltype = F_WRLCK;
+			tdb->allrecord_lock.off = 0;
+			return 0;
+		}
+		if (errno != EDEADLK) {
+			break;
+		}
+		/* sleep for as short a time as we can - more portable than usleep() */
+		tv.tv_sec = 0;
+		tv.tv_usec = 1;
+		select(0, NULL, NULL, NULL, &tv);
+	}
+	tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
+		 "tdb_allrecord_upgrade failed\n");
+	return -1;
+}
+#endif
+
+static struct tdb_lock_type *find_nestlock(struct tdb_context *tdb,
+					   tdb_off_t offset)
+{
+	unsigned int i;
+
+	for (i=0; i<tdb->num_lockrecs; i++) {
+		if (tdb->lockrecs[i].off == offset) {
+			return &tdb->lockrecs[i];
+		}
+	}
+	return NULL;
+}
+
+/* lock an offset in the database. */
+static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
+			 enum tdb_lock_flags flags)
+{
+	struct tdb_lock_type *new_lck;
+
+	if (offset >= TDB_HASH_LOCK_START + (1ULL << tdb->header.v.hash_bits)
+	    + (tdb->header.v.num_zones * (tdb->header.v.free_buckets+1))) {
+		tdb->ecode = TDB_ERR_LOCK;
+		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			 "tdb_lock: invalid offset %llu for ltype=%d\n",
+			 (long long)offset, ltype);
+		return -1;
+	}
+	if (tdb->flags & TDB_NOLOCK)
+		return 0;
+
+	new_lck = find_nestlock(tdb, offset);
+	if (new_lck) {
+		/*
+		 * Just increment the in-memory struct, posix locks
+		 * don't stack.
+		 */
+		new_lck->count++;
+		return 0;
+	}
+
+	new_lck = (struct tdb_lock_type *)realloc(
+		tdb->lockrecs,
+		sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
+	if (new_lck == NULL) {
+		tdb->ecode = TDB_ERR_OOM;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_lock: unable to allocate %llu lock structure",
+			 (long long)(tdb->num_lockrecs + 1));
+		errno = ENOMEM;
+		return -1;
+	}
+	tdb->lockrecs = new_lck;
+
+	/* Since fcntl locks don't nest, we do a lock for the first one,
+	   and simply bump the count for future ones */
+	if (tdb_brlock(tdb, ltype, offset, 1, flags)) {
+		return -1;
+	}
+
+	tdb->lockrecs[tdb->num_lockrecs].off = offset;
+	tdb->lockrecs[tdb->num_lockrecs].count = 1;
+	tdb->lockrecs[tdb->num_lockrecs].ltype = ltype;
+	tdb->num_lockrecs++;
+
+	return 0;
+}
+
+static int tdb_lock_and_recover(struct tdb_context *tdb)
+{
+#if 0 /* FIXME */
+
+	int ret;
+
+	/* We need to match locking order in transaction commit. */
+	if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) {
+		return -1;
+	}
+
+	if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) {
+		tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
+		return -1;
+	}
+
+	ret = tdb_transaction_recover(tdb);
+
+	tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1);
+	tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
+
+	return ret;
+#else
+	abort();
+	return -1;
+#endif
+}
+
+static bool tdb_needs_recovery(struct tdb_context *tdb)
+{
+	/* FIXME */
+	return false;
+}
+
+static int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t off, int ltype)
+{
+	int ret = -1;
+	struct tdb_lock_type *lck;
+
+	if (tdb->flags & TDB_NOLOCK)
+		return 0;
+
+	lck = find_nestlock(tdb, off);
+	if ((lck == NULL) || (lck->count == 0)) {
+		tdb->ecode = TDB_ERR_LOCK;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_unlock: no lock for %llu\n", (long long)off);
+		return -1;
+	}
+
+	if (lck->count > 1) {
+		lck->count--;
+		return 0;
+	}
+
+	/*
+	 * This lock has count==1 left, so we need to unlock it in the
+	 * kernel. We don't bother with decrementing the in-memory array
+	 * element, we're about to overwrite it with the last array element
+	 * anyway.
+	 */
+	ret = tdb_brunlock(tdb, ltype, off, 1);
+
+	/*
+	 * Shrink the array by overwriting the element just unlocked with the
+	 * last array element.
+	 */
+	*lck = tdb->lockrecs[--tdb->num_lockrecs];
+
+	if (tdb->num_lockrecs == 0) {
+		/* If we're not holding any locks, header can change. */
+		tdb->header_uptodate = false;
+	}
+
+	return ret;
+}
+
+#if 0
+/*
+  get the transaction lock
+ */
+int tdb_transaction_lock(struct tdb_context *tdb, int ltype,
+			 enum tdb_lock_flags lockflags)
+{
+	return tdb_nest_lock(tdb, TRANSACTION_LOCK, ltype, lockflags);
+}
+
+/*
+  release the transaction lock
+ */
+int tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
+{
+	return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false);
+}
+#endif
+
+/* We only need to lock individual bytes, but Linux merges consecutive locks
+ * so we lock in contiguous ranges. */
+static int tdb_lock_gradual(struct tdb_context *tdb,
+			    int ltype, enum tdb_lock_flags flags,
+			    tdb_off_t off, tdb_off_t len)
+{
+	int ret;
+	enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
+
+	if (len <= 4) {
+		/* Single record.  Just do blocking lock. */
+		return tdb_brlock(tdb, ltype, off, len, flags);
+	}
+
+	/* First we try non-blocking. */
+	ret = tdb_brlock(tdb, ltype, off, len, nb_flags);
+	if (ret == 0) {
+		return 0;
+	}
+
+	/* Try locking first half, then second. */
+	ret = tdb_lock_gradual(tdb, ltype, flags, off, len / 2);
+	if (ret == -1)
+		return -1;
+
+	ret = tdb_lock_gradual(tdb, ltype, flags,
+				    off + len / 2, len - len / 2);
+	if (ret == -1) {
+		tdb_brunlock(tdb, ltype, off, len / 2);
+		return -1;
+	}
+	return 0;
+}
+
+/* lock/unlock entire database.  It can only be upgradable if you have some
+ * other way of guaranteeing exclusivity (ie. transaction write lock).
+ * Note that we don't lock the free chains: noone can get those locks
+ * without a hash chain lock first. */
+int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
+		       enum tdb_lock_flags flags, bool upgradable)
+{
+	tdb_off_t hash_size;
+
+	/* FIXME: There are no locks on read-only dbs */
+	if (tdb->read_only) {
+		tdb->ecode = TDB_ERR_LOCK;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_allrecord_lock: read-only\n");
+		return -1;
+	}
+
+	if (tdb->allrecord_lock.count && tdb->allrecord_lock.ltype == ltype) {
+		tdb->allrecord_lock.count++;
+		return 0;
+	}
+
+	if (tdb->allrecord_lock.count) {
+		/* a global lock of a different type exists */
+		tdb->ecode = TDB_ERR_LOCK;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_allrecord_lock: already have %s lock\n",
+			 tdb->allrecord_lock.ltype == F_RDLCK
+			 ? "read" : "write");
+		return -1;
+	}
+
+	if (tdb_has_locks(tdb)) {
+		/* can't combine global and chain locks */
+		tdb->ecode = TDB_ERR_LOCK;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_allrecord_lock: already have chain lock\n");
+		return -1;
+	}
+
+	if (upgradable && ltype != F_RDLCK) {
+		/* tdb error: you can't upgrade a write lock! */
+		tdb->ecode = TDB_ERR_LOCK;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_allrecord_lock: can't upgrade a write lock\n");
+		return -1;
+	}
+
+	/* Lock all the hash buckets. */
+again:
+	hash_size = (1ULL << tdb->header.v.hash_bits);
+	if (tdb_lock_gradual(tdb, ltype, TDB_HASH_LOCK_START,
+			     1ULL << tdb->header.v.hash_bits, flags)) {
+		if (!(flags & TDB_LOCK_PROBE)) {
+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				 "tdb_lockall hashes failed (%s)\n",
+				 strerror(errno));
+		}
+		return -1;
+	}
+
+	/* Now we re-check header, holding lock. */
+	if (unlikely(update_header(tdb))) {
+		tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, hash_size);
+		goto again;
+	}
+
+	/* Now check for needing recovery. */
+	if (unlikely(tdb_needs_recovery(tdb))) {
+		tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, hash_size);
+		if (tdb_lock_and_recover(tdb) == -1) {
+			return -1;
+		}		
+		goto again;
+	}
+
+
+	tdb->allrecord_lock.count = 1;
+	/* If it's upgradable, it's actually exclusive so we can treat
+	 * it as a write lock. */
+	tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
+	tdb->allrecord_lock.off = upgradable;
+	return 0;
+}
+
+int tdb_lock_open(struct tdb_context *tdb)
+{
+	return tdb_nest_lock(tdb, TDB_OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT);
+}
+
+void tdb_unlock_open(struct tdb_context *tdb)
+{
+	tdb_nest_unlock(tdb, TDB_OPEN_LOCK, F_WRLCK);
+}
+
+/* unlock entire db */
+int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
+{
+	tdb_off_t hash_size;
+
+	/* FIXME: There are no locks on read-only dbs */
+	if (tdb->read_only) {
+		tdb->ecode = TDB_ERR_LOCK;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_allrecord_unlock: read-only\n");
+		return -1;
+	}
+
+	if (tdb->allrecord_lock.count == 0) {
+		tdb->ecode = TDB_ERR_LOCK;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_allrecord_unlock: not locked!\n");
+		return -1;
+	}
+
+	/* Upgradable locks are marked as write locks. */
+	if (tdb->allrecord_lock.ltype != ltype
+	    && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
+		tdb->ecode = TDB_ERR_LOCK;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_allrecord_unlock: have %s lock\n",
+			 tdb->allrecord_lock.ltype == F_RDLCK
+			 ? "read" : "write");
+		return -1;
+	}
+
+	if (tdb->allrecord_lock.count > 1) {
+		tdb->allrecord_lock.count--;
+		return 0;
+	}
+
+	tdb->allrecord_lock.count = 0;
+	tdb->allrecord_lock.ltype = 0;
+
+	hash_size = (1ULL << tdb->header.v.hash_bits);
+
+	return tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, hash_size);
+}
+
+bool tdb_has_locks(struct tdb_context *tdb)
+{
+	return tdb->allrecord_lock.count || tdb->num_lockrecs;
+}
+
+#if 0
+/* lock entire database with write lock */
+int tdb_lockall(struct tdb_context *tdb)
+{
+	tdb_trace(tdb, "tdb_lockall");
+	return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
+}
+
+/* lock entire database with write lock - nonblocking varient */
+int tdb_lockall_nonblock(struct tdb_context *tdb)
+{
+	int ret = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_NOWAIT, false);
+	tdb_trace_ret(tdb, "tdb_lockall_nonblock", ret);
+	return ret;
+}
+
+/* unlock entire database with write lock */
+int tdb_unlockall(struct tdb_context *tdb)
+{
+	tdb_trace(tdb, "tdb_unlockall");
+	return tdb_allrecord_unlock(tdb, F_WRLCK);
+}
+
+/* lock entire database with read lock */
+int tdb_lockall_read(struct tdb_context *tdb)
+{
+	tdb_trace(tdb, "tdb_lockall_read");
+	return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
+}
+
+/* lock entire database with read lock - nonblock varient */
+int tdb_lockall_read_nonblock(struct tdb_context *tdb)
+{
+	int ret = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_NOWAIT, false);
+	tdb_trace_ret(tdb, "tdb_lockall_read_nonblock", ret);
+	return ret;
+}
+
+/* unlock entire database with read lock */
+int tdb_unlockall_read(struct tdb_context *tdb)
+{
+	tdb_trace(tdb, "tdb_unlockall_read");
+	return tdb_allrecord_unlock(tdb, F_RDLCK);
+}
+#endif
+
+int tdb_lock_list(struct tdb_context *tdb, tdb_off_t list,
+		  int ltype, enum tdb_lock_flags waitflag)
+{
+	/* a allrecord lock allows us to avoid per chain locks */
+	if (tdb->allrecord_lock.count &&
+	    (ltype == tdb->allrecord_lock.ltype || ltype == F_RDLCK)) {
+		return 0;
+	}
+
+	if (tdb->allrecord_lock.count) {
+		tdb->ecode = TDB_ERR_LOCK;
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_lock_list: have %s allrecordlock\n",
+			 tdb->allrecord_lock.ltype == F_RDLCK
+			 ? "read" : "write");
+		return -1;
+	}
+
+	/* FIXME: Should we do header_uptodate and return retry here? */
+	return tdb_nest_lock(tdb, TDB_HASH_LOCK_START + list, ltype, waitflag);
+}
+
+int tdb_unlock_list(struct tdb_context *tdb, tdb_off_t list, int ltype)
+{
+	/* a allrecord lock allows us to avoid per chain locks */
+	if (tdb->allrecord_lock.count) {
+		if (tdb->allrecord_lock.ltype == F_RDLCK
+		    && ltype == F_WRLCK) {
+			tdb->ecode = TDB_ERR_LOCK;
+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				 "tdb_unlock_list RO allrecord!\n");
+			return -1;
+		}
+		return 0;
+	} else {
+		return tdb_nest_unlock(tdb, TDB_HASH_LOCK_START + list, ltype);
+	}
+}
+
+/* Free list locks come after hash locks */
+int tdb_lock_free_list(struct tdb_context *tdb, tdb_off_t flist,
+		       enum tdb_lock_flags waitflag)
+{
+	/* You're supposed to have a hash lock first! */
+	if (!tdb_has_locks(tdb)) {
+		tdb->ecode = TDB_ERR_LOCK;
+		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			 "tdb_lock_free_list without lock!\n");
+		return -1;
+	}
+
+	/* a allrecord lock allows us to avoid per chain locks */
+	if (tdb->allrecord_lock.count) {
+		if (tdb->allrecord_lock.ltype == F_WRLCK)
+			return 0;
+		tdb->ecode = TDB_ERR_LOCK;
+		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			 "tdb_lock_free_list with RO allrecordlock!\n");
+		return -1;
+	}
+
+	return tdb_nest_lock(tdb, TDB_HASH_LOCK_START
+			     + (1ULL << tdb->header.v.hash_bits)
+			     + flist, F_WRLCK, waitflag);
+}
+
+void tdb_unlock_free_list(struct tdb_context *tdb, tdb_off_t flist)
+{
+	if (tdb->allrecord_lock.count)
+		return;
+
+	tdb_nest_unlock(tdb, TDB_HASH_LOCK_START
+			+ (1ULL << tdb->header.v.hash_bits)
+			+ flist, F_WRLCK);
+}
+
+#if 0
+static int chainlock_loop(struct tdb_context *tdb, const TDB_DATA *key,
+			  int ltype, enum tdb_lock_flags waitflag,
+			  const char *func)
+{
+	int ret;
+	uint64_t h = tdb_hash(tdb, key->dptr, key->dsize);
+
+again:
+	ret = tdb_lock_list(tdb,
+			    h & ((1ULL << tdb->header.v.hash_bits) - 1),
+			    ltype, waitflag);
+	if (likely(ret == 0) && unlikely(update_header(tdb))) {
+		tdb_unlock_list(tdb, h & ((1ULL << tdb->header.v.hash_bits)-1),
+				ltype);
+		goto again;
+	}
+
+	tdb_trace_1rec(tdb, func, *key);
+	return ret;
+}
+
+/* lock/unlock one hash chain. This is meant to be used to reduce
+   contention - it cannot guarantee how many records will be locked */
+int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
+{
+	return chainlock_loop(tdb, &key, F_WRLCK, TDB_LOCK_WAIT,
+			      "tdb_chainlock");
+}
+
+/* lock/unlock one hash chain, non-blocking. This is meant to be used
+   to reduce contention - it cannot guarantee how many records will be
+   locked */
+int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
+{
+	return chainlock_loop(tdb, &key, F_WRLCK, TDB_LOCK_NOWAIT,
+			      "tdb_chainlock_nonblock");
+}
+
+int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
+{
+	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
+	tdb_trace_1rec(tdb, "tdb_chainunlock", key);
+	return tdb_unlock_list(tdb, h & ((1ULL << tdb->header.v.hash_bits)-1),
+			       F_WRLCK);
+}
+
+int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
+{
+	return chainlock_loop(tdb, &key, F_RDLCK, TDB_LOCK_WAIT,
+			      "tdb_chainlock_read");
+}
+
+int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
+{
+	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
+	tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
+	return tdb_unlock_list(tdb, h & ((1ULL << tdb->header.v.hash_bits)-1),
+			       F_RDLCK);
+}
+
+/* record lock stops delete underneath */
+int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
+{
+	if (tdb->allrecord_lock.count) {
+		return 0;
+	}
+	return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0;
+}
+
+/*
+  Write locks override our own fcntl readlocks, so check it here.
+  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
+  an error to fail to get the lock here.
+*/
+int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off)
+{
+	struct tdb_traverse_lock *i;
+	for (i = &tdb->travlocks; i; i = i->next)
+		if (i->off == off)
+			return -1;
+	if (tdb->allrecord_lock.count) {
+		if (tdb->allrecord_lock.ltype == F_WRLCK) {
+			return 0;
+		}
+		return -1;
+	}
+	return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
+}
+
+int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
+{
+	if (tdb->allrecord_lock.count) {
+		return 0;
+	}
+	return tdb_brunlock(tdb, F_WRLCK, off, 1);
+}
+
+/* fcntl locks don't stack: avoid unlocking someone else's */
+int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off)
+{
+	struct tdb_traverse_lock *i;
+	uint32_t count = 0;
+
+	if (tdb->allrecord_lock.count) {
+		return 0;
+	}
+
+	if (off == 0)
+		return 0;
+	for (i = &tdb->travlocks; i; i = i->next)
+		if (i->off == off)
+			count++;
+	return (count == 1 ? tdb_brunlock(tdb, F_RDLCK, off, 1) : 0);
+}
+
+/* The transaction code uses this to remove all locks. */
+void tdb_release_transaction_locks(struct tdb_context *tdb)
+{
+	unsigned int i;
+
+	if (tdb->allrecord_lock.count != 0) {
+		tdb_off_t hash_size, free_size;
+
+		hash_size = (1ULL << tdb->header.v.hash_bits)
+			* sizeof(tdb_off_t);
+		free_size = tdb->header.v.free_zones 
+			* (tdb->header.v.free_buckets + 1) * sizeof(tdb_off_t);
+
+		tdb_brunlock(tdb, tdb->allrecord_lock.ltype,
+			     tdb->header.v.hash_off, hash_size);
+		tdb_brunlock(tdb, tdb->allrecord_lock.ltype,
+			     tdb->header.v.free_off, free_size);
+		tdb->allrecord_lock.count = 0;
+		tdb->allrecord_lock.ltype = 0;
+	}
+
+	for (i = 0; i<tdb->num_lockrecs; i++) {
+		struct tdb_lock_type *lck = &tdb->lockrecs[i];
+
+		tdb_brunlock(tdb, lck->ltype, lck->off, 1);
+	}
+	tdb->num_lockrecs = 0;
+	SAFE_FREE(tdb->lockrecs);
+	tdb->header_uptodate = false;
+}
+#endif

+ 456 - 0
ccan/tdb2/private.h

@@ -0,0 +1,456 @@
+#ifndef TDB_PRIVATE_H
+#define TDB_PRIVATE_H
+ /* 
+   Trivial Database 2: private types and prototypes
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define _XOPEN_SOURCE 500
+#define _FILE_OFFSET_BITS 64
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <utime.h>
+#include <unistd.h>
+#include "config.h"
+#include <ccan/tdb2/tdb2.h>
+#include <ccan/likely/likely.h>
+#ifdef HAVE_BYTESWAP_H
+#include <byteswap.h>
+#endif
+
+#ifndef TEST_IT
+#define TEST_IT(cond)
+#endif
+
+/* #define TDB_TRACE 1 */
+
+#ifndef __STRING
+#define __STRING(x)    #x
+#endif
+
+#ifndef __STRINGSTRING
+#define __STRINGSTRING(x) __STRING(x)
+#endif
+
+#ifndef __location__
+#define __location__ __FILE__ ":" __STRINGSTRING(__LINE__)
+#endif
+
+typedef uint64_t tdb_len_t;
+typedef uint64_t tdb_off_t;
+
+#ifndef offsetof
+#define offsetof(t,f) ((unsigned int)&((t *)0)->f)
+#endif
+
+#define TDB_MAGIC_FOOD "TDB file\n"
+#define TDB_VERSION ((uint64_t)(0x26011967 + 7))
+#define TDB_MAGIC ((uint64_t)0x1999)
+#define TDB_FREE_MAGIC (~(uint64_t)TDB_MAGIC)
+#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
+#define TDB_RECOVERY_MAGIC (0xf53bc0e7U)
+#define TDB_RECOVERY_INVALID_MAGIC (0x0)
+#define TDB_EXTRA_HASHBITS (11) /* We steal 11 bits to stash hash info. */
+#define TDB_EXTRA_HASHBITS_NUM (3)
+
+#define TDB_OFF_ERR ((tdb_off_t)-1)
+
+/* Prevent others from opening the file. */
+#define TDB_OPEN_LOCK 0
+/* Doing a transaction. */
+#define TDB_TRANSACTION_LOCK 1
+/* Hash chain locks. */
+#define TDB_HASH_LOCK_START 2
+
+/* We start wih 256 hash buckets, 10 free buckets.  A 1k-sized zone. */
+#define INITIAL_HASH_BITS 8
+#define INITIAL_FREE_BUCKETS 10
+#define INITIAL_ZONE_BITS 10
+
+#if !HAVE_BSWAP_64
+static inline uint64_t bswap_64(uint64_t x)
+{
+	return (((x&0x000000FFULL)<<56)
+		| ((x&0x0000FF00ULL)<<48)
+		| ((x&0x00FF0000ULL)<<40)
+		| ((x&0xFF000000ULL)<<32)
+		| ((x>>8)&0xFF000000ULL)
+		| ((x>>16)&0x00FF0000ULL)
+		| ((x>>24)&0x0000FF00ULL)
+		| ((x>>32)&0x000000FFULL));
+}
+#endif
+
+struct tdb_used_record {
+	/* For on-disk compatibility, we avoid bitfields:
+	   magic: 16,        (highest)
+	   key_len_bits: 5,
+           hash:11,
+	   extra_padding: 32 (lowest)
+	*/
+        uint64_t magic_and_meta;
+	/* The bottom key_len_bits*2 are key length, rest is data length. */
+        uint64_t key_and_data_len;
+};
+
+static inline unsigned rec_key_bits(const struct tdb_used_record *r)
+{
+	return ((r->magic_and_meta >> 43) & ((1 << 5)-1)) * 2;
+}
+
+static inline uint64_t rec_key_length(const struct tdb_used_record *r)
+{
+	return r->key_and_data_len & ((1ULL << rec_key_bits(r)) - 1);
+}
+
+static inline uint64_t rec_data_length(const struct tdb_used_record *r)
+{
+	return r->key_and_data_len >> rec_key_bits(r);
+}
+
+static inline uint64_t rec_extra_padding(const struct tdb_used_record *r)
+{
+	return r->magic_and_meta & 0xFFFFFFFF;
+}
+
+static inline uint64_t rec_hash(const struct tdb_used_record *r)
+{
+	return ((r->magic_and_meta >> 32) & ((1ULL << 11) - 1)) << (64 - 11);
+}
+
+static inline uint16_t rec_magic(const struct tdb_used_record *r)
+{
+	return (r->magic_and_meta >> 48);
+}
+
+struct tdb_free_record {
+        uint64_t magic;
+        uint64_t data_len; /* Not counting these two fields. */
+	/* This is why the minimum record size is 16 bytes.  */
+	uint64_t next, prev;
+};
+
+/* These parts can change while we have db open. */
+struct tdb_header_volatile {
+	uint64_t generation; /* Makes sure it changes on every update. */
+	uint64_t hash_bits; /* Entries in hash table. */
+	uint64_t hash_off; /* Offset of hash table. */
+	uint64_t num_zones; /* How many zones in the file. */
+	uint64_t zone_bits; /* Size of zones. */
+	uint64_t free_buckets; /* How many buckets in each zone. */
+	uint64_t free_off; /* Arrays of free entries. */
+};
+
+/* this is stored at the front of every database */
+struct tdb_header {
+	char magic_food[32]; /* for /etc/magic */
+	uint64_t version; /* version of the code */
+	uint64_t hash_test; /* result of hashing HASH_MAGIC. */
+	uint64_t hash_seed; /* "random" seed written at creation time. */
+
+	struct tdb_header_volatile v;
+
+	tdb_off_t reserved[19];
+};
+
+enum tdb_lock_flags {
+	/* WAIT == F_SETLKW, NOWAIT == F_SETLK */
+	TDB_LOCK_NOWAIT = 0,
+	TDB_LOCK_WAIT = 1,
+	/* If set, don't log an error on failure. */
+	TDB_LOCK_PROBE = 2,
+};
+
+struct tdb_lock_type {
+	uint32_t off;
+	uint32_t count;
+	uint32_t ltype;
+};
+
+struct tdb_context {
+	/* Filename of the database. */
+	const char *name;
+
+	/* Mmap (if any), or malloc (for TDB_INTERNAL). */
+	void *map_ptr;
+
+	 /* Open file descriptor (undefined for TDB_INTERNAL). */
+	int fd;
+
+	/* How much space has been mapped (<= current file size) */
+	tdb_len_t map_size;
+
+	/* Opened read-only? */
+	bool read_only;
+
+	/* Error code for last tdb error. */
+	enum TDB_ERROR ecode; 
+
+	/* A cached copy of the header */
+	struct tdb_header header; 
+	/* (for debugging). */
+	bool header_uptodate; 
+
+	/* the flags passed to tdb_open, for tdb_reopen. */
+	uint32_t flags;
+
+	/* Logging function */
+	tdb_logfn_t log;
+	void *log_priv;
+
+	/* Hash function. */
+	tdb_hashfn_t khash;
+	void *hash_priv;
+
+	/* What zone of the tdb to use, for spreading load. */
+	uint64_t last_zone; 
+
+	/* IO methods: changes for transactions. */
+	const struct tdb_methods *methods;
+
+	/* Lock information */
+	struct tdb_lock_type allrecord_lock;
+	uint64_t num_lockrecs;
+	struct tdb_lock_type *lockrecs;
+
+	/* Set if we are in a transaction. */
+	struct tdb_transaction *transaction;
+	
+	/* Single list of all TDBs, to avoid multiple opens. */
+	struct tdb_context *next;
+	dev_t device;	
+	ino_t inode;
+};
+
+struct tdb_methods {
+	int (*read)(struct tdb_context *, tdb_off_t, void *, tdb_len_t);
+	int (*write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
+	int (*oob)(struct tdb_context *, tdb_off_t, bool);
+	int (*expand_file)(struct tdb_context *, tdb_len_t, tdb_len_t);
+};
+
+/*
+  internal prototypes
+*/
+/* tdb.c: */
+/* Returns true if header changed. */
+bool update_header(struct tdb_context *tdb);
+
+/* Hash random memory. */
+uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len);
+
+
+/* free.c: */
+uint64_t random_free_zone(struct tdb_context *tdb);
+
+/* If this fails, try tdb_expand. */
+tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
+		uint64_t hash, bool growing);
+
+/* Put this record in a free list. */
+int add_free_record(struct tdb_context *tdb,
+		    tdb_off_t off, tdb_len_t len_with_header);
+
+/* Set up header for a used record. */
+int set_header(struct tdb_context *tdb,
+	       struct tdb_used_record *rec,
+	       uint64_t keylen, uint64_t datalen,
+	       uint64_t actuallen, uint64_t hash);
+
+/* Used by tdb_check to verify. */
+unsigned int size_to_bucket(struct tdb_context *tdb, tdb_len_t data_len);
+tdb_off_t zone_of(struct tdb_context *tdb, tdb_off_t off);
+
+/* io.c: */
+/* Initialize tdb->methods. */
+void tdb_io_init(struct tdb_context *tdb);
+
+/* Convert endian of the buffer if required. */
+void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size);
+
+/* Unmap and try to map the tdb. */
+void tdb_munmap(struct tdb_context *tdb);
+void tdb_mmap(struct tdb_context *tdb);
+
+/* Hand data to a function, direct if possible */
+int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
+		   tdb_off_t offset, tdb_len_t len,
+		   int (*parser)(TDB_DATA key, TDB_DATA data,
+				 void *private_data),
+		   void *private_data);
+
+/* Either make a copy into pad and return that, or return ptr into mmap.
+ * Converts endian (ie. will use pad in that case). */
+void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len);
+
+/* Either alloc a copy, or give direct access.  Release frees or noop. */
+const void *tdb_access_read(struct tdb_context *tdb,
+			    tdb_off_t off, tdb_len_t len);
+void tdb_access_release(struct tdb_context *tdb, const void *p);
+
+/* Convenience routine to get an offset. */
+tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off);
+
+/* Write an offset at an offset. */
+int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val);
+
+/* Clear an ondisk area. */
+int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len);
+
+/* Return a non-zero offset in this array, or num. */
+tdb_off_t tdb_find_nonzero_off(struct tdb_context *tdb, tdb_off_t off,
+			       uint64_t num);
+
+/* Return a zero offset in this array, or num. */
+tdb_off_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
+			    uint64_t num);
+
+/* Even on files, we can get partial writes due to signals. */
+bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off);
+
+/* Allocate and make a copy of some offset. */
+void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len);
+
+/* Munges record and writes it */
+int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
+		      void *rec, size_t len);
+
+/* Reads record and converts it */
+int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
+		     void *rec, size_t len);
+
+/* Hash on disk. */
+uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off);
+
+/* lock.c: */
+/* Lock/unlock a particular hash list. */
+int tdb_lock_list(struct tdb_context *tdb, tdb_off_t list,
+		  int ltype, enum tdb_lock_flags waitflag);
+int tdb_unlock_list(struct tdb_context *tdb, tdb_off_t list, int ltype);
+
+/* Lock/unlock a particular free list. */
+int tdb_lock_free_list(struct tdb_context *tdb, tdb_off_t flist,
+		       enum tdb_lock_flags waitflag);
+void tdb_unlock_free_list(struct tdb_context *tdb, tdb_off_t flist);
+
+/* Do we have any locks? */
+bool tdb_has_locks(struct tdb_context *tdb);
+
+/* Lock entire database. */
+int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
+		       enum tdb_lock_flags flags, bool upgradable);
+int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype);
+
+/* Serialize db open. */
+int tdb_lock_open(struct tdb_context *tdb);
+void tdb_unlock_open(struct tdb_context *tdb);
+/* Expand the file. */
+int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
+	       bool growing);
+
+#if 0
+/* Low-level locking primitives. */
+int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
+		  enum tdb_lock_flags flags);
+int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t offset, int ltype);
+
+int tdb_munmap(struct tdb_context *tdb);
+void tdb_mmap(struct tdb_context *tdb);
+int tdb_lock(struct tdb_context *tdb, int list, int ltype);
+int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
+bool tdb_have_locks(struct tdb_context *tdb);
+int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
+int tdb_brlock(struct tdb_context *tdb,
+	       int rw_type, tdb_off_t offset, size_t len,
+	       enum tdb_lock_flags flags);
+int tdb_brunlock(struct tdb_context *tdb,
+		 int rw_type, tdb_off_t offset, size_t len);
+bool tdb_have_extra_locks(struct tdb_context *tdb);
+void tdb_release_extra_locks(struct tdb_context *tdb);
+int tdb_transaction_lock(struct tdb_context *tdb, int ltype);
+int tdb_transaction_unlock(struct tdb_context *tdb, int ltype);
+int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
+		       enum tdb_lock_flags flags, bool upgradable);
+int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype);
+int tdb_allrecord_upgrade(struct tdb_context *tdb);
+int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off);
+int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off);
+int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
+int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
+int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec);
+tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec);
+int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
+int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
+int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off);
+int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off);
+bool tdb_needs_recovery(struct tdb_context *tdb);
+int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec);
+int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec);
+int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct tdb_record *rec);
+unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len);
+int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
+		   tdb_off_t offset, tdb_len_t len,
+		   int (*parser)(TDB_DATA key, TDB_DATA data,
+				 void *private_data),
+		   void *private_data);
+tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
+			   struct tdb_record *rec);
+void tdb_io_init(struct tdb_context *tdb);
+int tdb_expand(struct tdb_context *tdb, tdb_off_t size);
+int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off,
+		      struct tdb_record *rec);
+#endif
+
+#ifdef TDB_TRACE
+void tdb_trace(struct tdb_context *tdb, const char *op);
+void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op);
+void tdb_trace_open(struct tdb_context *tdb, const char *op,
+		    unsigned hash_size, unsigned tdb_flags, unsigned open_flags);
+void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret);
+void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret);
+void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
+		    TDB_DATA rec);
+void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
+			TDB_DATA rec, int ret);
+void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
+			   TDB_DATA rec, TDB_DATA ret);
+void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
+			     TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
+			     int ret);
+void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
+			   TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret);
+#else
+#define tdb_trace(tdb, op)
+#define tdb_trace_seqnum(tdb, seqnum, op)
+#define tdb_trace_open(tdb, op, hash_size, tdb_flags, open_flags)
+#define tdb_trace_ret(tdb, op, ret)
+#define tdb_trace_retrec(tdb, op, ret)
+#define tdb_trace_1rec(tdb, op, rec)
+#define tdb_trace_1rec_ret(tdb, op, rec, ret)
+#define tdb_trace_1rec_retrec(tdb, op, rec, ret)
+#define tdb_trace_2rec_flag_ret(tdb, op, rec1, rec2, flag, ret)
+#define tdb_trace_2rec_retrec(tdb, op, rec1, rec2, ret)
+#endif /* !TDB_TRACE */
+
+#endif

+ 875 - 0
ccan/tdb2/tdb.c

@@ -0,0 +1,875 @@
+#include "private.h"
+#include <ccan/tdb2/tdb2.h>
+#include <ccan/hash/hash.h>
+#include <ccan/likely/likely.h>
+#include <assert.h>
+
+/* The null return. */
+struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
+
+/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
+static struct tdb_context *tdbs = NULL;
+
+PRINTF_ATTRIBUTE(4, 5) static void
+null_log_fn(struct tdb_context *tdb,
+	    enum tdb_debug_level level, void *priv,
+	    const char *fmt, ...)
+{
+}
+
+/* We do a lot of work assuming our copy of the header volatile area
+ * is uptodate, and usually it is.  However, once we grab a lock, we have to
+ * re-check it. */
+bool update_header(struct tdb_context *tdb)
+{
+	struct tdb_header_volatile pad, *v;
+
+	if (tdb->header_uptodate) {
+		tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
+			 "warning: header uptodate already\n");
+	}
+
+	/* We could get a partial update if we're not holding any locks. */
+	assert(tdb_has_locks(tdb));
+
+	v = tdb_get(tdb, offsetof(struct tdb_header, v), &pad, sizeof(*v));
+	if (!v) {
+		/* On failure, imply we updated header so they retry. */
+		return true;
+	}
+	tdb->header_uptodate = true;
+	if (likely(memcmp(&tdb->header.v, v, sizeof(*v)) == 0)) {
+		return false;
+	}
+	tdb->header.v = *v;
+	return true;
+}
+
+static uint64_t jenkins_hash(const void *key, size_t length, uint64_t seed,
+			     void *arg)
+{
+	return hash64_any(key, length, seed);
+}
+
+uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len)
+{
+	return tdb->khash(ptr, len, tdb->header.hash_seed, tdb->hash_priv);
+}
+
+static bool tdb_already_open(dev_t device, ino_t ino)
+{
+	struct tdb_context *i;
+	
+	for (i = tdbs; i; i = i->next) {
+		if (i->device == device && i->inode == ino) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static uint64_t random_number(struct tdb_context *tdb)
+{
+	int fd;
+	uint64_t ret = 0;
+	struct timeval now;
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if (fd >= 0) {
+		if (read(fd, &ret, sizeof(ret)) == sizeof(ret)) {
+			tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
+				 "tdb_open: random from /dev/urandom\n");
+			close(fd);
+			return ret;
+		}
+		close(fd);
+	}
+	/* FIXME: Untested!  Based on Wikipedia protocol description! */
+	fd = open("/dev/egd-pool", O_RDWR);
+	if (fd >= 0) {
+		/* Command is 1, next byte is size we want to read. */
+		char cmd[2] = { 1, sizeof(uint64_t) };
+		if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) {
+			char reply[1 + sizeof(uint64_t)];
+			int r = read(fd, reply, sizeof(reply));
+			if (r > 1) {
+				tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
+					 "tdb_open: %u random bytes from"
+					 " /dev/egd-pool\n", r-1);
+				/* Copy at least some bytes. */
+				memcpy(&ret, reply+1, r - 1);
+				if (reply[0] == sizeof(uint64_t)
+				    && r == sizeof(reply)) {
+					close(fd);
+					return ret;
+				}
+			}
+		}
+		close(fd);
+	}
+
+	/* Fallback: pid and time. */
+	gettimeofday(&now, NULL);
+	ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
+	tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
+		 "tdb_open: random from getpid and time\n");
+	return ret;
+}
+
+struct new_database {
+	struct tdb_header hdr;
+	struct tdb_used_record hrec;
+	tdb_off_t hash[1ULL << INITIAL_HASH_BITS];
+	struct tdb_used_record frec;
+	tdb_off_t free[INITIAL_FREE_BUCKETS + 1]; /* One overflow bucket */
+};
+
+/* initialise a new database */
+static int tdb_new_database(struct tdb_context *tdb)
+{
+	/* We make it up in memory, then write it out if not internal */
+	struct new_database newdb;
+
+	/* Fill in the header */
+	newdb.hdr.version = TDB_VERSION;
+	newdb.hdr.hash_seed = random_number(tdb);
+	newdb.hdr.hash_test = TDB_HASH_MAGIC;
+	newdb.hdr.hash_test = tdb->khash(&newdb.hdr.hash_test,
+					 sizeof(newdb.hdr.hash_test),
+					 newdb.hdr.hash_seed,
+					 tdb->hash_priv);
+
+	newdb.hdr.v.generation = 0;
+
+	/* Free array has 1 zone, 10 buckets.  All buckets empty. */
+	newdb.hdr.v.num_zones = 1;
+	newdb.hdr.v.zone_bits = INITIAL_ZONE_BITS;
+	newdb.hdr.v.free_buckets = INITIAL_FREE_BUCKETS;
+	newdb.hdr.v.free_off = offsetof(struct new_database, free);
+	set_header(tdb, &newdb.frec, 0,
+		   sizeof(newdb.free), sizeof(newdb.free), 0);
+	memset(newdb.free, 0, sizeof(newdb.free));
+
+	/* Initial hashes are empty. */
+	newdb.hdr.v.hash_bits = INITIAL_HASH_BITS;
+	newdb.hdr.v.hash_off = offsetof(struct new_database, hash);
+	set_header(tdb, &newdb.hrec, 0,
+		   sizeof(newdb.hash), sizeof(newdb.hash), 0);
+	memset(newdb.hash, 0, sizeof(newdb.hash));
+
+	if (tdb->flags & TDB_INTERNAL) {
+		tdb->map_size = sizeof(newdb);
+		tdb->map_ptr = malloc(tdb->map_size);
+		if (!tdb->map_ptr) {
+			tdb->ecode = TDB_ERR_OOM;
+			return -1;
+		}
+		memcpy(tdb->map_ptr, &newdb, tdb->map_size);
+		tdb->header = newdb.hdr;
+		/* Convert the `ondisk' version if asked. */
+		tdb_convert(tdb, tdb->map_ptr, sizeof(newdb));
+		return 0;
+	}
+	if (lseek(tdb->fd, 0, SEEK_SET) == -1)
+		return -1;
+
+	if (ftruncate(tdb->fd, 0) == -1)
+		return -1;
+
+	/* This creates an endian-converted header, as if read from disk */
+	tdb->header = newdb.hdr;
+	tdb_convert(tdb, &tdb->header, sizeof(tdb->header));
+
+	/* Don't endian-convert the magic food! */
+	memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
+	strcpy(newdb.hdr.magic_food, TDB_MAGIC_FOOD);
+
+	if (!tdb_pwrite_all(tdb->fd, &newdb, sizeof(newdb), 0)) {
+		tdb->ecode = TDB_ERR_IO;
+		return -1;
+	}
+	return 0;
+}
+
+struct tdb_context *tdb_open(const char *name, int tdb_flags,
+			     int open_flags, mode_t mode,
+			     union tdb_attribute *attr)
+{
+	struct tdb_context *tdb;
+	struct stat st;
+	int save_errno;
+	uint64_t hash_test;
+	unsigned v;
+
+	if (!(tdb = (struct tdb_context *)calloc(1, sizeof *tdb))) {
+		/* Can't log this */
+		errno = ENOMEM;
+		goto fail;
+	}
+	tdb->fd = -1;
+	tdb->name = NULL;
+	tdb->map_ptr = NULL;
+	tdb->flags = tdb_flags;
+	tdb->log = null_log_fn;
+	tdb->log_priv = NULL;
+	tdb->khash = jenkins_hash;
+	tdb->hash_priv = NULL;
+
+	/* FIXME */
+	if (attr) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_open: attributes not yet supported\n");
+		errno = EINVAL;
+		goto fail;
+	}
+
+	if ((open_flags & O_ACCMODE) == O_WRONLY) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_open: can't open tdb %s write-only\n", name);
+		errno = EINVAL;
+		goto fail;
+	}
+
+	if ((open_flags & O_ACCMODE) == O_RDONLY) {
+		tdb->read_only = 1;
+		/* read only databases don't do locking */
+		tdb->flags |= TDB_NOLOCK;
+	}
+
+	/* internal databases don't mmap or lock */
+	if (tdb->flags & TDB_INTERNAL) {
+		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
+		if (tdb_new_database(tdb) != 0) {
+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				 "tdb_open: tdb_new_database failed!");
+			goto fail;
+		}
+		TEST_IT(tdb->flags & TDB_CONVERT);
+		goto internal;
+	}
+
+	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
+		tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
+			 "tdb_open: could not open file %s: %s\n",
+			 name, strerror(errno));
+		goto fail;	/* errno set by open(2) */
+	}
+
+	/* on exec, don't inherit the fd */
+	v = fcntl(tdb->fd, F_GETFD, 0);
+        fcntl(tdb->fd, F_SETFD, v | FD_CLOEXEC);
+
+	/* ensure there is only one process initialising at once */
+	if (tdb_lock_open(tdb) == -1) {
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_open: failed to get open lock on %s: %s\n",
+			 name, strerror(errno));
+		goto fail;	/* errno set by tdb_brlock */
+	}
+
+	errno = 0;
+	if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
+	    || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0) {
+		if (!(open_flags & O_CREAT) || tdb_new_database(tdb) == -1) {
+			if (errno == 0) {
+				errno = EIO; /* ie bad format or something */
+			}
+			goto fail;
+		}
+	} else if (tdb->header.version != TDB_VERSION) {
+		if (tdb->header.version == bswap_64(TDB_VERSION))
+			tdb->flags |= TDB_CONVERT;
+		else {
+			/* wrong version */
+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				 "tdb_open: %s is unknown version 0x%llx\n",
+				 name, (long long)tdb->header.version);
+			errno = EIO;
+			goto fail;
+		}
+	}
+
+	tdb_convert(tdb, &tdb->header, sizeof(tdb->header));
+	hash_test = TDB_HASH_MAGIC;
+	hash_test = tdb->khash(&hash_test, sizeof(hash_test),
+			       tdb->header.hash_seed, tdb->hash_priv);
+	if (tdb->header.hash_test != hash_test) {
+		/* wrong hash variant */
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_open: %s uses a different hash function\n",
+			 name);
+		errno = EIO;
+		goto fail;
+	}
+
+	if (fstat(tdb->fd, &st) == -1)
+		goto fail;
+
+	/* Is it already in the open list?  If so, fail. */
+	if (tdb_already_open(st.st_dev, st.st_ino)) {
+		/* FIXME */
+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			 "tdb_open: %s (%d,%d) is already open in this process\n",
+			 name, (int)st.st_dev, (int)st.st_ino);
+		errno = EBUSY;
+		goto fail;
+	}
+
+	tdb->name = strdup(name);
+	if (!tdb->name) {
+		errno = ENOMEM;
+		goto fail;
+	}
+
+	tdb->map_size = st.st_size;
+	tdb->device = st.st_dev;
+	tdb->inode = st.st_ino;
+	tdb_io_init(tdb);
+	tdb_mmap(tdb);
+
+ internal:
+	/* Internal (memory-only) databases skip all the code above to
+	 * do with disk files, and resume here by releasing their
+	 * open lock and hooking into the active list. */
+	tdb_unlock_open(tdb);
+	tdb->last_zone = random_free_zone(tdb);
+	tdb->next = tdbs;
+	tdbs = tdb;
+	return tdb;
+
+ fail:
+	save_errno = errno;
+
+	if (!tdb)
+		return NULL;
+
+#ifdef TDB_TRACE
+	close(tdb->tracefd);
+#endif
+	if (tdb->map_ptr) {
+		if (tdb->flags & TDB_INTERNAL) {
+			free(tdb->map_ptr);
+		} else
+			tdb_munmap(tdb);
+	}
+	free((char *)tdb->name);
+	if (tdb->fd != -1)
+		if (close(tdb->fd) != 0)
+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+				 "tdb_open: failed to close tdb->fd"
+				 " on error!\n");
+	free(tdb);
+	errno = save_errno;
+	return NULL;
+}
+
+static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
+{
+	return memcmp(data.dptr, key.dptr, data.dsize) == 0;
+}
+
+static void unlock_lists(struct tdb_context *tdb,
+			 uint64_t start, uint64_t end, int ltype)
+{
+	do {
+		tdb_unlock_list(tdb, start, ltype);
+		start = (start + ((1ULL << tdb->header.v.hash_bits) - 1))
+			& ((1ULL << tdb->header.v.hash_bits) - 1);
+	} while (start != end);
+}
+
+/* FIXME: Return header copy? */
+/* Returns -1 or offset of entry (0 if not found).
+ * Locks hash entried from *start to *end (where the entry was found). */
+static tdb_off_t find_bucket_and_lock(struct tdb_context *tdb,
+				      const struct tdb_data *key,
+				      uint64_t hash,
+				      uint64_t *start,
+				      uint64_t *end,
+				      uint64_t *room,
+				      int ltype)
+{
+	uint64_t hextra;
+	tdb_off_t off;
+
+	/* hash_bits might be out of date... */
+again:
+	*start = *end = hash & ((1ULL << tdb->header.v.hash_bits) - 1);
+	hextra = hash >> tdb->header.v.hash_bits;
+
+	/* FIXME: can we avoid locks for some fast paths? */
+	if (tdb_lock_list(tdb, *end, ltype, TDB_LOCK_WAIT) == -1)
+		return TDB_OFF_ERR;
+
+	/* We only need to check this for first lock. */
+	if (unlikely(update_header(tdb))) {
+		tdb_unlock_list(tdb, *end, ltype);
+		goto again;
+	}
+
+	while ((off = tdb_read_off(tdb, tdb->header.v.hash_off
+				   + *end * sizeof(tdb_off_t)))
+	       != TDB_OFF_ERR) {
+		struct tdb_used_record pad, *r;
+		uint64_t keylen, next;
+
+		/* Didn't find it? */
+		if (!off)
+			return 0;
+
+#if 0 /* FIXME: Check other bits. */
+		unsigned int bits, bitmask, hoffextra;
+		/* Bottom three bits show how many extra hash bits. */
+		bits = (off & ((1 << TDB_EXTRA_HASHBITS_NUM) - 1)) + 1;
+		bitmask = (1 << bits)-1;
+		hoffextra = ((off >> TDB_EXTRA_HASHBITS_NUM) & bitmask);
+		if ((hextra & bitmask) != hoffextra) 
+			goto lock_next;
+#endif
+
+		r = tdb_get(tdb, off, &pad, sizeof(*r));
+		if (!r)
+			goto unlock_err;
+
+		if (rec_magic(r) != TDB_MAGIC) {
+			tdb->ecode = TDB_ERR_CORRUPT;
+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				 "find_bucket_and_lock: bad magic 0x%llx"
+				 " at offset %llu!\n",
+				 (long long)rec_magic(r), (long long)off);
+			goto unlock_err;
+		}
+
+		/* FIXME: check extra bits in header! */
+		keylen = rec_key_length(r);
+		if (keylen != key->dsize)
+			goto lock_next;
+
+		switch (tdb_parse_data(tdb, *key, off + sizeof(*r), key->dsize,
+				       tdb_key_compare, NULL)) {
+		case 1:
+			/* Match! */
+			*room = rec_data_length(r) + rec_extra_padding(r);
+			return off >> TDB_EXTRA_HASHBITS_NUM;
+		case 0:
+			break;
+		default:
+			goto unlock_err;
+		}
+
+	lock_next:
+		/* Lock next bucket. */
+		/* FIXME: We can deadlock if this wraps! */
+		next = (*end + 1) & ((1ULL << tdb->header.v.hash_bits) - 1);
+		if (next == *start) {
+			tdb->ecode = TDB_ERR_CORRUPT;
+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				 "find_bucket_and_lock: full hash table!\n");
+			goto unlock_err;
+		}
+		if (tdb_lock_list(tdb, next, ltype, TDB_LOCK_WAIT) == -1)
+			goto unlock_err;
+		*end = next;
+	}
+
+unlock_err:
+	TEST_IT(*end < *start);
+	unlock_lists(tdb, *start, *end, ltype);
+	return TDB_OFF_ERR;
+}
+
+static int update_rec_hdr(struct tdb_context *tdb,
+			  tdb_off_t off,
+			  tdb_len_t keylen,
+			  tdb_len_t datalen,
+			  tdb_len_t room,
+			  uint64_t h)
+{
+	struct tdb_used_record rec;
+
+	if (set_header(tdb, &rec, keylen, datalen, room - datalen, h))
+		return -1;
+
+	return tdb_write_convert(tdb, off, &rec, sizeof(rec));
+}
+
+/* If we fail, others will try after us. */
+static void enlarge_hash(struct tdb_context *tdb)
+{
+	tdb_off_t newoff, i;
+	uint64_t h, num = 1ULL << tdb->header.v.hash_bits;
+	struct tdb_used_record pad, *r;
+
+	/* FIXME: We should do this without holding locks throughout. */
+	if (tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == -1)
+		return;
+
+	if (unlikely(update_header(tdb))) {
+		/* Someone else enlarged for us?  Nothing to do. */
+		if ((1ULL << tdb->header.v.hash_bits) != num)
+			goto unlock;
+	}
+
+	newoff = alloc(tdb, 0, num * 2, 0, false);
+	if (unlikely(newoff == TDB_OFF_ERR))
+		goto unlock;
+	if (unlikely(newoff == 0)) {
+		if (tdb_expand(tdb, 0, num * 2, false) == -1)
+			goto unlock;
+		newoff = alloc(tdb, 0, num * 2, 0, false);
+		if (newoff == TDB_OFF_ERR || newoff == 0)
+			goto unlock;
+	}
+
+	/* FIXME: If the space before is empty, we know this is in its ideal
+	 * location.  We can steal a bit from the pointer to avoid rehash. */
+	for (i = tdb_find_nonzero_off(tdb, tdb->header.v.hash_off, num);
+	     i < num;
+	     i += tdb_find_nonzero_off(tdb, tdb->header.v.hash_off
+				       + i*sizeof(tdb_off_t), num - i)) {
+		tdb_off_t off;
+		off = tdb_read_off(tdb, tdb->header.v.hash_off
+				   + i*sizeof(tdb_off_t));
+		if (unlikely(off == TDB_OFF_ERR))
+			goto unlock;
+		if (unlikely(!off)) {
+			tdb->ecode = TDB_ERR_CORRUPT;
+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+				 "find_bucket_and_lock: zero hash bucket!\n");
+			goto unlock;
+		}
+		h = hash_record(tdb, off);
+		/* FIXME: Encode extra hash bits! */
+		if (tdb_write_off(tdb, newoff
+				  + (h & ((num * 2) - 1)) * sizeof(uint64_t),
+				  off) == -1)
+			goto unlock;
+	}
+
+	/* Free up old hash. */
+	r = tdb_get(tdb, tdb->header.v.hash_off, &pad, sizeof(*r));
+	if (!r)
+		goto unlock;
+	add_free_record(tdb, tdb->header.v.hash_off,
+			rec_data_length(r) + rec_extra_padding(r));
+
+	/* Now we write the modified header. */
+	tdb->header.v.generation++;
+	tdb->header.v.hash_bits++;
+	tdb->header.v.hash_off = newoff;
+	tdb_write_convert(tdb, offsetof(struct tdb_header, v),
+			  &tdb->header.v, sizeof(tdb->header.v));
+unlock:
+	tdb_allrecord_unlock(tdb, F_WRLCK);
+}
+
+int tdb_store(struct tdb_context *tdb,
+	      struct tdb_data key, struct tdb_data dbuf, int flag)
+{
+	tdb_off_t new_off, off, start, end, room;
+	uint64_t h;
+	bool growing = false;
+
+	h = tdb_hash(tdb, key.dptr, key.dsize);
+	off = find_bucket_and_lock(tdb, &key, h, &start, &end, &room, F_WRLCK);
+	if (off == TDB_OFF_ERR)
+		return -1;
+
+	/* Now we have lock on this hash bucket. */
+	if (flag == TDB_INSERT) {
+		if (off) {
+			tdb->ecode = TDB_ERR_EXISTS;
+			goto fail;
+		}
+	} else {
+		if (off) {
+			if (room >= key.dsize + dbuf.dsize) {
+				new_off = off;
+				if (update_rec_hdr(tdb, off,
+						   key.dsize, dbuf.dsize,
+						   room, h))
+					goto fail;
+				goto write;
+			}
+			/* FIXME: See if right record is free? */
+			/* Hint to allocator that we've realloced. */
+			growing = true;
+		} else {
+			if (flag == TDB_MODIFY) {
+				/* if the record doesn't exist and we
+				   are in TDB_MODIFY mode then we should fail
+				   the store */
+				tdb->ecode = TDB_ERR_NOEXIST;
+				goto fail;
+			}
+		}
+	}
+
+	/* Allocate a new record. */
+	new_off = alloc(tdb, key.dsize, dbuf.dsize, h, growing);
+	if (new_off == 0) {
+		unlock_lists(tdb, start, end, F_WRLCK);
+		/* Expand, then try again... */
+		if (tdb_expand(tdb, key.dsize, dbuf.dsize, growing) == -1)
+			return -1;
+		return tdb_store(tdb, key, dbuf, flag);
+	}
+
+	/* We didn't like the existing one: remove it. */
+	if (off) {
+		add_free_record(tdb, off, sizeof(struct tdb_used_record)
+				+ key.dsize + room);
+	}
+
+write:
+	off = tdb->header.v.hash_off + end * sizeof(tdb_off_t);
+	/* FIXME: Encode extra hash bits! */
+	if (tdb_write_off(tdb, off, new_off) == -1)
+		goto fail;
+
+	off = new_off + sizeof(struct tdb_used_record);
+	if (tdb->methods->write(tdb, off, key.dptr, key.dsize) == -1)
+		goto fail;
+	off += key.dsize;
+	if (tdb->methods->write(tdb, off, dbuf.dptr, dbuf.dsize) == -1)
+		goto fail;
+
+	/* FIXME: tdb_increment_seqnum(tdb); */
+	unlock_lists(tdb, start, end, F_WRLCK);
+
+	/* By simple trial and error, this roughly approximates a 60%
+	 * full measure. */
+	if (unlikely(end - start > 4 * tdb->header.v.hash_bits - 32))
+		enlarge_hash(tdb);
+
+	return 0;
+
+fail:
+	unlock_lists(tdb, start, end, F_WRLCK);
+	return -1;
+}
+
+struct tdb_data tdb_fetch(struct tdb_context *tdb, struct tdb_data key)
+{
+	tdb_off_t off, start, end, room;
+	uint64_t h;
+	struct tdb_used_record pad, *r;
+	struct tdb_data ret;
+
+	h = tdb_hash(tdb, key.dptr, key.dsize);
+	off = find_bucket_and_lock(tdb, &key, h, &start, &end, &room, F_RDLCK);
+	if (off == TDB_OFF_ERR)
+		return tdb_null;
+
+	if (!off) {
+		unlock_lists(tdb, start, end, F_RDLCK);
+		tdb->ecode = TDB_SUCCESS;
+		return tdb_null;
+	}
+
+	r = tdb_get(tdb, off, &pad, sizeof(*r));
+	if (!r) {
+		unlock_lists(tdb, start, end, F_RDLCK);
+		return tdb_null;
+	}
+
+	ret.dsize = rec_data_length(r);
+	ret.dptr = tdb_alloc_read(tdb, off + sizeof(*r) + key.dsize,
+				  ret.dsize);
+	unlock_lists(tdb, start, end, F_RDLCK);
+	return ret;
+}
+
+static int hash_add(struct tdb_context *tdb, uint64_t h, tdb_off_t off)
+{
+	tdb_off_t i, hoff, len, num;
+
+	i = (h & ((1ULL << tdb->header.v.hash_bits) - 1));
+	hoff = tdb->header.v.hash_off + i * sizeof(tdb_off_t);
+	len = (1ULL << tdb->header.v.hash_bits) - i;
+
+	/* Look for next space. */
+	num = tdb_find_zero_off(tdb, hoff, len);
+	if (unlikely(num == len)) {
+		hoff = tdb->header.v.hash_off;
+		len = (1ULL << tdb->header.v.hash_bits);
+		num = tdb_find_zero_off(tdb, hoff, len);
+		if (i == len)
+			return -1;
+	}
+	/* FIXME: Encode extra hash bits! */
+	return tdb_write_off(tdb, hoff + num * sizeof(tdb_off_t), off);
+}
+
+static int unlink_used_record(struct tdb_context *tdb, tdb_off_t chain,
+			      uint64_t *extra_locks)
+{
+	tdb_off_t num, len, i, hoff;
+
+	/* FIXME: Maybe lock more in search?  Maybe don't lock if scan
+	 * finds none? */
+again:
+	len = (1ULL << tdb->header.v.hash_bits) - (chain + 1);
+	hoff = tdb->header.v.hash_off + (chain + 1) * sizeof(tdb_off_t);
+	num = tdb_find_zero_off(tdb, hoff, len);
+
+	/* We want to lock the zero entry, too.  In the wrap case,
+	 * this locks one extra.  That's harmless. */
+	num++;
+
+	for (i = chain + 1; i < chain + 1 + num; i++) {
+		if (tdb_lock_list(tdb, i, F_WRLCK, TDB_LOCK_WAIT) == -1) {
+			if (i != chain + 1)
+				unlock_lists(tdb, chain + 1, i-1, F_WRLCK);
+			return -1;
+		}
+	}
+
+	/* The wrap case: we need those locks out of order! */
+	if (unlikely(num == len + 1)) {
+		*extra_locks = tdb_find_zero_off(tdb, tdb->header.v.hash_off,
+						 1ULL << tdb->header.v.hash_bits);
+		(*extra_locks)++;
+		for (i = 0; i < *extra_locks; i++) {
+			if (tdb_lock_list(tdb, i, F_WRLCK, TDB_LOCK_NOWAIT)) {
+				/* Failed.  Caller must lock in order. */
+				if (i)
+					unlock_lists(tdb, 0, i-1, F_WRLCK);
+				unlock_lists(tdb, chain + 1, chain + num,
+					     F_WRLCK);
+				return 1;
+			}
+		}
+		num += *extra_locks;
+	}
+
+	/* Now we have the locks, be certain that offset is still 0! */
+	hoff = tdb->header.v.hash_off
+		+ (((chain + num) * sizeof(tdb_off_t))
+		   & ((1ULL << tdb->header.v.hash_bits) - 1));
+
+	if (unlikely(tdb_read_off(tdb, hoff) != 0)) {
+		unlock_lists(tdb, chain + 1, chain + num, F_WRLCK);
+		goto again;
+	}
+
+	/* OK, all locked.  Unlink first one. */
+	hoff = tdb->header.v.hash_off + chain * sizeof(tdb_off_t);
+	if (tdb_write_off(tdb, hoff, 0) == -1)
+		goto unlock_err;
+
+	/* Rehash the rest. */
+	for (i = 1; i < num; i++) {
+		tdb_off_t off;
+		uint64_t h;
+
+		hoff = tdb->header.v.hash_off
+			+ (((chain + i) * sizeof(tdb_off_t))
+			   & ((1ULL << tdb->header.v.hash_bits) - 1));
+		off = tdb_read_off(tdb, hoff);
+		if (unlikely(off == TDB_OFF_ERR))
+			goto unlock_err;
+
+		/* Maybe use a bit to indicate it is in ideal place? */
+		h = hash_record(tdb, off);
+		/* Is it happy where it is? */
+		if ((h & ((1ULL << tdb->header.v.hash_bits)-1)) == (chain + i))
+			continue;
+
+		/* Remove it. */
+		if (tdb_write_off(tdb, hoff, 0) == -1)
+			goto unlock_err;
+
+		/* Rehash it. */
+		if (hash_add(tdb, h, off) == -1)
+			goto unlock_err;
+	}
+	unlock_lists(tdb, chain + 1, chain + num, F_WRLCK);
+	return 0;
+
+unlock_err:
+	unlock_lists(tdb, chain + 1, chain + num, F_WRLCK);
+	return -1;
+}
+
+int tdb_delete(struct tdb_context *tdb, struct tdb_data key)
+{
+	tdb_off_t off, start, end, room, extra_locks = 0;
+	uint64_t h;
+	int ret;
+
+	h = tdb_hash(tdb, key.dptr, key.dsize);
+	off = find_bucket_and_lock(tdb, &key, h, &start, &end, &room, F_WRLCK);
+	if (off == TDB_OFF_ERR)
+		return -1;
+
+	if (off == 0) {
+		unlock_lists(tdb, start, end, F_WRLCK);
+		tdb->ecode = TDB_ERR_NOEXIST;
+		return -1;
+	}
+
+	ret = unlink_used_record(tdb, end, &extra_locks);
+	if (unlikely(ret == 1)) {
+		unsigned int i;
+
+		unlock_lists(tdb, start, end, F_WRLCK);
+
+		/* We need extra locks at the start. */
+		for (i = 0; i < extra_locks; i++) {
+			if (tdb_lock_list(tdb, i, F_WRLCK, TDB_LOCK_WAIT)) {
+				if (i)
+					unlock_lists(tdb, 0, i-1, F_WRLCK);
+				return -1;
+			}
+		}
+		/* Try again now we're holding more locks. */
+		ret = tdb_delete(tdb, key);
+		unlock_lists(tdb, 0, i, F_WRLCK);
+		return ret;
+	}
+	unlock_lists(tdb, start, end, F_WRLCK);
+	return ret;
+}
+
+int tdb_close(struct tdb_context *tdb)
+{
+	struct tdb_context **i;
+	int ret = 0;
+
+	/* FIXME:
+	if (tdb->transaction) {
+		tdb_transaction_cancel(tdb);
+	}
+	*/
+	tdb_trace(tdb, "tdb_close");
+
+	if (tdb->map_ptr) {
+		if (tdb->flags & TDB_INTERNAL)
+			free(tdb->map_ptr);
+		else
+			tdb_munmap(tdb);
+	}
+	free((char *)tdb->name);
+	if (tdb->fd != -1) {
+		ret = close(tdb->fd);
+		tdb->fd = -1;
+	}
+	free(tdb->lockrecs);
+
+	/* Remove from contexts list */
+	for (i = &tdbs; *i; i = &(*i)->next) {
+		if (*i == tdb) {
+			*i = tdb->next;
+			break;
+		}
+	}
+
+#ifdef TDB_TRACE
+	close(tdb->tracefd);
+#endif
+	free(tdb);
+
+	return ret;
+}

+ 143 - 0
ccan/tdb2/tdb2.h

@@ -0,0 +1,143 @@
+#ifndef CCAN_TDB2_H
+#define CCAN_TDB2_H
+
+/* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell 1999-2004
+   
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#ifndef _SAMBA_BUILD_
+/* For mode_t */
+#include <sys/types.h>
+/* For O_* flags. */
+#include <sys/stat.h>
+/* For sig_atomic_t. */
+#include <signal.h>
+/* For uint64_t */
+#include <stdint.h>
+#endif
+
+/* flags to tdb_store() */
+#define TDB_REPLACE 1		/* Unused */
+#define TDB_INSERT 2 		/* Don't overwrite an existing entry */
+#define TDB_MODIFY 3		/* Don't create an existing entry    */
+
+/* flags for tdb_open() */
+#define TDB_DEFAULT 0 /* just a readability place holder */
+#define TDB_CLEAR_IF_FIRST 1
+#define TDB_INTERNAL 2 /* don't store on disk */
+#define TDB_NOLOCK   4 /* don't do any locking */
+#define TDB_NOMMAP   8 /* don't use mmap */
+#define TDB_CONVERT 16 /* convert endian (internal use) */
+#define TDB_BIGENDIAN 32 /* header is big-endian (internal use) */
+#define TDB_NOSYNC   64 /* don't use synchronous transactions */
+#define TDB_SEQNUM   128 /* maintain a sequence number */
+#define TDB_VOLATILE   256 /* Activate the per-hashchain freelist, default 5 */
+#define TDB_ALLOW_NESTING 512 /* Allow transactions to nest */
+#define TDB_DISALLOW_NESTING 1024 /* Disallow transactions to nest */
+
+/* error codes */
+enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK, 
+		TDB_ERR_OOM, TDB_ERR_EXISTS, TDB_ERR_NOLOCK, TDB_ERR_LOCK_TIMEOUT,
+		TDB_ERR_NOEXIST, TDB_ERR_EINVAL, TDB_ERR_RDONLY,
+		TDB_ERR_NESTING};
+
+/* debugging uses one of the following levels */
+enum tdb_debug_level {TDB_DEBUG_FATAL = 0, TDB_DEBUG_ERROR, 
+		      TDB_DEBUG_WARNING, TDB_DEBUG_TRACE};
+
+typedef struct tdb_data {
+	unsigned char *dptr;
+	size_t dsize;
+} TDB_DATA;
+
+#ifndef PRINTF_ATTRIBUTE
+#if (__GNUC__ >= 3)
+/** Use gcc attribute to check printf fns.  a1 is the 1-based index of
+ * the parameter containing the format, and a2 the index of the first
+ * argument. Note that some gcc 2.x versions don't handle this
+ * properly **/
+#define PRINTF_ATTRIBUTE(a1, a2) __attribute__ ((format (__printf__, a1, a2)))
+#else
+#define PRINTF_ATTRIBUTE(a1, a2)
+#endif
+#endif
+
+struct tdb_context;
+
+/* FIXME: Make typesafe */
+typedef void (*tdb_logfn_t)(struct tdb_context *, enum tdb_debug_level, void *priv, const char *, ...) PRINTF_ATTRIBUTE(4, 5);
+typedef uint64_t (*tdb_hashfn_t)(const void *key, size_t len, uint64_t seed,
+				 void *priv);
+
+enum tdb_attribute_type {
+	TDB_ATTRIBUTE_LOG = 0,
+	TDB_ATTRIBUTE_HASH = 1
+};
+
+struct tdb_attribute_base {
+	enum tdb_attribute_type attr;
+	union tdb_attribute *next;
+};
+
+struct tdb_attribute_log {
+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
+	tdb_logfn_t log_fn;
+	void *log_private;
+};
+
+struct tdb_attribute_hash {
+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
+	tdb_hashfn_t hash_fn;
+	void *hash_private;
+};
+
+union tdb_attribute {
+	struct tdb_attribute_base base;
+	struct tdb_attribute_log log;
+	struct tdb_attribute_hash hash;
+};
+		
+struct tdb_context *tdb_open(const char *name, int tdb_flags,
+			     int open_flags, mode_t mode,
+			     union tdb_attribute *attributes);
+
+struct tdb_data tdb_fetch(struct tdb_context *tdb, struct tdb_data key);
+int tdb_delete(struct tdb_context *tdb, struct tdb_data key);
+int tdb_store(struct tdb_context *tdb, struct tdb_data key, struct tdb_data dbuf, int flag);
+int tdb_close(struct tdb_context *tdb);
+int tdb_check(struct tdb_context *tdb,
+	      int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
+	      void *private_data);
+
+extern struct tdb_data tdb_null;
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif /* tdb2.h */

+ 40 - 0
ccan/tdb2/test/run-encode.c

@@ -0,0 +1,40 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tap/tap.h>
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	struct tdb_used_record rec;
+	struct tdb_context tdb = { .log = null_log_fn, .log_priv = NULL };
+
+	plan_tests(64 + 32 + 48*6);
+
+	/* We should be able to encode any data value. */
+	for (i = 0; i < 64; i++)
+		ok1(set_header(&tdb, &rec, 0, 1ULL << i, 1ULL << i, 0) == 0);
+
+	/* And any key and data with < 64 bits between them. */
+	for (i = 0; i < 32; i++) {
+		tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
+		ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen, 0) == 0);
+	}
+
+	/* We should neatly encode all values. */
+	for (i = 0; i < 48; i++) {
+		uint64_t h = 1ULL << (i < 11 ? 63 - i : 63 - 10);
+		uint64_t klen = 1ULL << (i < 16 ? i : 15);
+		uint64_t dlen = 1ULL << i;
+		uint64_t xlen = 1ULL << (i < 32 ? i : 31);
+		ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen + xlen, h)
+		    == 0);
+		ok1(rec_key_length(&rec) == klen);
+		ok1(rec_data_length(&rec) == dlen);
+		ok1(rec_extra_padding(&rec) == xlen);
+		ok1(rec_hash(&rec) == h);
+		ok1(rec_magic(&rec) == TDB_MAGIC);
+	}
+	return exit_status();
+}

+ 36 - 0
ccan/tdb2/test/run-fls.c

@@ -0,0 +1,36 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tap/tap.h>
+
+static unsigned int dumb_fls(uint64_t num)
+{
+	int i;
+
+	for (i = 63; i >= 0; i--) {
+		if (num & (1ULL << i))
+			break;
+	}
+	return i + 1;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+
+	plan_tests(64 * 64 + 2);
+
+	ok1(fls64(0) == 0);
+	ok1(dumb_fls(0) == 0);
+
+	for (i = 0; i < 64; i++) {
+		for (j = 0; j < 64; j++) {
+			uint64_t val = (1ULL << i) | (1ULL << j);
+			ok(fls64(val) == dumb_fls(val),
+			   "%llu -> %u should be %u", (long long)val,
+			   fls64(val), dumb_fls(val));
+		}
+	}
+	return exit_status();
+}