16 years ago · 39f01834db
--- a/ccan/tdb2/_info
+++ b/ccan/tdb2/_info
@@ -0,0 +1,81 @@
 
															+#include <string.h>
														
 
															+#include <stdio.h>
														
 
															+
														
 
															+/**
														
 
															+ * tdb2 - [[WORK IN PROGRESS!]] The trivial (64bit transactional) database
														
 
															+ *
														
 
															+ * The tdb2 module provides an efficient keyword data mapping (usually
														
 
															+ * within a file).  It supports transactions, so the contents of the
														
 
															+ * database is reliable even across crashes.
														
 
															+ *
														
 
															+ * Example:
														
 
															+ *	#include <ccan/tdb2/tdb2.h>
														
 
															+ *	#include <ccan/str/str.h>
														
 
															+ *	#include <err.h>
														
 
															+ *	#include <stdio.h>
														
 
															+ *	
														
 
															+ *	static void usage(void)
														
 
															+ *	{
														
 
															+ *		errx(1, "Usage: %s fetch <dbfile> <key>\n"
														
 
															+ *		     "OR %s store <dbfile> <key> <data>");
														
 
															+ *	}
														
 
															+ *	
														
 
															+ *	int main(int argc, char *argv[])
														
 
															+ *	{
														
 
															+ *		struct tdb_context *tdb;
														
 
															+ *		TDB_DATA key, value;
														
 
															+ *	
														
 
															+ *		if (argc < 4)
														
 
															+ *			usage();
														
 
															+ *	
														
 
															+ *		tdb = tdb_open(argv[2], 1024, TDB_DEFAULT, O_CREAT|O_RDWR,
														
 
															+ *				0600);
														
 
															+ *		if (!tdb)
														
 
															+ *			err(1, "Opening %s", argv[2]);
														
 
															+ *	
														
 
															+ *		key.dptr = (void *)argv[3];
														
 
															+ *		key.dsize = strlen(argv[3]);
														
 
															+ *	
														
 
															+ *		if (streq(argv[1], "fetch")) {
														
 
															+ *			if (argc != 4)
														
 
															+ *				usage();
														
 
															+ *			value = tdb_fetch(tdb, key);
														
 
															+ *			if (!value.dptr)
														
 
															+ *				errx(1, "fetch %s: %s",
														
 
															+ *				     argv[3], tdb_errorstr(tdb));
														
 
															+ *			printf("%.*s\n", value.dsize, (char *)value.dptr);
														
 
															+ *			free(value.dptr);
														
 
															+ *		} else if (streq(argv[1], "store")) {
														
 
															+ *			if (argc != 5)
														
 
															+ *				usage();
														
 
															+ *			value.dptr = (void *)argv[4];
														
 
															+ *			value.dsize = strlen(argv[4]);
														
 
															+ *			if (tdb_store(tdb, key, value, 0) != 0)
														
 
															+ *				errx(1, "store %s: %s",
														
 
															+ *				     argv[3], tdb_errorstr(tdb));
														
 
															+ *		} else
														
 
															+ *			usage();
														
 
															+ *	
														
 
															+ *		return 0;
														
 
															+ *	}
														
 
															+ *
														
 
															+ * Maintainer: Rusty Russell <rusty@rustcorp.com.au>
														
 
															+ *
														
 
															+ * Author: Rusty Russell
														
 
															+ *
														
 
															+ * Licence: LGPLv3 (or later)
														
 
															+ */
														
 
															+int main(int argc, char *argv[])
														
 
															+{
														
 
															+	if (argc != 2)
														
 
															+		return 1;
														
 
															+
														
 
															+	if (strcmp(argv[1], "depends") == 0) {
														
 
															+		printf("ccan/hash\n");
														
 
															+		printf("ccan/likely\n");
														
 
															+		printf("ccan/asearch\n");
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	return 1;
														
 
															+}
														
--- a/ccan/tdb2/check.c
+++ b/ccan/tdb2/check.c
@@ -0,0 +1,411 @@
 
															+ /* 
														
 
															+   Trivial Database 2: free list/block handling
														
 
															+   Copyright (C) Rusty Russell 2010
														
 
															+   
														
 
															+   This library is free software; you can redistribute it and/or
														
 
															+   modify it under the terms of the GNU Lesser General Public
														
 
															+   License as published by the Free Software Foundation; either
														
 
															+   version 3 of the License, or (at your option) any later version.
														
 
															+
														
 
															+   This library is distributed in the hope that it will be useful,
														
 
															+   but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+   Lesser General Public License for more details.
														
 
															+
														
 
															+   You should have received a copy of the GNU Lesser General Public
														
 
															+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
														
 
															+*/
														
 
															+#include "private.h"
														
 
															+#include <ccan/likely/likely.h>
														
 
															+#include <ccan/asearch/asearch.h>
														
 
															+
														
 
															+/* We keep an ordered array of offsets. */
														
 
															+static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
														
 
															+{
														
 
															+	tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
														
 
															+	if (!new)
														
 
															+		return false;
														
 
															+	new[(*num)++] = off;
														
 
															+	*arr = new;
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+static bool check_header(struct tdb_context *tdb)
														
 
															+{
														
 
															+	uint64_t hash_test;
														
 
															+
														
 
															+	hash_test = TDB_HASH_MAGIC;
														
 
															+	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
														
 
															+	if (tdb->header.hash_test != hash_test) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "check: hash test %llu should be %llu\n",
														
 
															+			 tdb->header.hash_test, hash_test);
														
 
															+		return false;
														
 
															+	}
														
 
															+	if (strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "check: bad magic '%.*s'\n",
														
 
															+			 sizeof(tdb->header.magic_food),
														
 
															+			 tdb->header.magic_food);
														
 
															+		return false;
														
 
															+	}
														
 
															+	if (tdb->header.v.hash_bits < INITIAL_HASH_BITS) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "check: bad hash bits %llu\n",
														
 
															+			 (long long)tdb->header.v.hash_bits);
														
 
															+		return false;
														
 
															+	}
														
 
															+	if (tdb->header.v.zone_bits < INITIAL_ZONE_BITS) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "check: bad zone_bits %llu\n",
														
 
															+			 (long long)tdb->header.v.zone_bits);
														
 
															+		return false;
														
 
															+	}
														
 
															+	if (tdb->header.v.free_buckets < INITIAL_FREE_BUCKETS) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "check: bad free_buckets %llu\n",
														
 
															+			 (long long)tdb->header.v.free_buckets);
														
 
															+		return false;
														
 
															+	}
														
 
															+	if ((1ULL << tdb->header.v.zone_bits) * tdb->header.v.num_zones
														
 
															+	    < tdb->map_size) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "check: %llu zones size %llu don't cover %llu\n",
														
 
															+			 (long long)(1ULL << tdb->header.v.zone_bits),
														
 
															+			 (long long)tdb->header.v.num_zones,
														
 
															+			 (long long)tdb->map_size);
														
 
															+		return false;
														
 
															+	}
														
 
															+
														
 
															+	/* We check hash_off and free_off later. */
														
 
															+
														
 
															+	/* Don't check reserved: they *can* be used later. */
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
														
 
															+{
														
 
															+	/* Can overflow an int. */
														
 
															+	return a > b ? 1
														
 
															+		: a < b ? -1
														
 
															+		: 0;
														
 
															+}
														
 
															+
														
 
															+static bool check_hash_list(struct tdb_context *tdb,
														
 
															+			    tdb_off_t used[],
														
 
															+			    size_t num_used)
														
 
															+{
														
 
															+	struct tdb_used_record rec;
														
 
															+	tdb_len_t hashlen, i, num_nonzero;
														
 
															+	tdb_off_t h;
														
 
															+	size_t num_found;
														
 
															+
														
 
															+	hashlen = sizeof(tdb_off_t) << tdb->header.v.hash_bits;
														
 
															+
														
 
															+	if (tdb_read_convert(tdb, tdb->header.v.hash_off - sizeof(rec),
														
 
															+			     &rec, sizeof(rec)) == -1)
														
 
															+		return false;
														
 
															+
														
 
															+	if (rec_data_length(&rec) != hashlen) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: Bad hash table length %llu vs %llu\n",
														
 
															+			 (long long)rec_data_length(&rec),
														
 
															+			 (long long)hashlen);
														
 
															+		return false;
														
 
															+	}
														
 
															+	if (rec_key_length(&rec) != 0) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: Bad hash table key length %llu\n",
														
 
															+			 (long long)rec_key_length(&rec));
														
 
															+		return false;
														
 
															+	}
														
 
															+	if (rec_hash(&rec) != 0) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: Bad hash table hash value %llu\n",
														
 
															+			 (long long)rec_hash(&rec));
														
 
															+		return false;
														
 
															+	}
														
 
															+
														
 
															+	num_found = 0;
														
 
															+	num_nonzero = 0;
														
 
															+	for (i = 0, h = tdb->header.v.hash_off;
														
 
															+	     i < (1ULL << tdb->header.v.hash_bits);
														
 
															+	     i++, h += sizeof(tdb_off_t)) {
														
 
															+		tdb_off_t off, *p, pos;
														
 
															+		struct tdb_used_record rec;
														
 
															+		uint64_t hash;
														
 
															+
														
 
															+		off = tdb_read_off(tdb, h);
														
 
															+		if (off == TDB_OFF_ERR)
														
 
															+			return false;
														
 
															+		if (!off) {
														
 
															+			num_nonzero = 0;
														
 
															+			continue;
														
 
															+		}
														
 
															+		/* FIXME: Check hash bits */
														
 
															+		p = asearch(&off, used, num_used, off_cmp);
														
 
															+		if (!p) {
														
 
															+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+				 "tdb_check: Invalid offset %llu in hash\n",
														
 
															+				 (long long)off);
														
 
															+			return false;
														
 
															+		}
														
 
															+		/* Mark it invalid. */
														
 
															+		*p ^= 1;
														
 
															+		num_found++;
														
 
															+
														
 
															+		if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
														
 
															+			return false;
														
 
															+
														
 
															+		/* Check it is hashed correctly. */
														
 
															+		hash = hash_record(tdb, off);
														
 
															+
														
 
															+		/* Top bits must match header. */
														
 
															+		if (hash >> (64 - 11) != rec_hash(&rec)) {
														
 
															+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+				 "tdb_check: Bad hash magic at offset %llu"
														
 
															+				 " (0x%llx vs 0x%llx)\n",
														
 
															+				 (long long)off,
														
 
															+				 (long long)hash, (long long)rec_hash(&rec));
														
 
															+			return false;
														
 
															+		}
														
 
															+
														
 
															+		/* It must be in the right place in hash array. */
														
 
															+		pos = hash & ((1ULL << tdb->header.v.hash_bits)-1);
														
 
															+		if (pos < i - num_nonzero || pos > i) {
														
 
															+			/* Could be wrap from end of array?  FIXME: check? */
														
 
															+			if (i != num_nonzero) {
														
 
															+				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+					 "tdb_check: Bad hash position %llu at"
														
 
															+					 " offset %llu hash 0x%llx\n",
														
 
															+					 (long long)i,
														
 
															+					 (long long)off,
														
 
															+					 (long long)hash);
														
 
															+				return false;
														
 
															+			}
														
 
															+		}
														
 
															+		num_nonzero++;
														
 
															+	}
														
 
															+
														
 
															+	if (num_found != num_used) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: Not all entries are in hash\n");
														
 
															+		return false;
														
 
															+	}
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+static bool check_free(struct tdb_context *tdb,
														
 
															+		       tdb_off_t off,
														
 
															+		       const struct tdb_free_record *frec,
														
 
															+		       tdb_off_t prev,
														
 
															+		       tdb_off_t zone, unsigned int bucket)
														
 
															+{
														
 
															+	if (frec->magic != TDB_FREE_MAGIC) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: offset %llu bad magic 0x%llx\n",
														
 
															+			 (long long)off, (long long)frec->magic);
														
 
															+		return false;
														
 
															+	}
														
 
															+	if (tdb->methods->oob(tdb, off
														
 
															+			      + frec->data_len-sizeof(struct tdb_used_record),
														
 
															+			      true))
														
 
															+		return false;
														
 
															+	if (zone_of(tdb, off) != zone) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: offset %llu in wrong zone %llu vs %llu\n",
														
 
															+			 (long long)off,
														
 
															+			 (long long)zone, (long long)zone_of(tdb, off));
														
 
															+		return false;
														
 
															+	}
														
 
															+	if (size_to_bucket(tdb, frec->data_len) != bucket) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: offset %llu in wrong bucket %u vs %u\n",
														
 
															+			 (long long)off,
														
 
															+			 bucket, size_to_bucket(tdb, frec->data_len));
														
 
															+		return false;
														
 
															+	}
														
 
															+	if (prev != frec->prev) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: offset %llu bad prev %llu vs %llu\n",
														
 
															+			 (long long)off,
														
 
															+			 (long long)prev, (long long)frec->prev);
														
 
															+		return false;
														
 
															+	}
														
 
															+	return true;
														
 
															+}
														
 
															+		       
														
 
															+static bool check_free_list(struct tdb_context *tdb,
														
 
															+			    tdb_off_t free[],
														
 
															+			    size_t num_free)
														
 
															+{
														
 
															+	struct tdb_used_record rec;
														
 
															+	tdb_len_t freelen, i, j;
														
 
															+	tdb_off_t h;
														
 
															+	size_t num_found;
														
 
															+
														
 
															+	freelen = sizeof(tdb_off_t) * tdb->header.v.num_zones
														
 
															+		* (tdb->header.v.free_buckets + 1);
														
 
															+
														
 
															+	if (tdb_read_convert(tdb, tdb->header.v.free_off - sizeof(rec),
														
 
															+			     &rec, sizeof(rec)) == -1)
														
 
															+		return false;
														
 
															+
														
 
															+	if (rec_data_length(&rec) != freelen) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: Bad free table length %llu vs %llu\n",
														
 
															+			 (long long)rec_data_length(&rec),
														
 
															+			 (long long)freelen);
														
 
															+		return false;
														
 
															+	}
														
 
															+	if (rec_key_length(&rec) != 0) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: Bad free table key length %llu\n",
														
 
															+			 (long long)rec_key_length(&rec));
														
 
															+		return false;
														
 
															+	}
														
 
															+	if (rec_hash(&rec) != 0) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: Bad free table hash value %llu\n",
														
 
															+			 (long long)rec_hash(&rec));
														
 
															+		return false;
														
 
															+	}
														
 
															+
														
 
															+	num_found = 0;
														
 
															+	h = tdb->header.v.free_off;
														
 
															+	for (i = 0; i < tdb->header.v.num_zones; i++) {
														
 
															+		for (j = 0; j <= tdb->header.v.free_buckets;
														
 
															+		     j++, h += sizeof(tdb_off_t)) {
														
 
															+			tdb_off_t off, prev = 0, *p;
														
 
															+			struct tdb_free_record f;
														
 
															+
														
 
															+			for (off = tdb_read_off(tdb, h); off; off = f.next) {
														
 
															+				if (off == TDB_OFF_ERR)
														
 
															+					return false;
														
 
															+				if (tdb_read_convert(tdb, off, &f, sizeof(f)))
														
 
															+					return false;
														
 
															+				if (!check_free(tdb, off, &f, prev, i, j))
														
 
															+					return false;
														
 
															+
														
 
															+				/* FIXME: Check hash bits */
														
 
															+				p = asearch(&off, free, num_free, off_cmp);
														
 
															+				if (!p) {
														
 
															+					tdb->log(tdb, TDB_DEBUG_ERROR,
														
 
															+						 tdb->log_priv,
														
 
															+						 "tdb_check: Invalid offset"
														
 
															+						 " %llu in free table\n",
														
 
															+						 (long long)off);
														
 
															+					return false;
														
 
															+				}
														
 
															+				/* Mark it invalid. */
														
 
															+				*p ^= 1;
														
 
															+				num_found++;
														
 
															+				prev = off;
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+	if (num_found != num_free) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: Not all entries are in free table\n");
														
 
															+		return false;
														
 
															+	}
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+/* FIXME: call check() function. */
														
 
															+int tdb_check(struct tdb_context *tdb,
														
 
															+	      int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
														
 
															+	      void *private_data)
														
 
															+{
														
 
															+	tdb_off_t *free = NULL, *used = NULL, off;
														
 
															+	tdb_len_t len;
														
 
															+	size_t num_free = 0, num_used = 0;
														
 
															+	bool hash_found = false, free_found = false;
														
 
															+
														
 
															+	if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
														
 
															+		return -1;
														
 
															+
														
 
															+	update_header(tdb);
														
 
															+
														
 
															+	if (!check_header(tdb))
														
 
															+		goto fail;
														
 
															+
														
 
															+	/* First we do a linear scan, checking all records. */
														
 
															+	for (off = sizeof(struct tdb_header);
														
 
															+	     off < tdb->map_size;
														
 
															+	     off += len) {
														
 
															+		union {
														
 
															+			struct tdb_used_record u;
														
 
															+			struct tdb_free_record f;
														
 
															+		} pad, *p;
														
 
															+		p = tdb_get(tdb, off, &pad, sizeof(pad));
														
 
															+		if (!p)
														
 
															+			goto fail;
														
 
															+		if (p->f.magic == TDB_FREE_MAGIC) {
														
 
															+			/* This record is free! */
														
 
															+			if (!append(&free, &num_free, off))
														
 
															+				goto fail;
														
 
															+			len = sizeof(p->u) + p->f.data_len;
														
 
															+			if (tdb->methods->oob(tdb, off + len, false))
														
 
															+				goto fail;
														
 
															+		} else {
														
 
															+			uint64_t klen, dlen, extra;
														
 
															+
														
 
															+			/* This record is used! */
														
 
															+			if (rec_magic(&p->u) != TDB_MAGIC) {
														
 
															+				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+					 "tdb_check: Bad magic 0x%llx"
														
 
															+					 " at offset %llu\n",
														
 
															+					 (long long)rec_magic(&p->u),
														
 
															+					 (long long)off);
														
 
															+				goto fail;
														
 
															+			}
														
 
															+			
														
 
															+			if (!append(&used, &num_used, off))
														
 
															+				goto fail;
														
 
															+
														
 
															+			klen = rec_key_length(&p->u);
														
 
															+			dlen = rec_data_length(&p->u);
														
 
															+			extra = rec_extra_padding(&p->u);
														
 
															+
														
 
															+			len = sizeof(p->u) + klen + dlen + extra;
														
 
															+			if (tdb->methods->oob(tdb, off + len, false))
														
 
															+				goto fail;
														
 
															+
														
 
															+			if (off + sizeof(p->u) == tdb->header.v.hash_off) {
														
 
															+				hash_found = true;
														
 
															+			} else if (off + sizeof(p->u)
														
 
															+				   == tdb->header.v.free_off) {
														
 
															+				free_found = true;
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (!hash_found) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: hash table not found at %llu\n",
														
 
															+			 (long long)tdb->header.v.hash_off);
														
 
															+		goto fail;
														
 
															+	}
														
 
															+
														
 
															+	if (!free_found) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_check: free table not found at %llu\n",
														
 
															+			 (long long)tdb->header.v.free_off);
														
 
															+		goto fail;
														
 
															+	}
														
 
															+
														
 
															+	/* FIXME: Check key uniqueness? */
														
 
															+	if (!check_hash_list(tdb, used, num_used))
														
 
															+		goto fail;
														
 
															+
														
 
															+	if (!check_free_list(tdb, free, num_free))
														
 
															+		goto fail;
														
 
															+
														
 
															+	tdb_allrecord_unlock(tdb, F_RDLCK);
														
 
															+	return true;
														
 
															+
														
 
															+fail:
														
 
															+	tdb_allrecord_unlock(tdb, F_RDLCK);
														
 
															+	return false;
														
 
															+}
														
--- a/ccan/tdb2/doc/design-1.3.txt
+++ b/ccan/tdb2/doc/design-1.3.txt
@@ -0,0 +1,1050 @@
 
															+TDB2: A Redesigning The Trivial DataBase
														
 
															+
														
 
															+Rusty Russell, IBM Corporation
														
 
															+
														
 
															+27-April-2010
														
 
															+
														
 
															+Abstract
														
 
															+
														
 
															+The Trivial DataBase on-disk format is 32 bits; with usage cases 
														
 
															+heading towards the 4G limit, that must change. This required 
														
 
															+breakage provides an opportunity to revisit TDB's other design 
														
 
															+decisions and reassess them.
														
 
															+
														
 
															+1 Introduction
														
 
															+
														
 
															+The Trivial DataBase was originally written by Andrew Tridgell as 
														
 
															+a simple key/data pair storage system with the same API as dbm, 
														
 
															+but allowing multiple readers and writers while being small 
														
 
															+enough (< 1000 lines of C) to include in SAMBA. The simple design 
														
 
															+created in 1999 has proven surprisingly robust and performant, 
														
 
															+used in Samba versions 3 and 4 as well as numerous other 
														
 
															+projects. Its useful life was greatly increased by the 
														
 
															+(backwards-compatible!) addition of transaction support in 2005.
														
 
															+
														
 
															+The wider variety and greater demands of TDB-using code has lead 
														
 
															+to some organic growth of the API, as well as some compromises on 
														
 
															+the implementation. None of these, by themselves, are seen as 
														
 
															+show-stoppers, but the cumulative effect is to a loss of elegance 
														
 
															+over the initial, simple TDB implementation. Here is a table of 
														
 
															+the approximate number of lines of implementation code and number 
														
 
															+of API functions at the end of each year:
														
 
															+
														
 
															+
														
 
															++-----------+----------------+--------------------------------+
														
 
															+| Year End  | API Functions  | Lines of C Code Implementation |
														
 
															++-----------+----------------+--------------------------------+
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   1999    |      13        |              1195              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2000    |      24        |              1725              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2001    |      32        |              2228              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2002    |      35        |              2481              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2003    |      35        |              2552              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2004    |      40        |              2584              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2005    |      38        |              2647              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2006    |      52        |              3754              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2007    |      66        |              4398              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2008    |      71        |              4768              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2009    |      73        |              5715              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+
														
 
															+
														
 
															+This review is an attempt to catalog and address all the known 
														
 
															+issues with TDB and create solutions which address the problems 
														
 
															+without significantly increasing complexity; all involved are far 
														
 
															+too aware of the dangers of second system syndrome in rewriting a 
														
 
															+successful project like this.
														
 
															+
														
 
															+2 API Issues
														
 
															+
														
 
															+2.1 tdb_open_ex Is Not Expandable
														
 
															+
														
 
															+The tdb_open() call was expanded to tdb_open_ex(), which added an 
														
 
															+optional hashing function and an optional logging function 
														
 
															+argument. Additional arguments to open would require the 
														
 
															+introduction of a tdb_open_ex2 call etc.
														
 
															+
														
 
															+2.1.1 Proposed Solution
														
 
															+
														
 
															+tdb_open() will take a linked-list of attributes:
														
 
															+
														
 
															+enum tdb_attribute {
														
 
															+
														
 
															+    TDB_ATTRIBUTE_LOG = 0,
														
 
															+
														
 
															+    TDB_ATTRIBUTE_HASH = 1
														
 
															+
														
 
															+};
														
 
															+
														
 
															+struct tdb_attribute_base {
														
 
															+
														
 
															+    enum tdb_attribute attr;
														
 
															+
														
 
															+    union tdb_attribute *next;
														
 
															+
														
 
															+};
														
 
															+
														
 
															+struct tdb_attribute_log {
														
 
															+
														
 
															+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG 
														
 
															+*/
														
 
															+
														
 
															+    tdb_log_func log_fn;
														
 
															+
														
 
															+    void *log_private;
														
 
															+
														
 
															+};
														
 
															+
														
 
															+struct tdb_attribute_hash {
														
 
															+
														
 
															+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH 
														
 
															+*/
														
 
															+
														
 
															+    tdb_hash_func hash_fn;
														
 
															+
														
 
															+    void *hash_private;
														
 
															+
														
 
															+};
														
 
															+
														
 
															+union tdb_attribute {
														
 
															+
														
 
															+    struct tdb_attribute_base base;
														
 
															+
														
 
															+    struct tdb_attribute_log log;
														
 
															+
														
 
															+    struct tdb_attribute_hash hash;
														
 
															+
														
 
															+};
														
 
															+
														
 
															+This allows future attributes to be added, even if this expands 
														
 
															+the size of the union.
														
 
															+
														
 
															+2.2 tdb_traverse Makes Impossible Guarantees
														
 
															+
														
 
															+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, 
														
 
															+and it was thought that it was important to guarantee that all 
														
 
															+records which exist at the start and end of the traversal would 
														
 
															+be included, and no record would be included twice.
														
 
															+
														
 
															+This adds complexity (see[Reliable-Traversal-Adds]) and does not 
														
 
															+work anyway for records which are altered (in particular, those 
														
 
															+which are expanded may be effectively deleted and re-added behind 
														
 
															+the traversal).
														
 
															+
														
 
															+2.2.1 <traverse-Proposed-Solution>Proposed Solution
														
 
															+
														
 
															+Abandon the guarantee. You will see every record if no changes 
														
 
															+occur during your traversal, otherwise you will see some subset. 
														
 
															+You can prevent changes by using a transaction or the locking 
														
 
															+API.
														
 
															+
														
 
															+2.3 Nesting of Transactions Is Fraught
														
 
															+
														
 
															+TDB has alternated between allowing nested transactions and not 
														
 
															+allowing them. Various paths in the Samba codebase assume that 
														
 
															+transactions will nest, and in a sense they can: the operation is 
														
 
															+only committed to disk when the outer transaction is committed. 
														
 
															+There are two problems, however:
														
 
															+
														
 
															+1. Canceling the inner transaction will cause the outer 
														
 
															+  transaction commit to fail, and will not undo any operations 
														
 
															+  since the inner transaction began. This problem is soluble with 
														
 
															+  some additional internal code.
														
 
															+
														
 
															+2. An inner transaction commit can be cancelled by the outer 
														
 
															+  transaction. This is desirable in the way which Samba's 
														
 
															+  database initialization code uses transactions, but could be a 
														
 
															+  surprise to any users expecting a successful transaction commit 
														
 
															+  to expose changes to others.
														
 
															+
														
 
															+The current solution is to specify the behavior at tdb_open(), 
														
 
															+with the default currently that nested transactions are allowed. 
														
 
															+This flag can also be changed at runtime.
														
 
															+
														
 
															+2.3.1 Proposed Solution
														
 
															+
														
 
															+Given the usage patterns, it seems that the “least-surprise” 
														
 
															+behavior of disallowing nested transactions should become the 
														
 
															+default. Additionally, it seems the outer transaction is the only 
														
 
															+code which knows whether inner transactions should be allowed, so 
														
 
															+a flag to indicate this could be added to tdb_transaction_start. 
														
 
															+However, this behavior can be simulated with a wrapper which uses 
														
 
															+tdb_add_flags() and tdb_remove_flags(), so the API should not be 
														
 
															+expanded for this relatively-obscure case.
														
 
															+
														
 
															+2.4 Incorrect Hash Function is Not Detected
														
 
															+
														
 
															+tdb_open_ex() allows the calling code to specify a different hash 
														
 
															+function to use, but does not check that all other processes 
														
 
															+accessing this tdb are using the same hash function. The result 
														
 
															+is that records are missing from tdb_fetch().
														
 
															+
														
 
															+2.4.1 Proposed Solution
														
 
															+
														
 
															+The header should contain an example hash result (eg. the hash of 
														
 
															+0xdeadbeef), and tdb_open_ex() should check that the given hash 
														
 
															+function produces the same answer, or fail the tdb_open call.
														
 
															+
														
 
															+2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
														
 
															+
														
 
															+In response to scalability issues with the free list ([TDB-Freelist-Is]
														
 
															+) two API workarounds have been incorporated in TDB: 
														
 
															+tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The 
														
 
															+latter actually calls the former with an argument of “5”.
														
 
															+
														
 
															+This code allows deleted records to accumulate without putting 
														
 
															+them in the free list. On delete we iterate through each chain 
														
 
															+and free them in a batch if there are more than max_dead entries. 
														
 
															+These are never otherwise recycled except as a side-effect of a 
														
 
															+tdb_repack.
														
 
															+
														
 
															+2.5.1 Proposed Solution
														
 
															+
														
 
															+With the scalability problems of the freelist solved, this API 
														
 
															+can be removed. The TDB_VOLATILE flag may still be useful as a 
														
 
															+hint that store and delete of records will be at least as common 
														
 
															+as fetch in order to allow some internal tuning, but initially 
														
 
															+will become a no-op.
														
 
															+
														
 
															+2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times 
														
 
															+  In The Same Process
														
 
															+
														
 
															+No process can open the same TDB twice; we check and disallow it. 
														
 
															+This is an unfortunate side-effect of fcntl locks, which operate 
														
 
															+on a per-file rather than per-file-descriptor basis, and do not 
														
 
															+nest. Thus, closing any file descriptor on a file clears all the 
														
 
															+locks obtained by this process, even if they were placed using a 
														
 
															+different file descriptor!
														
 
															+
														
 
															+Note that even if this were solved, deadlock could occur if 
														
 
															+operations were nested: this is a more manageable programming 
														
 
															+error in most cases.
														
 
															+
														
 
															+2.6.1 Proposed Solution
														
 
															+
														
 
															+We could lobby POSIX to fix the perverse rules, or at least lobby 
														
 
															+Linux to violate them so that the most common implementation does 
														
 
															+not have this restriction. This would be a generally good idea 
														
 
															+for other fcntl lock users.
														
 
															+
														
 
															+Samba uses a wrapper which hands out the same tdb_context to 
														
 
															+multiple callers if this happens, and does simple reference 
														
 
															+counting. We should do this inside the tdb library, which already 
														
 
															+emulates lock nesting internally; it would need to recognize when 
														
 
															+deadlock occurs within a single process. This would create a new 
														
 
															+failure mode for tdb operations (while we currently handle 
														
 
															+locking failures, they are impossible in normal use and a process 
														
 
															+encountering them can do little but give up).
														
 
															+
														
 
															+I do not see benefit in an additional tdb_open flag to indicate 
														
 
															+whether re-opening is allowed, as though there may be some 
														
 
															+benefit to adding a call to detect when a tdb_context is shared, 
														
 
															+to allow other to create such an API.
														
 
															+
														
 
															+2.7 TDB API Is Not POSIX Thread-safe
														
 
															+
														
 
															+The TDB API uses an error code which can be queried after an 
														
 
															+operation to determine what went wrong. This programming model 
														
 
															+does not work with threads, unless specific additional guarantees 
														
 
															+are given by the implementation. In addition, even 
														
 
															+otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
														
 
															+).
														
 
															+
														
 
															+2.7.1 Proposed Solution
														
 
															+
														
 
															+Reachitecting the API to include a tdb_errcode pointer would be a 
														
 
															+great deal of churn; we are better to guarantee that the 
														
 
															+tdb_errcode is per-thread so the current programming model can be 
														
 
															+maintained.
														
 
															+
														
 
															+This requires dynamic per-thread allocations, which is awkward 
														
 
															+with POSIX threads (pthread_key_create space is limited and we 
														
 
															+cannot simply allocate a key for every TDB).
														
 
															+
														
 
															+Internal locking is required to make sure that fcntl locks do not 
														
 
															+overlap between threads, and also that the global list of tdbs is 
														
 
															+maintained.
														
 
															+
														
 
															+The aim is that building tdb with -DTDB_PTHREAD will result in a 
														
 
															+pthread-safe version of the library, and otherwise no overhead 
														
 
															+will exist.
														
 
															+
														
 
															+2.8 *_nonblock Functions And *_mark Functions Expose 
														
 
															+  Implementation
														
 
															+
														
 
															+CTDB[footnote:
														
 
															+Clustered TDB, see http://ctdb.samba.org
														
 
															+] wishes to operate on TDB in a non-blocking manner. This is 
														
 
															+currently done as follows:
														
 
															+
														
 
															+1. Call the _nonblock variant of an API function (eg. 
														
 
															+  tdb_lockall_nonblock). If this fails:
														
 
															+
														
 
															+2. Fork a child process, and wait for it to call the normal 
														
 
															+  variant (eg. tdb_lockall).
														
 
															+
														
 
															+3. If the child succeeds, call the _mark variant to indicate we 
														
 
															+  already have the locks (eg. tdb_lockall_mark).
														
 
															+
														
 
															+4. Upon completion, tell the child to release the locks (eg. 
														
 
															+  tdb_unlockall).
														
 
															+
														
 
															+5. Indicate to tdb that it should consider the locks removed (eg. 
														
 
															+  tdb_unlockall_mark).
														
 
															+
														
 
															+There are several issues with this approach. Firstly, adding two 
														
 
															+new variants of each function clutters the API for an obscure 
														
 
															+use, and so not all functions have three variants. Secondly, it 
														
 
															+assumes that all paths of the functions ask for the same locks, 
														
 
															+otherwise the parent process will have to get a lock which the 
														
 
															+child doesn't have under some circumstances. I don't believe this 
														
 
															+is currently the case, but it constrains the implementation. 
														
 
															+
														
 
															+2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
														
 
															+
														
 
															+Implement a hook for locking methods, so that the caller can 
														
 
															+control the calls to create and remove fcntl locks. In this 
														
 
															+scenario, ctdbd would operate as follows:
														
 
															+
														
 
															+1. Call the normal API function, eg tdb_lockall().
														
 
															+
														
 
															+2. When the lock callback comes in, check if the child has the 
														
 
															+  lock. Initially, this is always false. If so, return 0. 
														
 
															+  Otherwise, try to obtain it in non-blocking mode. If that 
														
 
															+  fails, return EWOULDBLOCK.
														
 
															+
														
 
															+3. Release locks in the unlock callback as normal.
														
 
															+
														
 
															+4. If tdb_lockall() fails, see if we recorded a lock failure; if 
														
 
															+  so, call the child to repeat the operation.
														
 
															+
														
 
															+5. The child records what locks it obtains, and returns that 
														
 
															+  information to the parent.
														
 
															+
														
 
															+6. When the child has succeeded, goto 1.
														
 
															+
														
 
															+This is flexible enough to handle any potential locking scenario, 
														
 
															+even when lock requirements change. It can be optimized so that 
														
 
															+the parent does not release locks, just tells the child which 
														
 
															+locks it doesn't need to obtain.
														
 
															+
														
 
															+It also keeps the complexity out of the API, and in ctdbd where 
														
 
															+it is needed.
														
 
															+
														
 
															+2.9 tdb_chainlock Functions Expose Implementation
														
 
															+
														
 
															+tdb_chainlock locks some number of records, including the record 
														
 
															+indicated by the given key. This gave atomicity guarantees; 
														
 
															+no-one can start a transaction, alter, read or delete that key 
														
 
															+while the lock is held.
														
 
															+
														
 
															+It also makes the same guarantee for any other key in the chain, 
														
 
															+which is an internal implementation detail and potentially a 
														
 
															+cause for deadlock.
														
 
															+
														
 
															+2.9.1 Proposed Solution
														
 
															+
														
 
															+None. It would be nice to have an explicit single entry lock 
														
 
															+which effected no other keys. Unfortunately, this won't work for 
														
 
															+an entry which doesn't exist. Thus while chainlock may be 
														
 
															+implemented more efficiently for the existing case, it will still 
														
 
															+have overlap issues with the non-existing case. So it is best to 
														
 
															+keep the current (lack of) guarantee about which records will be 
														
 
															+effected to avoid constraining our implementation.
														
 
															+
														
 
															+2.10 Signal Handling is Not Race-Free
														
 
															+
														
 
															+The tdb_setalarm_sigptr() call allows the caller's signal handler 
														
 
															+to indicate that the tdb locking code should return with a 
														
 
															+failure, rather than trying again when a signal is received (and 
														
 
															+errno == EAGAIN). This is usually used to implement timeouts.
														
 
															+
														
 
															+Unfortunately, this does not work in the case where the signal is 
														
 
															+received before the tdb code enters the fcntl() call to place the 
														
 
															+lock: the code will sleep within the fcntl() code, unaware that 
														
 
															+the signal wants it to exit. In the case of long timeouts, this 
														
 
															+does not happen in practice.
														
 
															+
														
 
															+2.10.1 Proposed Solution
														
 
															+
														
 
															+The locking hooks proposed in[Proposed-Solution-locking-hook] 
														
 
															+would allow the user to decide on whether to fail the lock 
														
 
															+acquisition on a signal. This allows the caller to choose their 
														
 
															+own compromise: they could narrow the race by checking 
														
 
															+immediately before the fcntl call.[footnote:
														
 
															+It may be possible to make this race-free in some implementations 
														
 
															+by having the signal handler alter the struct flock to make it 
														
 
															+invalid. This will cause the fcntl() lock call to fail with 
														
 
															+EINVAL if the signal occurs before the kernel is entered, 
														
 
															+otherwise EAGAIN.
														
 
															+]
														
 
															+
														
 
															+2.11 The API Uses Gratuitous Typedefs, Capitals
														
 
															+
														
 
															+typedefs are useful for providing source compatibility when types 
														
 
															+can differ across implementations, or arguably in the case of 
														
 
															+function pointer definitions which are hard for humans to parse. 
														
 
															+Otherwise it is simply obfuscation and pollutes the namespace.
														
 
															+
														
 
															+Capitalization is usually reserved for compile-time constants and 
														
 
															+macros.
														
 
															+
														
 
															+  TDB_CONTEXT There is no reason to use this over 'struct 
														
 
															+  tdb_context'; the definition isn't visible to the API user 
														
 
															+  anyway.
														
 
															+
														
 
															+  TDB_DATA There is no reason to use this over struct TDB_DATA; 
														
 
															+  the struct needs to be understood by the API user.
														
 
															+
														
 
															+  struct TDB_DATA This would normally be called 'struct 
														
 
															+  tdb_data'.
														
 
															+
														
 
															+  enum TDB_ERROR Similarly, this would normally be enum 
														
 
															+  tdb_error.
														
 
															+
														
 
															+2.11.1 Proposed Solution
														
 
															+
														
 
															+None. Introducing lower case variants would please pedants like 
														
 
															+myself, but if it were done the existing ones should be kept. 
														
 
															+There is little point forcing a purely cosmetic change upon tdb 
														
 
															+users.
														
 
															+
														
 
															+2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The 
														
 
															+  Private Pointer
														
 
															+
														
 
															+For API compatibility reasons, the logging function needs to call 
														
 
															+tdb_get_logging_private() to retrieve the pointer registered by 
														
 
															+the tdb_open_ex for logging.
														
 
															+
														
 
															+2.12.1 Proposed Solution
														
 
															+
														
 
															+It should simply take an extra argument, since we are prepared to 
														
 
															+break the API/ABI.
														
 
															+
														
 
															+2.13 Various Callback Functions Are Not Typesafe
														
 
															+
														
 
															+The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
														
 
															+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read 
														
 
															+and tdb_check all take void * and must internally convert it to 
														
 
															+the argument type they were expecting.
														
 
															+
														
 
															+If this type changes, the compiler will not produce warnings on 
														
 
															+the callers, since it only sees void *.
														
 
															+
														
 
															+2.13.1 Proposed Solution
														
 
															+
														
 
															+With careful use of macros, we can create callback functions 
														
 
															+which give a warning when used on gcc and the types of the 
														
 
															+callback and its private argument differ. Unsupported compilers 
														
 
															+will not give a warning, which is no worse than now. In addition, 
														
 
															+the callbacks become clearer, as they need not use void * for 
														
 
															+their parameter.
														
 
															+
														
 
															+See CCAN's typesafe_cb module at 
														
 
															+http://ccan.ozlabs.org/info/typesafe_cb.html
														
 
															+
														
 
															+2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, 
														
 
															+  tdb_reopen_all Problematic
														
 
															+
														
 
															+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB 
														
 
															+file should be cleared if the caller discovers it is the only 
														
 
															+process with the TDB open. However, if any caller does not 
														
 
															+specify TDB_CLEAR_IF_FIRST it will not be detected, so will have 
														
 
															+the TDB erased underneath them (usually resulting in a crash).
														
 
															+
														
 
															+There is a similar issue on fork(); if the parent exits (or 
														
 
															+otherwise closes the tdb) before the child calls tdb_reopen_all() 
														
 
															+to establish the lock used to indicate the TDB is opened by 
														
 
															+someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe 
														
 
															+it alone has opened the TDB and will erase it.
														
 
															+
														
 
															+2.14.1 Proposed Solution
														
 
															+
														
 
															+Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but 
														
 
															+see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
														
 
															+
														
 
															+3 Performance And Scalability Issues
														
 
															+
														
 
															+3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST 
														
 
															+  Imposes Performance Penalty
														
 
															+
														
 
															+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is 
														
 
															+placed at offset 4 (aka. the ACTIVE_LOCK). While these locks 
														
 
															+never conflict in normal tdb usage, they do add substantial 
														
 
															+overhead for most fcntl lock implementations when the kernel 
														
 
															+scans to detect if a lock conflict exists. This is often a single 
														
 
															+linked list, making the time to acquire and release a fcntl lock 
														
 
															+O(N) where N is the number of processes with the TDB open, not 
														
 
															+the number actually doing work.
														
 
															+
														
 
															+In a Samba server it is common to have huge numbers of clients 
														
 
															+sitting idle, and thus they have weaned themselves off the 
														
 
															+TDB_CLEAR_IF_FIRST flag.[footnote:
														
 
															+There is a flag to tdb_reopen_all() which is used for this 
														
 
															+optimization: if the parent process will outlive the child, the 
														
 
															+child does not need the ACTIVE_LOCK. This is a workaround for 
														
 
															+this very performance issue.
														
 
															+]
														
 
															+
														
 
															+3.1.1 Proposed Solution
														
 
															+
														
 
															+Remove the flag. It was a neat idea, but even trivial servers 
														
 
															+tend to know when they are initializing for the first time and 
														
 
															+can simply unlink the old tdb at that point.
														
 
															+
														
 
															+3.2 TDB Files Have a 4G Limit
														
 
															+
														
 
															+This seems to be becoming an issue (so much for “trivial”!), 
														
 
															+particularly for ldb.
														
 
															+
														
 
															+3.2.1 Proposed Solution
														
 
															+
														
 
															+A new, incompatible TDB format which uses 64 bit offsets 
														
 
															+internally rather than 32 bit as now. For simplicity of endian 
														
 
															+conversion (which TDB does on the fly if required), all values 
														
 
															+will be 64 bit on disk. In practice, some upper bits may be used 
														
 
															+for other purposes, but at least 56 bits will be available for 
														
 
															+file offsets.
														
 
															+
														
 
															+tdb_open() will automatically detect the old version, and even 
														
 
															+create them if TDB_VERSION6 is specified to tdb_open.
														
 
															+
														
 
															+32 bit processes will still be able to access TDBs larger than 4G 
														
 
															+(assuming that their off_t allows them to seek to 64 bits), they 
														
 
															+will gracefully fall back as they fail to mmap. This can happen 
														
 
															+already with large TDBs.
														
 
															+
														
 
															+Old versions of tdb will fail to open the new TDB files (since 28 
														
 
															+August 2009, commit 398d0c29290: prior to that any unrecognized 
														
 
															+file format would be erased and initialized as a fresh tdb!)
														
 
															+
														
 
															+3.3 TDB Records Have a 4G Limit
														
 
															+
														
 
															+This has not been a reported problem, and the API uses size_t 
														
 
															+which can be 64 bit on 64 bit platforms. However, other limits 
														
 
															+may have made such an issue moot.
														
 
															+
														
 
															+3.3.1 Proposed Solution
														
 
															+
														
 
															+Record sizes will be 64 bit, with an error returned on 32 bit 
														
 
															+platforms which try to access such records (the current 
														
 
															+implementation would return TDB_ERR_OOM in a similar case). It 
														
 
															+seems unlikely that 32 bit keys will be a limitation, so the 
														
 
															+implementation may not support this (see [sub:Records-Incur-A]).
														
 
															+
														
 
															+3.4 Hash Size Is Determined At TDB Creation Time
														
 
															+
														
 
															+TDB contains a number of hash chains in the header; the number is 
														
 
															+specified at creation time, and defaults to 131. This is such a 
														
 
															+bottleneck on large databases (as each hash chain gets quite 
														
 
															+long), that LDB uses 10,000 for this hash. In general it is 
														
 
															+impossible to know what the 'right' answer is at database 
														
 
															+creation time.
														
 
															+
														
 
															+3.4.1 Proposed Solution
														
 
															+
														
 
															+After comprehensive performance testing on various scalable hash 
														
 
															+variants[footnote:
														
 
															+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 
														
 
															+This was annoying because I was previously convinced that an 
														
 
															+expanding tree of hashes would be very close to optimal.
														
 
															+], it became clear that it is hard to beat a straight linear hash 
														
 
															+table which doubles in size when it reaches saturation. There are 
														
 
															+three details which become important:
														
 
															+
														
 
															+1. On encountering a full bucket, we use the next bucket.
														
 
															+
														
 
															+2. Extra hash bits are stored with the offset, to reduce 
														
 
															+  comparisons.
														
 
															+
														
 
															+3. A marker entry is used on deleting an entry.
														
 
															+
														
 
															+The doubling of the table must be done under a transaction; we 
														
 
															+will not reduce it on deletion, so it will be an unusual case. It 
														
 
															+will either be placed at the head (other entries will be moved 
														
 
															+out the way so we can expand). We could have a pointer in the 
														
 
															+header to the current hashtable location, but that pointer would 
														
 
															+have to be read frequently to check for hashtable moves.
														
 
															+
														
 
															+The locking for this is slightly more complex than the chained 
														
 
															+case; we currently have one lock per bucket, and that means we 
														
 
															+would need to expand the lock if we overflow to the next bucket. 
														
 
															+The frequency of such collisions will effect our locking 
														
 
															+heuristics: we can always lock more buckets than we need.
														
 
															+
														
 
															+One possible optimization is to only re-check the hash size on an 
														
 
															+insert or a lookup miss.
														
 
															+
														
 
															+3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
														
 
															+
														
 
															+TDB uses a single linked list for the free list. Allocation 
														
 
															+occurs as follows, using heuristics which have evolved over time:
														
 
															+
														
 
															+1. Get the free list lock for this whole operation.
														
 
															+
														
 
															+2. Multiply length by 1.25, so we always over-allocate by 25%.
														
 
															+
														
 
															+3. Set the slack multiplier to 1.
														
 
															+
														
 
															+4. Examine the current freelist entry: if it is > length but < 
														
 
															+  the current best case, remember it as the best case.
														
 
															+
														
 
															+5. Multiply the slack multiplier by 1.05.
														
 
															+
														
 
															+6. If our best fit so far is less than length * slack multiplier, 
														
 
															+  return it. The slack will be turned into a new free record if 
														
 
															+  it's large enough.
														
 
															+
														
 
															+7. Otherwise, go onto the next freelist entry.
														
 
															+
														
 
															+Deleting a record occurs as follows:
														
 
															+
														
 
															+1. Lock the hash chain for this whole operation.
														
 
															+
														
 
															+2. Walk the chain to find the record, keeping the prev pointer 
														
 
															+  offset.
														
 
															+
														
 
															+3. If max_dead is non-zero:
														
 
															+
														
 
															+  (a) Walk the hash chain again and count the dead records.
														
 
															+
														
 
															+  (b) If it's more than max_dead, bulk free all the dead ones 
														
 
															+    (similar to steps 4 and below, but the lock is only obtained 
														
 
															+    once).
														
 
															+
														
 
															+  (c) Simply mark this record as dead and return. 
														
 
															+
														
 
															+4. Get the free list lock for the remainder of this operation.
														
 
															+
														
 
															+5. <right-merging>Examine the following block to see if it is 
														
 
															+  free; if so, enlarge the current block and remove that block 
														
 
															+  from the free list. This was disabled, as removal from the free 
														
 
															+  list was O(entries-in-free-list).
														
 
															+
														
 
															+6. Examine the preceeding block to see if it is free: for this 
														
 
															+  reason, each block has a 32-bit tailer which indicates its 
														
 
															+  length. If it is free, expand it to cover our new block and 
														
 
															+  return.
														
 
															+
														
 
															+7. Otherwise, prepend ourselves to the free list.
														
 
															+
														
 
															+Disabling right-merging (step [right-merging]) causes 
														
 
															+fragmentation; the other heuristics proved insufficient to 
														
 
															+address this, so the final answer to this was that when we expand 
														
 
															+the TDB file inside a transaction commit, we repack the entire 
														
 
															+tdb.
														
 
															+
														
 
															+The single list lock limits our allocation rate; due to the other 
														
 
															+issues this is not currently seen as a bottleneck.
														
 
															+
														
 
															+3.5.1 Proposed Solution
														
 
															+
														
 
															+The first step is to remove all the current heuristics, as they 
														
 
															+obviously interact, then examine them once the lock contention is 
														
 
															+addressed.
														
 
															+
														
 
															+The free list must be split to reduce contention. Assuming 
														
 
															+perfect free merging, we can at most have 1 free list entry for 
														
 
															+each entry. This implies that the number of free lists is related 
														
 
															+to the size of the hash table, but as it is rare to walk a large 
														
 
															+number of free list entries we can use far fewer, say 1/32 of the 
														
 
															+number of hash buckets.
														
 
															+
														
 
															+There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
														
 
															+) but it's not clear this would reduce contention in the common 
														
 
															+case where all processes are allocating/freeing the same size. 
														
 
															+Thus we almost certainly need to divide in other ways: the most 
														
 
															+obvious is to divide the file into zones, and using a free list 
														
 
															+(or set of free lists) for each. This approximates address 
														
 
															+ordering.
														
 
															+
														
 
															+Note that this means we need to split the free lists when we 
														
 
															+expand the file; this is probably acceptable when we double the 
														
 
															+hash table size, since that is such an expensive operation 
														
 
															+already. In the case of increasing the file size, there is an 
														
 
															+optimization we can use: if we use M in the formula above as the 
														
 
															+file size rounded up to the next power of 2, we only need 
														
 
															+reshuffle free lists when the file size crosses a power of 2 
														
 
															+boundary, and reshuffling the free lists is trivial: we simply 
														
 
															+merge every consecutive pair of free lists.
														
 
															+
														
 
															+The basic algorithm is as follows. Freeing is simple:
														
 
															+
														
 
															+1. Identify the correct zone.
														
 
															+
														
 
															+2. Lock the corresponding list.
														
 
															+
														
 
															+3. Re-check the zone (we didn't have a lock, sizes could have 
														
 
															+  changed): relock if necessary.
														
 
															+
														
 
															+4. Place the freed entry in the list for that zone.
														
 
															+
														
 
															+Allocation is a little more complicated, as we perform delayed 
														
 
															+coalescing at this point:
														
 
															+
														
 
															+1. Pick a zone either the zone we last freed into, or based on a “
														
 
															+  random” number.
														
 
															+
														
 
															+2. Lock the corresponding list.
														
 
															+
														
 
															+3. Re-check the zone: relock if necessary.
														
 
															+
														
 
															+4. If the top entry is -large enough, remove it from the list and 
														
 
															+  return it.
														
 
															+
														
 
															+5. Otherwise, coalesce entries in the list.
														
 
															+
														
 
															+  (a) 
														
 
															+
														
 
															+  (b) 
														
 
															+
														
 
															+  (c) 
														
 
															+
														
 
															+  (d) 
														
 
															+
														
 
															+6. If there was no entry large enough, unlock the list and try 
														
 
															+  the next zone.
														
 
															+
														
 
															+7. 
														
 
															+
														
 
															+8. 
														
 
															+
														
 
															+9. If no zone satisfies, expand the file.
														
 
															+
														
 
															+This optimizes rapid insert/delete of free list entries by not 
														
 
															+coalescing them all the time.. First-fit address ordering 
														
 
															+ordering seems to be fairly good for keeping fragmentation low 
														
 
															+(see [sub:TDB-Becomes-Fragmented]). Note that address ordering 
														
 
															+does not need a tailer to coalesce, though if we needed one we 
														
 
															+could have one cheaply: see [sub:Records-Incur-A]. 
														
 
															+
														
 
															+
														
 
															+
														
 
															+I anticipate that the number of entries in each free zone would 
														
 
															+be small, but it might be worth using one free entry to hold 
														
 
															+pointers to the others for cache efficiency.
														
 
															+
														
 
															+3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
														
 
															+
														
 
															+Much of this is a result of allocation strategy[footnote:
														
 
															+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 
														
 
															+ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
														
 
															+] and deliberate hobbling of coalescing; internal fragmentation 
														
 
															+(aka overallocation) is deliberately set at 25%, and external 
														
 
															+fragmentation is only cured by the decision to repack the entire 
														
 
															+db when a transaction commit needs to enlarge the file.
														
 
															+
														
 
															+3.6.1 Proposed Solution
														
 
															+
														
 
															+The 25% overhead on allocation works in practice for ldb because 
														
 
															+indexes tend to expand by one record at a time. This internal 
														
 
															+fragmentation can be resolved by having an “expanded” bit in the 
														
 
															+header to note entries that have previously expanded, and 
														
 
															+allocating more space for them.
														
 
															+
														
 
															+There are is a spectrum of possible solutions for external 
														
 
															+fragmentation: one is to use a fragmentation-avoiding allocation 
														
 
															+strategy such as best-fit address-order allocator. The other end 
														
 
															+of the spectrum would be to use a bump allocator (very fast and 
														
 
															+simple) and simply repack the file when we reach the end.
														
 
															+
														
 
															+There are three problems with efficient fragmentation-avoiding 
														
 
															+allocators: they are non-trivial, they tend to use a single free 
														
 
															+list for each size, and there's no evidence that tdb allocation 
														
 
															+patterns will match those recorded for general allocators (though 
														
 
															+it seems likely).
														
 
															+
														
 
															+Thus we don't spend too much effort on external fragmentation; we 
														
 
															+will be no worse than the current code if we need to repack on 
														
 
															+occasion. More effort is spent on reducing freelist contention, 
														
 
															+and reducing overhead.
														
 
															+
														
 
															+3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
														
 
															+
														
 
															+Each TDB record has a header as follows:
														
 
															+
														
 
															+struct tdb_record {
														
 
															+
														
 
															+        tdb_off_t next; /* offset of the next record in the list 
														
 
															+*/
														
 
															+
														
 
															+        tdb_len_t rec_len; /* total byte length of record */
														
 
															+
														
 
															+        tdb_len_t key_len; /* byte length of key */
														
 
															+
														
 
															+        tdb_len_t data_len; /* byte length of data */
														
 
															+
														
 
															+        uint32_t full_hash; /* the full 32 bit hash of the key */
														
 
															+
														
 
															+        uint32_t magic;   /* try to catch errors */
														
 
															+
														
 
															+        /* the following union is implied:
														
 
															+
														
 
															+                union {
														
 
															+
														
 
															+                        char record[rec_len];
														
 
															+
														
 
															+                        struct {
														
 
															+
														
 
															+                                char key[key_len];
														
 
															+
														
 
															+                                char data[data_len];
														
 
															+
														
 
															+                        }
														
 
															+
														
 
															+                        uint32_t totalsize; (tailer)
														
 
															+
														
 
															+                }
														
 
															+
														
 
															+        */
														
 
															+
														
 
															+};
														
 
															+
														
 
															+Naively, this would double to a 56-byte overhead on a 64 bit 
														
 
															+implementation.
														
 
															+
														
 
															+3.7.1 Proposed Solution
														
 
															+
														
 
															+We can use various techniques to reduce this for an allocated 
														
 
															+block:
														
 
															+
														
 
															+1. The 'next' pointer is not required, as we are using a flat 
														
 
															+  hash table.
														
 
															+
														
 
															+2. 'rec_len' can instead be expressed as an addition to key_len 
														
 
															+  and data_len (it accounts for wasted or overallocated length in 
														
 
															+  the record). Since the record length is always a multiple of 8, 
														
 
															+  we can conveniently fit it in 32 bits (representing up to 35 
														
 
															+  bits).
														
 
															+
														
 
															+3. 'key_len' and 'data_len' can be reduced. I'm unwilling to 
														
 
															+  restrict 'data_len' to 32 bits, but instead we can combine the 
														
 
															+  two into one 64-bit field and using a 5 bit value which 
														
 
															+  indicates at what bit to divide the two. Keys are unlikely to 
														
 
															+  scale as fast as data, so I'm assuming a maximum key size of 32 
														
 
															+  bits.
														
 
															+
														
 
															+4. 'full_hash' is used to avoid a memcmp on the “miss” case, but 
														
 
															+  this is diminishing returns after a handful of bits (at 10 
														
 
															+  bits, it reduces 99.9% of false memcmp). As an aside, as the 
														
 
															+  lower bits are already incorporated in the hash table 
														
 
															+  resolution, the upper bits should be used here.
														
 
															+
														
 
															+5. 'magic' does not need to be enlarged: it currently reflects 
														
 
															+  one of 5 values (used, free, dead, recovery, and 
														
 
															+  unused_recovery). It is useful for quick sanity checking 
														
 
															+  however, and should not be eliminated.
														
 
															+
														
 
															+6. 'tailer' is only used to coalesce free blocks (so a block to 
														
 
															+  the right can find the header to check if this block is free). 
														
 
															+  This can be replaced by a single 'free' bit in the header of 
														
 
															+  the following block (and the tailer only exists in free 
														
 
															+  blocks).[footnote:
														
 
															+This technique from Thomas Standish. Data Structure Techniques. 
														
 
															+Addison-Wesley, Reading, Massachusetts, 1980.
														
 
															+] The current proposed coalescing algorithm doesn't need this, 
														
 
															+  however.
														
 
															+
														
 
															+This produces a 16 byte used header like this:
														
 
															+
														
 
															+struct tdb_used_record {
														
 
															+
														
 
															+        uint32_t magic : 16,
														
 
															+
														
 
															+                 prev_is_free: 1,
														
 
															+
														
 
															+                 key_data_divide: 5,
														
 
															+
														
 
															+                 top_hash: 10;
														
 
															+
														
 
															+        uint32_t extra_octets;
														
 
															+
														
 
															+        uint64_t key_and_data_len;
														
 
															+
														
 
															+};
														
 
															+
														
 
															+And a free record like this:
														
 
															+
														
 
															+struct tdb_free_record {
														
 
															+
														
 
															+        uint32_t free_magic;
														
 
															+
														
 
															+        uint64_t total_length;
														
 
															+
														
 
															+        ...
														
 
															+
														
 
															+        uint64_t tailer;
														
 
															+
														
 
															+};
														
 
															+
														
 
															+
														
 
															+
														
 
															+3.8 Transaction Commit Requires 4 fdatasync
														
 
															+
														
 
															+The current transaction algorithm is:
														
 
															+
														
 
															+1. write_recovery_data();
														
 
															+
														
 
															+2. sync();
														
 
															+
														
 
															+3. write_recovery_header();
														
 
															+
														
 
															+4. sync();
														
 
															+
														
 
															+5. overwrite_with_new_data();
														
 
															+
														
 
															+6. sync();
														
 
															+
														
 
															+7. remove_recovery_header();
														
 
															+
														
 
															+8. sync(); 
														
 
															+
														
 
															+On current ext3, each sync flushes all data to disk, so the next 
														
 
															+3 syncs are relatively expensive. But this could become a 
														
 
															+performance bottleneck on other filesystems such as ext4.
														
 
															+
														
 
															+3.8.1 Proposed Solution
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+Neil Brown points out that this is overzealous, and only one sync 
														
 
															+is needed:
														
 
															+
														
 
															+1. Bundle the recovery data, a transaction counter and a strong 
														
 
															+  checksum of the new data.
														
 
															+
														
 
															+2. Strong checksum that whole bundle.
														
 
															+
														
 
															+3. Store the bundle in the database.
														
 
															+
														
 
															+4. Overwrite the oldest of the two recovery pointers in the 
														
 
															+  header (identified using the transaction counter) with the 
														
 
															+  offset of this bundle.
														
 
															+
														
 
															+5. sync.
														
 
															+
														
 
															+6. Write the new data to the file.
														
 
															+
														
 
															+Checking for recovery means identifying the latest bundle with a 
														
 
															+valid checksum and using the new data checksum to ensure that it 
														
 
															+has been applied. This is more expensive than the current check, 
														
 
															+but need only be done at open. For running databases, a separate 
														
 
															+header field can be used to indicate a transaction in progress; 
														
 
															+we need only check for recovery if this is set.
														
 
															+
														
 
															+3.9 TDB Does Not Have Snapshot Support
														
 
															+
														
 
															+3.9.1 Proposed Solution
														
 
															+
														
 
															+None. At some point you say “use a real database”.
														
 
															+
														
 
															+But as a thought experiment, if we implemented transactions to 
														
 
															+only overwrite free entries (this is tricky: there must not be a 
														
 
															+header in each entry which indicates whether it is free, but use 
														
 
															+of presence in metadata elsewhere), and a pointer to the hash 
														
 
															+table, we could create an entirely new commit without destroying 
														
 
															+existing data. Then it would be easy to implement snapshots in a 
														
 
															+similar way.
														
 
															+
														
 
															+This would not allow arbitrary changes to the database, such as 
														
 
															+tdb_repack does, and would require more space (since we have to 
														
 
															+preserve the current and future entries at once). If we used hash 
														
 
															+trees rather than one big hash table, we might only have to 
														
 
															+rewrite some sections of the hash, too.
														
 
															+
														
 
															+We could then implement snapshots using a similar method, using 
														
 
															+multiple different hash tables/free tables.
														
 
															+
														
 
															+3.10 Transactions Cannot Operate in Parallel
														
 
															+
														
 
															+This would be useless for ldb, as it hits the index records with 
														
 
															+just about every update. It would add significant complexity in 
														
 
															+resolving clashes, and cause the all transaction callers to write 
														
 
															+their code to loop in the case where the transactions spuriously 
														
 
															+failed.
														
 
															+
														
 
															+3.10.1 Proposed Solution
														
 
															+
														
 
															+We could solve a small part of the problem by providing read-only 
														
 
															+transactions. These would allow one write transaction to begin, 
														
 
															+but it could not commit until all r/o transactions are done. This 
														
 
															+would require a new RO_TRANSACTION_LOCK, which would be upgraded 
														
 
															+on commit.
														
 
															+
														
 
															+3.11 Default Hash Function Is Suboptimal
														
 
															+
														
 
															+The Knuth-inspired multiplicative hash used by tdb is fairly slow 
														
 
															+(especially if we expand it to 64 bits), and works best when the 
														
 
															+hash bucket size is a prime number (which also means a slow 
														
 
															+modulus). In addition, it is highly predictable which could 
														
 
															+potentially lead to a Denial of Service attack in some TDB uses.
														
 
															+
														
 
															+3.11.1 Proposed Solution
														
 
															+
														
 
															+The Jenkins lookup3 hash[footnote:
														
 
															+http://burtleburtle.net/bob/c/lookup3.c
														
 
															+] is a fast and superbly-mixing hash. It's used by the Linux 
														
 
															+kernel and almost everything else. This has the particular 
														
 
															+properties that it takes an initial seed, and produces two 32 bit 
														
 
															+hash numbers, which we can combine into a 64-bit hash.
														
 
															+
														
 
															+The seed should be created at tdb-creation time from some random 
														
 
															+source, and placed in the header. This is far from foolproof, but 
														
 
															+adds a little bit of protection against hash bombing.
														
 
															+
														
 
															+3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
														
 
															+
														
 
															+We lock a record during traversal iteration, and try to grab that 
														
 
															+lock in the delete code. If that grab on delete fails, we simply 
														
 
															+mark it deleted and continue onwards; traversal checks for this 
														
 
															+condition and does the delete when it moves off the record.
														
 
															+
														
 
															+If traversal terminates, the dead record may be left 
														
 
															+indefinitely.
														
 
															+
														
 
															+3.12.1 Proposed Solution
														
 
															+
														
 
															+Remove reliability guarantees; see [traverse-Proposed-Solution].
														
 
															+
														
 
															+3.13 Fcntl Locking Adds Overhead
														
 
															+
														
 
															+Placing a fcntl lock means a system call, as does removing one. 
														
 
															+This is actually one reason why transactions can be faster 
														
 
															+(everything is locked once at transaction start). In the 
														
 
															+uncontended case, this overhead can theoretically be eliminated.
														
 
															+
														
 
															+3.13.1 Proposed Solution
														
 
															+
														
 
															+None.
														
 
															+
														
 
															+We tried this before with spinlock support, in the early days of 
														
 
															+TDB, and it didn't make much difference except in manufactured 
														
 
															+benchmarks.
														
 
															+
														
 
															+We could use spinlocks (with futex kernel support under Linux), 
														
 
															+but it means that we lose automatic cleanup when a process dies 
														
 
															+with a lock. There is a method of auto-cleanup under Linux, but 
														
 
															+it's not supported by other operating systems. We could 
														
 
															+reintroduce a clear-if-first-style lock and sweep for dead 
														
 
															+futexes on open, but that wouldn't help the normal case of one 
														
 
															+concurrent opener dying. Increasingly elaborate repair schemes 
														
 
															+could be considered, but they require an ABI change (everyone 
														
 
															+must use them) anyway, so there's no need to do this at the same 
														
 
															+time as everything else.
														
 
															+
														
--- a/ccan/tdb2/doc/design.lyx
+++ b/ccan/tdb2/doc/design.lyx
@@ -0,0 +1,2282 @@
 
															+#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
														
 
															+\lyxformat 345
														
 
															+\begin_document
														
 
															+\begin_header
														
 
															+\textclass article
														
 
															+\use_default_options true
														
 
															+\language english
														
 
															+\inputencoding auto
														
 
															+\font_roman default
														
 
															+\font_sans default
														
 
															+\font_typewriter default
														
 
															+\font_default_family default
														
 
															+\font_sc false
														
 
															+\font_osf false
														
 
															+\font_sf_scale 100
														
 
															+\font_tt_scale 100
														
 
															+
														
 
															+\graphics default
														
 
															+\paperfontsize default
														
 
															+\use_hyperref false
														
 
															+\papersize default
														
 
															+\use_geometry false
														
 
															+\use_amsmath 1
														
 
															+\use_esint 1
														
 
															+\cite_engine basic
														
 
															+\use_bibtopic false
														
 
															+\paperorientation portrait
														
 
															+\secnumdepth 3
														
 
															+\tocdepth 3
														
 
															+\paragraph_separation indent
														
 
															+\defskip medskip
														
 
															+\quotes_language english
														
 
															+\papercolumns 1
														
 
															+\papersides 1
														
 
															+\paperpagestyle default
														
 
															+\tracking_changes true
														
 
															+\output_changes true
														
 
															+\author "" 
														
 
															+\author "" 
														
 
															+\end_header
														
 
															+
														
 
															+\begin_body
														
 
															+
														
 
															+\begin_layout Title
														
 
															+TDB2: A Redesigning The Trivial DataBase
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Author
														
 
															+Rusty Russell, IBM Corporation
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Date
														
 
															+26-July-2010
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Abstract
														
 
															+The Trivial DataBase on-disk format is 32 bits; with usage cases heading
														
 
															+ towards the 4G limit, that must change.
														
 
															+ This required breakage provides an opportunity to revisit TDB's other design
														
 
															+ decisions and reassess them.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Section
														
 
															+Introduction
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The Trivial DataBase was originally written by Andrew Tridgell as a simple
														
 
															+ key/data pair storage system with the same API as dbm, but allowing multiple
														
 
															+ readers and writers while being small enough (< 1000 lines of C) to include
														
 
															+ in SAMBA.
														
 
															+ The simple design created in 1999 has proven surprisingly robust and performant
														
 
															+, used in Samba versions 3 and 4 as well as numerous other projects.
														
 
															+ Its useful life was greatly increased by the (backwards-compatible!) addition
														
 
															+ of transaction support in 2005.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The wider variety and greater demands of TDB-using code has lead to some
														
 
															+ organic growth of the API, as well as some compromises on the implementation.
														
 
															+ None of these, by themselves, are seen as show-stoppers, but the cumulative
														
 
															+ effect is to a loss of elegance over the initial, simple TDB implementation.
														
 
															+ Here is a table of the approximate number of lines of implementation code
														
 
															+ and number of API functions at the end of each year:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+\begin_inset Tabular
														
 
															+<lyxtabular version="3" rows="12" columns="3">
														
 
															+<features>
														
 
															+<column alignment="center" valignment="top" width="0">
														
 
															+<column alignment="center" valignment="top" width="0">
														
 
															+<column alignment="center" valignment="top" width="0">
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+Year End
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+API Functions
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+Lines of C Code Implementation
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+1999
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+13
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+1195
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2000
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+24
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+1725
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2001
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+32
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2228
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2002
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+35
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2481
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2003
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+35
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2552
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2004
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+40
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2584
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2005
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+38
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2647
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2006
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+52
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+3754
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2007
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+66
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+4398
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2008
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+71
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+4768
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2009
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+73
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+5715
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+</lyxtabular>
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This review is an attempt to catalog and address all the known issues with
														
 
															+ TDB and create solutions which address the problems without significantly
														
 
															+ increasing complexity; all involved are far too aware of the dangers of
														
 
															+ second system syndrome in rewriting a successful project like this.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Section
														
 
															+API Issues
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+tdb_open_ex Is Not Expandable
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The tdb_open() call was expanded to tdb_open_ex(), which added an optional
														
 
															+ hashing function and an optional logging function argument.
														
 
															+ Additional arguments to open would require the introduction of a tdb_open_ex2
														
 
															+ call etc.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+tdb_open() will take a linked-list of attributes:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+enum tdb_attribute {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    TDB_ATTRIBUTE_LOG = 0,
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    TDB_ATTRIBUTE_HASH = 1
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+struct tdb_attribute_base {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    enum tdb_attribute attr;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    union tdb_attribute *next;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+struct tdb_attribute_log {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    tdb_log_func log_fn;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    void *log_private;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+struct tdb_attribute_hash {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    tdb_hash_func hash_fn;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    void *hash_private;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+union tdb_attribute {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    struct tdb_attribute_base base;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    struct tdb_attribute_log log;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    struct tdb_attribute_hash hash;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This allows future attributes to be added, even if this expands the size
														
 
															+ of the union.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+tdb_traverse Makes Impossible Guarantees
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
														
 
															+ was thought that it was important to guarantee that all records which exist
														
 
															+ at the start and end of the traversal would be included, and no record
														
 
															+ would be included twice.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This adds complexity (see
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "Reliable-Traversal-Adds"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+) and does not work anyway for records which are altered (in particular,
														
 
															+ those which are expanded may be effectively deleted and re-added behind
														
 
															+ the traversal).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "traverse-Proposed-Solution"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Abandon the guarantee.
														
 
															+ You will see every record if no changes occur during your traversal, otherwise
														
 
															+ you will see some subset.
														
 
															+ You can prevent changes by using a transaction or the locking API.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Nesting of Transactions Is Fraught
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+TDB has alternated between allowing nested transactions and not allowing
														
 
															+ them.
														
 
															+ Various paths in the Samba codebase assume that transactions will nest,
														
 
															+ and in a sense they can: the operation is only committed to disk when the
														
 
															+ outer transaction is committed.
														
 
															+ There are two problems, however:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Canceling the inner transaction will cause the outer transaction commit
														
 
															+ to fail, and will not undo any operations since the inner transaction began.
														
 
															+ This problem is soluble with some additional internal code.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+An inner transaction commit can be cancelled by the outer transaction.
														
 
															+ This is desirable in the way which Samba's database initialization code
														
 
															+ uses transactions, but could be a surprise to any users expecting a successful
														
 
															+ transaction commit to expose changes to others.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The current solution is to specify the behavior at tdb_open(), with the
														
 
															+ default currently that nested transactions are allowed.
														
 
															+ This flag can also be changed at runtime.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Given the usage patterns, it seems that the 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+least-surprise
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ behavior of disallowing nested transactions should become the default.
														
 
															+ Additionally, it seems the outer transaction is the only code which knows
														
 
															+ whether inner transactions should be allowed, so a flag to indicate this
														
 
															+ could be added to tdb_transaction_start.
														
 
															+ However, this behavior can be simulated with a wrapper which uses tdb_add_flags
														
 
															+() and tdb_remove_flags(), so the API should not be expanded for this relatively
														
 
															+-obscure case.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Incorrect Hash Function is Not Detected
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+tdb_open_ex() allows the calling code to specify a different hash function
														
 
															+ to use, but does not check that all other processes accessing this tdb
														
 
															+ are using the same hash function.
														
 
															+ The result is that records are missing from tdb_fetch().
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The header should contain an example hash result (eg.
														
 
															+ the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
														
 
															+ hash function produces the same answer, or fail the tdb_open call.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+tdb_set_max_dead/TDB_VOLATILE Expose Implementation
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+In response to scalability issues with the free list (
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "TDB-Freelist-Is"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
														
 
															+ and the TDB_VOLATILE flag to tdb_open.
														
 
															+ The latter actually calls the former with an argument of 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+5
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This code allows deleted records to accumulate without putting them in the
														
 
															+ free list.
														
 
															+ On delete we iterate through each chain and free them in a batch if there
														
 
															+ are more than max_dead entries.
														
 
															+ These are never otherwise recycled except as a side-effect of a tdb_repack.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+With the scalability problems of the freelist solved, this API can be removed.
														
 
															+ The TDB_VOLATILE flag may still be useful as a hint that store and delete
														
 
															+ of records will be at least as common as fetch in order to allow some internal
														
 
															+ tuning, but initially will become a no-op.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "TDB-Files-Cannot"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+TDB Files Cannot Be Opened Multiple Times In The Same Process
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+No process can open the same TDB twice; we check and disallow it.
														
 
															+ This is an unfortunate side-effect of fcntl locks, which operate on a per-file
														
 
															+ rather than per-file-descriptor basis, and do not nest.
														
 
															+ Thus, closing any file descriptor on a file clears all the locks obtained
														
 
															+ by this process, even if they were placed using a different file descriptor!
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Note that even if this were solved, deadlock could occur if operations were
														
 
															+ nested: this is a more manageable programming error in most cases.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We could lobby POSIX to fix the perverse rules, or at least lobby Linux
														
 
															+ to violate them so that the most common implementation does not have this
														
 
															+ restriction.
														
 
															+ This would be a generally good idea for other fcntl lock users.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Samba uses a wrapper which hands out the same tdb_context to multiple callers
														
 
															+ if this happens, and does simple reference counting.
														
 
															+ We should do this inside the tdb library, which already emulates lock nesting
														
 
															+ internally; it would need to recognize when deadlock occurs within a single
														
 
															+ process.
														
 
															+ This would create a new failure mode for tdb operations (while we currently
														
 
															+ handle locking failures, they are impossible in normal use and a process
														
 
															+ encountering them can do little but give up).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+I do not see benefit in an additional tdb_open flag to indicate whether
														
 
															+ re-opening is allowed, as though there may be some benefit to adding a
														
 
															+ call to detect when a tdb_context is shared, to allow other to create such
														
 
															+ an API.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+TDB API Is Not POSIX Thread-safe
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The TDB API uses an error code which can be queried after an operation to
														
 
															+ determine what went wrong.
														
 
															+ This programming model does not work with threads, unless specific additional
														
 
															+ guarantees are given by the implementation.
														
 
															+ In addition, even otherwise-independent threads cannot open the same TDB
														
 
															+ (as in 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "TDB-Files-Cannot"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Reachitecting the API to include a tdb_errcode pointer would be a great
														
 
															+ deal of churn; we are better to guarantee that the tdb_errcode is per-thread
														
 
															+ so the current programming model can be maintained.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This requires dynamic per-thread allocations, which is awkward with POSIX
														
 
															+ threads (pthread_key_create space is limited and we cannot simply allocate
														
 
															+ a key for every TDB).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Internal locking is required to make sure that fcntl locks do not overlap
														
 
															+ between threads, and also that the global list of tdbs is maintained.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
														
 
															+ version of the library, and otherwise no overhead will exist.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+*_nonblock Functions And *_mark Functions Expose Implementation
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+CTDB
														
 
															+\begin_inset Foot
														
 
															+status collapsed
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+Clustered TDB, see http://ctdb.samba.org
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+ wishes to operate on TDB in a non-blocking manner.
														
 
															+ This is currently done as follows:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Call the _nonblock variant of an API function (eg.
														
 
															+ tdb_lockall_nonblock).
														
 
															+ If this fails:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Fork a child process, and wait for it to call the normal variant (eg.
														
 
															+ tdb_lockall).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If the child succeeds, call the _mark variant to indicate we already have
														
 
															+ the locks (eg.
														
 
															+ tdb_lockall_mark).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Upon completion, tell the child to release the locks (eg.
														
 
															+ tdb_unlockall).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Indicate to tdb that it should consider the locks removed (eg.
														
 
															+ tdb_unlockall_mark).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+There are several issues with this approach.
														
 
															+ Firstly, adding two new variants of each function clutters the API for
														
 
															+ an obscure use, and so not all functions have three variants.
														
 
															+ Secondly, it assumes that all paths of the functions ask for the same locks,
														
 
															+ otherwise the parent process will have to get a lock which the child doesn't
														
 
															+ have under some circumstances.
														
 
															+ I don't believe this is currently the case, but it constrains the implementatio
														
 
															+n.
														
 
															+ 
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "Proposed-Solution-locking-hook"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Implement a hook for locking methods, so that the caller can control the
														
 
															+ calls to create and remove fcntl locks.
														
 
															+ In this scenario, ctdbd would operate as follows:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Call the normal API function, eg tdb_lockall().
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+When the lock callback comes in, check if the child has the lock.
														
 
															+ Initially, this is always false.
														
 
															+ If so, return 0.
														
 
															+ Otherwise, try to obtain it in non-blocking mode.
														
 
															+ If that fails, return EWOULDBLOCK.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Release locks in the unlock callback as normal.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
														
 
															+ child to repeat the operation.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+The child records what locks it obtains, and returns that information to
														
 
															+ the parent.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+When the child has succeeded, goto 1.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This is flexible enough to handle any potential locking scenario, even when
														
 
															+ lock requirements change.
														
 
															+ It can be optimized so that the parent does not release locks, just tells
														
 
															+ the child which locks it doesn't need to obtain.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+It also keeps the complexity out of the API, and in ctdbd where it is needed.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+tdb_chainlock Functions Expose Implementation
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+tdb_chainlock locks some number of records, including the record indicated
														
 
															+ by the given key.
														
 
															+ This gave atomicity guarantees; no-one can start a transaction, alter,
														
 
															+ read or delete that key while the lock is held.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+It also makes the same guarantee for any other key in the chain, which is
														
 
															+ an internal implementation detail and potentially a cause for deadlock.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+None.
														
 
															+ It would be nice to have an explicit single entry lock which effected no
														
 
															+ other keys.
														
 
															+ Unfortunately, this won't work for an entry which doesn't exist.
														
 
															+ Thus while chainlock may be implemented more efficiently for the existing
														
 
															+ case, it will still have overlap issues with the non-existing case.
														
 
															+ So it is best to keep the current (lack of) guarantee about which records
														
 
															+ will be effected to avoid constraining our implementation.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Signal Handling is Not Race-Free
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
														
 
															+ that the tdb locking code should return with a failure, rather than trying
														
 
															+ again when a signal is received (and errno == EAGAIN).
														
 
															+ This is usually used to implement timeouts.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Unfortunately, this does not work in the case where the signal is received
														
 
															+ before the tdb code enters the fcntl() call to place the lock: the code
														
 
															+ will sleep within the fcntl() code, unaware that the signal wants it to
														
 
															+ exit.
														
 
															+ In the case of long timeouts, this does not happen in practice.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The locking hooks proposed in
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "Proposed-Solution-locking-hook"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+ would allow the user to decide on whether to fail the lock acquisition
														
 
															+ on a signal.
														
 
															+ This allows the caller to choose their own compromise: they could narrow
														
 
															+ the race by checking immediately before the fcntl call.
														
 
															+\begin_inset Foot
														
 
															+status collapsed
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+It may be possible to make this race-free in some implementations by having
														
 
															+ the signal handler alter the struct flock to make it invalid.
														
 
															+ This will cause the fcntl() lock call to fail with EINVAL if the signal
														
 
															+ occurs before the kernel is entered, otherwise EAGAIN.
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+The API Uses Gratuitous Typedefs, Capitals
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+typedefs are useful for providing source compatibility when types can differ
														
 
															+ across implementations, or arguably in the case of function pointer definitions
														
 
															+ which are hard for humans to parse.
														
 
															+ Otherwise it is simply obfuscation and pollutes the namespace.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Capitalization is usually reserved for compile-time constants and macros.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Description
														
 
															+TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
														
 
															+ definition isn't visible to the API user anyway.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Description
														
 
															+TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
														
 
															+ needs to be understood by the API user.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Description
														
 
															+struct
														
 
															+\begin_inset space ~
														
 
															+\end_inset
														
 
															+
														
 
															+TDB_DATA This would normally be called 'struct tdb_data'.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Description
														
 
															+enum
														
 
															+\begin_inset space ~
														
 
															+\end_inset
														
 
															+
														
 
															+TDB_ERROR Similarly, this would normally be enum tdb_error.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+None.
														
 
															+ Introducing lower case variants would please pedants like myself, but if
														
 
															+ it were done the existing ones should be kept.
														
 
															+ There is little point forcing a purely cosmetic change upon tdb users.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "tdb_log_func-Doesnt-Take"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+tdb_log_func Doesn't Take The Private Pointer
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+For API compatibility reasons, the logging function needs to call tdb_get_loggin
														
 
															+g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+It should simply take an extra argument, since we are prepared to break
														
 
															+ the API/ABI.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Various Callback Functions Are Not Typesafe
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The callback functions in tdb_set_logging_function (after 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "tdb_log_func-Doesnt-Take"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
														
 
															+ all take void * and must internally convert it to the argument type they
														
 
															+ were expecting.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+If this type changes, the compiler will not produce warnings on the callers,
														
 
															+ since it only sees void *.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+With careful use of macros, we can create callback functions which give
														
 
															+ a warning when used on gcc and the types of the callback and its private
														
 
															+ argument differ.
														
 
															+ Unsupported compilers will not give a warning, which is no worse than now.
														
 
															+ In addition, the callbacks become clearer, as they need not use void *
														
 
															+ for their parameter.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
														
 
															+ be cleared if the caller discovers it is the only process with the TDB
														
 
															+ open.
														
 
															+ However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
														
 
															+ be detected, so will have the TDB erased underneath them (usually resulting
														
 
															+ in a crash).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+There is a similar issue on fork(); if the parent exits (or otherwise closes
														
 
															+ the tdb) before the child calls tdb_reopen_all() to establish the lock
														
 
															+ used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
														
 
															+ at that moment will believe it alone has opened the TDB and will erase
														
 
															+ it.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Remove TDB_CLEAR_IF_FIRST.
														
 
															+ Other workarounds are possible, but see 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Section
														
 
															+Performance And Scalability Issues
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+TDB_CLEAR_IF_FIRST Imposes Performance Penalty
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
														
 
															+ 4 (aka.
														
 
															+ the ACTIVE_LOCK).
														
 
															+ While these locks never conflict in normal tdb usage, they do add substantial
														
 
															+ overhead for most fcntl lock implementations when the kernel scans to detect
														
 
															+ if a lock conflict exists.
														
 
															+ This is often a single linked list, making the time to acquire and release
														
 
															+ a fcntl lock O(N) where N is the number of processes with the TDB open,
														
 
															+ not the number actually doing work.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+In a Samba server it is common to have huge numbers of clients sitting idle,
														
 
															+ and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
														
 
															+\begin_inset Foot
														
 
															+status collapsed
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+There is a flag to tdb_reopen_all() which is used for this optimization:
														
 
															+ if the parent process will outlive the child, the child does not need the
														
 
															+ ACTIVE_LOCK.
														
 
															+ This is a workaround for this very performance issue.
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Remove the flag.
														
 
															+ It was a neat idea, but even trivial servers tend to know when they are
														
 
															+ initializing for the first time and can simply unlink the old tdb at that
														
 
															+ point.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+TDB Files Have a 4G Limit
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This seems to be becoming an issue (so much for 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+trivial
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+!), particularly for ldb.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+A new, incompatible TDB format which uses 64 bit offsets internally rather
														
 
															+ than 32 bit as now.
														
 
															+ For simplicity of endian conversion (which TDB does on the fly if required),
														
 
															+ all values will be 64 bit on disk.
														
 
															+ In practice, some upper bits may be used for other purposes, but at least
														
 
															+ 56 bits will be available for file offsets.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+tdb_open() will automatically detect the old version, and even create them
														
 
															+ if TDB_VERSION6 is specified to tdb_open.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+32 bit processes will still be able to access TDBs larger than 4G (assuming
														
 
															+ that their off_t allows them to seek to 64 bits), they will gracefully
														
 
															+ fall back as they fail to mmap.
														
 
															+ This can happen already with large TDBs.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Old versions of tdb will fail to open the new TDB files (since 28 August
														
 
															+ 2009, commit 398d0c29290: prior to that any unrecognized file format would
														
 
															+ be erased and initialized as a fresh tdb!)
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+TDB Records Have a 4G Limit
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This has not been a reported problem, and the API uses size_t which can
														
 
															+ be 64 bit on 64 bit platforms.
														
 
															+ However, other limits may have made such an issue moot.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Record sizes will be 64 bit, with an error returned on 32 bit platforms
														
 
															+ which try to access such records (the current implementation would return
														
 
															+ TDB_ERR_OOM in a similar case).
														
 
															+ It seems unlikely that 32 bit keys will be a limitation, so the implementation
														
 
															+ may not support this (see 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "sub:Records-Incur-A"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Hash Size Is Determined At TDB Creation Time
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+TDB contains a number of hash chains in the header; the number is specified
														
 
															+ at creation time, and defaults to 131.
														
 
															+ This is such a bottleneck on large databases (as each hash chain gets quite
														
 
															+ long), that LDB uses 10,000 for this hash.
														
 
															+ In general it is impossible to know what the 'right' answer is at database
														
 
															+ creation time.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+After comprehensive performance testing on various scalable hash variants
														
 
															+\begin_inset Foot
														
 
															+status collapsed
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
														
 
															+ because I was previously convinced that an expanding tree of hashes would
														
 
															+ be very close to optimal.
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+, it became clear that it is hard to beat a straight linear hash table which
														
 
															+ doubles in size when it reaches saturation.
														
 
															+ There are three details which become important:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+On encountering a full bucket, we use the next bucket.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Extra hash bits are stored with the offset, to reduce comparisons.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+A marker entry is used on deleting an entry.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The doubling of the table must be done under a transaction; we will not
														
 
															+ reduce it on deletion, so it will be an unusual case.
														
 
															+ It will either be placed at the head (other entries will be moved out the
														
 
															+ way so we can expand).
														
 
															+ We could have a pointer in the header to the current hashtable location,
														
 
															+ but that pointer would have to be read frequently to check for hashtable
														
 
															+ moves.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The locking for this is slightly more complex than the chained case; we
														
 
															+ currently have one lock per bucket, and that means we would need to expand
														
 
															+ the lock if we overflow to the next bucket.
														
 
															+ The frequency of such collisions will effect our locking heuristics: we
														
 
															+ can always lock more buckets than we need.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+One possible optimization is to only re-check the hash size on an insert
														
 
															+ or a lookup miss.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "TDB-Freelist-Is"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+TDB Freelist Is Highly Contended
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+TDB uses a single linked list for the free list.
														
 
															+ Allocation occurs as follows, using heuristics which have evolved over
														
 
															+ time:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Get the free list lock for this whole operation.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Multiply length by 1.25, so we always over-allocate by 25%.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Set the slack multiplier to 1.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Examine the current freelist entry: if it is > length but < the current
														
 
															+ best case, remember it as the best case.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Multiply the slack multiplier by 1.05.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If our best fit so far is less than length * slack multiplier, return it.
														
 
															+ The slack will be turned into a new free record if it's large enough.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Otherwise, go onto the next freelist entry.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Deleting a record occurs as follows:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Lock the hash chain for this whole operation.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Walk the chain to find the record, keeping the prev pointer offset.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If max_dead is non-zero:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_deeper
														
 
															+\begin_layout Enumerate
														
 
															+Walk the hash chain again and count the dead records.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If it's more than max_dead, bulk free all the dead ones (similar to steps
														
 
															+ 4 and below, but the lock is only obtained once).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Simply mark this record as dead and return.
														
 
															+ 
														
 
															+\end_layout
														
 
															+
														
 
															+\end_deeper
														
 
															+\begin_layout Enumerate
														
 
															+Get the free list lock for the remainder of this operation.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "right-merging"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+Examine the following block to see if it is free; if so, enlarge the current
														
 
															+ block and remove that block from the free list.
														
 
															+ This was disabled, as removal from the free list was O(entries-in-free-list).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Examine the preceeding block to see if it is free: for this reason, each
														
 
															+ block has a 32-bit tailer which indicates its length.
														
 
															+ If it is free, expand it to cover our new block and return.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Otherwise, prepend ourselves to the free list.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Disabling right-merging (step 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "right-merging"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+) causes fragmentation; the other heuristics proved insufficient to address
														
 
															+ this, so the final answer to this was that when we expand the TDB file
														
 
															+ inside a transaction commit, we repack the entire tdb.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The single list lock limits our allocation rate; due to the other issues
														
 
															+ this is not currently seen as a bottleneck.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The first step is to remove all the current heuristics, as they obviously
														
 
															+ interact, then examine them once the lock contention is addressed.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The free list must be split to reduce contention.
														
 
															+ Assuming perfect free merging, we can at most have 1 free list entry for
														
 
															+ each entry.
														
 
															+ This implies that the number of free lists is related to the size of the
														
 
															+ hash table, but as it is rare to walk a large number of free list entries
														
 
															+ we can use far fewer, say 1/32 of the number of hash buckets.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+There are various benefits in using per-size free lists (see 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "sub:TDB-Becomes-Fragmented"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+) but it's not clear this would reduce contention in the common case where
														
 
															+ all processes are allocating/freeing the same size.
														
 
															+ Thus we almost certainly need to divide in other ways: the most obvious
														
 
															+ is to divide the file into zones, and using a free list (or set of free
														
 
															+ lists) for each.
														
 
															+ This approximates address ordering.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Note that this means we need to split the free lists when we expand the
														
 
															+ file; this is probably acceptable when we double the hash table size, since
														
 
															+ that is such an expensive operation already.
														
 
															+ In the case of increasing the file size, there is an optimization we can
														
 
															+ use: if we use M in the formula above as the file size rounded up to the
														
 
															+ next power of 2, we only need reshuffle free lists when the file size crosses
														
 
															+ a power of 2 boundary, 
														
 
															+\emph on
														
 
															+and 
														
 
															+\emph default
														
 
															+reshuffling the free lists is trivial: we simply merge every consecutive
														
 
															+ pair of free lists.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The basic algorithm is as follows.
														
 
															+ Freeing is simple:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Identify the correct zone.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Lock the corresponding list.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Re-check the zone (we didn't have a lock, sizes could have changed): relock
														
 
															+ if necessary.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Place the freed entry in the list for that zone.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Allocation is a little more complicated, as we perform delayed coalescing
														
 
															+ at this point:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Pick a zone either the zone we last freed into, or based on a 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+random
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ number.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Lock the corresponding list.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Re-check the zone: relock if necessary.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If the top entry is -large enough, remove it from the list and return it.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Otherwise, coalesce entries in the list.If there was no entry large enough,
														
 
															+ unlock the list and try the next zone.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If no zone satisfies, expand the file.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This optimizes rapid insert/delete of free list entries by not coalescing
														
 
															+ them all the time..
														
 
															+ First-fit address ordering ordering seems to be fairly good for keeping
														
 
															+ fragmentation low (see 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "sub:TDB-Becomes-Fragmented"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+).
														
 
															+ Note that address ordering does not need a tailer to coalesce, though if
														
 
															+ we needed one we could have one cheaply: see 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "sub:Records-Incur-A"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+.
														
 
															+ 
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+I anticipate that the number of entries in each free zone would be small,
														
 
															+ but it might be worth using one free entry to hold pointers to the others
														
 
															+ for cache efficiency.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "sub:TDB-Becomes-Fragmented"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+TDB Becomes Fragmented
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Much of this is a result of allocation strategy
														
 
															+\begin_inset Foot
														
 
															+status collapsed
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
														
 
															+xas.edu/pub/garbage/malloc/ismm98.ps
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+ and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
														
 
															+on) is deliberately set at 25%, and external fragmentation is only cured
														
 
															+ by the decision to repack the entire db when a transaction commit needs
														
 
															+ to enlarge the file.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The 25% overhead on allocation works in practice for ldb because indexes
														
 
															+ tend to expand by one record at a time.
														
 
															+ This internal fragmentation can be resolved by having an 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+expanded
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ bit in the header to note entries that have previously expanded, and allocating
														
 
															+ more space for them.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+There are is a spectrum of possible solutions for external fragmentation:
														
 
															+ one is to use a fragmentation-avoiding allocation strategy such as best-fit
														
 
															+ address-order allocator.
														
 
															+ The other end of the spectrum would be to use a bump allocator (very fast
														
 
															+ and simple) and simply repack the file when we reach the end.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+There are three problems with efficient fragmentation-avoiding allocators:
														
 
															+ they are non-trivial, they tend to use a single free list for each size,
														
 
															+ and there's no evidence that tdb allocation patterns will match those recorded
														
 
															+ for general allocators (though it seems likely).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Thus we don't spend too much effort on external fragmentation; we will be
														
 
															+ no worse than the current code if we need to repack on occasion.
														
 
															+ More effort is spent on reducing freelist contention, and reducing overhead.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "sub:Records-Incur-A"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+Records Incur A 28-Byte Overhead
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Each TDB record has a header as follows:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+struct tdb_record {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        tdb_off_t next; /* offset of the next record in the list */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        tdb_len_t rec_len; /* total byte length of record */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        tdb_len_t key_len; /* byte length of key */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        tdb_len_t data_len; /* byte length of data */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint32_t full_hash; /* the full 32 bit hash of the key */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint32_t magic;   /* try to catch errors */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        /* the following union is implied:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                union {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                        char record[rec_len];
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                        struct {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                                char key[key_len];
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                                char data[data_len];
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                        }
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                        uint32_t totalsize; (tailer)
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                }
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Naively, this would double to a 56-byte overhead on a 64 bit implementation.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We can use various techniques to reduce this for an allocated block:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+The 'next' pointer is not required, as we are using a flat hash table.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+'rec_len' can instead be expressed as an addition to key_len and data_len
														
 
															+ (it accounts for wasted or overallocated length in the record).
														
 
															+ Since the record length is always a multiple of 8, we can conveniently
														
 
															+ fit it in 32 bits (representing up to 35 bits).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+'key_len' and 'data_len' can be reduced.
														
 
															+ I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
														
 
															+ the two into one 64-bit field and using a 5 bit value which indicates at
														
 
															+ what bit to divide the two.
														
 
															+ Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
														
 
															+ size of 32 bits.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+'full_hash' is used to avoid a memcmp on the 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+miss
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ case, but this is diminishing returns after a handful of bits (at 10 bits,
														
 
															+ it reduces 99.9% of false memcmp).
														
 
															+ As an aside, as the lower bits are already incorporated in the hash table
														
 
															+ resolution, the upper bits should be used here.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+'magic' does not need to be enlarged: it currently reflects one of 5 values
														
 
															+ (used, free, dead, recovery, and unused_recovery).
														
 
															+ It is useful for quick sanity checking however, and should not be eliminated.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+'tailer' is only used to coalesce free blocks (so a block to the right can
														
 
															+ find the header to check if this block is free).
														
 
															+ This can be replaced by a single 'free' bit in the header of the following
														
 
															+ block (and the tailer only exists in free blocks).
														
 
															+\begin_inset Foot
														
 
															+status collapsed
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+This technique from Thomas Standish.
														
 
															+ Data Structure Techniques.
														
 
															+ Addison-Wesley, Reading, Massachusetts, 1980.
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+ The current proposed coalescing algorithm doesn't need this, however.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This produces a 16 byte used header like this:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+struct tdb_used_record {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint32_t magic : 16,
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                 prev_is_free: 1,
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                 key_data_divide: 5,
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                 top_hash: 10;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint32_t extra_octets;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint64_t key_and_data_len;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+And a free record like this:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+struct tdb_free_record {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint32_t free_magic;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint64_t total_length;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        ...
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint64_t tailer;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Transaction Commit Requires 4 fdatasync
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The current transaction algorithm is:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+write_recovery_data();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+sync();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+write_recovery_header();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+sync();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+overwrite_with_new_data();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+sync();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+remove_recovery_header();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+sync(); 
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+On current ext3, each sync flushes all data to disk, so the next 3 syncs
														
 
															+ are relatively expensive.
														
 
															+ But this could become a performance bottleneck on other filesystems such
														
 
															+ as ext4.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Neil Brown points out that this is overzealous, and only one sync is needed:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Bundle the recovery data, a transaction counter and a strong checksum of
														
 
															+ the new data.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Strong checksum that whole bundle.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Store the bundle in the database.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Overwrite the oldest of the two recovery pointers in the header (identified
														
 
															+ using the transaction counter) with the offset of this bundle.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+sync.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Write the new data to the file.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Checking for recovery means identifying the latest bundle with a valid checksum
														
 
															+ and using the new data checksum to ensure that it has been applied.
														
 
															+ This is more expensive than the current check, but need only be done at
														
 
															+ open.
														
 
															+ For running databases, a separate header field can be used to indicate
														
 
															+ a transaction in progress; we need only check for recovery if this is set.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "sub:TDB-Does-Not"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+TDB Does Not Have Snapshot Support
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+None.
														
 
															+ At some point you say 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+use a real database
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+But as a thought experiment, if we implemented transactions to only overwrite
														
 
															+ free entries (this is tricky: there must not be a header in each entry
														
 
															+ which indicates whether it is free, but use of presence in metadata elsewhere),
														
 
															+ and a pointer to the hash table, we could create an entirely new commit
														
 
															+ without destroying existing data.
														
 
															+ Then it would be easy to implement snapshots in a similar way.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This would not allow arbitrary changes to the database, such as tdb_repack
														
 
															+ does, and would require more space (since we have to preserve the current
														
 
															+ and future entries at once).
														
 
															+ If we used hash trees rather than one big hash table, we might only have
														
 
															+ to rewrite some sections of the hash, too.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We could then implement snapshots using a similar method, using multiple
														
 
															+ different hash tables/free tables.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Transactions Cannot Operate in Parallel
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This would be useless for ldb, as it hits the index records with just about
														
 
															+ every update.
														
 
															+ It would add significant complexity in resolving clashes, and cause the
														
 
															+ all transaction callers to write their code to loop in the case where the
														
 
															+ transactions spuriously failed.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We could solve a small part of the problem by providing read-only transactions.
														
 
															+ These would allow one write transaction to begin, but it could not commit
														
 
															+ until all r/o transactions are done.
														
 
															+ This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
														
 
															+ commit.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Default Hash Function Is Suboptimal
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
														
 
															+ if we expand it to 64 bits), and works best when the hash bucket size is
														
 
															+ a prime number (which also means a slow modulus).
														
 
															+ In addition, it is highly predictable which could potentially lead to a
														
 
															+ Denial of Service attack in some TDB uses.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The Jenkins lookup3 hash
														
 
															+\begin_inset Foot
														
 
															+status open
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+http://burtleburtle.net/bob/c/lookup3.c
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+ is a fast and superbly-mixing hash.
														
 
															+ It's used by the Linux kernel and almost everything else.
														
 
															+ This has the particular properties that it takes an initial seed, and produces
														
 
															+ two 32 bit hash numbers, which we can combine into a 64-bit hash.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The seed should be created at tdb-creation time from some random source,
														
 
															+ and placed in the header.
														
 
															+ This is far from foolproof, but adds a little bit of protection against
														
 
															+ hash bombing.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "Reliable-Traversal-Adds"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+Reliable Traversal Adds Complexity
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We lock a record during traversal iteration, and try to grab that lock in
														
 
															+ the delete code.
														
 
															+ If that grab on delete fails, we simply mark it deleted and continue onwards;
														
 
															+ traversal checks for this condition and does the delete when it moves off
														
 
															+ the record.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+If traversal terminates, the dead record may be left indefinitely.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Remove reliability guarantees; see 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "traverse-Proposed-Solution"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Fcntl Locking Adds Overhead
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Placing a fcntl lock means a system call, as does removing one.
														
 
															+ This is actually one reason why transactions can be faster (everything
														
 
															+ is locked once at transaction start).
														
 
															+ In the uncontended case, this overhead can theoretically be eliminated.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+None.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We tried this before with spinlock support, in the early days of TDB, and
														
 
															+ it didn't make much difference except in manufactured benchmarks.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We could use spinlocks (with futex kernel support under Linux), but it means
														
 
															+ that we lose automatic cleanup when a process dies with a lock.
														
 
															+ There is a method of auto-cleanup under Linux, but it's not supported by
														
 
															+ other operating systems.
														
 
															+ We could reintroduce a clear-if-first-style lock and sweep for dead futexes
														
 
															+ on open, but that wouldn't help the normal case of one concurrent opener
														
 
															+ dying.
														
 
															+ Increasingly elaborate repair schemes could be considered, but they require
														
 
															+ an ABI change (everyone must use them) anyway, so there's no need to do
														
 
															+ this at the same time as everything else.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Some Transactions Don't Require Durability
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
														
 
															+ usage, and occasionally empties the results into a transactional TDB.
														
 
															+ This kind of usage prioritizes performance over durability: as long as
														
 
															+ we are consistent, data can be lost.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This would be more neatly implemented inside tdb: a 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+soft
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ transaction commit (ie.
														
 
															+ syncless) which meant that data may be reverted on a crash.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+None.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Unfortunately any transaction scheme which overwrites old data requires
														
 
															+ a sync before that overwrite to avoid the possibility of corruption.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+It seems possible to use a scheme similar to that described in 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "sub:TDB-Does-Not"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+,where transactions are committed without overwriting existing data, and
														
 
															+ an array of top-level pointers were available in the header.
														
 
															+ If the transaction is 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+soft
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ then we would not need a sync at all: existing processes would pick up
														
 
															+ the new hash table and free list and work with that.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+At some later point, a sync would allow recovery of the old data into the
														
 
															+ free lists (perhaps when the array of top-level pointers filled).
														
 
															+ On crash, tdb_open() would examine the array of top levels, and apply the
														
 
															+ transactions until it encountered an invalid checksum.
														
 
															+\end_layout
														
 
															+
														
 
															+\end_body
														
 
															+\end_document
														
--- a/ccan/tdb2/doc/design.lyx,v
+++ b/ccan/tdb2/doc/design.lyx,v
@@ -0,0 +1,3106 @@
 
															+head	1.6;
														
 
															+access;
														
 
															+symbols;
														
 
															+locks; strict;
														
 
															+comment	@# @;
														
 
															+
														
 
															+
														
 
															+1.6
														
 
															+date	2010.08.02.00.21.43;	author rusty;	state Exp;
														
 
															+branches;
														
 
															+next	1.5;
														
 
															+
														
 
															+1.5
														
 
															+date	2010.08.02.00.21.16;	author rusty;	state Exp;
														
 
															+branches;
														
 
															+next	1.4;
														
 
															+
														
 
															+1.4
														
 
															+date	2010.05.10.13.09.11;	author rusty;	state Exp;
														
 
															+branches;
														
 
															+next	1.3;
														
 
															+
														
 
															+1.3
														
 
															+date	2010.05.10.11.58.37;	author rusty;	state Exp;
														
 
															+branches;
														
 
															+next	1.2;
														
 
															+
														
 
															+1.2
														
 
															+date	2010.05.10.05.35.13;	author rusty;	state Exp;
														
 
															+branches;
														
 
															+next	1.1;
														
 
															+
														
 
															+1.1
														
 
															+date	2010.05.04.02.29.16;	author rusty;	state Exp;
														
 
															+branches;
														
 
															+next	;
														
 
															+
														
 
															+
														
 
															+desc
														
 
															+@First draft
														
 
															+@
														
 
															+
														
 
															+
														
 
															+1.6
														
 
															+log
														
 
															+@Commit changes
														
 
															+@
														
 
															+text
														
 
															+@#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
														
 
															+\lyxformat 345
														
 
															+\begin_document
														
 
															+\begin_header
														
 
															+\textclass article
														
 
															+\use_default_options true
														
 
															+\language english
														
 
															+\inputencoding auto
														
 
															+\font_roman default
														
 
															+\font_sans default
														
 
															+\font_typewriter default
														
 
															+\font_default_family default
														
 
															+\font_sc false
														
 
															+\font_osf false
														
 
															+\font_sf_scale 100
														
 
															+\font_tt_scale 100
														
 
															+
														
 
															+\graphics default
														
 
															+\paperfontsize default
														
 
															+\use_hyperref false
														
 
															+\papersize default
														
 
															+\use_geometry false
														
 
															+\use_amsmath 1
														
 
															+\use_esint 1
														
 
															+\cite_engine basic
														
 
															+\use_bibtopic false
														
 
															+\paperorientation portrait
														
 
															+\secnumdepth 3
														
 
															+\tocdepth 3
														
 
															+\paragraph_separation indent
														
 
															+\defskip medskip
														
 
															+\quotes_language english
														
 
															+\papercolumns 1
														
 
															+\papersides 1
														
 
															+\paperpagestyle default
														
 
															+\tracking_changes true
														
 
															+\output_changes true
														
 
															+\author "" 
														
 
															+\author "" 
														
 
															+\end_header
														
 
															+
														
 
															+\begin_body
														
 
															+
														
 
															+\begin_layout Title
														
 
															+TDB2: A Redesigning The Trivial DataBase
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Author
														
 
															+Rusty Russell, IBM Corporation
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Date
														
 
															+26-July-2010
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Abstract
														
 
															+The Trivial DataBase on-disk format is 32 bits; with usage cases heading
														
 
															+ towards the 4G limit, that must change.
														
 
															+ This required breakage provides an opportunity to revisit TDB's other design
														
 
															+ decisions and reassess them.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Section
														
 
															+Introduction
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The Trivial DataBase was originally written by Andrew Tridgell as a simple
														
 
															+ key/data pair storage system with the same API as dbm, but allowing multiple
														
 
															+ readers and writers while being small enough (< 1000 lines of C) to include
														
 
															+ in SAMBA.
														
 
															+ The simple design created in 1999 has proven surprisingly robust and performant
														
 
															+, used in Samba versions 3 and 4 as well as numerous other projects.
														
 
															+ Its useful life was greatly increased by the (backwards-compatible!) addition
														
 
															+ of transaction support in 2005.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The wider variety and greater demands of TDB-using code has lead to some
														
 
															+ organic growth of the API, as well as some compromises on the implementation.
														
 
															+ None of these, by themselves, are seen as show-stoppers, but the cumulative
														
 
															+ effect is to a loss of elegance over the initial, simple TDB implementation.
														
 
															+ Here is a table of the approximate number of lines of implementation code
														
 
															+ and number of API functions at the end of each year:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+\begin_inset Tabular
														
 
															+<lyxtabular version="3" rows="12" columns="3">
														
 
															+<features>
														
 
															+<column alignment="center" valignment="top" width="0">
														
 
															+<column alignment="center" valignment="top" width="0">
														
 
															+<column alignment="center" valignment="top" width="0">
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+Year End
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+API Functions
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+Lines of C Code Implementation
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+1999
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+13
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+1195
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2000
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+24
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+1725
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2001
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+32
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2228
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2002
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+35
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2481
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2003
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+35
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2552
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2004
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+40
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2584
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2005
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+38
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2647
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2006
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+52
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+3754
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2007
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+66
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+4398
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2008
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+71
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+4768
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+<row>
														
 
															+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+2009
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+73
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
														
 
															+\begin_inset Text
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+5715
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+</cell>
														
 
															+</row>
														
 
															+</lyxtabular>
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This review is an attempt to catalog and address all the known issues with
														
 
															+ TDB and create solutions which address the problems without significantly
														
 
															+ increasing complexity; all involved are far too aware of the dangers of
														
 
															+ second system syndrome in rewriting a successful project like this.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Section
														
 
															+API Issues
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+tdb_open_ex Is Not Expandable
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The tdb_open() call was expanded to tdb_open_ex(), which added an optional
														
 
															+ hashing function and an optional logging function argument.
														
 
															+ Additional arguments to open would require the introduction of a tdb_open_ex2
														
 
															+ call etc.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+tdb_open() will take a linked-list of attributes:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+enum tdb_attribute {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    TDB_ATTRIBUTE_LOG = 0,
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    TDB_ATTRIBUTE_HASH = 1
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+struct tdb_attribute_base {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    enum tdb_attribute attr;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    union tdb_attribute *next;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+struct tdb_attribute_log {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    tdb_log_func log_fn;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    void *log_private;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+struct tdb_attribute_hash {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    tdb_hash_func hash_fn;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    void *hash_private;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+union tdb_attribute {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    struct tdb_attribute_base base;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    struct tdb_attribute_log log;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+    struct tdb_attribute_hash hash;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This allows future attributes to be added, even if this expands the size
														
 
															+ of the union.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+tdb_traverse Makes Impossible Guarantees
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
														
 
															+ was thought that it was important to guarantee that all records which exist
														
 
															+ at the start and end of the traversal would be included, and no record
														
 
															+ would be included twice.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This adds complexity (see
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "Reliable-Traversal-Adds"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+) and does not work anyway for records which are altered (in particular,
														
 
															+ those which are expanded may be effectively deleted and re-added behind
														
 
															+ the traversal).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "traverse-Proposed-Solution"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Abandon the guarantee.
														
 
															+ You will see every record if no changes occur during your traversal, otherwise
														
 
															+ you will see some subset.
														
 
															+ You can prevent changes by using a transaction or the locking API.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Nesting of Transactions Is Fraught
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+TDB has alternated between allowing nested transactions and not allowing
														
 
															+ them.
														
 
															+ Various paths in the Samba codebase assume that transactions will nest,
														
 
															+ and in a sense they can: the operation is only committed to disk when the
														
 
															+ outer transaction is committed.
														
 
															+ There are two problems, however:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Canceling the inner transaction will cause the outer transaction commit
														
 
															+ to fail, and will not undo any operations since the inner transaction began.
														
 
															+ This problem is soluble with some additional internal code.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+An inner transaction commit can be cancelled by the outer transaction.
														
 
															+ This is desirable in the way which Samba's database initialization code
														
 
															+ uses transactions, but could be a surprise to any users expecting a successful
														
 
															+ transaction commit to expose changes to others.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The current solution is to specify the behavior at tdb_open(), with the
														
 
															+ default currently that nested transactions are allowed.
														
 
															+ This flag can also be changed at runtime.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Given the usage patterns, it seems that the 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+least-surprise
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ behavior of disallowing nested transactions should become the default.
														
 
															+ Additionally, it seems the outer transaction is the only code which knows
														
 
															+ whether inner transactions should be allowed, so a flag to indicate this
														
 
															+ could be added to tdb_transaction_start.
														
 
															+ However, this behavior can be simulated with a wrapper which uses tdb_add_flags
														
 
															+() and tdb_remove_flags(), so the API should not be expanded for this relatively
														
 
															+-obscure case.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Incorrect Hash Function is Not Detected
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+tdb_open_ex() allows the calling code to specify a different hash function
														
 
															+ to use, but does not check that all other processes accessing this tdb
														
 
															+ are using the same hash function.
														
 
															+ The result is that records are missing from tdb_fetch().
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The header should contain an example hash result (eg.
														
 
															+ the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
														
 
															+ hash function produces the same answer, or fail the tdb_open call.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+tdb_set_max_dead/TDB_VOLATILE Expose Implementation
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+In response to scalability issues with the free list (
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "TDB-Freelist-Is"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
														
 
															+ and the TDB_VOLATILE flag to tdb_open.
														
 
															+ The latter actually calls the former with an argument of 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+5
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This code allows deleted records to accumulate without putting them in the
														
 
															+ free list.
														
 
															+ On delete we iterate through each chain and free them in a batch if there
														
 
															+ are more than max_dead entries.
														
 
															+ These are never otherwise recycled except as a side-effect of a tdb_repack.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+With the scalability problems of the freelist solved, this API can be removed.
														
 
															+ The TDB_VOLATILE flag may still be useful as a hint that store and delete
														
 
															+ of records will be at least as common as fetch in order to allow some internal
														
 
															+ tuning, but initially will become a no-op.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "TDB-Files-Cannot"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+TDB Files Cannot Be Opened Multiple Times In The Same Process
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+No process can open the same TDB twice; we check and disallow it.
														
 
															+ This is an unfortunate side-effect of fcntl locks, which operate on a per-file
														
 
															+ rather than per-file-descriptor basis, and do not nest.
														
 
															+ Thus, closing any file descriptor on a file clears all the locks obtained
														
 
															+ by this process, even if they were placed using a different file descriptor!
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Note that even if this were solved, deadlock could occur if operations were
														
 
															+ nested: this is a more manageable programming error in most cases.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We could lobby POSIX to fix the perverse rules, or at least lobby Linux
														
 
															+ to violate them so that the most common implementation does not have this
														
 
															+ restriction.
														
 
															+ This would be a generally good idea for other fcntl lock users.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Samba uses a wrapper which hands out the same tdb_context to multiple callers
														
 
															+ if this happens, and does simple reference counting.
														
 
															+ We should do this inside the tdb library, which already emulates lock nesting
														
 
															+ internally; it would need to recognize when deadlock occurs within a single
														
 
															+ process.
														
 
															+ This would create a new failure mode for tdb operations (while we currently
														
 
															+ handle locking failures, they are impossible in normal use and a process
														
 
															+ encountering them can do little but give up).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+I do not see benefit in an additional tdb_open flag to indicate whether
														
 
															+ re-opening is allowed, as though there may be some benefit to adding a
														
 
															+ call to detect when a tdb_context is shared, to allow other to create such
														
 
															+ an API.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+TDB API Is Not POSIX Thread-safe
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The TDB API uses an error code which can be queried after an operation to
														
 
															+ determine what went wrong.
														
 
															+ This programming model does not work with threads, unless specific additional
														
 
															+ guarantees are given by the implementation.
														
 
															+ In addition, even otherwise-independent threads cannot open the same TDB
														
 
															+ (as in 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "TDB-Files-Cannot"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Reachitecting the API to include a tdb_errcode pointer would be a great
														
 
															+ deal of churn; we are better to guarantee that the tdb_errcode is per-thread
														
 
															+ so the current programming model can be maintained.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This requires dynamic per-thread allocations, which is awkward with POSIX
														
 
															+ threads (pthread_key_create space is limited and we cannot simply allocate
														
 
															+ a key for every TDB).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Internal locking is required to make sure that fcntl locks do not overlap
														
 
															+ between threads, and also that the global list of tdbs is maintained.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
														
 
															+ version of the library, and otherwise no overhead will exist.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+*_nonblock Functions And *_mark Functions Expose Implementation
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+CTDB
														
 
															+\begin_inset Foot
														
 
															+status collapsed
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+Clustered TDB, see http://ctdb.samba.org
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+ wishes to operate on TDB in a non-blocking manner.
														
 
															+ This is currently done as follows:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Call the _nonblock variant of an API function (eg.
														
 
															+ tdb_lockall_nonblock).
														
 
															+ If this fails:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Fork a child process, and wait for it to call the normal variant (eg.
														
 
															+ tdb_lockall).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If the child succeeds, call the _mark variant to indicate we already have
														
 
															+ the locks (eg.
														
 
															+ tdb_lockall_mark).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Upon completion, tell the child to release the locks (eg.
														
 
															+ tdb_unlockall).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Indicate to tdb that it should consider the locks removed (eg.
														
 
															+ tdb_unlockall_mark).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+There are several issues with this approach.
														
 
															+ Firstly, adding two new variants of each function clutters the API for
														
 
															+ an obscure use, and so not all functions have three variants.
														
 
															+ Secondly, it assumes that all paths of the functions ask for the same locks,
														
 
															+ otherwise the parent process will have to get a lock which the child doesn't
														
 
															+ have under some circumstances.
														
 
															+ I don't believe this is currently the case, but it constrains the implementatio
														
 
															+n.
														
 
															+ 
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "Proposed-Solution-locking-hook"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Implement a hook for locking methods, so that the caller can control the
														
 
															+ calls to create and remove fcntl locks.
														
 
															+ In this scenario, ctdbd would operate as follows:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Call the normal API function, eg tdb_lockall().
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+When the lock callback comes in, check if the child has the lock.
														
 
															+ Initially, this is always false.
														
 
															+ If so, return 0.
														
 
															+ Otherwise, try to obtain it in non-blocking mode.
														
 
															+ If that fails, return EWOULDBLOCK.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Release locks in the unlock callback as normal.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
														
 
															+ child to repeat the operation.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+The child records what locks it obtains, and returns that information to
														
 
															+ the parent.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+When the child has succeeded, goto 1.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This is flexible enough to handle any potential locking scenario, even when
														
 
															+ lock requirements change.
														
 
															+ It can be optimized so that the parent does not release locks, just tells
														
 
															+ the child which locks it doesn't need to obtain.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+It also keeps the complexity out of the API, and in ctdbd where it is needed.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+tdb_chainlock Functions Expose Implementation
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+tdb_chainlock locks some number of records, including the record indicated
														
 
															+ by the given key.
														
 
															+ This gave atomicity guarantees; no-one can start a transaction, alter,
														
 
															+ read or delete that key while the lock is held.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+It also makes the same guarantee for any other key in the chain, which is
														
 
															+ an internal implementation detail and potentially a cause for deadlock.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+None.
														
 
															+ It would be nice to have an explicit single entry lock which effected no
														
 
															+ other keys.
														
 
															+ Unfortunately, this won't work for an entry which doesn't exist.
														
 
															+ Thus while chainlock may be implemented more efficiently for the existing
														
 
															+ case, it will still have overlap issues with the non-existing case.
														
 
															+ So it is best to keep the current (lack of) guarantee about which records
														
 
															+ will be effected to avoid constraining our implementation.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Signal Handling is Not Race-Free
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
														
 
															+ that the tdb locking code should return with a failure, rather than trying
														
 
															+ again when a signal is received (and errno == EAGAIN).
														
 
															+ This is usually used to implement timeouts.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Unfortunately, this does not work in the case where the signal is received
														
 
															+ before the tdb code enters the fcntl() call to place the lock: the code
														
 
															+ will sleep within the fcntl() code, unaware that the signal wants it to
														
 
															+ exit.
														
 
															+ In the case of long timeouts, this does not happen in practice.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The locking hooks proposed in
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "Proposed-Solution-locking-hook"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+ would allow the user to decide on whether to fail the lock acquisition
														
 
															+ on a signal.
														
 
															+ This allows the caller to choose their own compromise: they could narrow
														
 
															+ the race by checking immediately before the fcntl call.
														
 
															+\begin_inset Foot
														
 
															+status collapsed
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+It may be possible to make this race-free in some implementations by having
														
 
															+ the signal handler alter the struct flock to make it invalid.
														
 
															+ This will cause the fcntl() lock call to fail with EINVAL if the signal
														
 
															+ occurs before the kernel is entered, otherwise EAGAIN.
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+The API Uses Gratuitous Typedefs, Capitals
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+typedefs are useful for providing source compatibility when types can differ
														
 
															+ across implementations, or arguably in the case of function pointer definitions
														
 
															+ which are hard for humans to parse.
														
 
															+ Otherwise it is simply obfuscation and pollutes the namespace.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Capitalization is usually reserved for compile-time constants and macros.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Description
														
 
															+TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
														
 
															+ definition isn't visible to the API user anyway.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Description
														
 
															+TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
														
 
															+ needs to be understood by the API user.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Description
														
 
															+struct
														
 
															+\begin_inset space ~
														
 
															+\end_inset
														
 
															+
														
 
															+TDB_DATA This would normally be called 'struct tdb_data'.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Description
														
 
															+enum
														
 
															+\begin_inset space ~
														
 
															+\end_inset
														
 
															+
														
 
															+TDB_ERROR Similarly, this would normally be enum tdb_error.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+None.
														
 
															+ Introducing lower case variants would please pedants like myself, but if
														
 
															+ it were done the existing ones should be kept.
														
 
															+ There is little point forcing a purely cosmetic change upon tdb users.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "tdb_log_func-Doesnt-Take"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+tdb_log_func Doesn't Take The Private Pointer
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+For API compatibility reasons, the logging function needs to call tdb_get_loggin
														
 
															+g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+It should simply take an extra argument, since we are prepared to break
														
 
															+ the API/ABI.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Various Callback Functions Are Not Typesafe
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The callback functions in tdb_set_logging_function (after 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "tdb_log_func-Doesnt-Take"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
														
 
															+ all take void * and must internally convert it to the argument type they
														
 
															+ were expecting.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+If this type changes, the compiler will not produce warnings on the callers,
														
 
															+ since it only sees void *.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+With careful use of macros, we can create callback functions which give
														
 
															+ a warning when used on gcc and the types of the callback and its private
														
 
															+ argument differ.
														
 
															+ Unsupported compilers will not give a warning, which is no worse than now.
														
 
															+ In addition, the callbacks become clearer, as they need not use void *
														
 
															+ for their parameter.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
														
 
															+ be cleared if the caller discovers it is the only process with the TDB
														
 
															+ open.
														
 
															+ However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
														
 
															+ be detected, so will have the TDB erased underneath them (usually resulting
														
 
															+ in a crash).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+There is a similar issue on fork(); if the parent exits (or otherwise closes
														
 
															+ the tdb) before the child calls tdb_reopen_all() to establish the lock
														
 
															+ used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
														
 
															+ at that moment will believe it alone has opened the TDB and will erase
														
 
															+ it.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Remove TDB_CLEAR_IF_FIRST.
														
 
															+ Other workarounds are possible, but see 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Section
														
 
															+Performance And Scalability Issues
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+TDB_CLEAR_IF_FIRST Imposes Performance Penalty
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
														
 
															+ 4 (aka.
														
 
															+ the ACTIVE_LOCK).
														
 
															+ While these locks never conflict in normal tdb usage, they do add substantial
														
 
															+ overhead for most fcntl lock implementations when the kernel scans to detect
														
 
															+ if a lock conflict exists.
														
 
															+ This is often a single linked list, making the time to acquire and release
														
 
															+ a fcntl lock O(N) where N is the number of processes with the TDB open,
														
 
															+ not the number actually doing work.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+In a Samba server it is common to have huge numbers of clients sitting idle,
														
 
															+ and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
														
 
															+\begin_inset Foot
														
 
															+status collapsed
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+There is a flag to tdb_reopen_all() which is used for this optimization:
														
 
															+ if the parent process will outlive the child, the child does not need the
														
 
															+ ACTIVE_LOCK.
														
 
															+ This is a workaround for this very performance issue.
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Remove the flag.
														
 
															+ It was a neat idea, but even trivial servers tend to know when they are
														
 
															+ initializing for the first time and can simply unlink the old tdb at that
														
 
															+ point.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+TDB Files Have a 4G Limit
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This seems to be becoming an issue (so much for 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+trivial
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+!), particularly for ldb.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+A new, incompatible TDB format which uses 64 bit offsets internally rather
														
 
															+ than 32 bit as now.
														
 
															+ For simplicity of endian conversion (which TDB does on the fly if required),
														
 
															+ all values will be 64 bit on disk.
														
 
															+ In practice, some upper bits may be used for other purposes, but at least
														
 
															+ 56 bits will be available for file offsets.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+tdb_open() will automatically detect the old version, and even create them
														
 
															+ if TDB_VERSION6 is specified to tdb_open.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+32 bit processes will still be able to access TDBs larger than 4G (assuming
														
 
															+ that their off_t allows them to seek to 64 bits), they will gracefully
														
 
															+ fall back as they fail to mmap.
														
 
															+ This can happen already with large TDBs.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Old versions of tdb will fail to open the new TDB files (since 28 August
														
 
															+ 2009, commit 398d0c29290: prior to that any unrecognized file format would
														
 
															+ be erased and initialized as a fresh tdb!)
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+TDB Records Have a 4G Limit
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This has not been a reported problem, and the API uses size_t which can
														
 
															+ be 64 bit on 64 bit platforms.
														
 
															+ However, other limits may have made such an issue moot.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Record sizes will be 64 bit, with an error returned on 32 bit platforms
														
 
															+ which try to access such records (the current implementation would return
														
 
															+ TDB_ERR_OOM in a similar case).
														
 
															+ It seems unlikely that 32 bit keys will be a limitation, so the implementation
														
 
															+ may not support this (see 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "sub:Records-Incur-A"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Hash Size Is Determined At TDB Creation Time
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+TDB contains a number of hash chains in the header; the number is specified
														
 
															+ at creation time, and defaults to 131.
														
 
															+ This is such a bottleneck on large databases (as each hash chain gets quite
														
 
															+ long), that LDB uses 10,000 for this hash.
														
 
															+ In general it is impossible to know what the 'right' answer is at database
														
 
															+ creation time.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+After comprehensive performance testing on various scalable hash variants
														
 
															+\begin_inset Foot
														
 
															+status collapsed
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
														
 
															+ because I was previously convinced that an expanding tree of hashes would
														
 
															+ be very close to optimal.
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+, it became clear that it is hard to beat a straight linear hash table which
														
 
															+ doubles in size when it reaches saturation.
														
 
															+ There are three details which become important:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+On encountering a full bucket, we use the next bucket.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Extra hash bits are stored with the offset, to reduce comparisons.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+A marker entry is used on deleting an entry.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The doubling of the table must be done under a transaction; we will not
														
 
															+ reduce it on deletion, so it will be an unusual case.
														
 
															+ It will either be placed at the head (other entries will be moved out the
														
 
															+ way so we can expand).
														
 
															+ We could have a pointer in the header to the current hashtable location,
														
 
															+ but that pointer would have to be read frequently to check for hashtable
														
 
															+ moves.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The locking for this is slightly more complex than the chained case; we
														
 
															+ currently have one lock per bucket, and that means we would need to expand
														
 
															+ the lock if we overflow to the next bucket.
														
 
															+ The frequency of such collisions will effect our locking heuristics: we
														
 
															+ can always lock more buckets than we need.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+One possible optimization is to only re-check the hash size on an insert
														
 
															+ or a lookup miss.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "TDB-Freelist-Is"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+TDB Freelist Is Highly Contended
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+TDB uses a single linked list for the free list.
														
 
															+ Allocation occurs as follows, using heuristics which have evolved over
														
 
															+ time:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Get the free list lock for this whole operation.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Multiply length by 1.25, so we always over-allocate by 25%.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Set the slack multiplier to 1.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Examine the current freelist entry: if it is > length but < the current
														
 
															+ best case, remember it as the best case.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Multiply the slack multiplier by 1.05.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If our best fit so far is less than length * slack multiplier, return it.
														
 
															+ The slack will be turned into a new free record if it's large enough.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Otherwise, go onto the next freelist entry.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Deleting a record occurs as follows:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Lock the hash chain for this whole operation.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Walk the chain to find the record, keeping the prev pointer offset.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If max_dead is non-zero:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_deeper
														
 
															+\begin_layout Enumerate
														
 
															+Walk the hash chain again and count the dead records.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If it's more than max_dead, bulk free all the dead ones (similar to steps
														
 
															+ 4 and below, but the lock is only obtained once).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Simply mark this record as dead and return.
														
 
															+ 
														
 
															+\end_layout
														
 
															+
														
 
															+\end_deeper
														
 
															+\begin_layout Enumerate
														
 
															+Get the free list lock for the remainder of this operation.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "right-merging"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+Examine the following block to see if it is free; if so, enlarge the current
														
 
															+ block and remove that block from the free list.
														
 
															+ This was disabled, as removal from the free list was O(entries-in-free-list).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Examine the preceeding block to see if it is free: for this reason, each
														
 
															+ block has a 32-bit tailer which indicates its length.
														
 
															+ If it is free, expand it to cover our new block and return.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Otherwise, prepend ourselves to the free list.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Disabling right-merging (step 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "right-merging"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+) causes fragmentation; the other heuristics proved insufficient to address
														
 
															+ this, so the final answer to this was that when we expand the TDB file
														
 
															+ inside a transaction commit, we repack the entire tdb.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The single list lock limits our allocation rate; due to the other issues
														
 
															+ this is not currently seen as a bottleneck.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The first step is to remove all the current heuristics, as they obviously
														
 
															+ interact, then examine them once the lock contention is addressed.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The free list must be split to reduce contention.
														
 
															+ Assuming perfect free merging, we can at most have 1 free list entry for
														
 
															+ each entry.
														
 
															+ This implies that the number of free lists is related to the size of the
														
 
															+ hash table, but as it is rare to walk a large number of free list entries
														
 
															+ we can use far fewer, say 1/32 of the number of hash buckets.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+There are various benefits in using per-size free lists (see 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "sub:TDB-Becomes-Fragmented"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+) but it's not clear this would reduce contention in the common case where
														
 
															+ all processes are allocating/freeing the same size.
														
 
															+ Thus we almost certainly need to divide in other ways: the most obvious
														
 
															+ is to divide the file into zones, and using a free list (or set of free
														
 
															+ lists) for each.
														
 
															+ This approximates address ordering.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Note that this means we need to split the free lists when we expand the
														
 
															+ file; this is probably acceptable when we double the hash table size, since
														
 
															+ that is such an expensive operation already.
														
 
															+ In the case of increasing the file size, there is an optimization we can
														
 
															+ use: if we use M in the formula above as the file size rounded up to the
														
 
															+ next power of 2, we only need reshuffle free lists when the file size crosses
														
 
															+ a power of 2 boundary, 
														
 
															+\emph on
														
 
															+and 
														
 
															+\emph default
														
 
															+reshuffling the free lists is trivial: we simply merge every consecutive
														
 
															+ pair of free lists.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The basic algorithm is as follows.
														
 
															+ Freeing is simple:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Identify the correct zone.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Lock the corresponding list.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Re-check the zone (we didn't have a lock, sizes could have changed): relock
														
 
															+ if necessary.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Place the freed entry in the list for that zone.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Allocation is a little more complicated, as we perform delayed coalescing
														
 
															+ at this point:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Pick a zone either the zone we last freed into, or based on a 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+random
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ number.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Lock the corresponding list.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Re-check the zone: relock if necessary.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If the top entry is -large enough, remove it from the list and return it.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Otherwise, coalesce entries in the list.If there was no entry large enough,
														
 
															+ unlock the list and try the next zone.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If no zone satisfies, expand the file.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This optimizes rapid insert/delete of free list entries by not coalescing
														
 
															+ them all the time..
														
 
															+ First-fit address ordering ordering seems to be fairly good for keeping
														
 
															+ fragmentation low (see 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "sub:TDB-Becomes-Fragmented"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+).
														
 
															+ Note that address ordering does not need a tailer to coalesce, though if
														
 
															+ we needed one we could have one cheaply: see 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "sub:Records-Incur-A"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+.
														
 
															+ 
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+I anticipate that the number of entries in each free zone would be small,
														
 
															+ but it might be worth using one free entry to hold pointers to the others
														
 
															+ for cache efficiency.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "sub:TDB-Becomes-Fragmented"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+TDB Becomes Fragmented
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Much of this is a result of allocation strategy
														
 
															+\begin_inset Foot
														
 
															+status collapsed
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
														
 
															+xas.edu/pub/garbage/malloc/ismm98.ps
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+ and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
														
 
															+on) is deliberately set at 25%, and external fragmentation is only cured
														
 
															+ by the decision to repack the entire db when a transaction commit needs
														
 
															+ to enlarge the file.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The 25% overhead on allocation works in practice for ldb because indexes
														
 
															+ tend to expand by one record at a time.
														
 
															+ This internal fragmentation can be resolved by having an 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+expanded
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ bit in the header to note entries that have previously expanded, and allocating
														
 
															+ more space for them.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+There are is a spectrum of possible solutions for external fragmentation:
														
 
															+ one is to use a fragmentation-avoiding allocation strategy such as best-fit
														
 
															+ address-order allocator.
														
 
															+ The other end of the spectrum would be to use a bump allocator (very fast
														
 
															+ and simple) and simply repack the file when we reach the end.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+There are three problems with efficient fragmentation-avoiding allocators:
														
 
															+ they are non-trivial, they tend to use a single free list for each size,
														
 
															+ and there's no evidence that tdb allocation patterns will match those recorded
														
 
															+ for general allocators (though it seems likely).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Thus we don't spend too much effort on external fragmentation; we will be
														
 
															+ no worse than the current code if we need to repack on occasion.
														
 
															+ More effort is spent on reducing freelist contention, and reducing overhead.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "sub:Records-Incur-A"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+Records Incur A 28-Byte Overhead
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Each TDB record has a header as follows:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+struct tdb_record {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        tdb_off_t next; /* offset of the next record in the list */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        tdb_len_t rec_len; /* total byte length of record */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        tdb_len_t key_len; /* byte length of key */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        tdb_len_t data_len; /* byte length of data */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint32_t full_hash; /* the full 32 bit hash of the key */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint32_t magic;   /* try to catch errors */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        /* the following union is implied:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                union {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                        char record[rec_len];
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                        struct {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                                char key[key_len];
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                                char data[data_len];
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                        }
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                        uint32_t totalsize; (tailer)
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                }
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        */
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Naively, this would double to a 56-byte overhead on a 64 bit implementation.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We can use various techniques to reduce this for an allocated block:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+The 'next' pointer is not required, as we are using a flat hash table.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+'rec_len' can instead be expressed as an addition to key_len and data_len
														
 
															+ (it accounts for wasted or overallocated length in the record).
														
 
															+ Since the record length is always a multiple of 8, we can conveniently
														
 
															+ fit it in 32 bits (representing up to 35 bits).
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+'key_len' and 'data_len' can be reduced.
														
 
															+ I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
														
 
															+ the two into one 64-bit field and using a 5 bit value which indicates at
														
 
															+ what bit to divide the two.
														
 
															+ Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
														
 
															+ size of 32 bits.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+'full_hash' is used to avoid a memcmp on the 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+miss
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ case, but this is diminishing returns after a handful of bits (at 10 bits,
														
 
															+ it reduces 99.9% of false memcmp).
														
 
															+ As an aside, as the lower bits are already incorporated in the hash table
														
 
															+ resolution, the upper bits should be used here.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+'magic' does not need to be enlarged: it currently reflects one of 5 values
														
 
															+ (used, free, dead, recovery, and unused_recovery).
														
 
															+ It is useful for quick sanity checking however, and should not be eliminated.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+'tailer' is only used to coalesce free blocks (so a block to the right can
														
 
															+ find the header to check if this block is free).
														
 
															+ This can be replaced by a single 'free' bit in the header of the following
														
 
															+ block (and the tailer only exists in free blocks).
														
 
															+\begin_inset Foot
														
 
															+status collapsed
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+This technique from Thomas Standish.
														
 
															+ Data Structure Techniques.
														
 
															+ Addison-Wesley, Reading, Massachusetts, 1980.
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+ The current proposed coalescing algorithm doesn't need this, however.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This produces a 16 byte used header like this:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+struct tdb_used_record {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint32_t magic : 16,
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                 prev_is_free: 1,
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                 key_data_divide: 5,
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+                 top_hash: 10;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint32_t extra_octets;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint64_t key_and_data_len;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+And a free record like this:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+struct tdb_free_record {
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint32_t free_magic;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint64_t total_length;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        ...
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+        uint64_t tailer;
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+};
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout LyX-Code
														
 
															+
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Transaction Commit Requires 4 fdatasync
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The current transaction algorithm is:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+write_recovery_data();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+sync();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+write_recovery_header();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+sync();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+overwrite_with_new_data();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+sync();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+remove_recovery_header();
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+sync(); 
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+On current ext3, each sync flushes all data to disk, so the next 3 syncs
														
 
															+ are relatively expensive.
														
 
															+ But this could become a performance bottleneck on other filesystems such
														
 
															+ as ext4.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Neil Brown points out that this is overzealous, and only one sync is needed:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Bundle the recovery data, a transaction counter and a strong checksum of
														
 
															+ the new data.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Strong checksum that whole bundle.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Store the bundle in the database.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Overwrite the oldest of the two recovery pointers in the header (identified
														
 
															+ using the transaction counter) with the offset of this bundle.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+sync.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Write the new data to the file.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Checking for recovery means identifying the latest bundle with a valid checksum
														
 
															+ and using the new data checksum to ensure that it has been applied.
														
 
															+ This is more expensive than the current check, but need only be done at
														
 
															+ open.
														
 
															+ For running databases, a separate header field can be used to indicate
														
 
															+ a transaction in progress; we need only check for recovery if this is set.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "sub:TDB-Does-Not"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+TDB Does Not Have Snapshot Support
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+None.
														
 
															+ At some point you say 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+use a real database
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+But as a thought experiment, if we implemented transactions to only overwrite
														
 
															+ free entries (this is tricky: there must not be a header in each entry
														
 
															+ which indicates whether it is free, but use of presence in metadata elsewhere),
														
 
															+ and a pointer to the hash table, we could create an entirely new commit
														
 
															+ without destroying existing data.
														
 
															+ Then it would be easy to implement snapshots in a similar way.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This would not allow arbitrary changes to the database, such as tdb_repack
														
 
															+ does, and would require more space (since we have to preserve the current
														
 
															+ and future entries at once).
														
 
															+ If we used hash trees rather than one big hash table, we might only have
														
 
															+ to rewrite some sections of the hash, too.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We could then implement snapshots using a similar method, using multiple
														
 
															+ different hash tables/free tables.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Transactions Cannot Operate in Parallel
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This would be useless for ldb, as it hits the index records with just about
														
 
															+ every update.
														
 
															+ It would add significant complexity in resolving clashes, and cause the
														
 
															+ all transaction callers to write their code to loop in the case where the
														
 
															+ transactions spuriously failed.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We could solve a small part of the problem by providing read-only transactions.
														
 
															+ These would allow one write transaction to begin, but it could not commit
														
 
															+ until all r/o transactions are done.
														
 
															+ This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
														
 
															+ commit.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Default Hash Function Is Suboptimal
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
														
 
															+ if we expand it to 64 bits), and works best when the hash bucket size is
														
 
															+ a prime number (which also means a slow modulus).
														
 
															+ In addition, it is highly predictable which could potentially lead to a
														
 
															+ Denial of Service attack in some TDB uses.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The Jenkins lookup3 hash
														
 
															+\begin_inset Foot
														
 
															+status open
														
 
															+
														
 
															+\begin_layout Plain Layout
														
 
															+http://burtleburtle.net/bob/c/lookup3.c
														
 
															+\end_layout
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+ is a fast and superbly-mixing hash.
														
 
															+ It's used by the Linux kernel and almost everything else.
														
 
															+ This has the particular properties that it takes an initial seed, and produces
														
 
															+ two 32 bit hash numbers, which we can combine into a 64-bit hash.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+The seed should be created at tdb-creation time from some random source,
														
 
															+ and placed in the header.
														
 
															+ This is far from foolproof, but adds a little bit of protection against
														
 
															+ hash bombing.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+\begin_inset CommandInset label
														
 
															+LatexCommand label
														
 
															+name "Reliable-Traversal-Adds"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+Reliable Traversal Adds Complexity
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We lock a record during traversal iteration, and try to grab that lock in
														
 
															+ the delete code.
														
 
															+ If that grab on delete fails, we simply mark it deleted and continue onwards;
														
 
															+ traversal checks for this condition and does the delete when it moves off
														
 
															+ the record.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+If traversal terminates, the dead record may be left indefinitely.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Remove reliability guarantees; see 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "traverse-Proposed-Solution"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Fcntl Locking Adds Overhead
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Placing a fcntl lock means a system call, as does removing one.
														
 
															+ This is actually one reason why transactions can be faster (everything
														
 
															+ is locked once at transaction start).
														
 
															+ In the uncontended case, this overhead can theoretically be eliminated.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+None.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We tried this before with spinlock support, in the early days of TDB, and
														
 
															+ it didn't make much difference except in manufactured benchmarks.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+We could use spinlocks (with futex kernel support under Linux), but it means
														
 
															+ that we lose automatic cleanup when a process dies with a lock.
														
 
															+ There is a method of auto-cleanup under Linux, but it's not supported by
														
 
															+ other operating systems.
														
 
															+ We could reintroduce a clear-if-first-style lock and sweep for dead futexes
														
 
															+ on open, but that wouldn't help the normal case of one concurrent opener
														
 
															+ dying.
														
 
															+ Increasingly elaborate repair schemes could be considered, but they require
														
 
															+ an ABI change (everyone must use them) anyway, so there's no need to do
														
 
															+ this at the same time as everything else.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsection
														
 
															+Some Transactions Don't Require Durability
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
														
 
															+ usage, and occasionally empties the results into a transactional TDB.
														
 
															+ This kind of usage prioritizes performance over durability: as long as
														
 
															+ we are consistent, data can be lost.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+This would be more neatly implemented inside tdb: a 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+soft
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ transaction commit (ie.
														
 
															+ syncless) which meant that data may be reverted on a crash.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Subsubsection
														
 
															+Proposed Solution
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+None.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+Unfortunately any transaction scheme which overwrites old data requires
														
 
															+ a sync before that overwrite to avoid the possibility of corruption.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+It seems possible to use a scheme similar to that described in 
														
 
															+\begin_inset CommandInset ref
														
 
															+LatexCommand ref
														
 
															+reference "sub:TDB-Does-Not"
														
 
															+
														
 
															+\end_inset
														
 
															+
														
 
															+,where transactions are committed without overwriting existing data, and
														
 
															+ an array of top-level pointers were available in the header.
														
 
															+ If the transaction is 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+soft
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ then we would not need a sync at all: existing processes would pick up
														
 
															+ the new hash table and free list and work with that.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+At some later point, a sync would allow recovery of the old data into the
														
 
															+ free lists (perhaps when the array of top-level pointers filled).
														
 
															+ On crash, tdb_open() would examine the array of top levels, and apply the
														
 
															+ transactions until it encountered an invalid checksum.
														
 
															+\end_layout
														
 
															+
														
 
															+\end_body
														
 
															+\end_document
														
 
															+@
														
 
															+
														
 
															+
														
 
															+1.5
														
 
															+log
														
 
															+@Soft transaction commit
														
 
															+@
														
 
															+text
														
 
															+@d38 1
														
 
															+a38 1
														
 
															+\author "Rusty Russell,,," 
														
 
															+a52 4
														
 
															+
														
 
															+\change_deleted 0 1280141199
														
 
															+10-May-2010
														
 
															+\change_inserted 0 1280141202
														
 
															+a53 2
														
 
															+\change_unchanged
														
 
															+
														
 
															+a2028 2
														
 
															+
														
 
															+\change_inserted 0 1280140902
														
 
															+a2034 2
														
 
															+
														
 
															+\change_unchanged
														
 
															+a2212 2
														
 
															+\change_inserted 0 1280140661
														
 
															+
														
 
															+a2215 2
														
 
															+
														
 
															+\change_inserted 0 1280140703
														
 
															+a2219 2
														
 
															+
														
 
															+\change_inserted 0 1280708312
														
 
															+a2226 2
														
 
															+
														
 
															+\change_inserted 0 1280708400
														
 
															+a2239 2
														
 
															+
														
 
															+\change_inserted 0 1280140836
														
 
															+a2243 2
														
 
															+
														
 
															+\change_inserted 0 1280708255
														
 
															+a2247 2
														
 
															+
														
 
															+\change_inserted 0 1280708374
														
 
															+a2252 2
														
 
															+
														
 
															+\change_inserted 0 1280141181
														
 
															+a2274 2
														
 
															+
														
 
															+\change_inserted 0 1280141345
														
 
															+@
														
 
															+
														
 
															+
														
 
															+1.4
														
 
															+log
														
 
															+@Merge changes
														
 
															+@
														
 
															+text
														
 
															+@d38 1
														
 
															+a38 1
														
 
															+\author "" 
														
 
															+d53 2
														
 
															+d56 4
														
 
															+d2035 10
														
 
															+d2223 84
														
 
															+@
														
 
															+
														
 
															+
														
 
															+1.3
														
 
															+log
														
 
															+@Transaction and freelist rethink.
														
 
															+@
														
 
															+text
														
 
															+@d38 1
														
 
															+a38 1
														
 
															+\author "Rusty Russell,,," 
														
 
															+d53 1
														
 
															+a53 1
														
 
															+27-April-2010
														
 
															+d662 1
														
 
															+a662 5
														
 
															+ behavior of disallowing 
														
 
															+\change_inserted 0 1272940179
														
 
															+nested 
														
 
															+\change_unchanged
														
 
															+transactions should become the default.
														
 
															+a1210 2
														
 
															+\change_inserted 0 1272944650
														
 
															+
														
 
															+a1214 2
														
 
															+
														
 
															+\change_inserted 0 1272944763
														
 
															+a1218 2
														
 
															+\change_unchanged
														
 
															+
														
 
															+a1223 2
														
 
															+\change_unchanged
														
 
															+
														
 
															+a1301 2
														
 
															+
														
 
															+\change_inserted 0 1273478114
														
 
															+a1310 2
														
 
															+\change_unchanged
														
 
															+
														
 
															+d1515 1
														
 
															+a1515 11
														
 
															+The free list 
														
 
															+\change_deleted 0 1273469807
														
 
															+should
														
 
															+\change_inserted 0 1273469810
														
 
															+must
														
 
															+\change_unchanged
														
 
															+ be split 
														
 
															+\change_deleted 0 1273469815
														
 
															+into multiple lists 
														
 
															+\change_unchanged
														
 
															+to reduce contention.
														
 
															+a1520 2
														
 
															+\change_inserted 0 1273470006
														
 
															+
														
 
															+a1523 2
														
 
															+
														
 
															+\change_inserted 0 1273492055
														
 
															+a1539 2
														
 
															+
														
 
															+\change_inserted 0 1273483888
														
 
															+a1551 2
														
 
															+\change_unchanged
														
 
															+
														
 
															+a1554 8
														
 
															+
														
 
															+\change_deleted 0 1272942055
														
 
															+There are various ways to organize these lisys, but because we want to be
														
 
															+ able to quickly identify which free list an entry is in, and reduce the
														
 
															+ number of locks required for merging, we will use zoning (eg.
														
 
															+ each free list covers some fixed fraction of the file).
														
 
															+ 
														
 
															+\change_inserted 0 1273484187
														
 
															+d1556 1
														
 
															+a1556 7
														
 
															+ 
														
 
															+\change_deleted 0 1273484194
														
 
															+The algorithm for f
														
 
															+\change_inserted 0 1273484194
														
 
															+F
														
 
															+\change_unchanged
														
 
															+reeing is simple:
														
 
															+d1560 1
														
 
															+a1560 7
														
 
															+Identify the correct 
														
 
															+\change_deleted 0 1273482856
														
 
															+free list
														
 
															+\change_inserted 0 1273482857
														
 
															+zone
														
 
															+\change_unchanged
														
 
															+.
														
 
															+d1564 1
														
 
															+a1564 7
														
 
															+Lock the 
														
 
															+\change_inserted 0 1273482895
														
 
															+corresponding 
														
 
															+\change_unchanged
														
 
															+list
														
 
															+\change_inserted 0 1273482863
														
 
															+.
														
 
															+a1567 2
														
 
															+
														
 
															+\change_inserted 0 1273482909
														
 
															+d1573 1
														
 
															+a1573 13
														
 
															+
														
 
															+\change_deleted 0 1273482885
														
 
															+, and p
														
 
															+\change_inserted 0 1273482888
														
 
															+P
														
 
															+\change_unchanged
														
 
															+lace the freed entry 
														
 
															+\change_deleted 0 1273492415
														
 
															+at the head
														
 
															+\change_inserted 0 1273492415
														
 
															+in the list for that zone
														
 
															+\change_unchanged
														
 
															+.
														
 
															+d1577 2
														
 
															+a1578 7
														
 
															+Allocation is a little more complicated, as we 
														
 
															+\change_deleted 0 1273483240
														
 
															+merge entries as we walk the list:
														
 
															+\change_inserted 0 1273484250
														
 
															+perform delayed coalescing at this point:
														
 
															+\change_unchanged
														
 
															+
														
 
															+d1582 1
														
 
															+a1582 19
														
 
															+Pick a 
														
 
															+\change_deleted 0 1273482955
														
 
															+free list;
														
 
															+\change_inserted 0 1273482957
														
 
															+zone
														
 
															+\change_unchanged
														
 
															+ either the 
														
 
															+\change_deleted 0 1273482962
														
 
															+list
														
 
															+\change_inserted 0 1273482962
														
 
															+zone
														
 
															+\change_unchanged
														
 
															+ we last freed 
														
 
															+\change_deleted 0 1273482966
														
 
															+o
														
 
															+\change_inserted 0 1273482966
														
 
															+i
														
 
															+\change_unchanged
														
 
															+nto, or based on a 
														
 
															+d1594 1
														
 
															+a1594 9
														
 
															+Lock th
														
 
															+\change_inserted 0 1273482980
														
 
															+e corresponding
														
 
															+\change_deleted 0 1273482973
														
 
															+at
														
 
															+\change_unchanged
														
 
															+ list.
														
 
															+\change_inserted 0 1273482982
														
 
															+
														
 
															+a1597 2
														
 
															+
														
 
															+\change_inserted 0 1273483084
														
 
															+a1598 53
														
 
															+\change_unchanged
														
 
															+
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+If the top entry is 
														
 
															+\change_deleted 0 1273492155
														
 
															+well-sized, 
														
 
															+\change_inserted 0 1273492159
														
 
															+-large enough, 
														
 
															+\change_unchanged
														
 
															+remove it from the list and return it.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+Otherwise, 
														
 
															+\change_inserted 0 1273492206
														
 
															+coalesce entries in the list.
														
 
															+\change_deleted 0 1273492200
														
 
															+examine the entry to the right of it in the file.
														
 
															+ If it is free:
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_deeper
														
 
															+\begin_layout Enumerate
														
 
															+
														
 
															+\change_deleted 0 1273492200
														
 
															+If that entry is in a different list, lock that list too.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+
														
 
															+\change_deleted 0 1273492200
														
 
															+If we had to place a new lock, re-check that the entry is free.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+
														
 
															+\change_deleted 0 1273492200
														
 
															+Remove that entry from its free list and expand this entry to cover it.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Enumerate
														
 
															+
														
 
															+\change_deleted 0 1273485554
														
 
															+Goto step 3.
														
 
															+\end_layout
														
 
															+
														
 
															+\end_deeper
														
 
															+\begin_layout Enumerate
														
 
															+
														
 
															+\change_inserted 0 1273485311
														
 
															+If there was no entry large enough, unlock the list and try the next zone.
														
 
															+d1602 1
														
 
															+a1602 5
														
 
															+
														
 
															+\change_deleted 0 1273483646
														
 
															+Repeat step 3 with each entry in the list.
														
 
															+\change_unchanged
														
 
															+
														
 
															+d1606 2
														
 
															+a1607 5
														
 
															+
														
 
															+\change_deleted 0 1273483668
														
 
															+Unlock the list and repeat step 2 with the next list.
														
 
															+\change_unchanged
														
 
															+
														
 
															+d1611 1
														
 
															+a1611 7
														
 
															+If no 
														
 
															+\change_deleted 0 1273483671
														
 
															+list
														
 
															+\change_inserted 0 1273483671
														
 
															+zone
														
 
															+\change_unchanged
														
 
															+ satisfies, expand the file.
														
 
															+d1615 2
														
 
															+a1616 9
														
 
															+This optimizes rapid insert/delete of free list entries
														
 
															+\change_inserted 0 1273485794
														
 
															+ by not coalescing them all the time.
														
 
															+\change_deleted 0 1273483685
														
 
															+, and allows us to get rid of the tailer altogether
														
 
															+\change_unchanged
														
 
															+.
														
 
															+
														
 
															+\change_inserted 0 1273492299
														
 
															+a1638 39
														
 
															+
														
 
															+\change_deleted 0 1273476840
														
 
															+The question of 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+well-sized
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ free entries is more difficult: the 25% overhead works in practice for
														
 
															+ ldb because indexes tend to expand by one record at a time.
														
 
															+ This can be resolved by having an 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+expanded
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ bit in the header to note entries that have previously expanded, and allocating
														
 
															+ more space for them.
														
 
															+ Whether the 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+increasing slack
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+ algorithm should be implemented or first-fit used is still unknown: we
														
 
															+ will determine this once these other ideas are implemented.
														
 
															+\change_inserted 0 1273483750
														
 
															+
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+
														
 
															+\change_inserted 0 1273492450
														
 
															+a1644 2
														
 
															+
														
 
															+\change_inserted 0 1273470441
														
 
															+a1654 2
														
 
															+
														
 
															+\change_inserted 0 1273476556
														
 
															+a1659 2
														
 
															+
														
 
															+\change_inserted 0 1273470423
														
 
															+a1661 2
														
 
															+\change_unchanged
														
 
															+
														
 
															+a1672 2
														
 
															+
														
 
															+\change_inserted 0 1273476847
														
 
															+a1676 2
														
 
															+
														
 
															+\change_inserted 0 1273476886
														
 
															+a1691 2
														
 
															+
														
 
															+\change_inserted 0 1273477233
														
 
															+a1699 2
														
 
															+
														
 
															+\change_inserted 0 1273477534
														
 
															+a1706 2
														
 
															+
														
 
															+\change_inserted 0 1273482700
														
 
															+a1712 2
														
 
															+
														
 
															+\change_inserted 0 1273478079
														
 
															+a1722 2
														
 
															+
														
 
															+\change_inserted 0 1273477839
														
 
															+a1726 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1730 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1734 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1738 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1742 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1746 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1750 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1754 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1758 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1762 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1766 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1770 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1774 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1778 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1782 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1786 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1790 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1794 2
														
 
															+
														
 
															+\change_inserted 0 1273477925
														
 
															+a1798 2
														
 
															+
														
 
															+\change_inserted 0 1273492522
														
 
															+a1802 2
														
 
															+
														
 
															+\change_inserted 0 1273492530
														
 
															+a1806 2
														
 
															+
														
 
															+\change_inserted 0 1273492546
														
 
															+a1810 2
														
 
															+
														
 
															+\change_inserted 0 1273478239
														
 
															+a1814 2
														
 
															+
														
 
															+\change_inserted 0 1273479960
														
 
															+a1821 2
														
 
															+
														
 
															+\change_inserted 0 1273480265
														
 
															+a1830 2
														
 
															+
														
 
															+\change_inserted 0 1273480354
														
 
															+a1845 2
														
 
															+
														
 
															+\change_inserted 0 1273478968
														
 
															+a1851 2
														
 
															+
														
 
															+\change_inserted 0 1273492604
														
 
															+a1859 2
														
 
															+
														
 
															+\change_inserted 0 1273479572
														
 
															+a1862 2
														
 
															+\change_unchanged
														
 
															+
														
 
															+a1870 2
														
 
															+
														
 
															+\change_inserted 0 1273480282
														
 
															+a1874 2
														
 
															+
														
 
															+\change_inserted 0 1273478931
														
 
															+a1878 2
														
 
															+
														
 
															+\change_inserted 0 1273481549
														
 
															+a1882 2
														
 
															+
														
 
															+\change_inserted 0 1273481557
														
 
															+a1886 2
														
 
															+
														
 
															+\change_inserted 0 1273480307
														
 
															+a1890 2
														
 
															+
														
 
															+\change_inserted 0 1273480335
														
 
															+a1894 2
														
 
															+
														
 
															+\change_inserted 0 1273479897
														
 
															+a1898 2
														
 
															+
														
 
															+\change_inserted 0 1273479653
														
 
															+a1902 2
														
 
															+
														
 
															+\change_inserted 0 1273480371
														
 
															+a1906 2
														
 
															+
														
 
															+\change_inserted 0 1273480464
														
 
															+a1910 2
														
 
															+
														
 
															+\change_inserted 0 1273480399
														
 
															+a1914 2
														
 
															+
														
 
															+\change_inserted 0 1273480425
														
 
															+a1918 2
														
 
															+
														
 
															+\change_inserted 0 1273480453
														
 
															+a1922 2
														
 
															+
														
 
															+\change_inserted 0 1273480455
														
 
															+a1926 2
														
 
															+
														
 
															+\change_inserted 0 1273480450
														
 
															+a1930 2
														
 
															+
														
 
															+\change_inserted 0 1273480452
														
 
															+a1935 2
														
 
															+\change_inserted 0 1273478830
														
 
															+
														
 
															+a1942 5
														
 
															+
														
 
															+\change_deleted 0 1273481604
														
 
															+In theory, we could get away with 2: one after we write the new data, and
														
 
															+ one to somehow atomically change over to it.
														
 
															+\change_inserted 0 1273481632
														
 
															+a1946 2
														
 
															+
														
 
															+\change_inserted 0 1273481724
														
 
															+a1950 2
														
 
															+
														
 
															+\change_inserted 0 1273481713
														
 
															+a1954 2
														
 
															+
														
 
															+\change_inserted 0 1273481717
														
 
															+a1958 2
														
 
															+
														
 
															+\change_inserted 0 1273481730
														
 
															+a1962 2
														
 
															+
														
 
															+\change_inserted 0 1273481736
														
 
															+a1966 2
														
 
															+
														
 
															+\change_inserted 0 1273481744
														
 
															+a1970 2
														
 
															+
														
 
															+\change_inserted 0 1273481748
														
 
															+a1974 2
														
 
															+
														
 
															+\change_inserted 0 1273482185
														
 
															+a1978 2
														
 
															+
														
 
															+\change_inserted 0 1273482259
														
 
															+a1989 50
														
 
															+
														
 
															+\change_deleted 0 1273481848
														
 
															+None.
														
 
															+ Trying to rewrite the transaction code is a separate experiment, which
														
 
															+ I encourage someone else to do.
														
 
															+ At some point you say 
														
 
															+\begin_inset Quotes eld
														
 
															+\end_inset
														
 
															+
														
 
															+use a real database
														
 
															+\begin_inset Quotes erd
														
 
															+\end_inset
														
 
															+
														
 
															+.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+
														
 
															+\change_deleted 0 1273481848
														
 
															+But as a thought experiment:
														
 
															+\change_unchanged
														
 
															+
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+
														
 
															+\change_deleted 0 1273481788
														
 
															+Say there was a pointer in the header which said where the hash table and
														
 
															+ free list tables were, and that no blocks were labeled with whether they
														
 
															+ were free or not (it had to be derived from what list they were in).
														
 
															+ We could create new hash table and free list in some free space, and populate
														
 
															+ it as we want the post-committed state to look.
														
 
															+ Then we sync, then we switch the offset in the header, then we sync again.
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+
														
 
															+\change_deleted 0 1273481788
														
 
															+This would not allow arbitrary changes to the database, such as tdb_repack
														
 
															+ does, and would require more space (since we have to preserve the current
														
 
															+ and future entries at once).
														
 
															+ If we used hash trees rather than one big hash table, we might only have
														
 
															+ to rewrite some sections of the hash, too.
														
 
															+\change_inserted 0 1273481854
														
 
															+
														
 
															+\end_layout
														
 
															+
														
 
															+\begin_layout Standard
														
 
															+
														
 
															+\change_inserted 0 1273482102
														
 
															+a1993 2
														
 
															+
														
 
															+\change_inserted 0 1273482061
														
 
															+a1998 2
														
 
															+
														
 
															+\change_inserted 0 1273482063
														
 
															+a2002 2
														
 
															+
														
 
															+\change_inserted 0 1273482072
														
 
															+a2006 2
														
 
															+
														
 
															+\change_inserted 0 1273482139
														
 
															+a2011 2
														
 
															+
														
 
															+\change_inserted 0 1273482364
														
 
															+a2015 2
														
 
															+
														
 
															+\change_inserted 0 1273482163
														
 
															+a2019 2
														
 
															+
														
 
															+\change_inserted 0 1273482493
														
 
															+a2037 2
														
 
															+
														
 
															+\change_inserted 0 1273482536
														
 
															+a2046 2
														
 
															+\change_unchanged
														
 
															+
														
 
															+a2049 2
														
 
															+
														
 
															+\change_inserted 0 1273482641
														
 
															+a2058 2
														
 
															+
														
 
															+\change_inserted 0 1273481827
														
 
															+d2067 2
														
 
															+a2068 11
														
 
															+We could 
														
 
															+\change_inserted 0 1273481829
														
 
															+then 
														
 
															+\change_unchanged
														
 
															+implement snapshots using a similar method
														
 
															+\change_deleted 0 1273481838
														
 
															+ to the above, only
														
 
															+\change_inserted 0 1273481840
														
 
															+,
														
 
															+\change_unchanged
														
 
															+ using multiple different hash tables/free tables.
														
 
															+@
														
 
															+
														
 
															+
														
 
															+1.2
														
 
															+log
														
 
															+@After first feedback (Ronnie & Volker)
														
 
															+@
														
 
															+text
														
 
															+@d1314 13
														
 
															+d1531 11
														
 
															+a1541 1
														
 
															+The free list should be split into multiple lists to reduce contention.
														
 
															+d1547 39
														
 
															+d1596 7
														
 
															+d1604 1
														
 
															+a1604 1
														
 
															+The algorithm for freeing is simple:
														
 
															+d1608 7
														
 
															+a1614 1
														
 
															+Identify the correct free list.
														
 
															+d1618 30
														
 
															+a1647 1
														
 
															+Lock the list, and place the freed entry at the head.
														
 
															+d1651 7
														
 
															+a1657 2
														
 
															+Allocation is a little more complicated, as we merge entries as we walk
														
 
															+ the list:
														
 
															+d1661 19
														
 
															+a1679 1
														
 
															+Pick a free list; either the list we last freed onto, or based on a 
														
 
															+d1691 17
														
 
															+a1707 1
														
 
															+Lock that list.
														
 
															+d1711 7
														
 
															+a1717 1
														
 
															+If the top entry is well-sized, remove it from the list and return it.
														
 
															+d1721 5
														
 
															+a1725 1
														
 
															+Otherwise, examine the entry to the right of it in the file.
														
 
															+d1731 2
														
 
															+d1737 2
														
 
															+d1743 2
														
 
															+d1749 2
														
 
															+d1756 8
														
 
															+d1765 2
														
 
															+d1770 2
														
 
															+d1773 2
														
 
															+d1778 7
														
 
															+a1784 1
														
 
															+If no list satisfies, expand the file.
														
 
															+d1788 28
														
 
															+a1815 2
														
 
															+This optimizes rapid insert/delete of free list entries, and allows us to
														
 
															+ get rid of the tailer altogether.
														
 
															+d1819 2
														
 
															+d1851 1
														
 
															+a1851 1
														
 
															+\change_inserted 0 1272941474
														
 
															+d1857 303
														
 
															+a2159 18
														
 
															+\change_inserted 0 1272942759
														
 
															+There are various ways to organize these lists, but because we want to be
														
 
															+ able to quickly identify which free list an entry is in, and reduce the
														
 
															+ number of locks required for merging, we will use zoning (eg.
														
 
															+ each of the N free lists in a tdb file of size M covers a fixed fraction
														
 
															+ M/N).
														
 
															+ Note that this means we need to reshuffle the free lists when we expand
														
 
															+ the file; this is probably acceptable when we double the hash table size,
														
 
															+ since that is such an expensive operation already.
														
 
															+ In the case of increasing the file size, there is an optimization we can
														
 
															+ use: if we use M in the formula above as the file size rounded up to the
														
 
															+ next power of 2, we only need reshuffle free lists when the file size crosses
														
 
															+ a power of 2 boundary, 
														
 
															+\emph on
														
 
															+and 
														
 
															+\emph default
														
 
															+reshuffling the free lists is trivial: we simply merge every consecutive
														
 
															+ pair of free lists.
														
 
															+d2164 107
														
 
															+d2276 2
														
 
															+d2280 59
														
 
															+d2346 2
														
 
															+d2363 2
														
 
															+d2366 2
														
 
															+d2371 2
														
 
															+d2382 2
														
 
															+d2389 57
														
 
															+d2458 13
														
 
															+d2474 32
														
 
															+a2505 2
														
 
															+We could implement snapshots using a similar method to the above, only using
														
 
															+ multiple different hash tables/free tables.
														
 
															+@
														
 
															+
														
 
															+
														
 
															+1.1
														
 
															+log
														
 
															+@Initial revision
														
 
															+@
														
 
															+text
														
 
															+@d1 1
														
 
															+a1 1
														
 
															+#LyX 1.6.4 created this file. For more info see http://www.lyx.org/
														
 
															+d36 3
														
 
															+a38 3
														
 
															+\tracking_changes false
														
 
															+\output_changes false
														
 
															+\author "" 
														
 
															+d662 5
														
 
															+a666 1
														
 
															+ behavior of disallowing transactions should become the default.
														
 
															+d1215 21
														
 
															+d1527 2
														
 
															+d1533 3
														
 
															+a1535 1
														
 
															+ The algorithm for freeing is simple:
														
 
															+d1642 26
														
 
															+@
														
--- a/ccan/tdb2/doc/design.pdf
+++ b/ccan/tdb2/doc/design.pdf
--- a/ccan/tdb2/doc/design.txt
+++ b/ccan/tdb2/doc/design.txt
@@ -0,0 +1,1058 @@
 
															+TDB2: A Redesigning The Trivial DataBase
														
 
															+
														
 
															+Rusty Russell, IBM Corporation
														
 
															+
														
 
															+26-July-2010
														
 
															+
														
 
															+Abstract
														
 
															+
														
 
															+The Trivial DataBase on-disk format is 32 bits; with usage cases 
														
 
															+heading towards the 4G limit, that must change. This required 
														
 
															+breakage provides an opportunity to revisit TDB's other design 
														
 
															+decisions and reassess them.
														
 
															+
														
 
															+1 Introduction
														
 
															+
														
 
															+The Trivial DataBase was originally written by Andrew Tridgell as 
														
 
															+a simple key/data pair storage system with the same API as dbm, 
														
 
															+but allowing multiple readers and writers while being small 
														
 
															+enough (< 1000 lines of C) to include in SAMBA. The simple design 
														
 
															+created in 1999 has proven surprisingly robust and performant, 
														
 
															+used in Samba versions 3 and 4 as well as numerous other 
														
 
															+projects. Its useful life was greatly increased by the 
														
 
															+(backwards-compatible!) addition of transaction support in 2005.
														
 
															+
														
 
															+The wider variety and greater demands of TDB-using code has lead 
														
 
															+to some organic growth of the API, as well as some compromises on 
														
 
															+the implementation. None of these, by themselves, are seen as 
														
 
															+show-stoppers, but the cumulative effect is to a loss of elegance 
														
 
															+over the initial, simple TDB implementation. Here is a table of 
														
 
															+the approximate number of lines of implementation code and number 
														
 
															+of API functions at the end of each year:
														
 
															+
														
 
															+
														
 
															++-----------+----------------+--------------------------------+
														
 
															+| Year End  | API Functions  | Lines of C Code Implementation |
														
 
															++-----------+----------------+--------------------------------+
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   1999    |      13        |              1195              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2000    |      24        |              1725              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2001    |      32        |              2228              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2002    |      35        |              2481              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2003    |      35        |              2552              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2004    |      40        |              2584              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2005    |      38        |              2647              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2006    |      52        |              3754              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2007    |      66        |              4398              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2008    |      71        |              4768              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+|   2009    |      73        |              5715              |
														
 
															++-----------+----------------+--------------------------------+
														
 
															+
														
 
															+
														
 
															+This review is an attempt to catalog and address all the known 
														
 
															+issues with TDB and create solutions which address the problems 
														
 
															+without significantly increasing complexity; all involved are far 
														
 
															+too aware of the dangers of second system syndrome in rewriting a 
														
 
															+successful project like this.
														
 
															+
														
 
															+2 API Issues
														
 
															+
														
 
															+2.1 tdb_open_ex Is Not Expandable
														
 
															+
														
 
															+The tdb_open() call was expanded to tdb_open_ex(), which added an 
														
 
															+optional hashing function and an optional logging function 
														
 
															+argument. Additional arguments to open would require the 
														
 
															+introduction of a tdb_open_ex2 call etc.
														
 
															+
														
 
															+2.1.1 Proposed Solution
														
 
															+
														
 
															+tdb_open() will take a linked-list of attributes:
														
 
															+
														
 
															+enum tdb_attribute {
														
 
															+
														
 
															+    TDB_ATTRIBUTE_LOG = 0,
														
 
															+
														
 
															+    TDB_ATTRIBUTE_HASH = 1
														
 
															+
														
 
															+};
														
 
															+
														
 
															+struct tdb_attribute_base {
														
 
															+
														
 
															+    enum tdb_attribute attr;
														
 
															+
														
 
															+    union tdb_attribute *next;
														
 
															+
														
 
															+};
														
 
															+
														
 
															+struct tdb_attribute_log {
														
 
															+
														
 
															+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG 
														
 
															+*/
														
 
															+
														
 
															+    tdb_log_func log_fn;
														
 
															+
														
 
															+    void *log_private;
														
 
															+
														
 
															+};
														
 
															+
														
 
															+struct tdb_attribute_hash {
														
 
															+
														
 
															+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH 
														
 
															+*/
														
 
															+
														
 
															+    tdb_hash_func hash_fn;
														
 
															+
														
 
															+    void *hash_private;
														
 
															+
														
 
															+};
														
 
															+
														
 
															+union tdb_attribute {
														
 
															+
														
 
															+    struct tdb_attribute_base base;
														
 
															+
														
 
															+    struct tdb_attribute_log log;
														
 
															+
														
 
															+    struct tdb_attribute_hash hash;
														
 
															+
														
 
															+};
														
 
															+
														
 
															+This allows future attributes to be added, even if this expands 
														
 
															+the size of the union.
														
 
															+
														
 
															+2.2 tdb_traverse Makes Impossible Guarantees
														
 
															+
														
 
															+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, 
														
 
															+and it was thought that it was important to guarantee that all 
														
 
															+records which exist at the start and end of the traversal would 
														
 
															+be included, and no record would be included twice.
														
 
															+
														
 
															+This adds complexity (see[Reliable-Traversal-Adds]) and does not 
														
 
															+work anyway for records which are altered (in particular, those 
														
 
															+which are expanded may be effectively deleted and re-added behind 
														
 
															+the traversal).
														
 
															+
														
 
															+2.2.1 <traverse-Proposed-Solution>Proposed Solution
														
 
															+
														
 
															+Abandon the guarantee. You will see every record if no changes 
														
 
															+occur during your traversal, otherwise you will see some subset. 
														
 
															+You can prevent changes by using a transaction or the locking 
														
 
															+API.
														
 
															+
														
 
															+2.3 Nesting of Transactions Is Fraught
														
 
															+
														
 
															+TDB has alternated between allowing nested transactions and not 
														
 
															+allowing them. Various paths in the Samba codebase assume that 
														
 
															+transactions will nest, and in a sense they can: the operation is 
														
 
															+only committed to disk when the outer transaction is committed. 
														
 
															+There are two problems, however:
														
 
															+
														
 
															+1. Canceling the inner transaction will cause the outer 
														
 
															+  transaction commit to fail, and will not undo any operations 
														
 
															+  since the inner transaction began. This problem is soluble with 
														
 
															+  some additional internal code.
														
 
															+
														
 
															+2. An inner transaction commit can be cancelled by the outer 
														
 
															+  transaction. This is desirable in the way which Samba's 
														
 
															+  database initialization code uses transactions, but could be a 
														
 
															+  surprise to any users expecting a successful transaction commit 
														
 
															+  to expose changes to others.
														
 
															+
														
 
															+The current solution is to specify the behavior at tdb_open(), 
														
 
															+with the default currently that nested transactions are allowed. 
														
 
															+This flag can also be changed at runtime.
														
 
															+
														
 
															+2.3.1 Proposed Solution
														
 
															+
														
 
															+Given the usage patterns, it seems that the “least-surprise” 
														
 
															+behavior of disallowing nested transactions should become the 
														
 
															+default. Additionally, it seems the outer transaction is the only 
														
 
															+code which knows whether inner transactions should be allowed, so 
														
 
															+a flag to indicate this could be added to tdb_transaction_start. 
														
 
															+However, this behavior can be simulated with a wrapper which uses 
														
 
															+tdb_add_flags() and tdb_remove_flags(), so the API should not be 
														
 
															+expanded for this relatively-obscure case.
														
 
															+
														
 
															+2.4 Incorrect Hash Function is Not Detected
														
 
															+
														
 
															+tdb_open_ex() allows the calling code to specify a different hash 
														
 
															+function to use, but does not check that all other processes 
														
 
															+accessing this tdb are using the same hash function. The result 
														
 
															+is that records are missing from tdb_fetch().
														
 
															+
														
 
															+2.4.1 Proposed Solution
														
 
															+
														
 
															+The header should contain an example hash result (eg. the hash of 
														
 
															+0xdeadbeef), and tdb_open_ex() should check that the given hash 
														
 
															+function produces the same answer, or fail the tdb_open call.
														
 
															+
														
 
															+2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
														
 
															+
														
 
															+In response to scalability issues with the free list ([TDB-Freelist-Is]
														
 
															+) two API workarounds have been incorporated in TDB: 
														
 
															+tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The 
														
 
															+latter actually calls the former with an argument of “5”.
														
 
															+
														
 
															+This code allows deleted records to accumulate without putting 
														
 
															+them in the free list. On delete we iterate through each chain 
														
 
															+and free them in a batch if there are more than max_dead entries. 
														
 
															+These are never otherwise recycled except as a side-effect of a 
														
 
															+tdb_repack.
														
 
															+
														
 
															+2.5.1 Proposed Solution
														
 
															+
														
 
															+With the scalability problems of the freelist solved, this API 
														
 
															+can be removed. The TDB_VOLATILE flag may still be useful as a 
														
 
															+hint that store and delete of records will be at least as common 
														
 
															+as fetch in order to allow some internal tuning, but initially 
														
 
															+will become a no-op.
														
 
															+
														
 
															+2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times 
														
 
															+  In The Same Process
														
 
															+
														
 
															+No process can open the same TDB twice; we check and disallow it. 
														
 
															+This is an unfortunate side-effect of fcntl locks, which operate 
														
 
															+on a per-file rather than per-file-descriptor basis, and do not 
														
 
															+nest. Thus, closing any file descriptor on a file clears all the 
														
 
															+locks obtained by this process, even if they were placed using a 
														
 
															+different file descriptor!
														
 
															+
														
 
															+Note that even if this were solved, deadlock could occur if 
														
 
															+operations were nested: this is a more manageable programming 
														
 
															+error in most cases.
														
 
															+
														
 
															+2.6.1 Proposed Solution
														
 
															+
														
 
															+We could lobby POSIX to fix the perverse rules, or at least lobby 
														
 
															+Linux to violate them so that the most common implementation does 
														
 
															+not have this restriction. This would be a generally good idea 
														
 
															+for other fcntl lock users.
														
 
															+
														
 
															+Samba uses a wrapper which hands out the same tdb_context to 
														
 
															+multiple callers if this happens, and does simple reference 
														
 
															+counting. We should do this inside the tdb library, which already 
														
 
															+emulates lock nesting internally; it would need to recognize when 
														
 
															+deadlock occurs within a single process. This would create a new 
														
 
															+failure mode for tdb operations (while we currently handle 
														
 
															+locking failures, they are impossible in normal use and a process 
														
 
															+encountering them can do little but give up).
														
 
															+
														
 
															+I do not see benefit in an additional tdb_open flag to indicate 
														
 
															+whether re-opening is allowed, as though there may be some 
														
 
															+benefit to adding a call to detect when a tdb_context is shared, 
														
 
															+to allow other to create such an API.
														
 
															+
														
 
															+2.7 TDB API Is Not POSIX Thread-safe
														
 
															+
														
 
															+The TDB API uses an error code which can be queried after an 
														
 
															+operation to determine what went wrong. This programming model 
														
 
															+does not work with threads, unless specific additional guarantees 
														
 
															+are given by the implementation. In addition, even 
														
 
															+otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
														
 
															+).
														
 
															+
														
 
															+2.7.1 Proposed Solution
														
 
															+
														
 
															+Reachitecting the API to include a tdb_errcode pointer would be a 
														
 
															+great deal of churn; we are better to guarantee that the 
														
 
															+tdb_errcode is per-thread so the current programming model can be 
														
 
															+maintained.
														
 
															+
														
 
															+This requires dynamic per-thread allocations, which is awkward 
														
 
															+with POSIX threads (pthread_key_create space is limited and we 
														
 
															+cannot simply allocate a key for every TDB).
														
 
															+
														
 
															+Internal locking is required to make sure that fcntl locks do not 
														
 
															+overlap between threads, and also that the global list of tdbs is 
														
 
															+maintained.
														
 
															+
														
 
															+The aim is that building tdb with -DTDB_PTHREAD will result in a 
														
 
															+pthread-safe version of the library, and otherwise no overhead 
														
 
															+will exist.
														
 
															+
														
 
															+2.8 *_nonblock Functions And *_mark Functions Expose 
														
 
															+  Implementation
														
 
															+
														
 
															+CTDB[footnote:
														
 
															+Clustered TDB, see http://ctdb.samba.org
														
 
															+] wishes to operate on TDB in a non-blocking manner. This is 
														
 
															+currently done as follows:
														
 
															+
														
 
															+1. Call the _nonblock variant of an API function (eg. 
														
 
															+  tdb_lockall_nonblock). If this fails:
														
 
															+
														
 
															+2. Fork a child process, and wait for it to call the normal 
														
 
															+  variant (eg. tdb_lockall).
														
 
															+
														
 
															+3. If the child succeeds, call the _mark variant to indicate we 
														
 
															+  already have the locks (eg. tdb_lockall_mark).
														
 
															+
														
 
															+4. Upon completion, tell the child to release the locks (eg. 
														
 
															+  tdb_unlockall).
														
 
															+
														
 
															+5. Indicate to tdb that it should consider the locks removed (eg. 
														
 
															+  tdb_unlockall_mark).
														
 
															+
														
 
															+There are several issues with this approach. Firstly, adding two 
														
 
															+new variants of each function clutters the API for an obscure 
														
 
															+use, and so not all functions have three variants. Secondly, it 
														
 
															+assumes that all paths of the functions ask for the same locks, 
														
 
															+otherwise the parent process will have to get a lock which the 
														
 
															+child doesn't have under some circumstances. I don't believe this 
														
 
															+is currently the case, but it constrains the implementation. 
														
 
															+
														
 
															+2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
														
 
															+
														
 
															+Implement a hook for locking methods, so that the caller can 
														
 
															+control the calls to create and remove fcntl locks. In this 
														
 
															+scenario, ctdbd would operate as follows:
														
 
															+
														
 
															+1. Call the normal API function, eg tdb_lockall().
														
 
															+
														
 
															+2. When the lock callback comes in, check if the child has the 
														
 
															+  lock. Initially, this is always false. If so, return 0. 
														
 
															+  Otherwise, try to obtain it in non-blocking mode. If that 
														
 
															+  fails, return EWOULDBLOCK.
														
 
															+
														
 
															+3. Release locks in the unlock callback as normal.
														
 
															+
														
 
															+4. If tdb_lockall() fails, see if we recorded a lock failure; if 
														
 
															+  so, call the child to repeat the operation.
														
 
															+
														
 
															+5. The child records what locks it obtains, and returns that 
														
 
															+  information to the parent.
														
 
															+
														
 
															+6. When the child has succeeded, goto 1.
														
 
															+
														
 
															+This is flexible enough to handle any potential locking scenario, 
														
 
															+even when lock requirements change. It can be optimized so that 
														
 
															+the parent does not release locks, just tells the child which 
														
 
															+locks it doesn't need to obtain.
														
 
															+
														
 
															+It also keeps the complexity out of the API, and in ctdbd where 
														
 
															+it is needed.
														
 
															+
														
 
															+2.9 tdb_chainlock Functions Expose Implementation
														
 
															+
														
 
															+tdb_chainlock locks some number of records, including the record 
														
 
															+indicated by the given key. This gave atomicity guarantees; 
														
 
															+no-one can start a transaction, alter, read or delete that key 
														
 
															+while the lock is held.
														
 
															+
														
 
															+It also makes the same guarantee for any other key in the chain, 
														
 
															+which is an internal implementation detail and potentially a 
														
 
															+cause for deadlock.
														
 
															+
														
 
															+2.9.1 Proposed Solution
														
 
															+
														
 
															+None. It would be nice to have an explicit single entry lock 
														
 
															+which effected no other keys. Unfortunately, this won't work for 
														
 
															+an entry which doesn't exist. Thus while chainlock may be 
														
 
															+implemented more efficiently for the existing case, it will still 
														
 
															+have overlap issues with the non-existing case. So it is best to 
														
 
															+keep the current (lack of) guarantee about which records will be 
														
 
															+effected to avoid constraining our implementation.
														
 
															+
														
 
															+2.10 Signal Handling is Not Race-Free
														
 
															+
														
 
															+The tdb_setalarm_sigptr() call allows the caller's signal handler 
														
 
															+to indicate that the tdb locking code should return with a 
														
 
															+failure, rather than trying again when a signal is received (and 
														
 
															+errno == EAGAIN). This is usually used to implement timeouts.
														
 
															+
														
 
															+Unfortunately, this does not work in the case where the signal is 
														
 
															+received before the tdb code enters the fcntl() call to place the 
														
 
															+lock: the code will sleep within the fcntl() code, unaware that 
														
 
															+the signal wants it to exit. In the case of long timeouts, this 
														
 
															+does not happen in practice.
														
 
															+
														
 
															+2.10.1 Proposed Solution
														
 
															+
														
 
															+The locking hooks proposed in[Proposed-Solution-locking-hook] 
														
 
															+would allow the user to decide on whether to fail the lock 
														
 
															+acquisition on a signal. This allows the caller to choose their 
														
 
															+own compromise: they could narrow the race by checking 
														
 
															+immediately before the fcntl call.[footnote:
														
 
															+It may be possible to make this race-free in some implementations 
														
 
															+by having the signal handler alter the struct flock to make it 
														
 
															+invalid. This will cause the fcntl() lock call to fail with 
														
 
															+EINVAL if the signal occurs before the kernel is entered, 
														
 
															+otherwise EAGAIN.
														
 
															+]
														
 
															+
														
 
															+2.11 The API Uses Gratuitous Typedefs, Capitals
														
 
															+
														
 
															+typedefs are useful for providing source compatibility when types 
														
 
															+can differ across implementations, or arguably in the case of 
														
 
															+function pointer definitions which are hard for humans to parse. 
														
 
															+Otherwise it is simply obfuscation and pollutes the namespace.
														
 
															+
														
 
															+Capitalization is usually reserved for compile-time constants and 
														
 
															+macros.
														
 
															+
														
 
															+  TDB_CONTEXT There is no reason to use this over 'struct 
														
 
															+  tdb_context'; the definition isn't visible to the API user 
														
 
															+  anyway.
														
 
															+
														
 
															+  TDB_DATA There is no reason to use this over struct TDB_DATA; 
														
 
															+  the struct needs to be understood by the API user.
														
 
															+
														
 
															+  struct TDB_DATA This would normally be called 'struct 
														
 
															+  tdb_data'.
														
 
															+
														
 
															+  enum TDB_ERROR Similarly, this would normally be enum 
														
 
															+  tdb_error.
														
 
															+
														
 
															+2.11.1 Proposed Solution
														
 
															+
														
 
															+None. Introducing lower case variants would please pedants like 
														
 
															+myself, but if it were done the existing ones should be kept. 
														
 
															+There is little point forcing a purely cosmetic change upon tdb 
														
 
															+users.
														
 
															+
														
 
															+2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The 
														
 
															+  Private Pointer
														
 
															+
														
 
															+For API compatibility reasons, the logging function needs to call 
														
 
															+tdb_get_logging_private() to retrieve the pointer registered by 
														
 
															+the tdb_open_ex for logging.
														
 
															+
														
 
															+2.12.1 Proposed Solution
														
 
															+
														
 
															+It should simply take an extra argument, since we are prepared to 
														
 
															+break the API/ABI.
														
 
															+
														
 
															+2.13 Various Callback Functions Are Not Typesafe
														
 
															+
														
 
															+The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
														
 
															+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read 
														
 
															+and tdb_check all take void * and must internally convert it to 
														
 
															+the argument type they were expecting.
														
 
															+
														
 
															+If this type changes, the compiler will not produce warnings on 
														
 
															+the callers, since it only sees void *.
														
 
															+
														
 
															+2.13.1 Proposed Solution
														
 
															+
														
 
															+With careful use of macros, we can create callback functions 
														
 
															+which give a warning when used on gcc and the types of the 
														
 
															+callback and its private argument differ. Unsupported compilers 
														
 
															+will not give a warning, which is no worse than now. In addition, 
														
 
															+the callbacks become clearer, as they need not use void * for 
														
 
															+their parameter.
														
 
															+
														
 
															+See CCAN's typesafe_cb module at 
														
 
															+http://ccan.ozlabs.org/info/typesafe_cb.html
														
 
															+
														
 
															+2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, 
														
 
															+  tdb_reopen_all Problematic
														
 
															+
														
 
															+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB 
														
 
															+file should be cleared if the caller discovers it is the only 
														
 
															+process with the TDB open. However, if any caller does not 
														
 
															+specify TDB_CLEAR_IF_FIRST it will not be detected, so will have 
														
 
															+the TDB erased underneath them (usually resulting in a crash).
														
 
															+
														
 
															+There is a similar issue on fork(); if the parent exits (or 
														
 
															+otherwise closes the tdb) before the child calls tdb_reopen_all() 
														
 
															+to establish the lock used to indicate the TDB is opened by 
														
 
															+someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe 
														
 
															+it alone has opened the TDB and will erase it.
														
 
															+
														
 
															+2.14.1 Proposed Solution
														
 
															+
														
 
															+Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but 
														
 
															+see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
														
 
															+
														
 
															+3 Performance And Scalability Issues
														
 
															+
														
 
															+3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST 
														
 
															+  Imposes Performance Penalty
														
 
															+
														
 
															+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is 
														
 
															+placed at offset 4 (aka. the ACTIVE_LOCK). While these locks 
														
 
															+never conflict in normal tdb usage, they do add substantial 
														
 
															+overhead for most fcntl lock implementations when the kernel 
														
 
															+scans to detect if a lock conflict exists. This is often a single 
														
 
															+linked list, making the time to acquire and release a fcntl lock 
														
 
															+O(N) where N is the number of processes with the TDB open, not 
														
 
															+the number actually doing work.
														
 
															+
														
 
															+In a Samba server it is common to have huge numbers of clients 
														
 
															+sitting idle, and thus they have weaned themselves off the 
														
 
															+TDB_CLEAR_IF_FIRST flag.[footnote:
														
 
															+There is a flag to tdb_reopen_all() which is used for this 
														
 
															+optimization: if the parent process will outlive the child, the 
														
 
															+child does not need the ACTIVE_LOCK. This is a workaround for 
														
 
															+this very performance issue.
														
 
															+]
														
 
															+
														
 
															+3.1.1 Proposed Solution
														
 
															+
														
 
															+Remove the flag. It was a neat idea, but even trivial servers 
														
 
															+tend to know when they are initializing for the first time and 
														
 
															+can simply unlink the old tdb at that point.
														
 
															+
														
 
															+3.2 TDB Files Have a 4G Limit
														
 
															+
														
 
															+This seems to be becoming an issue (so much for “trivial”!), 
														
 
															+particularly for ldb.
														
 
															+
														
 
															+3.2.1 Proposed Solution
														
 
															+
														
 
															+A new, incompatible TDB format which uses 64 bit offsets 
														
 
															+internally rather than 32 bit as now. For simplicity of endian 
														
 
															+conversion (which TDB does on the fly if required), all values 
														
 
															+will be 64 bit on disk. In practice, some upper bits may be used 
														
 
															+for other purposes, but at least 56 bits will be available for 
														
 
															+file offsets.
														
 
															+
														
 
															+tdb_open() will automatically detect the old version, and even 
														
 
															+create them if TDB_VERSION6 is specified to tdb_open.
														
 
															+
														
 
															+32 bit processes will still be able to access TDBs larger than 4G 
														
 
															+(assuming that their off_t allows them to seek to 64 bits), they 
														
 
															+will gracefully fall back as they fail to mmap. This can happen 
														
 
															+already with large TDBs.
														
 
															+
														
 
															+Old versions of tdb will fail to open the new TDB files (since 28 
														
 
															+August 2009, commit 398d0c29290: prior to that any unrecognized 
														
 
															+file format would be erased and initialized as a fresh tdb!)
														
 
															+
														
 
															+3.3 TDB Records Have a 4G Limit
														
 
															+
														
 
															+This has not been a reported problem, and the API uses size_t 
														
 
															+which can be 64 bit on 64 bit platforms. However, other limits 
														
 
															+may have made such an issue moot.
														
 
															+
														
 
															+3.3.1 Proposed Solution
														
 
															+
														
 
															+Record sizes will be 64 bit, with an error returned on 32 bit 
														
 
															+platforms which try to access such records (the current 
														
 
															+implementation would return TDB_ERR_OOM in a similar case). It 
														
 
															+seems unlikely that 32 bit keys will be a limitation, so the 
														
 
															+implementation may not support this (see [sub:Records-Incur-A]).
														
 
															+
														
 
															+3.4 Hash Size Is Determined At TDB Creation Time
														
 
															+
														
 
															+TDB contains a number of hash chains in the header; the number is 
														
 
															+specified at creation time, and defaults to 131. This is such a 
														
 
															+bottleneck on large databases (as each hash chain gets quite 
														
 
															+long), that LDB uses 10,000 for this hash. In general it is 
														
 
															+impossible to know what the 'right' answer is at database 
														
 
															+creation time.
														
 
															+
														
 
															+3.4.1 Proposed Solution
														
 
															+
														
 
															+After comprehensive performance testing on various scalable hash 
														
 
															+variants[footnote:
														
 
															+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 
														
 
															+This was annoying because I was previously convinced that an 
														
 
															+expanding tree of hashes would be very close to optimal.
														
 
															+], it became clear that it is hard to beat a straight linear hash 
														
 
															+table which doubles in size when it reaches saturation. There are 
														
 
															+three details which become important:
														
 
															+
														
 
															+1. On encountering a full bucket, we use the next bucket.
														
 
															+
														
 
															+2. Extra hash bits are stored with the offset, to reduce 
														
 
															+  comparisons.
														
 
															+
														
 
															+3. A marker entry is used on deleting an entry.
														
 
															+
														
 
															+The doubling of the table must be done under a transaction; we 
														
 
															+will not reduce it on deletion, so it will be an unusual case. It 
														
 
															+will either be placed at the head (other entries will be moved 
														
 
															+out the way so we can expand). We could have a pointer in the 
														
 
															+header to the current hashtable location, but that pointer would 
														
 
															+have to be read frequently to check for hashtable moves.
														
 
															+
														
 
															+The locking for this is slightly more complex than the chained 
														
 
															+case; we currently have one lock per bucket, and that means we 
														
 
															+would need to expand the lock if we overflow to the next bucket. 
														
 
															+The frequency of such collisions will effect our locking 
														
 
															+heuristics: we can always lock more buckets than we need.
														
 
															+
														
 
															+One possible optimization is to only re-check the hash size on an 
														
 
															+insert or a lookup miss.
														
 
															+
														
 
															+3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
														
 
															+
														
 
															+TDB uses a single linked list for the free list. Allocation 
														
 
															+occurs as follows, using heuristics which have evolved over time:
														
 
															+
														
 
															+1. Get the free list lock for this whole operation.
														
 
															+
														
 
															+2. Multiply length by 1.25, so we always over-allocate by 25%.
														
 
															+
														
 
															+3. Set the slack multiplier to 1.
														
 
															+
														
 
															+4. Examine the current freelist entry: if it is > length but < 
														
 
															+  the current best case, remember it as the best case.
														
 
															+
														
 
															+5. Multiply the slack multiplier by 1.05.
														
 
															+
														
 
															+6. If our best fit so far is less than length * slack multiplier, 
														
 
															+  return it. The slack will be turned into a new free record if 
														
 
															+  it's large enough.
														
 
															+
														
 
															+7. Otherwise, go onto the next freelist entry.
														
 
															+
														
 
															+Deleting a record occurs as follows:
														
 
															+
														
 
															+1. Lock the hash chain for this whole operation.
														
 
															+
														
 
															+2. Walk the chain to find the record, keeping the prev pointer 
														
 
															+  offset.
														
 
															+
														
 
															+3. If max_dead is non-zero:
														
 
															+
														
 
															+  (a) Walk the hash chain again and count the dead records.
														
 
															+
														
 
															+  (b) If it's more than max_dead, bulk free all the dead ones 
														
 
															+    (similar to steps 4 and below, but the lock is only obtained 
														
 
															+    once).
														
 
															+
														
 
															+  (c) Simply mark this record as dead and return. 
														
 
															+
														
 
															+4. Get the free list lock for the remainder of this operation.
														
 
															+
														
 
															+5. <right-merging>Examine the following block to see if it is 
														
 
															+  free; if so, enlarge the current block and remove that block 
														
 
															+  from the free list. This was disabled, as removal from the free 
														
 
															+  list was O(entries-in-free-list).
														
 
															+
														
 
															+6. Examine the preceeding block to see if it is free: for this 
														
 
															+  reason, each block has a 32-bit tailer which indicates its 
														
 
															+  length. If it is free, expand it to cover our new block and 
														
 
															+  return.
														
 
															+
														
 
															+7. Otherwise, prepend ourselves to the free list.
														
 
															+
														
 
															+Disabling right-merging (step [right-merging]) causes 
														
 
															+fragmentation; the other heuristics proved insufficient to 
														
 
															+address this, so the final answer to this was that when we expand 
														
 
															+the TDB file inside a transaction commit, we repack the entire 
														
 
															+tdb.
														
 
															+
														
 
															+The single list lock limits our allocation rate; due to the other 
														
 
															+issues this is not currently seen as a bottleneck.
														
 
															+
														
 
															+3.5.1 Proposed Solution
														
 
															+
														
 
															+The first step is to remove all the current heuristics, as they 
														
 
															+obviously interact, then examine them once the lock contention is 
														
 
															+addressed.
														
 
															+
														
 
															+The free list must be split to reduce contention. Assuming 
														
 
															+perfect free merging, we can at most have 1 free list entry for 
														
 
															+each entry. This implies that the number of free lists is related 
														
 
															+to the size of the hash table, but as it is rare to walk a large 
														
 
															+number of free list entries we can use far fewer, say 1/32 of the 
														
 
															+number of hash buckets.
														
 
															+
														
 
															+There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
														
 
															+) but it's not clear this would reduce contention in the common 
														
 
															+case where all processes are allocating/freeing the same size. 
														
 
															+Thus we almost certainly need to divide in other ways: the most 
														
 
															+obvious is to divide the file into zones, and using a free list 
														
 
															+(or set of free lists) for each. This approximates address 
														
 
															+ordering.
														
 
															+
														
 
															+Note that this means we need to split the free lists when we 
														
 
															+expand the file; this is probably acceptable when we double the 
														
 
															+hash table size, since that is such an expensive operation 
														
 
															+already. In the case of increasing the file size, there is an 
														
 
															+optimization we can use: if we use M in the formula above as the 
														
 
															+file size rounded up to the next power of 2, we only need 
														
 
															+reshuffle free lists when the file size crosses a power of 2 
														
 
															+boundary, and reshuffling the free lists is trivial: we simply 
														
 
															+merge every consecutive pair of free lists.
														
 
															+
														
 
															+The basic algorithm is as follows. Freeing is simple:
														
 
															+
														
 
															+1. Identify the correct zone.
														
 
															+
														
 
															+2. Lock the corresponding list.
														
 
															+
														
 
															+3. Re-check the zone (we didn't have a lock, sizes could have 
														
 
															+  changed): relock if necessary.
														
 
															+
														
 
															+4. Place the freed entry in the list for that zone.
														
 
															+
														
 
															+Allocation is a little more complicated, as we perform delayed 
														
 
															+coalescing at this point:
														
 
															+
														
 
															+1. Pick a zone either the zone we last freed into, or based on a “
														
 
															+  random” number.
														
 
															+
														
 
															+2. Lock the corresponding list.
														
 
															+
														
 
															+3. Re-check the zone: relock if necessary.
														
 
															+
														
 
															+4. If the top entry is -large enough, remove it from the list and 
														
 
															+  return it.
														
 
															+
														
 
															+5. Otherwise, coalesce entries in the list.If there was no entry 
														
 
															+  large enough, unlock the list and try the next zone.
														
 
															+
														
 
															+6. If no zone satisfies, expand the file.
														
 
															+
														
 
															+This optimizes rapid insert/delete of free list entries by not 
														
 
															+coalescing them all the time.. First-fit address ordering 
														
 
															+ordering seems to be fairly good for keeping fragmentation low 
														
 
															+(see [sub:TDB-Becomes-Fragmented]). Note that address ordering 
														
 
															+does not need a tailer to coalesce, though if we needed one we 
														
 
															+could have one cheaply: see [sub:Records-Incur-A]. 
														
 
															+
														
 
															+I anticipate that the number of entries in each free zone would 
														
 
															+be small, but it might be worth using one free entry to hold 
														
 
															+pointers to the others for cache efficiency.
														
 
															+
														
 
															+3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
														
 
															+
														
 
															+Much of this is a result of allocation strategy[footnote:
														
 
															+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 
														
 
															+ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
														
 
															+] and deliberate hobbling of coalescing; internal fragmentation 
														
 
															+(aka overallocation) is deliberately set at 25%, and external 
														
 
															+fragmentation is only cured by the decision to repack the entire 
														
 
															+db when a transaction commit needs to enlarge the file.
														
 
															+
														
 
															+3.6.1 Proposed Solution
														
 
															+
														
 
															+The 25% overhead on allocation works in practice for ldb because 
														
 
															+indexes tend to expand by one record at a time. This internal 
														
 
															+fragmentation can be resolved by having an “expanded” bit in the 
														
 
															+header to note entries that have previously expanded, and 
														
 
															+allocating more space for them.
														
 
															+
														
 
															+There are is a spectrum of possible solutions for external 
														
 
															+fragmentation: one is to use a fragmentation-avoiding allocation 
														
 
															+strategy such as best-fit address-order allocator. The other end 
														
 
															+of the spectrum would be to use a bump allocator (very fast and 
														
 
															+simple) and simply repack the file when we reach the end.
														
 
															+
														
 
															+There are three problems with efficient fragmentation-avoiding 
														
 
															+allocators: they are non-trivial, they tend to use a single free 
														
 
															+list for each size, and there's no evidence that tdb allocation 
														
 
															+patterns will match those recorded for general allocators (though 
														
 
															+it seems likely).
														
 
															+
														
 
															+Thus we don't spend too much effort on external fragmentation; we 
														
 
															+will be no worse than the current code if we need to repack on 
														
 
															+occasion. More effort is spent on reducing freelist contention, 
														
 
															+and reducing overhead.
														
 
															+
														
 
															+3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
														
 
															+
														
 
															+Each TDB record has a header as follows:
														
 
															+
														
 
															+struct tdb_record {
														
 
															+
														
 
															+        tdb_off_t next; /* offset of the next record in the list 
														
 
															+*/
														
 
															+
														
 
															+        tdb_len_t rec_len; /* total byte length of record */
														
 
															+
														
 
															+        tdb_len_t key_len; /* byte length of key */
														
 
															+
														
 
															+        tdb_len_t data_len; /* byte length of data */
														
 
															+
														
 
															+        uint32_t full_hash; /* the full 32 bit hash of the key */
														
 
															+
														
 
															+        uint32_t magic;   /* try to catch errors */
														
 
															+
														
 
															+        /* the following union is implied:
														
 
															+
														
 
															+                union {
														
 
															+
														
 
															+                        char record[rec_len];
														
 
															+
														
 
															+                        struct {
														
 
															+
														
 
															+                                char key[key_len];
														
 
															+
														
 
															+                                char data[data_len];
														
 
															+
														
 
															+                        }
														
 
															+
														
 
															+                        uint32_t totalsize; (tailer)
														
 
															+
														
 
															+                }
														
 
															+
														
 
															+        */
														
 
															+
														
 
															+};
														
 
															+
														
 
															+Naively, this would double to a 56-byte overhead on a 64 bit 
														
 
															+implementation.
														
 
															+
														
 
															+3.7.1 Proposed Solution
														
 
															+
														
 
															+We can use various techniques to reduce this for an allocated 
														
 
															+block:
														
 
															+
														
 
															+1. The 'next' pointer is not required, as we are using a flat 
														
 
															+  hash table.
														
 
															+
														
 
															+2. 'rec_len' can instead be expressed as an addition to key_len 
														
 
															+  and data_len (it accounts for wasted or overallocated length in 
														
 
															+  the record). Since the record length is always a multiple of 8, 
														
 
															+  we can conveniently fit it in 32 bits (representing up to 35 
														
 
															+  bits).
														
 
															+
														
 
															+3. 'key_len' and 'data_len' can be reduced. I'm unwilling to 
														
 
															+  restrict 'data_len' to 32 bits, but instead we can combine the 
														
 
															+  two into one 64-bit field and using a 5 bit value which 
														
 
															+  indicates at what bit to divide the two. Keys are unlikely to 
														
 
															+  scale as fast as data, so I'm assuming a maximum key size of 32 
														
 
															+  bits.
														
 
															+
														
 
															+4. 'full_hash' is used to avoid a memcmp on the “miss” case, but 
														
 
															+  this is diminishing returns after a handful of bits (at 10 
														
 
															+  bits, it reduces 99.9% of false memcmp). As an aside, as the 
														
 
															+  lower bits are already incorporated in the hash table 
														
 
															+  resolution, the upper bits should be used here.
														
 
															+
														
 
															+5. 'magic' does not need to be enlarged: it currently reflects 
														
 
															+  one of 5 values (used, free, dead, recovery, and 
														
 
															+  unused_recovery). It is useful for quick sanity checking 
														
 
															+  however, and should not be eliminated.
														
 
															+
														
 
															+6. 'tailer' is only used to coalesce free blocks (so a block to 
														
 
															+  the right can find the header to check if this block is free). 
														
 
															+  This can be replaced by a single 'free' bit in the header of 
														
 
															+  the following block (and the tailer only exists in free 
														
 
															+  blocks).[footnote:
														
 
															+This technique from Thomas Standish. Data Structure Techniques. 
														
 
															+Addison-Wesley, Reading, Massachusetts, 1980.
														
 
															+] The current proposed coalescing algorithm doesn't need this, 
														
 
															+  however.
														
 
															+
														
 
															+This produces a 16 byte used header like this:
														
 
															+
														
 
															+struct tdb_used_record {
														
 
															+
														
 
															+        uint32_t magic : 16,
														
 
															+
														
 
															+                 prev_is_free: 1,
														
 
															+
														
 
															+                 key_data_divide: 5,
														
 
															+
														
 
															+                 top_hash: 10;
														
 
															+
														
 
															+        uint32_t extra_octets;
														
 
															+
														
 
															+        uint64_t key_and_data_len;
														
 
															+
														
 
															+};
														
 
															+
														
 
															+And a free record like this:
														
 
															+
														
 
															+struct tdb_free_record {
														
 
															+
														
 
															+        uint32_t free_magic;
														
 
															+
														
 
															+        uint64_t total_length;
														
 
															+
														
 
															+        ...
														
 
															+
														
 
															+        uint64_t tailer;
														
 
															+
														
 
															+};
														
 
															+
														
 
															+
														
 
															+
														
 
															+3.8 Transaction Commit Requires 4 fdatasync
														
 
															+
														
 
															+The current transaction algorithm is:
														
 
															+
														
 
															+1. write_recovery_data();
														
 
															+
														
 
															+2. sync();
														
 
															+
														
 
															+3. write_recovery_header();
														
 
															+
														
 
															+4. sync();
														
 
															+
														
 
															+5. overwrite_with_new_data();
														
 
															+
														
 
															+6. sync();
														
 
															+
														
 
															+7. remove_recovery_header();
														
 
															+
														
 
															+8. sync(); 
														
 
															+
														
 
															+On current ext3, each sync flushes all data to disk, so the next 
														
 
															+3 syncs are relatively expensive. But this could become a 
														
 
															+performance bottleneck on other filesystems such as ext4.
														
 
															+
														
 
															+3.8.1 Proposed Solution
														
 
															+
														
 
															+Neil Brown points out that this is overzealous, and only one sync 
														
 
															+is needed:
														
 
															+
														
 
															+1. Bundle the recovery data, a transaction counter and a strong 
														
 
															+  checksum of the new data.
														
 
															+
														
 
															+2. Strong checksum that whole bundle.
														
 
															+
														
 
															+3. Store the bundle in the database.
														
 
															+
														
 
															+4. Overwrite the oldest of the two recovery pointers in the 
														
 
															+  header (identified using the transaction counter) with the 
														
 
															+  offset of this bundle.
														
 
															+
														
 
															+5. sync.
														
 
															+
														
 
															+6. Write the new data to the file.
														
 
															+
														
 
															+Checking for recovery means identifying the latest bundle with a 
														
 
															+valid checksum and using the new data checksum to ensure that it 
														
 
															+has been applied. This is more expensive than the current check, 
														
 
															+but need only be done at open. For running databases, a separate 
														
 
															+header field can be used to indicate a transaction in progress; 
														
 
															+we need only check for recovery if this is set.
														
 
															+
														
 
															+3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
														
 
															+
														
 
															+3.9.1 Proposed Solution
														
 
															+
														
 
															+None. At some point you say “use a real database”.
														
 
															+
														
 
															+But as a thought experiment, if we implemented transactions to 
														
 
															+only overwrite free entries (this is tricky: there must not be a 
														
 
															+header in each entry which indicates whether it is free, but use 
														
 
															+of presence in metadata elsewhere), and a pointer to the hash 
														
 
															+table, we could create an entirely new commit without destroying 
														
 
															+existing data. Then it would be easy to implement snapshots in a 
														
 
															+similar way.
														
 
															+
														
 
															+This would not allow arbitrary changes to the database, such as 
														
 
															+tdb_repack does, and would require more space (since we have to 
														
 
															+preserve the current and future entries at once). If we used hash 
														
 
															+trees rather than one big hash table, we might only have to 
														
 
															+rewrite some sections of the hash, too.
														
 
															+
														
 
															+We could then implement snapshots using a similar method, using 
														
 
															+multiple different hash tables/free tables.
														
 
															+
														
 
															+3.10 Transactions Cannot Operate in Parallel
														
 
															+
														
 
															+This would be useless for ldb, as it hits the index records with 
														
 
															+just about every update. It would add significant complexity in 
														
 
															+resolving clashes, and cause the all transaction callers to write 
														
 
															+their code to loop in the case where the transactions spuriously 
														
 
															+failed.
														
 
															+
														
 
															+3.10.1 Proposed Solution
														
 
															+
														
 
															+We could solve a small part of the problem by providing read-only 
														
 
															+transactions. These would allow one write transaction to begin, 
														
 
															+but it could not commit until all r/o transactions are done. This 
														
 
															+would require a new RO_TRANSACTION_LOCK, which would be upgraded 
														
 
															+on commit.
														
 
															+
														
 
															+3.11 Default Hash Function Is Suboptimal
														
 
															+
														
 
															+The Knuth-inspired multiplicative hash used by tdb is fairly slow 
														
 
															+(especially if we expand it to 64 bits), and works best when the 
														
 
															+hash bucket size is a prime number (which also means a slow 
														
 
															+modulus). In addition, it is highly predictable which could 
														
 
															+potentially lead to a Denial of Service attack in some TDB uses.
														
 
															+
														
 
															+3.11.1 Proposed Solution
														
 
															+
														
 
															+The Jenkins lookup3 hash[footnote:
														
 
															+http://burtleburtle.net/bob/c/lookup3.c
														
 
															+] is a fast and superbly-mixing hash. It's used by the Linux 
														
 
															+kernel and almost everything else. This has the particular 
														
 
															+properties that it takes an initial seed, and produces two 32 bit 
														
 
															+hash numbers, which we can combine into a 64-bit hash.
														
 
															+
														
 
															+The seed should be created at tdb-creation time from some random 
														
 
															+source, and placed in the header. This is far from foolproof, but 
														
 
															+adds a little bit of protection against hash bombing.
														
 
															+
														
 
															+3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
														
 
															+
														
 
															+We lock a record during traversal iteration, and try to grab that 
														
 
															+lock in the delete code. If that grab on delete fails, we simply 
														
 
															+mark it deleted and continue onwards; traversal checks for this 
														
 
															+condition and does the delete when it moves off the record.
														
 
															+
														
 
															+If traversal terminates, the dead record may be left 
														
 
															+indefinitely.
														
 
															+
														
 
															+3.12.1 Proposed Solution
														
 
															+
														
 
															+Remove reliability guarantees; see [traverse-Proposed-Solution].
														
 
															+
														
 
															+3.13 Fcntl Locking Adds Overhead
														
 
															+
														
 
															+Placing a fcntl lock means a system call, as does removing one. 
														
 
															+This is actually one reason why transactions can be faster 
														
 
															+(everything is locked once at transaction start). In the 
														
 
															+uncontended case, this overhead can theoretically be eliminated.
														
 
															+
														
 
															+3.13.1 Proposed Solution
														
 
															+
														
 
															+None.
														
 
															+
														
 
															+We tried this before with spinlock support, in the early days of 
														
 
															+TDB, and it didn't make much difference except in manufactured 
														
 
															+benchmarks.
														
 
															+
														
 
															+We could use spinlocks (with futex kernel support under Linux), 
														
 
															+but it means that we lose automatic cleanup when a process dies 
														
 
															+with a lock. There is a method of auto-cleanup under Linux, but 
														
 
															+it's not supported by other operating systems. We could 
														
 
															+reintroduce a clear-if-first-style lock and sweep for dead 
														
 
															+futexes on open, but that wouldn't help the normal case of one 
														
 
															+concurrent opener dying. Increasingly elaborate repair schemes 
														
 
															+could be considered, but they require an ABI change (everyone 
														
 
															+must use them) anyway, so there's no need to do this at the same 
														
 
															+time as everything else.
														
 
															+
														
 
															+3.14 Some Transactions Don't Require Durability
														
 
															+
														
 
															+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for 
														
 
															+normal (fast) usage, and occasionally empties the results into a 
														
 
															+transactional TDB. This kind of usage prioritizes performance 
														
 
															+over durability: as long as we are consistent, data can be lost.
														
 
															+
														
 
															+This would be more neatly implemented inside tdb: a “soft” 
														
 
															+transaction commit (ie. syncless) which meant that data may be 
														
 
															+reverted on a crash.
														
 
															+
														
 
															+3.14.1 Proposed Solution
														
 
															+
														
 
															+None.
														
 
															+
														
 
															+Unfortunately any transaction scheme which overwrites old data 
														
 
															+requires a sync before that overwrite to avoid the possibility of 
														
 
															+corruption.
														
 
															+
														
 
															+It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not]
														
 
															+,where transactions are committed without overwriting existing 
														
 
															+data, and an array of top-level pointers were available in the 
														
 
															+header. If the transaction is “soft” then we would not need a 
														
 
															+sync at all: existing processes would pick up the new hash table 
														
 
															+and free list and work with that.
														
 
															+
														
 
															+At some later point, a sync would allow recovery of the old data 
														
 
															+into the free lists (perhaps when the array of top-level pointers 
														
 
															+filled). On crash, tdb_open() would examine the array of top 
														
 
															+levels, and apply the transactions until it encountered an 
														
 
															+invalid checksum.
														
 
															+
														
--- a/ccan/tdb2/free.c
+++ b/ccan/tdb2/free.c
@@ -0,0 +1,710 @@
 
															+ /* 
														
 
															+   Trivial Database 2: free list/block handling
														
 
															+   Copyright (C) Rusty Russell 2010
														
 
															+   
														
 
															+   This library is free software; you can redistribute it and/or
														
 
															+   modify it under the terms of the GNU Lesser General Public
														
 
															+   License as published by the Free Software Foundation; either
														
 
															+   version 3 of the License, or (at your option) any later version.
														
 
															+
														
 
															+   This library is distributed in the hope that it will be useful,
														
 
															+   but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+   Lesser General Public License for more details.
														
 
															+
														
 
															+   You should have received a copy of the GNU Lesser General Public
														
 
															+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
														
 
															+*/
														
 
															+#include "private.h"
														
 
															+#include <ccan/likely/likely.h>
														
 
															+#include <time.h>
														
 
															+#include <assert.h>
														
 
															+#include <limits.h>
														
 
															+
														
 
															+/* We have to be able to fit a free record here. */
														
 
															+#define MIN_DATA_LEN	\
														
 
															+	(sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
														
 
															+
														
 
															+/* We have a series of free lists, each one covering a "zone" of the file.
														
 
															+ *
														
 
															+ * For each zone we have a series of per-size buckets, and a final bucket for
														
 
															+ * "too big".
														
 
															+ *
														
 
															+ * It's possible to move the free_list_head, but *only* under the allrecord
														
 
															+ * lock. */
														
 
															+static tdb_off_t free_list_off(struct tdb_context *tdb, unsigned int list)
														
 
															+{
														
 
															+	return tdb->header.v.free_off + list * sizeof(tdb_off_t);
														
 
															+}
														
 
															+
														
 
															+/* We're a library: playing with srandom() is unfriendly.  srandom_r
														
 
															+ * probably lacks portability.  We don't need very random here. */
														
 
															+static unsigned int quick_random(struct tdb_context *tdb)
														
 
															+{
														
 
															+	return getpid() + time(NULL) + (unsigned long)tdb;
														
 
															+}
														
 
															+
														
 
															+/* Start by using a random zone to spread the load. */
														
 
															+uint64_t random_free_zone(struct tdb_context *tdb)
														
 
															+{
														
 
															+	/* num_zones might be out of date, but can only increase */
														
 
															+	return quick_random(tdb) % tdb->header.v.num_zones;
														
 
															+}
														
 
															+
														
 
															+static unsigned fls64(uint64_t val)
														
 
															+{
														
 
															+#if HAVE_BUILTIN_CLZL
														
 
															+	if (val <= ULONG_MAX) {
														
 
															+		/* This is significantly faster! */
														
 
															+		return val ? sizeof(long) * CHAR_BIT - __builtin_clzl(val) : 0;
														
 
															+	} else {
														
 
															+#endif
														
 
															+	uint64_t r = 64;
														
 
															+
														
 
															+	if (!val)
														
 
															+		return 0;
														
 
															+	if (!(val & 0xffffffff00000000ull)) {
														
 
															+		val <<= 32;
														
 
															+		r -= 32;
														
 
															+	}
														
 
															+	if (!(val & 0xffff000000000000ull)) {
														
 
															+		val <<= 16;
														
 
															+		r -= 16;
														
 
															+	}
														
 
															+	if (!(val & 0xff00000000000000ull)) {
														
 
															+		val <<= 8;
														
 
															+		r -= 8;
														
 
															+	}
														
 
															+	if (!(val & 0xf000000000000000ull)) {
														
 
															+		val <<= 4;
														
 
															+		r -= 4;
														
 
															+	}
														
 
															+	if (!(val & 0xc000000000000000ull)) {
														
 
															+		val <<= 2;
														
 
															+		r -= 2;
														
 
															+	}
														
 
															+	if (!(val & 0x8000000000000000ull)) {
														
 
															+		val <<= 1;
														
 
															+		r -= 1;
														
 
															+	}
														
 
															+	return r;
														
 
															+#if HAVE_BUILTIN_CLZL
														
 
															+	}
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															+/* In which bucket would we find a particular record size? (ignoring header) */
														
 
															+unsigned int size_to_bucket(struct tdb_context *tdb, tdb_len_t data_len)
														
 
															+{
														
 
															+	unsigned int bucket;
														
 
															+
														
 
															+	/* We can't have records smaller than this. */
														
 
															+	assert(data_len >= MIN_DATA_LEN);
														
 
															+
														
 
															+	/* Ignoring the header... */
														
 
															+	if (data_len - MIN_DATA_LEN <= 64) {
														
 
															+		/* 0 in bucket 0, 8 in bucket 1... 64 in bucket 6. */
														
 
															+		bucket = (data_len - MIN_DATA_LEN) / 8;
														
 
															+	} else {
														
 
															+		/* After that we go power of 2. */
														
 
															+		bucket = fls64(data_len - MIN_DATA_LEN) + 2;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(bucket > tdb->header.v.free_buckets))
														
 
															+		bucket = tdb->header.v.free_buckets;
														
 
															+	return bucket;
														
 
															+}
														
 
															+
														
 
															+/* What zone does a block belong in? */ 
														
 
															+tdb_off_t zone_of(struct tdb_context *tdb, tdb_off_t off)
														
 
															+{
														
 
															+	assert(tdb->header_uptodate);
														
 
															+
														
 
															+	return off >> tdb->header.v.zone_bits;
														
 
															+}
														
 
															+
														
 
															+/* Returns fl->max_bucket + 1, or list number to search. */
														
 
															+static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
														
 
															+{
														
 
															+	tdb_off_t first, off;
														
 
															+
														
 
															+	/* Speculatively search for a non-zero bucket. */
														
 
															+	first = tdb->last_zone * (tdb->header.v.free_buckets+1) + bucket;
														
 
															+	off = tdb_find_nonzero_off(tdb, free_list_off(tdb, first),
														
 
															+				   tdb->header.v.free_buckets - bucket);
														
 
															+	return bucket + off;
														
 
															+}
														
 
															+
														
 
															+static int remove_from_list(struct tdb_context *tdb,
														
 
															+			    tdb_off_t list, struct tdb_free_record *r)
														
 
															+{
														
 
															+	tdb_off_t off;
														
 
															+
														
 
															+	/* Front of list? */
														
 
															+	if (r->prev == 0) {
														
 
															+		off = free_list_off(tdb, list);
														
 
															+	} else {
														
 
															+		off = r->prev + offsetof(struct tdb_free_record, next);
														
 
															+	}
														
 
															+	/* r->prev->next = r->next */
														
 
															+	if (tdb_write_off(tdb, off, r->next)) {
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (r->next != 0) {
														
 
															+		off = r->next + offsetof(struct tdb_free_record, prev);
														
 
															+		/* r->next->prev = r->prev */
														
 
															+		if (tdb_write_off(tdb, off, r->prev)) {
														
 
															+			return -1;
														
 
															+		}
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* Enqueue in this free list. */
														
 
															+static int enqueue_in_free(struct tdb_context *tdb,
														
 
															+			   tdb_off_t list,
														
 
															+			   tdb_off_t off,
														
 
															+			   struct tdb_free_record *new)
														
 
															+{
														
 
															+	new->prev = 0;
														
 
															+	/* new->next = head. */
														
 
															+	new->next = tdb_read_off(tdb, free_list_off(tdb, list));
														
 
															+	if (new->next == TDB_OFF_ERR)
														
 
															+		return -1;
														
 
															+
														
 
															+	if (new->next) {
														
 
															+		/* next->prev = new. */
														
 
															+		if (tdb_write_off(tdb, new->next
														
 
															+				  + offsetof(struct tdb_free_record, prev),
														
 
															+				  off) != 0)
														
 
															+			return -1;
														
 
															+	}
														
 
															+	/* head = new */
														
 
															+	if (tdb_write_off(tdb, free_list_off(tdb, list), off) != 0)
														
 
															+		return -1;
														
 
															+	
														
 
															+	return tdb_write_convert(tdb, off, new, sizeof(*new));
														
 
															+}
														
 
															+
														
 
															+/* List isn't locked. */
														
 
															+int add_free_record(struct tdb_context *tdb,
														
 
															+		    tdb_off_t off, tdb_len_t len_with_header)
														
 
															+{
														
 
															+	struct tdb_free_record new;
														
 
															+	tdb_off_t list;
														
 
															+	int ret;
														
 
															+
														
 
															+	assert(len_with_header >= sizeof(new));
														
 
															+
														
 
															+	new.magic = TDB_FREE_MAGIC;
														
 
															+	new.data_len = len_with_header - sizeof(struct tdb_used_record);
														
 
															+
														
 
															+	tdb->last_zone = zone_of(tdb, off);
														
 
															+	list = tdb->last_zone * (tdb->header.v.free_buckets+1)
														
 
															+		+ size_to_bucket(tdb, new.data_len);
														
 
															+		
														
 
															+	if (tdb_lock_free_list(tdb, list, TDB_LOCK_WAIT) != 0)
														
 
															+		return -1;
														
 
															+
														
 
															+	ret = enqueue_in_free(tdb, list, off, &new);
														
 
															+	tdb_unlock_free_list(tdb, list);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* If we have enough left over to be useful, split that off. */
														
 
															+static int to_used_record(struct tdb_context *tdb,
														
 
															+			  tdb_off_t off,
														
 
															+			  tdb_len_t needed,
														
 
															+			  tdb_len_t total_len,
														
 
															+			  tdb_len_t *actual)
														
 
															+{
														
 
															+	struct tdb_used_record used;
														
 
															+	tdb_len_t leftover;
														
 
															+
														
 
															+	leftover = total_len - needed;
														
 
															+	if (leftover < sizeof(struct tdb_free_record))
														
 
															+		leftover = 0;
														
 
															+
														
 
															+	*actual = total_len - leftover;
														
 
															+
														
 
															+	if (leftover) {
														
 
															+		if (add_free_record(tdb, off + sizeof(used) + *actual,
														
 
															+				    total_len - needed))
														
 
															+			return -1;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* Note: we unlock the current list if we coalesce or fail. */
														
 
															+static int coalesce(struct tdb_context *tdb, tdb_off_t off,
														
 
															+		    tdb_off_t list, tdb_len_t data_len)
														
 
															+{
														
 
															+	struct tdb_free_record pad, *r;
														
 
															+	tdb_off_t end = off + sizeof(struct tdb_used_record) + data_len;
														
 
															+
														
 
															+	while (!tdb->methods->oob(tdb, end + sizeof(*r), 1)) {
														
 
															+		tdb_off_t nlist;
														
 
															+
														
 
															+		r = tdb_get(tdb, end, &pad, sizeof(pad));
														
 
															+		if (!r)
														
 
															+			goto err;
														
 
															+
														
 
															+		if (r->magic != TDB_FREE_MAGIC)
														
 
															+			break;
														
 
															+
														
 
															+		nlist = zone_of(tdb, end) * (tdb->header.v.free_buckets+1)
														
 
															+			+ size_to_bucket(tdb, r->data_len);
														
 
															+
														
 
															+		/* We may be violating lock order here, so best effort. */
														
 
															+		if (tdb_lock_free_list(tdb, nlist, TDB_LOCK_NOWAIT) == -1)
														
 
															+			break;
														
 
															+
														
 
															+		/* Now we have lock, re-check. */
														
 
															+		r = tdb_get(tdb, end, &pad, sizeof(pad));
														
 
															+		if (!r) {
														
 
															+			tdb_unlock_free_list(tdb, nlist);
														
 
															+			goto err;
														
 
															+		}
														
 
															+
														
 
															+		if (unlikely(r->magic != TDB_FREE_MAGIC)) {
														
 
															+			tdb_unlock_free_list(tdb, nlist);
														
 
															+			break;
														
 
															+		}
														
 
															+
														
 
															+		if (remove_from_list(tdb, list, r) == -1) {
														
 
															+			tdb_unlock_free_list(tdb, nlist);
														
 
															+			goto err;
														
 
															+		}
														
 
															+
														
 
															+		end += sizeof(struct tdb_used_record) + r->data_len;
														
 
															+		tdb_unlock_free_list(tdb, nlist);
														
 
															+	}
														
 
															+
														
 
															+	/* Didn't find any adjacent free? */
														
 
															+	if (end == off + sizeof(struct tdb_used_record) + data_len)
														
 
															+		return 0;
														
 
															+
														
 
															+	/* OK, expand record */
														
 
															+	r = tdb_get(tdb, off, &pad, sizeof(pad));
														
 
															+	if (!r)
														
 
															+		goto err;
														
 
															+
														
 
															+	if (remove_from_list(tdb, list, r) == -1)
														
 
															+		goto err;
														
 
															+
														
 
															+	/* We have to drop this to avoid deadlocks. */
														
 
															+	tdb_unlock_free_list(tdb, list);
														
 
															+
														
 
															+	if (add_free_record(tdb, off, end - off) == -1)
														
 
															+		return -1;
														
 
															+	return 1;
														
 
															+
														
 
															+err:
														
 
															+	/* To unify error paths, we *always* unlock list. */
														
 
															+	tdb_unlock_free_list(tdb, list);
														
 
															+	return -1;
														
 
															+}
														
 
															+
														
 
															+/* We need size bytes to put our key and data in. */
														
 
															+static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
														
 
															+				tdb_off_t bucket, size_t size,
														
 
															+				tdb_len_t *actual)
														
 
															+{
														
 
															+	tdb_off_t list;
														
 
															+	tdb_off_t off, prev, best_off;
														
 
															+	struct tdb_free_record pad, best = { 0 }, *r;
														
 
															+	double multiplier;
														
 
															+
														
 
															+again:
														
 
															+	list = tdb->last_zone * (tdb->header.v.free_buckets+1) + bucket;
														
 
															+
														
 
															+	/* Lock this list. */
														
 
															+	if (tdb_lock_free_list(tdb, list, TDB_LOCK_WAIT) == -1) {
														
 
															+		return TDB_OFF_ERR;
														
 
															+	}
														
 
															+
														
 
															+	prev = free_list_off(tdb, list);
														
 
															+	off = tdb_read_off(tdb, prev);
														
 
															+
														
 
															+	if (unlikely(off == TDB_OFF_ERR))
														
 
															+		goto unlock_err;
														
 
															+
														
 
															+	best.data_len = -1ULL;
														
 
															+	best_off = 0;
														
 
															+	multiplier = 1.0;
														
 
															+
														
 
															+	/* Walk the list to see if any are large enough, getting less fussy
														
 
															+	 * as we go. */
														
 
															+	while (off) {
														
 
															+		prev = off;
														
 
															+		off = tdb_read_off(tdb, prev);
														
 
															+		if (unlikely(off == TDB_OFF_ERR))
														
 
															+			goto unlock_err;
														
 
															+
														
 
															+		r = tdb_get(tdb, off, &pad, sizeof(*r));
														
 
															+		if (!r)
														
 
															+			goto unlock_err;
														
 
															+		if (r->magic != TDB_FREE_MAGIC) {
														
 
															+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+				 "lock_and_alloc: %llu non-free 0x%llx\n",
														
 
															+				 (long long)off, (long long)r->magic);
														
 
															+			goto unlock_err;
														
 
															+		}
														
 
															+
														
 
															+		if (r->data_len >= size && r->data_len < best.data_len) {
														
 
															+			best_off = off;
														
 
															+			best = *r;
														
 
															+		}
														
 
															+
														
 
															+		if (best.data_len < size * multiplier && best_off) {
														
 
															+			/* We're happy with this size: take it. */
														
 
															+			if (remove_from_list(tdb, list, &best) != 0)
														
 
															+				goto unlock_err;
														
 
															+			tdb_unlock_free_list(tdb, list);
														
 
															+
														
 
															+			if (to_used_record(tdb, best_off, size, best.data_len,
														
 
															+					   actual)) {
														
 
															+				return -1;
														
 
															+			}
														
 
															+			return best_off;
														
 
															+		}
														
 
															+		multiplier *= 1.01;
														
 
															+
														
 
															+		/* Since we're going slow anyway, try coalescing here. */
														
 
															+		switch (coalesce(tdb, off, list, r->data_len)) {
														
 
															+		case -1:
														
 
															+			/* This has already unlocked on error. */
														
 
															+			return -1;
														
 
															+		case 1:
														
 
															+			/* This has unlocked list, restart. */
														
 
															+			goto again;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	tdb_unlock_free_list(tdb, list);
														
 
															+	return 0;
														
 
															+
														
 
															+unlock_err:
														
 
															+	tdb_unlock_free_list(tdb, list);
														
 
															+	return TDB_OFF_ERR;
														
 
															+}
														
 
															+
														
 
															+/* We want a really big chunk.  Look through every zone's oversize bucket */
														
 
															+static tdb_off_t huge_alloc(struct tdb_context *tdb, size_t size,
														
 
															+			    tdb_len_t *actual)
														
 
															+{
														
 
															+	tdb_off_t i, off;
														
 
															+
														
 
															+	do {
														
 
															+		for (i = 0; i < tdb->header.v.num_zones; i++) {
														
 
															+			/* Try getting one from list. */
														
 
															+			off = lock_and_alloc(tdb, tdb->header.v.free_buckets,
														
 
															+					     size, actual);
														
 
															+			if (off == TDB_OFF_ERR)
														
 
															+				return TDB_OFF_ERR;
														
 
															+			if (off != 0)
														
 
															+				return off;
														
 
															+			/* FIXME: Coalesce! */
														
 
															+		}
														
 
															+	} while (tdb_expand(tdb, 0, size, false) == 0);
														
 
															+
														
 
															+	return TDB_OFF_ERR;
														
 
															+}
														
 
															+
														
 
															+static tdb_off_t get_free(struct tdb_context *tdb, size_t size,
														
 
															+			  tdb_len_t *actual)
														
 
															+{
														
 
															+	tdb_off_t off, bucket;
														
 
															+	unsigned int num_empty, step = 0;
														
 
															+
														
 
															+	bucket = size_to_bucket(tdb, size);
														
 
															+
														
 
															+	/* If we're after something bigger than a single zone, handle
														
 
															+	 * specially. */
														
 
															+	if (unlikely(sizeof(struct tdb_used_record) + size
														
 
															+		     >= (1ULL << tdb->header.v.zone_bits))) {
														
 
															+		return huge_alloc(tdb, size, actual);
														
 
															+	}
														
 
															+
														
 
															+	/* Number of zones we search is proportional to the log of them. */
														
 
															+	for (num_empty = 0; num_empty < fls64(tdb->header.v.num_zones);
														
 
															+	     num_empty++) {
														
 
															+		tdb_off_t b;
														
 
															+
														
 
															+		/* Start at exact size bucket, and search up... */
														
 
															+		for (b = bucket; b <= tdb->header.v.num_zones; b++) {
														
 
															+			b = find_free_head(tdb, b);
														
 
															+
														
 
															+			/* Non-empty list?  Try getting block. */
														
 
															+			if (b <= tdb->header.v.num_zones) {
														
 
															+				/* Try getting one from list. */
														
 
															+				off = lock_and_alloc(tdb, b, size, actual);
														
 
															+				if (off == TDB_OFF_ERR)
														
 
															+					return TDB_OFF_ERR;
														
 
															+				if (off != 0)
														
 
															+					return off;
														
 
															+				/* Didn't work.  Try next bucket. */
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		/* Try another zone, at pseudo random.  Avoid duplicates by
														
 
															+		   using an odd step. */
														
 
															+		if (step == 0)
														
 
															+			step = ((quick_random(tdb)) % 65536) * 2 + 1;
														
 
															+		tdb->last_zone = (tdb->last_zone + step)
														
 
															+			% tdb->header.v.num_zones;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int set_header(struct tdb_context *tdb,
														
 
															+	       struct tdb_used_record *rec,
														
 
															+	       uint64_t keylen, uint64_t datalen,
														
 
															+	       uint64_t actuallen, uint64_t hash)
														
 
															+{
														
 
															+	uint64_t keybits = (fls64(keylen) + 1) / 2;
														
 
															+
														
 
															+	/* Use top bits of hash, so it's independent of hash table size. */
														
 
															+	rec->magic_and_meta
														
 
															+		= (actuallen - (keylen + datalen))
														
 
															+		| ((hash >> 53) << 32)
														
 
															+		| (keybits << 43)
														
 
															+		| (TDB_MAGIC << 48);
														
 
															+	rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
														
 
															+
														
 
															+	/* Encoding can fail on big values. */
														
 
															+	if (rec_key_length(rec) != keylen
														
 
															+	    || rec_data_length(rec) != datalen
														
 
															+	    || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
														
 
															+		tdb->ecode = TDB_ERR_IO;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "Could not encode k=%llu,d=%llu,a=%llu\n",
														
 
															+			 (long long)keylen, (long long)datalen,
														
 
															+			 (long long)actuallen);
														
 
															+		return -1;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static tdb_len_t adjust_size(size_t keylen, size_t datalen, bool growing)
														
 
															+{
														
 
															+	tdb_len_t size = keylen + datalen;
														
 
															+
														
 
															+	if (size < MIN_DATA_LEN)
														
 
															+		size = MIN_DATA_LEN;
														
 
															+
														
 
															+	/* Overallocate if this is coming from an enlarging store. */
														
 
															+	if (growing)
														
 
															+		size += datalen / 2;
														
 
															+
														
 
															+	/* Round to next uint64_t boundary. */
														
 
															+	return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
														
 
															+}
														
 
															+
														
 
															+/* If this fails, try tdb_expand. */
														
 
															+tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
														
 
															+		uint64_t hash, bool growing)
														
 
															+{
														
 
															+	tdb_off_t off;
														
 
															+	tdb_len_t size, actual;
														
 
															+	struct tdb_used_record rec;
														
 
															+
														
 
															+	/* We don't want header to change during this! */
														
 
															+	assert(tdb->header_uptodate);
														
 
															+
														
 
															+	size = adjust_size(keylen, datalen, growing);
														
 
															+
														
 
															+	off = get_free(tdb, size, &actual);
														
 
															+	if (unlikely(off == TDB_OFF_ERR || off == 0))
														
 
															+		return off;
														
 
															+
														
 
															+	/* Some supergiant values can't be encoded. */
														
 
															+	if (set_header(tdb, &rec, keylen, datalen, actual, hash) != 0) {
														
 
															+		add_free_record(tdb, off, sizeof(rec) + actual);
														
 
															+		return TDB_OFF_ERR;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb_write_convert(tdb, off, &rec, sizeof(rec)) != 0)
														
 
															+		return TDB_OFF_ERR;
														
 
															+	
														
 
															+	return off;
														
 
															+}
														
 
															+
														
 
															+static bool larger_buckets_might_help(struct tdb_context *tdb)
														
 
															+{
														
 
															+	/* If our buckets are already covering 1/8 of a zone, don't
														
 
															+	 * bother (note: might become an 1/16 of a zone if we double
														
 
															+	 * zone size). */
														
 
															+	tdb_len_t size = (1ULL << tdb->header.v.zone_bits) / 8;
														
 
															+
														
 
															+	if (size >= MIN_DATA_LEN
														
 
															+	    && size_to_bucket(tdb, size) < tdb->header.v.free_buckets) {
														
 
															+		return false;
														
 
															+	}
														
 
															+
														
 
															+	/* FIXME: Put stats in tdb_context or examine db itself! */
														
 
															+	/* It's fairly cheap to do as we expand database. */
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+static bool zones_happy(struct tdb_context *tdb)
														
 
															+{
														
 
															+	/* FIXME: look at distribution of zones. */
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+/* Expand the database. */
														
 
															+int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
														
 
															+	       bool growing)
														
 
															+{
														
 
															+	uint64_t new_num_buckets, new_num_zones, new_zone_bits;
														
 
															+	uint64_t old_num_total, i;
														
 
															+	tdb_len_t add, freebucket_size, needed;
														
 
															+	tdb_off_t off, old_free_off;
														
 
															+	const tdb_off_t *oldf;
														
 
															+	struct tdb_used_record fhdr;
														
 
															+	
														
 
															+	/* We need room for the record header too. */
														
 
															+	needed = sizeof(struct tdb_used_record)
														
 
															+		+ adjust_size(klen, dlen, growing);
														
 
															+
														
 
															+	/* FIXME: this is overkill.  An expand lock? */
														
 
															+	if (tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == -1)
														
 
															+		return -1;
														
 
															+
														
 
															+	/* Someone may have expanded for us. */
														
 
															+	if (update_header(tdb))
														
 
															+		goto success;
														
 
															+
														
 
															+	/* Make sure we have the latest size. */
														
 
															+	tdb->methods->oob(tdb, tdb->map_size + 1, true);
														
 
															+
														
 
															+	/* Did we enlarge zones without enlarging file? */
														
 
															+	if (tdb->map_size < tdb->header.v.num_zones<<tdb->header.v.zone_bits) {
														
 
															+		add = (tdb->header.v.num_zones<<tdb->header.v.zone_bits)
														
 
															+			- tdb->map_size;
														
 
															+		/* Updates tdb->map_size. */
														
 
															+		if (tdb->methods->expand_file(tdb, tdb->map_size, add) == -1)
														
 
															+			goto fail;
														
 
															+		if (add_free_record(tdb, tdb->map_size - add, add) == -1)
														
 
															+			goto fail;
														
 
															+		if (add >= needed) {
														
 
															+			/* Allocate from this zone. */
														
 
															+			tdb->last_zone = zone_of(tdb, tdb->map_size - add);
														
 
															+			goto success;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* Slow path.  Should we increase the number of buckets? */
														
 
															+	new_num_buckets = tdb->header.v.free_buckets;
														
 
															+	if (larger_buckets_might_help(tdb))
														
 
															+		new_num_buckets++;
														
 
															+
														
 
															+	/* Now we'll need room for the new free buckets, too.  Assume
														
 
															+	 * worst case (zones expand). */
														
 
															+	needed += sizeof(fhdr)
														
 
															+		+ ((tdb->header.v.num_zones+1)
														
 
															+		   * (new_num_buckets+1) * sizeof(tdb_off_t));
														
 
															+
														
 
															+	/* If we need less that one zone, and they're working well, just add
														
 
															+	 * another one. */
														
 
															+	if (needed < (1UL<<tdb->header.v.zone_bits) && zones_happy(tdb)) {
														
 
															+		new_num_zones = tdb->header.v.num_zones+1;
														
 
															+		new_zone_bits = tdb->header.v.zone_bits;
														
 
															+		add = 1ULL << tdb->header.v.zone_bits;
														
 
															+	} else {
														
 
															+		/* Increase the zone size. */
														
 
															+		new_num_zones = tdb->header.v.num_zones;
														
 
															+		new_zone_bits = tdb->header.v.zone_bits+1;
														
 
															+		while ((new_num_zones << new_zone_bits) - tdb->map_size
														
 
															+		       < needed) {
														
 
															+			new_zone_bits++;
														
 
															+		}
														
 
															+
														
 
															+		/* We expand by enough zones to meet the need. */
														
 
															+		add = (needed + (1ULL << new_zone_bits)-1)
														
 
															+			& ~((1ULL << new_zone_bits)-1);
														
 
															+	}
														
 
															+
														
 
															+	/* Updates tdb->map_size. */
														
 
															+	if (tdb->methods->expand_file(tdb, tdb->map_size, add) == -1)
														
 
															+		goto fail;
														
 
															+
														
 
															+	/* Use first part as new free bucket array. */
														
 
															+	off = tdb->map_size - add;
														
 
															+	freebucket_size = new_num_zones
														
 
															+		* (new_num_buckets + 1) * sizeof(tdb_off_t);
														
 
															+
														
 
															+	/* Write header. */
														
 
															+	if (set_header(tdb, &fhdr, 0, freebucket_size, freebucket_size, 0))
														
 
															+		goto fail;
														
 
															+	if (tdb_write_convert(tdb, off, &fhdr, sizeof(fhdr)) == -1)
														
 
															+		goto fail;
														
 
															+
														
 
															+	/* Adjust off to point to start of buckets, add to be remainder. */
														
 
															+	add -= freebucket_size + sizeof(fhdr);
														
 
															+	off += sizeof(fhdr);
														
 
															+
														
 
															+	/* Access the old zones. */
														
 
															+	old_num_total = tdb->header.v.num_zones*(tdb->header.v.free_buckets+1);
														
 
															+	old_free_off = tdb->header.v.free_off;
														
 
															+	oldf = tdb_access_read(tdb, old_free_off,
														
 
															+			       old_num_total * sizeof(tdb_off_t));
														
 
															+	if (!oldf)
														
 
															+		goto fail;
														
 
															+
														
 
															+	/* Switch to using our new zone. */
														
 
															+	if (zero_out(tdb, off, new_num_zones * (new_num_buckets + 1)) == -1)
														
 
															+		goto fail_release;
														
 
															+	tdb->header.v.free_off = off;
														
 
															+	tdb->header.v.num_zones = new_num_zones;
														
 
															+	tdb->header.v.free_buckets = new_num_buckets;
														
 
															+
														
 
															+	/* FIXME: If zone size hasn't changed, can simply copy pointers. */
														
 
															+	/* FIXME: Coalesce? */
														
 
															+	for (i = 0; i < old_num_total; i++) {
														
 
															+		tdb_off_t next;
														
 
															+		struct tdb_free_record rec;
														
 
															+		tdb_off_t list;
														
 
															+
														
 
															+		for (off = oldf[i]; off; off = next) {
														
 
															+			if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
														
 
															+				goto fail_release;
														
 
															+
														
 
															+			list = zone_of(tdb, off)
														
 
															+				* (tdb->header.v.free_buckets+1)
														
 
															+				+ size_to_bucket(tdb, rec.data_len);
														
 
															+			next = rec.next;
														
 
															+		
														
 
															+			if (enqueue_in_free(tdb, list, off, &rec) == -1)
														
 
															+				goto fail_release;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+
														
 
															+	/* Free up the old free buckets. */
														
 
															+	old_free_off -= sizeof(fhdr);
														
 
															+	if (tdb_read_convert(tdb, old_free_off, &fhdr, sizeof(fhdr)) == -1)
														
 
															+		goto fail_release;
														
 
															+	if (add_free_record(tdb, old_free_off,
														
 
															+			    rec_data_length(&fhdr)+rec_extra_padding(&fhdr)))
														
 
															+		goto fail_release;
														
 
															+
														
 
															+	/* Add the rest as a new free record. */
														
 
															+	if (add_free_record(tdb, tdb->map_size - add, add) == -1)
														
 
															+		goto fail_release;
														
 
															+
														
 
															+	/* Start allocating from where the new space is. */
														
 
															+	tdb->last_zone = zone_of(tdb, tdb->map_size - add);
														
 
															+	tdb_access_release(tdb, oldf);
														
 
															+success:
														
 
															+	tdb_allrecord_unlock(tdb, F_WRLCK);
														
 
															+	return 0;
														
 
															+
														
 
															+fail_release:
														
 
															+	tdb_access_release(tdb, oldf);
														
 
															+fail:
														
 
															+	tdb_allrecord_unlock(tdb, F_WRLCK);
														
 
															+	return -1;
														
 
															+}
														
--- a/ccan/tdb2/io.c
+++ b/ccan/tdb2/io.c
@@ -0,0 +1,662 @@
 
															+ /* 
														
 
															+   Unix SMB/CIFS implementation.
														
 
															+
														
 
															+   trivial database library
														
 
															+
														
 
															+   Copyright (C) Andrew Tridgell              1999-2005
														
 
															+   Copyright (C) Paul `Rusty' Russell		   2000
														
 
															+   Copyright (C) Jeremy Allison			   2000-2003
														
 
															+   Copyright (C) Rusty Russell			   2010
														
 
															+
														
 
															+     ** NOTE! The following LGPL license applies to the tdb
														
 
															+     ** library. This does NOT imply that all of Samba is released
														
 
															+     ** under the LGPL
														
 
															+
														
 
															+   This library is free software; you can redistribute it and/or
														
 
															+   modify it under the terms of the GNU Lesser General Public
														
 
															+   License as published by the Free Software Foundation; either
														
 
															+   version 3 of the License, or (at your option) any later version.
														
 
															+
														
 
															+   This library is distributed in the hope that it will be useful,
														
 
															+   but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+   Lesser General Public License for more details.
														
 
															+
														
 
															+   You should have received a copy of the GNU Lesser General Public
														
 
															+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
														
 
															+*/
														
 
															+#include "private.h"
														
 
															+#include <ccan/likely/likely.h>
														
 
															+
														
 
															+void tdb_munmap(struct tdb_context *tdb)
														
 
															+{
														
 
															+	if (tdb->flags & TDB_INTERNAL)
														
 
															+		return;
														
 
															+
														
 
															+	if (tdb->map_ptr) {
														
 
															+		munmap(tdb->map_ptr, tdb->map_size);
														
 
															+		tdb->map_ptr = NULL;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+void tdb_mmap(struct tdb_context *tdb)
														
 
															+{
														
 
															+	if (tdb->flags & TDB_INTERNAL)
														
 
															+		return;
														
 
															+
														
 
															+	if (tdb->flags & TDB_NOMMAP)
														
 
															+		return;
														
 
															+
														
 
															+	tdb->map_ptr = mmap(NULL, tdb->map_size, 
														
 
															+			    PROT_READ|(tdb->read_only? 0:PROT_WRITE), 
														
 
															+			    MAP_SHARED, tdb->fd, 0);
														
 
															+
														
 
															+	/*
														
 
															+	 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
														
 
															+	 */
														
 
															+	if (tdb->map_ptr == MAP_FAILED) {
														
 
															+		tdb->map_ptr = NULL;
														
 
															+		tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
														
 
															+			 "tdb_mmap failed for size %lld (%s)\n", 
														
 
															+			 (long long)tdb->map_size, strerror(errno));
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* check for an out of bounds access - if it is out of bounds then
														
 
															+   see if the database has been expanded by someone else and expand
														
 
															+   if necessary 
														
 
															+   note that "len" is the minimum length needed for the db
														
 
															+*/
														
 
															+static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
														
 
															+{
														
 
															+	struct stat st;
														
 
															+	if (len <= tdb->map_size)
														
 
															+		return 0;
														
 
															+	if (tdb->flags & TDB_INTERNAL) {
														
 
															+		if (!probe) {
														
 
															+			/* Ensure ecode is set for log fn. */
														
 
															+			tdb->ecode = TDB_ERR_IO;
														
 
															+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+				 "tdb_oob len %lld beyond internal"
														
 
															+				 " malloc size %lld\n",
														
 
															+				 (long long)len,
														
 
															+				 (long long)tdb->map_size);
														
 
															+		}
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (fstat(tdb->fd, &st) == -1) {
														
 
															+		tdb->ecode = TDB_ERR_IO;
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (st.st_size < (size_t)len) {
														
 
															+		if (!probe) {
														
 
															+			/* Ensure ecode is set for log fn. */
														
 
															+			tdb->ecode = TDB_ERR_IO;
														
 
															+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+				 "tdb_oob len %lld beyond eof at %lld\n",
														
 
															+				 (long long)len, (long long)st.st_size);
														
 
															+		}
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	/* Unmap, update size, remap */
														
 
															+	tdb_munmap(tdb);
														
 
															+	tdb->map_size = st.st_size;
														
 
															+	tdb_mmap(tdb);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
														
 
															+{
														
 
															+	if (unlikely(!tdb->map_ptr))
														
 
															+		return NULL;
														
 
															+
														
 
															+	/* FIXME: We can do a subset of this! */
														
 
															+	if (tdb->transaction)
														
 
															+		return NULL;
														
 
															+
														
 
															+	if (unlikely(tdb_oob(tdb, off + len, true) == -1))
														
 
															+		return NULL;
														
 
															+	return (char *)tdb->map_ptr + off;
														
 
															+}
														
 
															+
														
 
															+/* Either make a copy into pad and return that, or return ptr into mmap. */
														
 
															+/* Note: pad has to be a real object, so we can't get here if len
														
 
															+ * overflows size_t */
														
 
															+/* FIXME: Transaction */
														
 
															+void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
														
 
															+{
														
 
															+	ssize_t r;
														
 
															+
														
 
															+	if (likely(!(tdb->flags & TDB_CONVERT))) {
														
 
															+		void *ret = tdb_direct(tdb, off, len);
														
 
															+		if (ret)
														
 
															+			return ret;
														
 
															+	}
														
 
															+
														
 
															+	if (unlikely(tdb_oob(tdb, off + len, false) == -1))
														
 
															+		return NULL;
														
 
															+
														
 
															+	r = pread(tdb->fd, pad, len, off);
														
 
															+	if (r != (ssize_t)len) {
														
 
															+		/* Ensure ecode is set for log fn. */
														
 
															+		tdb->ecode = TDB_ERR_IO;
														
 
															+		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+			 "tdb_read failed at %llu "
														
 
															+			 "len=%lld ret=%lld (%s) map_size=%lld\n",
														
 
															+			 (long long)off, (long long)len,
														
 
															+			 (long long)r, strerror(errno),
														
 
															+			 (long long)tdb->map_size);
														
 
															+		return NULL;
														
 
															+	}
														
 
															+	return tdb_convert(tdb, pad, len);
														
 
															+}
														
 
															+
														
 
															+/* Endian conversion: we only ever deal with 8 byte quantities */
														
 
															+void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
														
 
															+{
														
 
															+	if (unlikely((tdb->flags & TDB_CONVERT))) {
														
 
															+		uint64_t i, *p = (uint64_t *)buf;
														
 
															+		for (i = 0; i < size / 8; i++)
														
 
															+			p[i] = bswap_64(p[i]);
														
 
															+	}
														
 
															+	return buf;
														
 
															+}
														
 
															+
														
 
															+/* Return first non-zero offset in num offset array, or num. */
														
 
															+/* FIXME: Return the off? */
														
 
															+uint64_t tdb_find_nonzero_off(struct tdb_context *tdb, tdb_off_t off,
														
 
															+			      uint64_t num)
														
 
															+{
														
 
															+	uint64_t i, *val;
														
 
															+	bool alloc = false;
														
 
															+
														
 
															+	val = tdb_direct(tdb, off, num * sizeof(tdb_off_t));
														
 
															+	if (!unlikely(val)) {
														
 
															+		val = tdb_alloc_read(tdb, off, num * sizeof(tdb_off_t));
														
 
															+		if (!val)
														
 
															+			return num;
														
 
															+		alloc = true;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i < num; i++) {
														
 
															+		if (val[i])
														
 
															+			break;
														
 
															+	}
														
 
															+	if (unlikely(alloc))
														
 
															+		free(val);
														
 
															+	return i;
														
 
															+}
														
 
															+
														
 
															+/* Return first zero offset in num offset array, or num. */
														
 
															+uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
														
 
															+			   uint64_t num)
														
 
															+{
														
 
															+	uint64_t i, *val;
														
 
															+	bool alloc = false;
														
 
															+
														
 
															+	val = tdb_direct(tdb, off, num * sizeof(tdb_off_t));
														
 
															+	if (!unlikely(val)) {
														
 
															+		val = tdb_alloc_read(tdb, off, num * sizeof(tdb_off_t));
														
 
															+		if (!val)
														
 
															+			return num;
														
 
															+		alloc = true;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i < num; i++) {
														
 
															+		if (!val[i])
														
 
															+			break;
														
 
															+	}
														
 
															+	if (unlikely(alloc))
														
 
															+		free(val);
														
 
															+	return i;
														
 
															+}
														
 
															+
														
 
															+static int fill(struct tdb_context *tdb,
														
 
															+		const void *buf, size_t size,
														
 
															+		tdb_off_t off, tdb_len_t len)
														
 
															+{
														
 
															+	while (len) {
														
 
															+		size_t n = len > size ? size : len;
														
 
															+
														
 
															+		if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
														
 
															+			tdb->ecode = TDB_ERR_IO;
														
 
															+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+				 "fill write failed: giving up!\n");
														
 
															+			return -1;
														
 
															+		}
														
 
															+		len -= n;
														
 
															+		off += n;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
														
 
															+{
														
 
															+	void *p = tdb_direct(tdb, off, len);
														
 
															+	if (p) {
														
 
															+		memset(p, 0, len);
														
 
															+		return 0;
														
 
															+	} else {
														
 
															+		char buf[8192] = { 0 };
														
 
															+		return fill(tdb, buf, sizeof(buf), len, off);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
														
 
															+{
														
 
															+	tdb_off_t pad, *ret;
														
 
															+
														
 
															+	ret = tdb_get(tdb, off, &pad, sizeof(ret));
														
 
															+	if (!ret) {
														
 
															+		return TDB_OFF_ERR;
														
 
															+	}
														
 
															+	return *ret;
														
 
															+}
														
 
															+
														
 
															+/* Even on files, we can get partial writes due to signals. */
														
 
															+bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off)
														
 
															+{
														
 
															+	while (len) {
														
 
															+		size_t ret;
														
 
															+		ret = pwrite(fd, buf, len, off);
														
 
															+		if (ret < 0)
														
 
															+			return false;
														
 
															+		if (ret == 0) {
														
 
															+			errno = ENOSPC;
														
 
															+			return false;
														
 
															+		}
														
 
															+		buf += ret;
														
 
															+		off += ret;
														
 
															+		len -= ret;
														
 
															+	}
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+/* write a lump of data at a specified offset */
														
 
															+static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
														
 
															+		     const void *buf, tdb_len_t len)
														
 
															+{
														
 
															+	if (len == 0) {
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb->read_only) {
														
 
															+		tdb->ecode = TDB_ERR_RDONLY;
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb->methods->oob(tdb, off + len, 0) != 0)
														
 
															+		return -1;
														
 
															+
														
 
															+	if (tdb->map_ptr) {
														
 
															+		memcpy(off + (char *)tdb->map_ptr, buf, len);
														
 
															+	} else {
														
 
															+		if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
														
 
															+			tdb->ecode = TDB_ERR_IO;
														
 
															+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+				 "tdb_write failed at %llu len=%llu (%s)\n",
														
 
															+				 off, len, strerror(errno));
														
 
															+			return -1;
														
 
															+		}
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* read a lump of data at a specified offset */
														
 
															+static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
														
 
															+		    tdb_len_t len)
														
 
															+{
														
 
															+	if (tdb->methods->oob(tdb, off + len, 0) != 0) {
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb->map_ptr) {
														
 
															+		memcpy(buf, off + (char *)tdb->map_ptr, len);
														
 
															+	} else {
														
 
															+		ssize_t ret = pread(tdb->fd, buf, len, off);
														
 
															+		if (ret != (ssize_t)len) {
														
 
															+			/* Ensure ecode is set for log fn. */
														
 
															+			tdb->ecode = TDB_ERR_IO;
														
 
															+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+				 "tdb_read failed at %lld "
														
 
															+				 "len=%lld ret=%lld (%s) map_size=%lld\n",
														
 
															+				 (long long)off, (long long)len,
														
 
															+				 (long long)ret, strerror(errno),
														
 
															+				 (long long)tdb->map_size);
														
 
															+			return -1;
														
 
															+		}
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
														
 
															+		      void *rec, size_t len)
														
 
															+{
														
 
															+	return tdb->methods->write(tdb, off, tdb_convert(tdb, rec, len), len);
														
 
															+}
														
 
															+
														
 
															+int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
														
 
															+		      void *rec, size_t len)
														
 
															+{
														
 
															+	int ret = tdb->methods->read(tdb, off, rec, len);
														
 
															+	tdb_convert(tdb, rec, len);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
														
 
															+{
														
 
															+	return tdb_write_convert(tdb, off, &val, sizeof(val));
														
 
															+}
														
 
															+
														
 
															+/* read a lump of data, allocating the space for it */
														
 
															+void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
														
 
															+{
														
 
															+	void *buf;
														
 
															+
														
 
															+	/* some systems don't like zero length malloc */
														
 
															+	buf = malloc(len ? len : 1);
														
 
															+	if (unlikely(!buf)) {
														
 
															+		tdb->ecode = TDB_ERR_OOM;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_alloc_read malloc failed len=%lld\n",
														
 
															+			 (long long)len);
														
 
															+	} else if (unlikely(tdb->methods->read(tdb, offset, buf, len))) {
														
 
															+		free(buf);
														
 
															+		buf = NULL;
														
 
															+	}
														
 
															+	return buf;
														
 
															+}
														
 
															+
														
 
															+uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
														
 
															+{
														
 
															+	struct tdb_used_record pad, *r;
														
 
															+	void *key;
														
 
															+	uint64_t klen, hash;
														
 
															+
														
 
															+	r = tdb_get(tdb, off, &pad, sizeof(*r));
														
 
															+	if (!r)
														
 
															+		/* FIXME */
														
 
															+		return 0;
														
 
															+
														
 
															+	klen = rec_key_length(r);
														
 
															+	key = tdb_direct(tdb, off + sizeof(*r), klen);
														
 
															+	if (likely(key))
														
 
															+		return tdb_hash(tdb, key, klen);
														
 
															+
														
 
															+	key = tdb_alloc_read(tdb, off + sizeof(*r), klen);
														
 
															+	if (unlikely(!key))
														
 
															+		return 0;
														
 
															+	hash = tdb_hash(tdb, key, klen);
														
 
															+	free(key);
														
 
															+	return hash;
														
 
															+}
														
 
															+
														
 
															+/* Give a piece of tdb data to a parser */
														
 
															+int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
														
 
															+		   tdb_off_t offset, tdb_len_t len,
														
 
															+		   int (*parser)(TDB_DATA key, TDB_DATA data,
														
 
															+				 void *private_data),
														
 
															+		   void *private_data)
														
 
															+{
														
 
															+	TDB_DATA data;
														
 
															+	int result;
														
 
															+	bool allocated = false;
														
 
															+
														
 
															+	data.dsize = len;
														
 
															+	data.dptr = tdb_direct(tdb, offset, len);
														
 
															+	if (unlikely(!data.dptr)) {
														
 
															+		if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
														
 
															+			return -1;
														
 
															+		}
														
 
															+		allocated = true;
														
 
															+	}
														
 
															+	result = parser(key, data, private_data);
														
 
															+	if (unlikely(allocated))
														
 
															+		free(data.dptr);
														
 
															+	return result;
														
 
															+}
														
 
															+
														
 
															+/* expand a file.  we prefer to use ftruncate, as that is what posix
														
 
															+  says to use for mmap expansion */
														
 
															+static int tdb_expand_file(struct tdb_context *tdb,
														
 
															+			   tdb_len_t size, tdb_len_t addition)
														
 
															+{
														
 
															+	char buf[8192];
														
 
															+
														
 
															+	if (tdb->read_only) {
														
 
															+		tdb->ecode = TDB_ERR_RDONLY;
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	/* If this fails, we try to fill anyway. */
														
 
															+	if (ftruncate(tdb->fd, size+addition))
														
 
															+		;
														
 
															+
														
 
															+	/* now fill the file with something. This ensures that the
														
 
															+	   file isn't sparse, which would be very bad if we ran out of
														
 
															+	   disk. This must be done with write, not via mmap */
														
 
															+	memset(buf, 0x43, sizeof(buf));
														
 
															+	return fill(tdb, buf, sizeof(buf), addition, size);
														
 
															+}
														
 
															+
														
 
															+const void *tdb_access_read(struct tdb_context *tdb,
														
 
															+			    tdb_off_t off, tdb_len_t len)
														
 
															+{
														
 
															+	const void *ret = tdb_direct(tdb, off, len);
														
 
															+
														
 
															+	if (!ret)
														
 
															+		ret = tdb_alloc_read(tdb, off, len);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+void tdb_access_release(struct tdb_context *tdb, const void *p)
														
 
															+{
														
 
															+	if (!tdb->map_ptr
														
 
															+	    || (char *)p < (char *)tdb->map_ptr
														
 
															+	    || (char *)p >= (char *)tdb->map_ptr + tdb->map_size)
														
 
															+		free((void *)p);
														
 
															+}
														
 
															+
														
 
															+#if 0
														
 
															+/* write a lump of data at a specified offset */
														
 
															+static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
														
 
															+		     const void *buf, tdb_len_t len)
														
 
															+{
														
 
															+	if (len == 0) {
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb->read_only || tdb->traverse_read) {
														
 
															+		tdb->ecode = TDB_ERR_RDONLY;
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
														
 
															+		return -1;
														
 
															+
														
 
															+	if (tdb->map_ptr) {
														
 
															+		memcpy(off + (char *)tdb->map_ptr, buf, len);
														
 
															+	} else {
														
 
															+		ssize_t written = pwrite(tdb->fd, buf, len, off);
														
 
															+		if ((written != (ssize_t)len) && (written != -1)) {
														
 
															+			/* try once more */
														
 
															+			tdb->ecode = TDB_ERR_IO;
														
 
															+			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
														
 
															+				 "%d of %d bytes at %d, trying once more\n",
														
 
															+				 (int)written, len, off));
														
 
															+			written = pwrite(tdb->fd, (const char *)buf+written,
														
 
															+					 len-written,
														
 
															+					 off+written);
														
 
															+		}
														
 
															+		if (written == -1) {
														
 
															+			/* Ensure ecode is set for log fn. */
														
 
															+			tdb->ecode = TDB_ERR_IO;
														
 
															+			TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d "
														
 
															+				 "len=%d (%s)\n", off, len, strerror(errno)));
														
 
															+			return -1;
														
 
															+		} else if (written != (ssize_t)len) {
														
 
															+			tdb->ecode = TDB_ERR_IO;
														
 
															+			TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
														
 
															+				 "write %d bytes at %d in two attempts\n",
														
 
															+				 len, off));
														
 
															+			return -1;
														
 
															+		}
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+  do an unlocked scan of the hash table heads to find the next non-zero head. The value
														
 
															+  will then be confirmed with the lock held
														
 
															+*/		
														
 
															+static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
														
 
															+{
														
 
															+	uint32_t h = *chain;
														
 
															+	if (tdb->map_ptr) {
														
 
															+		for (;h < tdb->header.hash_size;h++) {
														
 
															+			if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
														
 
															+				break;
														
 
															+			}
														
 
															+		}
														
 
															+	} else {
														
 
															+		uint32_t off=0;
														
 
															+		for (;h < tdb->header.hash_size;h++) {
														
 
															+			if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
														
 
															+				break;
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+	(*chain) = h;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* expand the database by expanding the underlying file and doing the
														
 
															+   mmap again if necessary */
														
 
															+int tdb_expand(struct tdb_context *tdb)
														
 
															+{
														
 
															+	struct tdb_record rec;
														
 
															+	tdb_off_t offset, new_size;	
														
 
															+
														
 
															+	/* We have to lock every hash bucket and every free list. */
														
 
															+	do {
														
 
															+		
														
 
															+
														
 
															+	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
														
 
															+		TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	/* must know about any previous expansions by another process */
														
 
															+	tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
														
 
															+
														
 
															+	/* always make room for at least 100 more records, and at
														
 
															+           least 25% more space. Round the database up to a multiple
														
 
															+           of the page size */
														
 
															+	new_size = MAX(tdb->map_size + size*100, tdb->map_size * 1.25);
														
 
															+	size = TDB_ALIGN(new_size, tdb->page_size) - tdb->map_size;
														
 
															+
														
 
															+	if (!(tdb->flags & TDB_INTERNAL))
														
 
															+		tdb_munmap(tdb);
														
 
															+
														
 
															+	/*
														
 
															+	 * We must ensure the file is unmapped before doing this
														
 
															+	 * to ensure consistency with systems like OpenBSD where
														
 
															+	 * writes and mmaps are not consistent.
														
 
															+	 */
														
 
															+
														
 
															+	/* expand the file itself */
														
 
															+	if (!(tdb->flags & TDB_INTERNAL)) {
														
 
															+		if (tdb->methods->tdb_expand_file(tdb, tdb->map_size, size) != 0)
														
 
															+			goto fail;
														
 
															+	}
														
 
															+
														
 
															+	tdb->map_size += size;
														
 
															+
														
 
															+	if (tdb->flags & TDB_INTERNAL) {
														
 
															+		char *new_map_ptr = (char *)realloc(tdb->map_ptr,
														
 
															+						    tdb->map_size);
														
 
															+		if (!new_map_ptr) {
														
 
															+			tdb->map_size -= size;
														
 
															+			goto fail;
														
 
															+		}
														
 
															+		tdb->map_ptr = new_map_ptr;
														
 
															+	} else {
														
 
															+		/*
														
 
															+		 * We must ensure the file is remapped before adding the space
														
 
															+		 * to ensure consistency with systems like OpenBSD where
														
 
															+		 * writes and mmaps are not consistent.
														
 
															+		 */
														
 
															+
														
 
															+		/* We're ok if the mmap fails as we'll fallback to read/write */
														
 
															+		tdb_mmap(tdb);
														
 
															+	}
														
 
															+
														
 
															+	/* form a new freelist record */
														
 
															+	memset(&rec,'\0',sizeof(rec));
														
 
															+	rec.rec_len = size - sizeof(rec);
														
 
															+
														
 
															+	/* link it into the free list */
														
 
															+	offset = tdb->map_size - size;
														
 
															+	if (tdb_free(tdb, offset, &rec) == -1)
														
 
															+		goto fail;
														
 
															+
														
 
															+	tdb_unlock(tdb, -1, F_WRLCK);
														
 
															+	return 0;
														
 
															+ fail:
														
 
															+	tdb_unlock(tdb, -1, F_WRLCK);
														
 
															+	return -1;
														
 
															+}
														
 
															+
														
 
															+/* read/write a tdb_off_t */
														
 
															+int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
														
 
															+{
														
 
															+	return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
														
 
															+}
														
 
															+
														
 
															+int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
														
 
															+{
														
 
															+	tdb_off_t off = *d;
														
 
															+	return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* read/write a record */
														
 
															+int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
														
 
															+{
														
 
															+	if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
														
 
															+		return -1;
														
 
															+	if (TDB_BAD_MAGIC(rec)) {
														
 
															+		/* Ensure ecode is set for log fn. */
														
 
															+		tdb->ecode = TDB_ERR_CORRUPT;
														
 
															+		TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
														
 
															+		return -1;
														
 
															+	}
														
 
															+	return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
														
 
															+}
														
 
															+
														
 
															+int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
														
 
															+{
														
 
															+	struct tdb_record r = *rec;
														
 
															+	return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+static const struct tdb_methods io_methods = {
														
 
															+	tdb_read,
														
 
															+	tdb_write,
														
 
															+	tdb_oob,
														
 
															+	tdb_expand_file,
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+  initialise the default methods table
														
 
															+*/
														
 
															+void tdb_io_init(struct tdb_context *tdb)
														
 
															+{
														
 
															+	tdb->methods = &io_methods;
														
 
															+}
														
--- a/ccan/tdb2/lock.c
+++ b/ccan/tdb2/lock.c
@@ -0,0 +1,848 @@
 
															+ /* 
														
 
															+   Unix SMB/CIFS implementation.
														
 
															+
														
 
															+   trivial database library
														
 
															+
														
 
															+   Copyright (C) Andrew Tridgell              1999-2005
														
 
															+   Copyright (C) Paul `Rusty' Russell		   2000
														
 
															+   Copyright (C) Jeremy Allison			   2000-2003
														
 
															+
														
 
															+     ** NOTE! The following LGPL license applies to the tdb
														
 
															+     ** library. This does NOT imply that all of Samba is released
														
 
															+     ** under the LGPL
														
 
															+
														
 
															+   This library is free software; you can redistribute it and/or
														
 
															+   modify it under the terms of the GNU Lesser General Public
														
 
															+   License as published by the Free Software Foundation; either
														
 
															+   version 3 of the License, or (at your option) any later version.
														
 
															+
														
 
															+   This library is distributed in the hope that it will be useful,
														
 
															+   but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+   Lesser General Public License for more details.
														
 
															+
														
 
															+   You should have received a copy of the GNU Lesser General Public
														
 
															+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
														
 
															+*/
														
 
															+
														
 
															+#include "private.h"
														
 
															+
														
 
															+static int fcntl_lock(struct tdb_context *tdb,
														
 
															+		      int rw, off_t off, off_t len, bool waitflag)
														
 
															+{
														
 
															+	struct flock fl;
														
 
															+
														
 
															+	fl.l_type = rw;
														
 
															+	fl.l_whence = SEEK_SET;
														
 
															+	fl.l_start = off;
														
 
															+	fl.l_len = len;
														
 
															+	fl.l_pid = 0;
														
 
															+
														
 
															+	if (waitflag)
														
 
															+		return fcntl(tdb->fd, F_SETLKW, &fl);
														
 
															+	else
														
 
															+		return fcntl(tdb->fd, F_SETLK, &fl);
														
 
															+}
														
 
															+
														
 
															+static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
														
 
															+{
														
 
															+	struct flock fl;
														
 
															+#if 0 /* Check they matched up locks and unlocks correctly. */
														
 
															+	char line[80];
														
 
															+	FILE *locks;
														
 
															+	bool found = false;
														
 
															+
														
 
															+	locks = fopen("/proc/locks", "r");
														
 
															+
														
 
															+	while (fgets(line, 80, locks)) {
														
 
															+		char *p;
														
 
															+		int type, start, l;
														
 
															+
														
 
															+		/* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
														
 
															+		p = strchr(line, ':') + 1;
														
 
															+		if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
														
 
															+			continue;
														
 
															+		p += strlen(" FLOCK  ADVISORY  ");
														
 
															+		if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
														
 
															+			type = F_RDLCK;
														
 
															+		else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
														
 
															+			type = F_WRLCK;
														
 
															+		else
														
 
															+			abort();
														
 
															+		p += 6;
														
 
															+		if (atoi(p) != getpid())
														
 
															+			continue;
														
 
															+		p = strchr(strchr(p, ' ') + 1, ' ') + 1;
														
 
															+		start = atoi(p);
														
 
															+		p = strchr(p, ' ') + 1;
														
 
															+		if (strncmp(p, "EOF", 3) == 0)
														
 
															+			l = 0;
														
 
															+		else
														
 
															+			l = atoi(p) - start + 1;
														
 
															+
														
 
															+		if (off == start) {
														
 
															+			if (len != l) {
														
 
															+				fprintf(stderr, "Len %u should be %u: %s",
														
 
															+					(int)len, l, line);
														
 
															+				abort();
														
 
															+			}
														
 
															+			if (type != rw) {
														
 
															+				fprintf(stderr, "Type %s wrong: %s",
														
 
															+					rw == F_RDLCK ? "READ" : "WRITE", line);
														
 
															+				abort();
														
 
															+			}
														
 
															+			found = true;
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (!found) {
														
 
															+		fprintf(stderr, "Unlock on %u@%u not found!\n",
														
 
															+			(int)off, (int)len);
														
 
															+		abort();
														
 
															+	}
														
 
															+
														
 
															+	fclose(locks);
														
 
															+#endif
														
 
															+
														
 
															+	fl.l_type = F_UNLCK;
														
 
															+	fl.l_whence = SEEK_SET;
														
 
															+	fl.l_start = off;
														
 
															+	fl.l_len = len;
														
 
															+	fl.l_pid = 0;
														
 
															+
														
 
															+	return fcntl(tdb->fd, F_SETLKW, &fl);
														
 
															+}
														
 
															+
														
 
															+/* a byte range locking function - return 0 on success
														
 
															+   this functions locks/unlocks 1 byte at the specified offset.
														
 
															+
														
 
															+   note that a len of zero means lock to end of file
														
 
															+*/
														
 
															+static int tdb_brlock(struct tdb_context *tdb,
														
 
															+		      int rw_type, tdb_off_t offset, tdb_off_t len,
														
 
															+		      enum tdb_lock_flags flags)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	if (tdb->flags & TDB_NOLOCK) {
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	if (rw_type == F_WRLCK && tdb->read_only) {
														
 
															+		tdb->ecode = TDB_ERR_RDONLY;
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	/* A 32 bit system cannot open a 64-bit file, but it could have
														
 
															+	 * expanded since then: check here. */
														
 
															+	if ((size_t)(offset + len) != offset + len) {
														
 
															+		tdb->ecode = TDB_ERR_IO;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_brlock: lock on giant offset %llu\n",
														
 
															+			 (long long)(offset + len));
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	do {
														
 
															+		ret = fcntl_lock(tdb, rw_type, offset, len,
														
 
															+				 flags & TDB_LOCK_WAIT);
														
 
															+	} while (ret == -1 && errno == EINTR);
														
 
															+
														
 
															+	if (ret == -1) {
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		/* Generic lock error. errno set by fcntl.
														
 
															+		 * EAGAIN is an expected return from non-blocking
														
 
															+		 * locks. */
														
 
															+		if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
														
 
															+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+				 "tdb_brlock failed (fd=%d) at"
														
 
															+				 " offset %llu rw_type=%d flags=%d len=%llu\n",
														
 
															+				 tdb->fd, (long long)offset, rw_type,
														
 
															+				 flags, (long long)len);
														
 
															+		}
														
 
															+		return -1;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int tdb_brunlock(struct tdb_context *tdb,
														
 
															+			int rw_type, tdb_off_t offset, size_t len)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	if (tdb->flags & TDB_NOLOCK) {
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	do {
														
 
															+		ret = fcntl_unlock(tdb, rw_type, offset, len);
														
 
															+	} while (ret == -1 && errno == EINTR);
														
 
															+
														
 
															+	if (ret == -1) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
														
 
															+			 "tdb_brunlock failed (fd=%d) at offset %llu"
														
 
															+			 " rw_type=%d len=%llu\n",
														
 
															+			 tdb->fd, (long long)offset, rw_type, (long long)len);
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+#if 0
														
 
															+/*
														
 
															+  upgrade a read lock to a write lock. This needs to be handled in a
														
 
															+  special way as some OSes (such as solaris) have too conservative
														
 
															+  deadlock detection and claim a deadlock when progress can be
														
 
															+  made. For those OSes we may loop for a while.  
														
 
															+*/
														
 
															+int tdb_allrecord_upgrade(struct tdb_context *tdb)
														
 
															+{
														
 
															+	int count = 1000;
														
 
															+
														
 
															+	if (tdb->allrecord_lock.count != 1) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_allrecord_upgrade failed: count %u too high\n",
														
 
															+			 tdb->allrecord_lock.count);
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb->allrecord_lock.off != 1) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_allrecord_upgrade failed: already upgraded?\n");
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	while (count--) {
														
 
															+		struct timeval tv;
														
 
															+		if (tdb_brlock(tdb, F_WRLCK,
														
 
															+			       TDB_HASH_LOCK_START
														
 
															+			       + (1ULL << tdb->header.v.hash_bits), 0,
														
 
															+			       TDB_LOCK_WAIT|TDB_LOCK_PROBE) == 0) {
														
 
															+			tdb->allrecord_lock.ltype = F_WRLCK;
														
 
															+			tdb->allrecord_lock.off = 0;
														
 
															+			return 0;
														
 
															+		}
														
 
															+		if (errno != EDEADLK) {
														
 
															+			break;
														
 
															+		}
														
 
															+		/* sleep for as short a time as we can - more portable than usleep() */
														
 
															+		tv.tv_sec = 0;
														
 
															+		tv.tv_usec = 1;
														
 
															+		select(0, NULL, NULL, NULL, &tv);
														
 
															+	}
														
 
															+	tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
														
 
															+		 "tdb_allrecord_upgrade failed\n");
														
 
															+	return -1;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+static struct tdb_lock_type *find_nestlock(struct tdb_context *tdb,
														
 
															+					   tdb_off_t offset)
														
 
															+{
														
 
															+	unsigned int i;
														
 
															+
														
 
															+	for (i=0; i<tdb->num_lockrecs; i++) {
														
 
															+		if (tdb->lockrecs[i].off == offset) {
														
 
															+			return &tdb->lockrecs[i];
														
 
															+		}
														
 
															+	}
														
 
															+	return NULL;
														
 
															+}
														
 
															+
														
 
															+/* lock an offset in the database. */
														
 
															+static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
														
 
															+			 enum tdb_lock_flags flags)
														
 
															+{
														
 
															+	struct tdb_lock_type *new_lck;
														
 
															+
														
 
															+	if (offset >= TDB_HASH_LOCK_START + (1ULL << tdb->header.v.hash_bits)
														
 
															+	    + (tdb->header.v.num_zones * (tdb->header.v.free_buckets+1))) {
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+			 "tdb_lock: invalid offset %llu for ltype=%d\n",
														
 
															+			 (long long)offset, ltype);
														
 
															+		return -1;
														
 
															+	}
														
 
															+	if (tdb->flags & TDB_NOLOCK)
														
 
															+		return 0;
														
 
															+
														
 
															+	new_lck = find_nestlock(tdb, offset);
														
 
															+	if (new_lck) {
														
 
															+		/*
														
 
															+		 * Just increment the in-memory struct, posix locks
														
 
															+		 * don't stack.
														
 
															+		 */
														
 
															+		new_lck->count++;
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	new_lck = (struct tdb_lock_type *)realloc(
														
 
															+		tdb->lockrecs,
														
 
															+		sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
														
 
															+	if (new_lck == NULL) {
														
 
															+		tdb->ecode = TDB_ERR_OOM;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_lock: unable to allocate %llu lock structure",
														
 
															+			 (long long)(tdb->num_lockrecs + 1));
														
 
															+		errno = ENOMEM;
														
 
															+		return -1;
														
 
															+	}
														
 
															+	tdb->lockrecs = new_lck;
														
 
															+
														
 
															+	/* Since fcntl locks don't nest, we do a lock for the first one,
														
 
															+	   and simply bump the count for future ones */
														
 
															+	if (tdb_brlock(tdb, ltype, offset, 1, flags)) {
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	tdb->lockrecs[tdb->num_lockrecs].off = offset;
														
 
															+	tdb->lockrecs[tdb->num_lockrecs].count = 1;
														
 
															+	tdb->lockrecs[tdb->num_lockrecs].ltype = ltype;
														
 
															+	tdb->num_lockrecs++;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int tdb_lock_and_recover(struct tdb_context *tdb)
														
 
															+{
														
 
															+#if 0 /* FIXME */
														
 
															+
														
 
															+	int ret;
														
 
															+
														
 
															+	/* We need to match locking order in transaction commit. */
														
 
															+	if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) {
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) {
														
 
															+		tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	ret = tdb_transaction_recover(tdb);
														
 
															+
														
 
															+	tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1);
														
 
															+	tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
														
 
															+
														
 
															+	return ret;
														
 
															+#else
														
 
															+	abort();
														
 
															+	return -1;
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															+static bool tdb_needs_recovery(struct tdb_context *tdb)
														
 
															+{
														
 
															+	/* FIXME */
														
 
															+	return false;
														
 
															+}
														
 
															+
														
 
															+static int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t off, int ltype)
														
 
															+{
														
 
															+	int ret = -1;
														
 
															+	struct tdb_lock_type *lck;
														
 
															+
														
 
															+	if (tdb->flags & TDB_NOLOCK)
														
 
															+		return 0;
														
 
															+
														
 
															+	lck = find_nestlock(tdb, off);
														
 
															+	if ((lck == NULL) || (lck->count == 0)) {
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_unlock: no lock for %llu\n", (long long)off);
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (lck->count > 1) {
														
 
															+		lck->count--;
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * This lock has count==1 left, so we need to unlock it in the
														
 
															+	 * kernel. We don't bother with decrementing the in-memory array
														
 
															+	 * element, we're about to overwrite it with the last array element
														
 
															+	 * anyway.
														
 
															+	 */
														
 
															+	ret = tdb_brunlock(tdb, ltype, off, 1);
														
 
															+
														
 
															+	/*
														
 
															+	 * Shrink the array by overwriting the element just unlocked with the
														
 
															+	 * last array element.
														
 
															+	 */
														
 
															+	*lck = tdb->lockrecs[--tdb->num_lockrecs];
														
 
															+
														
 
															+	if (tdb->num_lockrecs == 0) {
														
 
															+		/* If we're not holding any locks, header can change. */
														
 
															+		tdb->header_uptodate = false;
														
 
															+	}
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+#if 0
														
 
															+/*
														
 
															+  get the transaction lock
														
 
															+ */
														
 
															+int tdb_transaction_lock(struct tdb_context *tdb, int ltype,
														
 
															+			 enum tdb_lock_flags lockflags)
														
 
															+{
														
 
															+	return tdb_nest_lock(tdb, TRANSACTION_LOCK, ltype, lockflags);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+  release the transaction lock
														
 
															+ */
														
 
															+int tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
														
 
															+{
														
 
															+	return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false);
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+/* We only need to lock individual bytes, but Linux merges consecutive locks
														
 
															+ * so we lock in contiguous ranges. */
														
 
															+static int tdb_lock_gradual(struct tdb_context *tdb,
														
 
															+			    int ltype, enum tdb_lock_flags flags,
														
 
															+			    tdb_off_t off, tdb_off_t len)
														
 
															+{
														
 
															+	int ret;
														
 
															+	enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
														
 
															+
														
 
															+	if (len <= 4) {
														
 
															+		/* Single record.  Just do blocking lock. */
														
 
															+		return tdb_brlock(tdb, ltype, off, len, flags);
														
 
															+	}
														
 
															+
														
 
															+	/* First we try non-blocking. */
														
 
															+	ret = tdb_brlock(tdb, ltype, off, len, nb_flags);
														
 
															+	if (ret == 0) {
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	/* Try locking first half, then second. */
														
 
															+	ret = tdb_lock_gradual(tdb, ltype, flags, off, len / 2);
														
 
															+	if (ret == -1)
														
 
															+		return -1;
														
 
															+
														
 
															+	ret = tdb_lock_gradual(tdb, ltype, flags,
														
 
															+				    off + len / 2, len - len / 2);
														
 
															+	if (ret == -1) {
														
 
															+		tdb_brunlock(tdb, ltype, off, len / 2);
														
 
															+		return -1;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* lock/unlock entire database.  It can only be upgradable if you have some
														
 
															+ * other way of guaranteeing exclusivity (ie. transaction write lock).
														
 
															+ * Note that we don't lock the free chains: noone can get those locks
														
 
															+ * without a hash chain lock first. */
														
 
															+int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
														
 
															+		       enum tdb_lock_flags flags, bool upgradable)
														
 
															+{
														
 
															+	tdb_off_t hash_size;
														
 
															+
														
 
															+	/* FIXME: There are no locks on read-only dbs */
														
 
															+	if (tdb->read_only) {
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_allrecord_lock: read-only\n");
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb->allrecord_lock.count && tdb->allrecord_lock.ltype == ltype) {
														
 
															+		tdb->allrecord_lock.count++;
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb->allrecord_lock.count) {
														
 
															+		/* a global lock of a different type exists */
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_allrecord_lock: already have %s lock\n",
														
 
															+			 tdb->allrecord_lock.ltype == F_RDLCK
														
 
															+			 ? "read" : "write");
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb_has_locks(tdb)) {
														
 
															+		/* can't combine global and chain locks */
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_allrecord_lock: already have chain lock\n");
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (upgradable && ltype != F_RDLCK) {
														
 
															+		/* tdb error: you can't upgrade a write lock! */
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_allrecord_lock: can't upgrade a write lock\n");
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	/* Lock all the hash buckets. */
														
 
															+again:
														
 
															+	hash_size = (1ULL << tdb->header.v.hash_bits);
														
 
															+	if (tdb_lock_gradual(tdb, ltype, TDB_HASH_LOCK_START,
														
 
															+			     1ULL << tdb->header.v.hash_bits, flags)) {
														
 
															+		if (!(flags & TDB_LOCK_PROBE)) {
														
 
															+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+				 "tdb_lockall hashes failed (%s)\n",
														
 
															+				 strerror(errno));
														
 
															+		}
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	/* Now we re-check header, holding lock. */
														
 
															+	if (unlikely(update_header(tdb))) {
														
 
															+		tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, hash_size);
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+	/* Now check for needing recovery. */
														
 
															+	if (unlikely(tdb_needs_recovery(tdb))) {
														
 
															+		tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, hash_size);
														
 
															+		if (tdb_lock_and_recover(tdb) == -1) {
														
 
															+			return -1;
														
 
															+		}		
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+
														
 
															+	tdb->allrecord_lock.count = 1;
														
 
															+	/* If it's upgradable, it's actually exclusive so we can treat
														
 
															+	 * it as a write lock. */
														
 
															+	tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
														
 
															+	tdb->allrecord_lock.off = upgradable;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int tdb_lock_open(struct tdb_context *tdb)
														
 
															+{
														
 
															+	return tdb_nest_lock(tdb, TDB_OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT);
														
 
															+}
														
 
															+
														
 
															+void tdb_unlock_open(struct tdb_context *tdb)
														
 
															+{
														
 
															+	tdb_nest_unlock(tdb, TDB_OPEN_LOCK, F_WRLCK);
														
 
															+}
														
 
															+
														
 
															+/* unlock entire db */
														
 
															+int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
														
 
															+{
														
 
															+	tdb_off_t hash_size;
														
 
															+
														
 
															+	/* FIXME: There are no locks on read-only dbs */
														
 
															+	if (tdb->read_only) {
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_allrecord_unlock: read-only\n");
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb->allrecord_lock.count == 0) {
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_allrecord_unlock: not locked!\n");
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	/* Upgradable locks are marked as write locks. */
														
 
															+	if (tdb->allrecord_lock.ltype != ltype
														
 
															+	    && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_allrecord_unlock: have %s lock\n",
														
 
															+			 tdb->allrecord_lock.ltype == F_RDLCK
														
 
															+			 ? "read" : "write");
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb->allrecord_lock.count > 1) {
														
 
															+		tdb->allrecord_lock.count--;
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	tdb->allrecord_lock.count = 0;
														
 
															+	tdb->allrecord_lock.ltype = 0;
														
 
															+
														
 
															+	hash_size = (1ULL << tdb->header.v.hash_bits);
														
 
															+
														
 
															+	return tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, hash_size);
														
 
															+}
														
 
															+
														
 
															+bool tdb_has_locks(struct tdb_context *tdb)
														
 
															+{
														
 
															+	return tdb->allrecord_lock.count || tdb->num_lockrecs;
														
 
															+}
														
 
															+
														
 
															+#if 0
														
 
															+/* lock entire database with write lock */
														
 
															+int tdb_lockall(struct tdb_context *tdb)
														
 
															+{
														
 
															+	tdb_trace(tdb, "tdb_lockall");
														
 
															+	return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
														
 
															+}
														
 
															+
														
 
															+/* lock entire database with write lock - nonblocking varient */
														
 
															+int tdb_lockall_nonblock(struct tdb_context *tdb)
														
 
															+{
														
 
															+	int ret = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_NOWAIT, false);
														
 
															+	tdb_trace_ret(tdb, "tdb_lockall_nonblock", ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* unlock entire database with write lock */
														
 
															+int tdb_unlockall(struct tdb_context *tdb)
														
 
															+{
														
 
															+	tdb_trace(tdb, "tdb_unlockall");
														
 
															+	return tdb_allrecord_unlock(tdb, F_WRLCK);
														
 
															+}
														
 
															+
														
 
															+/* lock entire database with read lock */
														
 
															+int tdb_lockall_read(struct tdb_context *tdb)
														
 
															+{
														
 
															+	tdb_trace(tdb, "tdb_lockall_read");
														
 
															+	return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
														
 
															+}
														
 
															+
														
 
															+/* lock entire database with read lock - nonblock varient */
														
 
															+int tdb_lockall_read_nonblock(struct tdb_context *tdb)
														
 
															+{
														
 
															+	int ret = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_NOWAIT, false);
														
 
															+	tdb_trace_ret(tdb, "tdb_lockall_read_nonblock", ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* unlock entire database with read lock */
														
 
															+int tdb_unlockall_read(struct tdb_context *tdb)
														
 
															+{
														
 
															+	tdb_trace(tdb, "tdb_unlockall_read");
														
 
															+	return tdb_allrecord_unlock(tdb, F_RDLCK);
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+int tdb_lock_list(struct tdb_context *tdb, tdb_off_t list,
														
 
															+		  int ltype, enum tdb_lock_flags waitflag)
														
 
															+{
														
 
															+	/* a allrecord lock allows us to avoid per chain locks */
														
 
															+	if (tdb->allrecord_lock.count &&
														
 
															+	    (ltype == tdb->allrecord_lock.ltype || ltype == F_RDLCK)) {
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	if (tdb->allrecord_lock.count) {
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_lock_list: have %s allrecordlock\n",
														
 
															+			 tdb->allrecord_lock.ltype == F_RDLCK
														
 
															+			 ? "read" : "write");
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	/* FIXME: Should we do header_uptodate and return retry here? */
														
 
															+	return tdb_nest_lock(tdb, TDB_HASH_LOCK_START + list, ltype, waitflag);
														
 
															+}
														
 
															+
														
 
															+int tdb_unlock_list(struct tdb_context *tdb, tdb_off_t list, int ltype)
														
 
															+{
														
 
															+	/* a allrecord lock allows us to avoid per chain locks */
														
 
															+	if (tdb->allrecord_lock.count) {
														
 
															+		if (tdb->allrecord_lock.ltype == F_RDLCK
														
 
															+		    && ltype == F_WRLCK) {
														
 
															+			tdb->ecode = TDB_ERR_LOCK;
														
 
															+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+				 "tdb_unlock_list RO allrecord!\n");
														
 
															+			return -1;
														
 
															+		}
														
 
															+		return 0;
														
 
															+	} else {
														
 
															+		return tdb_nest_unlock(tdb, TDB_HASH_LOCK_START + list, ltype);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* Free list locks come after hash locks */
														
 
															+int tdb_lock_free_list(struct tdb_context *tdb, tdb_off_t flist,
														
 
															+		       enum tdb_lock_flags waitflag)
														
 
															+{
														
 
															+	/* You're supposed to have a hash lock first! */
														
 
															+	if (!tdb_has_locks(tdb)) {
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+			 "tdb_lock_free_list without lock!\n");
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	/* a allrecord lock allows us to avoid per chain locks */
														
 
															+	if (tdb->allrecord_lock.count) {
														
 
															+		if (tdb->allrecord_lock.ltype == F_WRLCK)
														
 
															+			return 0;
														
 
															+		tdb->ecode = TDB_ERR_LOCK;
														
 
															+		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+			 "tdb_lock_free_list with RO allrecordlock!\n");
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	return tdb_nest_lock(tdb, TDB_HASH_LOCK_START
														
 
															+			     + (1ULL << tdb->header.v.hash_bits)
														
 
															+			     + flist, F_WRLCK, waitflag);
														
 
															+}
														
 
															+
														
 
															+void tdb_unlock_free_list(struct tdb_context *tdb, tdb_off_t flist)
														
 
															+{
														
 
															+	if (tdb->allrecord_lock.count)
														
 
															+		return;
														
 
															+
														
 
															+	tdb_nest_unlock(tdb, TDB_HASH_LOCK_START
														
 
															+			+ (1ULL << tdb->header.v.hash_bits)
														
 
															+			+ flist, F_WRLCK);
														
 
															+}
														
 
															+
														
 
															+#if 0
														
 
															+static int chainlock_loop(struct tdb_context *tdb, const TDB_DATA *key,
														
 
															+			  int ltype, enum tdb_lock_flags waitflag,
														
 
															+			  const char *func)
														
 
															+{
														
 
															+	int ret;
														
 
															+	uint64_t h = tdb_hash(tdb, key->dptr, key->dsize);
														
 
															+
														
 
															+again:
														
 
															+	ret = tdb_lock_list(tdb,
														
 
															+			    h & ((1ULL << tdb->header.v.hash_bits) - 1),
														
 
															+			    ltype, waitflag);
														
 
															+	if (likely(ret == 0) && unlikely(update_header(tdb))) {
														
 
															+		tdb_unlock_list(tdb, h & ((1ULL << tdb->header.v.hash_bits)-1),
														
 
															+				ltype);
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+	tdb_trace_1rec(tdb, func, *key);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* lock/unlock one hash chain. This is meant to be used to reduce
														
 
															+   contention - it cannot guarantee how many records will be locked */
														
 
															+int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
														
 
															+{
														
 
															+	return chainlock_loop(tdb, &key, F_WRLCK, TDB_LOCK_WAIT,
														
 
															+			      "tdb_chainlock");
														
 
															+}
														
 
															+
														
 
															+/* lock/unlock one hash chain, non-blocking. This is meant to be used
														
 
															+   to reduce contention - it cannot guarantee how many records will be
														
 
															+   locked */
														
 
															+int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
														
 
															+{
														
 
															+	return chainlock_loop(tdb, &key, F_WRLCK, TDB_LOCK_NOWAIT,
														
 
															+			      "tdb_chainlock_nonblock");
														
 
															+}
														
 
															+
														
 
															+int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
														
 
															+{
														
 
															+	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
														
 
															+	tdb_trace_1rec(tdb, "tdb_chainunlock", key);
														
 
															+	return tdb_unlock_list(tdb, h & ((1ULL << tdb->header.v.hash_bits)-1),
														
 
															+			       F_WRLCK);
														
 
															+}
														
 
															+
														
 
															+int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
														
 
															+{
														
 
															+	return chainlock_loop(tdb, &key, F_RDLCK, TDB_LOCK_WAIT,
														
 
															+			      "tdb_chainlock_read");
														
 
															+}
														
 
															+
														
 
															+int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
														
 
															+{
														
 
															+	uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
														
 
															+	tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
														
 
															+	return tdb_unlock_list(tdb, h & ((1ULL << tdb->header.v.hash_bits)-1),
														
 
															+			       F_RDLCK);
														
 
															+}
														
 
															+
														
 
															+/* record lock stops delete underneath */
														
 
															+int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
														
 
															+{
														
 
															+	if (tdb->allrecord_lock.count) {
														
 
															+		return 0;
														
 
															+	}
														
 
															+	return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+  Write locks override our own fcntl readlocks, so check it here.
														
 
															+  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
														
 
															+  an error to fail to get the lock here.
														
 
															+*/
														
 
															+int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off)
														
 
															+{
														
 
															+	struct tdb_traverse_lock *i;
														
 
															+	for (i = &tdb->travlocks; i; i = i->next)
														
 
															+		if (i->off == off)
														
 
															+			return -1;
														
 
															+	if (tdb->allrecord_lock.count) {
														
 
															+		if (tdb->allrecord_lock.ltype == F_WRLCK) {
														
 
															+			return 0;
														
 
															+		}
														
 
															+		return -1;
														
 
															+	}
														
 
															+	return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
														
 
															+}
														
 
															+
														
 
															+int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
														
 
															+{
														
 
															+	if (tdb->allrecord_lock.count) {
														
 
															+		return 0;
														
 
															+	}
														
 
															+	return tdb_brunlock(tdb, F_WRLCK, off, 1);
														
 
															+}
														
 
															+
														
 
															+/* fcntl locks don't stack: avoid unlocking someone else's */
														
 
															+int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off)
														
 
															+{
														
 
															+	struct tdb_traverse_lock *i;
														
 
															+	uint32_t count = 0;
														
 
															+
														
 
															+	if (tdb->allrecord_lock.count) {
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	if (off == 0)
														
 
															+		return 0;
														
 
															+	for (i = &tdb->travlocks; i; i = i->next)
														
 
															+		if (i->off == off)
														
 
															+			count++;
														
 
															+	return (count == 1 ? tdb_brunlock(tdb, F_RDLCK, off, 1) : 0);
														
 
															+}
														
 
															+
														
 
															+/* The transaction code uses this to remove all locks. */
														
 
															+void tdb_release_transaction_locks(struct tdb_context *tdb)
														
 
															+{
														
 
															+	unsigned int i;
														
 
															+
														
 
															+	if (tdb->allrecord_lock.count != 0) {
														
 
															+		tdb_off_t hash_size, free_size;
														
 
															+
														
 
															+		hash_size = (1ULL << tdb->header.v.hash_bits)
														
 
															+			* sizeof(tdb_off_t);
														
 
															+		free_size = tdb->header.v.free_zones 
														
 
															+			* (tdb->header.v.free_buckets + 1) * sizeof(tdb_off_t);
														
 
															+
														
 
															+		tdb_brunlock(tdb, tdb->allrecord_lock.ltype,
														
 
															+			     tdb->header.v.hash_off, hash_size);
														
 
															+		tdb_brunlock(tdb, tdb->allrecord_lock.ltype,
														
 
															+			     tdb->header.v.free_off, free_size);
														
 
															+		tdb->allrecord_lock.count = 0;
														
 
															+		tdb->allrecord_lock.ltype = 0;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i<tdb->num_lockrecs; i++) {
														
 
															+		struct tdb_lock_type *lck = &tdb->lockrecs[i];
														
 
															+
														
 
															+		tdb_brunlock(tdb, lck->ltype, lck->off, 1);
														
 
															+	}
														
 
															+	tdb->num_lockrecs = 0;
														
 
															+	SAFE_FREE(tdb->lockrecs);
														
 
															+	tdb->header_uptodate = false;
														
 
															+}
														
 
															+#endif
														
--- a/ccan/tdb2/private.h
+++ b/ccan/tdb2/private.h
@@ -0,0 +1,456 @@
 
															+#ifndef TDB_PRIVATE_H
														
 
															+#define TDB_PRIVATE_H
														
 
															+ /* 
														
 
															+   Trivial Database 2: private types and prototypes
														
 
															+   Copyright (C) Rusty Russell 2010
														
 
															+
														
 
															+   This library is free software; you can redistribute it and/or
														
 
															+   modify it under the terms of the GNU Lesser General Public
														
 
															+   License as published by the Free Software Foundation; either
														
 
															+   version 3 of the License, or (at your option) any later version.
														
 
															+
														
 
															+   This library is distributed in the hope that it will be useful,
														
 
															+   but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+   Lesser General Public License for more details.
														
 
															+
														
 
															+   You should have received a copy of the GNU Lesser General Public
														
 
															+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
														
 
															+*/
														
 
															+
														
 
															+#define _XOPEN_SOURCE 500
														
 
															+#define _FILE_OFFSET_BITS 64
														
 
															+#include <stdint.h>
														
 
															+#include <stdbool.h>
														
 
															+#include <stdlib.h>
														
 
															+#include <sys/time.h>
														
 
															+#include <sys/mman.h>
														
 
															+#include <unistd.h>
														
 
															+#include <fcntl.h>
														
 
															+#include <string.h>
														
 
															+#include <errno.h>
														
 
															+#include <stdio.h>
														
 
															+#include <utime.h>
														
 
															+#include <unistd.h>
														
 
															+#include "config.h"
														
 
															+#include <ccan/tdb2/tdb2.h>
														
 
															+#include <ccan/likely/likely.h>
														
 
															+#ifdef HAVE_BYTESWAP_H
														
 
															+#include <byteswap.h>
														
 
															+#endif
														
 
															+
														
 
															+#ifndef TEST_IT
														
 
															+#define TEST_IT(cond)
														
 
															+#endif
														
 
															+
														
 
															+/* #define TDB_TRACE 1 */
														
 
															+
														
 
															+#ifndef __STRING
														
 
															+#define __STRING(x)    #x
														
 
															+#endif
														
 
															+
														
 
															+#ifndef __STRINGSTRING
														
 
															+#define __STRINGSTRING(x) __STRING(x)
														
 
															+#endif
														
 
															+
														
 
															+#ifndef __location__
														
 
															+#define __location__ __FILE__ ":" __STRINGSTRING(__LINE__)
														
 
															+#endif
														
 
															+
														
 
															+typedef uint64_t tdb_len_t;
														
 
															+typedef uint64_t tdb_off_t;
														
 
															+
														
 
															+#ifndef offsetof
														
 
															+#define offsetof(t,f) ((unsigned int)&((t *)0)->f)
														
 
															+#endif
														
 
															+
														
 
															+#define TDB_MAGIC_FOOD "TDB file\n"
														
 
															+#define TDB_VERSION ((uint64_t)(0x26011967 + 7))
														
 
															+#define TDB_MAGIC ((uint64_t)0x1999)
														
 
															+#define TDB_FREE_MAGIC (~(uint64_t)TDB_MAGIC)
														
 
															+#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
														
 
															+#define TDB_RECOVERY_MAGIC (0xf53bc0e7U)
														
 
															+#define TDB_RECOVERY_INVALID_MAGIC (0x0)
														
 
															+#define TDB_EXTRA_HASHBITS (11) /* We steal 11 bits to stash hash info. */
														
 
															+#define TDB_EXTRA_HASHBITS_NUM (3)
														
 
															+
														
 
															+#define TDB_OFF_ERR ((tdb_off_t)-1)
														
 
															+
														
 
															+/* Prevent others from opening the file. */
														
 
															+#define TDB_OPEN_LOCK 0
														
 
															+/* Doing a transaction. */
														
 
															+#define TDB_TRANSACTION_LOCK 1
														
 
															+/* Hash chain locks. */
														
 
															+#define TDB_HASH_LOCK_START 2
														
 
															+
														
 
															+/* We start wih 256 hash buckets, 10 free buckets.  A 1k-sized zone. */
														
 
															+#define INITIAL_HASH_BITS 8
														
 
															+#define INITIAL_FREE_BUCKETS 10
														
 
															+#define INITIAL_ZONE_BITS 10
														
 
															+
														
 
															+#if !HAVE_BSWAP_64
														
 
															+static inline uint64_t bswap_64(uint64_t x)
														
 
															+{
														
 
															+	return (((x&0x000000FFULL)<<56)
														
 
															+		| ((x&0x0000FF00ULL)<<48)
														
 
															+		| ((x&0x00FF0000ULL)<<40)
														
 
															+		| ((x&0xFF000000ULL)<<32)
														
 
															+		| ((x>>8)&0xFF000000ULL)
														
 
															+		| ((x>>16)&0x00FF0000ULL)
														
 
															+		| ((x>>24)&0x0000FF00ULL)
														
 
															+		| ((x>>32)&0x000000FFULL));
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+struct tdb_used_record {
														
 
															+	/* For on-disk compatibility, we avoid bitfields:
														
 
															+	   magic: 16,        (highest)
														
 
															+	   key_len_bits: 5,
														
 
															+           hash:11,
														
 
															+	   extra_padding: 32 (lowest)
														
 
															+	*/
														
 
															+        uint64_t magic_and_meta;
														
 
															+	/* The bottom key_len_bits*2 are key length, rest is data length. */
														
 
															+        uint64_t key_and_data_len;
														
 
															+};
														
 
															+
														
 
															+static inline unsigned rec_key_bits(const struct tdb_used_record *r)
														
 
															+{
														
 
															+	return ((r->magic_and_meta >> 43) & ((1 << 5)-1)) * 2;
														
 
															+}
														
 
															+
														
 
															+static inline uint64_t rec_key_length(const struct tdb_used_record *r)
														
 
															+{
														
 
															+	return r->key_and_data_len & ((1ULL << rec_key_bits(r)) - 1);
														
 
															+}
														
 
															+
														
 
															+static inline uint64_t rec_data_length(const struct tdb_used_record *r)
														
 
															+{
														
 
															+	return r->key_and_data_len >> rec_key_bits(r);
														
 
															+}
														
 
															+
														
 
															+static inline uint64_t rec_extra_padding(const struct tdb_used_record *r)
														
 
															+{
														
 
															+	return r->magic_and_meta & 0xFFFFFFFF;
														
 
															+}
														
 
															+
														
 
															+static inline uint64_t rec_hash(const struct tdb_used_record *r)
														
 
															+{
														
 
															+	return ((r->magic_and_meta >> 32) & ((1ULL << 11) - 1)) << (64 - 11);
														
 
															+}
														
 
															+
														
 
															+static inline uint16_t rec_magic(const struct tdb_used_record *r)
														
 
															+{
														
 
															+	return (r->magic_and_meta >> 48);
														
 
															+}
														
 
															+
														
 
															+struct tdb_free_record {
														
 
															+        uint64_t magic;
														
 
															+        uint64_t data_len; /* Not counting these two fields. */
														
 
															+	/* This is why the minimum record size is 16 bytes.  */
														
 
															+	uint64_t next, prev;
														
 
															+};
														
 
															+
														
 
															+/* These parts can change while we have db open. */
														
 
															+struct tdb_header_volatile {
														
 
															+	uint64_t generation; /* Makes sure it changes on every update. */
														
 
															+	uint64_t hash_bits; /* Entries in hash table. */
														
 
															+	uint64_t hash_off; /* Offset of hash table. */
														
 
															+	uint64_t num_zones; /* How many zones in the file. */
														
 
															+	uint64_t zone_bits; /* Size of zones. */
														
 
															+	uint64_t free_buckets; /* How many buckets in each zone. */
														
 
															+	uint64_t free_off; /* Arrays of free entries. */
														
 
															+};
														
 
															+
														
 
															+/* this is stored at the front of every database */
														
 
															+struct tdb_header {
														
 
															+	char magic_food[32]; /* for /etc/magic */
														
 
															+	uint64_t version; /* version of the code */
														
 
															+	uint64_t hash_test; /* result of hashing HASH_MAGIC. */
														
 
															+	uint64_t hash_seed; /* "random" seed written at creation time. */
														
 
															+
														
 
															+	struct tdb_header_volatile v;
														
 
															+
														
 
															+	tdb_off_t reserved[19];
														
 
															+};
														
 
															+
														
 
															+enum tdb_lock_flags {
														
 
															+	/* WAIT == F_SETLKW, NOWAIT == F_SETLK */
														
 
															+	TDB_LOCK_NOWAIT = 0,
														
 
															+	TDB_LOCK_WAIT = 1,
														
 
															+	/* If set, don't log an error on failure. */
														
 
															+	TDB_LOCK_PROBE = 2,
														
 
															+};
														
 
															+
														
 
															+struct tdb_lock_type {
														
 
															+	uint32_t off;
														
 
															+	uint32_t count;
														
 
															+	uint32_t ltype;
														
 
															+};
														
 
															+
														
 
															+struct tdb_context {
														
 
															+	/* Filename of the database. */
														
 
															+	const char *name;
														
 
															+
														
 
															+	/* Mmap (if any), or malloc (for TDB_INTERNAL). */
														
 
															+	void *map_ptr;
														
 
															+
														
 
															+	 /* Open file descriptor (undefined for TDB_INTERNAL). */
														
 
															+	int fd;
														
 
															+
														
 
															+	/* How much space has been mapped (<= current file size) */
														
 
															+	tdb_len_t map_size;
														
 
															+
														
 
															+	/* Opened read-only? */
														
 
															+	bool read_only;
														
 
															+
														
 
															+	/* Error code for last tdb error. */
														
 
															+	enum TDB_ERROR ecode; 
														
 
															+
														
 
															+	/* A cached copy of the header */
														
 
															+	struct tdb_header header; 
														
 
															+	/* (for debugging). */
														
 
															+	bool header_uptodate; 
														
 
															+
														
 
															+	/* the flags passed to tdb_open, for tdb_reopen. */
														
 
															+	uint32_t flags;
														
 
															+
														
 
															+	/* Logging function */
														
 
															+	tdb_logfn_t log;
														
 
															+	void *log_priv;
														
 
															+
														
 
															+	/* Hash function. */
														
 
															+	tdb_hashfn_t khash;
														
 
															+	void *hash_priv;
														
 
															+
														
 
															+	/* What zone of the tdb to use, for spreading load. */
														
 
															+	uint64_t last_zone; 
														
 
															+
														
 
															+	/* IO methods: changes for transactions. */
														
 
															+	const struct tdb_methods *methods;
														
 
															+
														
 
															+	/* Lock information */
														
 
															+	struct tdb_lock_type allrecord_lock;
														
 
															+	uint64_t num_lockrecs;
														
 
															+	struct tdb_lock_type *lockrecs;
														
 
															+
														
 
															+	/* Set if we are in a transaction. */
														
 
															+	struct tdb_transaction *transaction;
														
 
															+	
														
 
															+	/* Single list of all TDBs, to avoid multiple opens. */
														
 
															+	struct tdb_context *next;
														
 
															+	dev_t device;	
														
 
															+	ino_t inode;
														
 
															+};
														
 
															+
														
 
															+struct tdb_methods {
														
 
															+	int (*read)(struct tdb_context *, tdb_off_t, void *, tdb_len_t);
														
 
															+	int (*write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
														
 
															+	int (*oob)(struct tdb_context *, tdb_off_t, bool);
														
 
															+	int (*expand_file)(struct tdb_context *, tdb_len_t, tdb_len_t);
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+  internal prototypes
														
 
															+*/
														
 
															+/* tdb.c: */
														
 
															+/* Returns true if header changed. */
														
 
															+bool update_header(struct tdb_context *tdb);
														
 
															+
														
 
															+/* Hash random memory. */
														
 
															+uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len);
														
 
															+
														
 
															+
														
 
															+/* free.c: */
														
 
															+uint64_t random_free_zone(struct tdb_context *tdb);
														
 
															+
														
 
															+/* If this fails, try tdb_expand. */
														
 
															+tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
														
 
															+		uint64_t hash, bool growing);
														
 
															+
														
 
															+/* Put this record in a free list. */
														
 
															+int add_free_record(struct tdb_context *tdb,
														
 
															+		    tdb_off_t off, tdb_len_t len_with_header);
														
 
															+
														
 
															+/* Set up header for a used record. */
														
 
															+int set_header(struct tdb_context *tdb,
														
 
															+	       struct tdb_used_record *rec,
														
 
															+	       uint64_t keylen, uint64_t datalen,
														
 
															+	       uint64_t actuallen, uint64_t hash);
														
 
															+
														
 
															+/* Used by tdb_check to verify. */
														
 
															+unsigned int size_to_bucket(struct tdb_context *tdb, tdb_len_t data_len);
														
 
															+tdb_off_t zone_of(struct tdb_context *tdb, tdb_off_t off);
														
 
															+
														
 
															+/* io.c: */
														
 
															+/* Initialize tdb->methods. */
														
 
															+void tdb_io_init(struct tdb_context *tdb);
														
 
															+
														
 
															+/* Convert endian of the buffer if required. */
														
 
															+void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size);
														
 
															+
														
 
															+/* Unmap and try to map the tdb. */
														
 
															+void tdb_munmap(struct tdb_context *tdb);
														
 
															+void tdb_mmap(struct tdb_context *tdb);
														
 
															+
														
 
															+/* Hand data to a function, direct if possible */
														
 
															+int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
														
 
															+		   tdb_off_t offset, tdb_len_t len,
														
 
															+		   int (*parser)(TDB_DATA key, TDB_DATA data,
														
 
															+				 void *private_data),
														
 
															+		   void *private_data);
														
 
															+
														
 
															+/* Either make a copy into pad and return that, or return ptr into mmap.
														
 
															+ * Converts endian (ie. will use pad in that case). */
														
 
															+void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len);
														
 
															+
														
 
															+/* Either alloc a copy, or give direct access.  Release frees or noop. */
														
 
															+const void *tdb_access_read(struct tdb_context *tdb,
														
 
															+			    tdb_off_t off, tdb_len_t len);
														
 
															+void tdb_access_release(struct tdb_context *tdb, const void *p);
														
 
															+
														
 
															+/* Convenience routine to get an offset. */
														
 
															+tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off);
														
 
															+
														
 
															+/* Write an offset at an offset. */
														
 
															+int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val);
														
 
															+
														
 
															+/* Clear an ondisk area. */
														
 
															+int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len);
														
 
															+
														
 
															+/* Return a non-zero offset in this array, or num. */
														
 
															+tdb_off_t tdb_find_nonzero_off(struct tdb_context *tdb, tdb_off_t off,
														
 
															+			       uint64_t num);
														
 
															+
														
 
															+/* Return a zero offset in this array, or num. */
														
 
															+tdb_off_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
														
 
															+			    uint64_t num);
														
 
															+
														
 
															+/* Even on files, we can get partial writes due to signals. */
														
 
															+bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off);
														
 
															+
														
 
															+/* Allocate and make a copy of some offset. */
														
 
															+void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len);
														
 
															+
														
 
															+/* Munges record and writes it */
														
 
															+int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
														
 
															+		      void *rec, size_t len);
														
 
															+
														
 
															+/* Reads record and converts it */
														
 
															+int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
														
 
															+		     void *rec, size_t len);
														
 
															+
														
 
															+/* Hash on disk. */
														
 
															+uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off);
														
 
															+
														
 
															+/* lock.c: */
														
 
															+/* Lock/unlock a particular hash list. */
														
 
															+int tdb_lock_list(struct tdb_context *tdb, tdb_off_t list,
														
 
															+		  int ltype, enum tdb_lock_flags waitflag);
														
 
															+int tdb_unlock_list(struct tdb_context *tdb, tdb_off_t list, int ltype);
														
 
															+
														
 
															+/* Lock/unlock a particular free list. */
														
 
															+int tdb_lock_free_list(struct tdb_context *tdb, tdb_off_t flist,
														
 
															+		       enum tdb_lock_flags waitflag);
														
 
															+void tdb_unlock_free_list(struct tdb_context *tdb, tdb_off_t flist);
														
 
															+
														
 
															+/* Do we have any locks? */
														
 
															+bool tdb_has_locks(struct tdb_context *tdb);
														
 
															+
														
 
															+/* Lock entire database. */
														
 
															+int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
														
 
															+		       enum tdb_lock_flags flags, bool upgradable);
														
 
															+int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype);
														
 
															+
														
 
															+/* Serialize db open. */
														
 
															+int tdb_lock_open(struct tdb_context *tdb);
														
 
															+void tdb_unlock_open(struct tdb_context *tdb);
														
 
															+/* Expand the file. */
														
 
															+int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
														
 
															+	       bool growing);
														
 
															+
														
 
															+#if 0
														
 
															+/* Low-level locking primitives. */
														
 
															+int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
														
 
															+		  enum tdb_lock_flags flags);
														
 
															+int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t offset, int ltype);
														
 
															+
														
 
															+int tdb_munmap(struct tdb_context *tdb);
														
 
															+void tdb_mmap(struct tdb_context *tdb);
														
 
															+int tdb_lock(struct tdb_context *tdb, int list, int ltype);
														
 
															+int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
														
 
															+bool tdb_have_locks(struct tdb_context *tdb);
														
 
															+int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
														
 
															+int tdb_brlock(struct tdb_context *tdb,
														
 
															+	       int rw_type, tdb_off_t offset, size_t len,
														
 
															+	       enum tdb_lock_flags flags);
														
 
															+int tdb_brunlock(struct tdb_context *tdb,
														
 
															+		 int rw_type, tdb_off_t offset, size_t len);
														
 
															+bool tdb_have_extra_locks(struct tdb_context *tdb);
														
 
															+void tdb_release_extra_locks(struct tdb_context *tdb);
														
 
															+int tdb_transaction_lock(struct tdb_context *tdb, int ltype);
														
 
															+int tdb_transaction_unlock(struct tdb_context *tdb, int ltype);
														
 
															+int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
														
 
															+		       enum tdb_lock_flags flags, bool upgradable);
														
 
															+int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype);
														
 
															+int tdb_allrecord_upgrade(struct tdb_context *tdb);
														
 
															+int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off);
														
 
															+int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off);
														
 
															+int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
														
 
															+int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
														
 
															+int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec);
														
 
															+tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec);
														
 
															+int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
														
 
															+int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
														
 
															+int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off);
														
 
															+int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off);
														
 
															+bool tdb_needs_recovery(struct tdb_context *tdb);
														
 
															+int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec);
														
 
															+int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec);
														
 
															+int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct tdb_record *rec);
														
 
															+unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len);
														
 
															+int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
														
 
															+		   tdb_off_t offset, tdb_len_t len,
														
 
															+		   int (*parser)(TDB_DATA key, TDB_DATA data,
														
 
															+				 void *private_data),
														
 
															+		   void *private_data);
														
 
															+tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
														
 
															+			   struct tdb_record *rec);
														
 
															+void tdb_io_init(struct tdb_context *tdb);
														
 
															+int tdb_expand(struct tdb_context *tdb, tdb_off_t size);
														
 
															+int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off,
														
 
															+		      struct tdb_record *rec);
														
 
															+#endif
														
 
															+
														
 
															+#ifdef TDB_TRACE
														
 
															+void tdb_trace(struct tdb_context *tdb, const char *op);
														
 
															+void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op);
														
 
															+void tdb_trace_open(struct tdb_context *tdb, const char *op,
														
 
															+		    unsigned hash_size, unsigned tdb_flags, unsigned open_flags);
														
 
															+void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret);
														
 
															+void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret);
														
 
															+void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
														
 
															+		    TDB_DATA rec);
														
 
															+void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
														
 
															+			TDB_DATA rec, int ret);
														
 
															+void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
														
 
															+			   TDB_DATA rec, TDB_DATA ret);
														
 
															+void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
														
 
															+			     TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
														
 
															+			     int ret);
														
 
															+void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
														
 
															+			   TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret);
														
 
															+#else
														
 
															+#define tdb_trace(tdb, op)
														
 
															+#define tdb_trace_seqnum(tdb, seqnum, op)
														
 
															+#define tdb_trace_open(tdb, op, hash_size, tdb_flags, open_flags)
														
 
															+#define tdb_trace_ret(tdb, op, ret)
														
 
															+#define tdb_trace_retrec(tdb, op, ret)
														
 
															+#define tdb_trace_1rec(tdb, op, rec)
														
 
															+#define tdb_trace_1rec_ret(tdb, op, rec, ret)
														
 
															+#define tdb_trace_1rec_retrec(tdb, op, rec, ret)
														
 
															+#define tdb_trace_2rec_flag_ret(tdb, op, rec1, rec2, flag, ret)
														
 
															+#define tdb_trace_2rec_retrec(tdb, op, rec1, rec2, ret)
														
 
															+#endif /* !TDB_TRACE */
														
 
															+
														
 
															+#endif
														
--- a/ccan/tdb2/tdb.c
+++ b/ccan/tdb2/tdb.c
@@ -0,0 +1,875 @@
 
															+#include "private.h"
														
 
															+#include <ccan/tdb2/tdb2.h>
														
 
															+#include <ccan/hash/hash.h>
														
 
															+#include <ccan/likely/likely.h>
														
 
															+#include <assert.h>
														
 
															+
														
 
															+/* The null return. */
														
 
															+struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
														
 
															+
														
 
															+/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
														
 
															+static struct tdb_context *tdbs = NULL;
														
 
															+
														
 
															+PRINTF_ATTRIBUTE(4, 5) static void
														
 
															+null_log_fn(struct tdb_context *tdb,
														
 
															+	    enum tdb_debug_level level, void *priv,
														
 
															+	    const char *fmt, ...)
														
 
															+{
														
 
															+}
														
 
															+
														
 
															+/* We do a lot of work assuming our copy of the header volatile area
														
 
															+ * is uptodate, and usually it is.  However, once we grab a lock, we have to
														
 
															+ * re-check it. */
														
 
															+bool update_header(struct tdb_context *tdb)
														
 
															+{
														
 
															+	struct tdb_header_volatile pad, *v;
														
 
															+
														
 
															+	if (tdb->header_uptodate) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
														
 
															+			 "warning: header uptodate already\n");
														
 
															+	}
														
 
															+
														
 
															+	/* We could get a partial update if we're not holding any locks. */
														
 
															+	assert(tdb_has_locks(tdb));
														
 
															+
														
 
															+	v = tdb_get(tdb, offsetof(struct tdb_header, v), &pad, sizeof(*v));
														
 
															+	if (!v) {
														
 
															+		/* On failure, imply we updated header so they retry. */
														
 
															+		return true;
														
 
															+	}
														
 
															+	tdb->header_uptodate = true;
														
 
															+	if (likely(memcmp(&tdb->header.v, v, sizeof(*v)) == 0)) {
														
 
															+		return false;
														
 
															+	}
														
 
															+	tdb->header.v = *v;
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+static uint64_t jenkins_hash(const void *key, size_t length, uint64_t seed,
														
 
															+			     void *arg)
														
 
															+{
														
 
															+	return hash64_any(key, length, seed);
														
 
															+}
														
 
															+
														
 
															+uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len)
														
 
															+{
														
 
															+	return tdb->khash(ptr, len, tdb->header.hash_seed, tdb->hash_priv);
														
 
															+}
														
 
															+
														
 
															+static bool tdb_already_open(dev_t device, ino_t ino)
														
 
															+{
														
 
															+	struct tdb_context *i;
														
 
															+	
														
 
															+	for (i = tdbs; i; i = i->next) {
														
 
															+		if (i->device == device && i->inode == ino) {
														
 
															+			return true;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return false;
														
 
															+}
														
 
															+
														
 
															+static uint64_t random_number(struct tdb_context *tdb)
														
 
															+{
														
 
															+	int fd;
														
 
															+	uint64_t ret = 0;
														
 
															+	struct timeval now;
														
 
															+
														
 
															+	fd = open("/dev/urandom", O_RDONLY);
														
 
															+	if (fd >= 0) {
														
 
															+		if (read(fd, &ret, sizeof(ret)) == sizeof(ret)) {
														
 
															+			tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
														
 
															+				 "tdb_open: random from /dev/urandom\n");
														
 
															+			close(fd);
														
 
															+			return ret;
														
 
															+		}
														
 
															+		close(fd);
														
 
															+	}
														
 
															+	/* FIXME: Untested!  Based on Wikipedia protocol description! */
														
 
															+	fd = open("/dev/egd-pool", O_RDWR);
														
 
															+	if (fd >= 0) {
														
 
															+		/* Command is 1, next byte is size we want to read. */
														
 
															+		char cmd[2] = { 1, sizeof(uint64_t) };
														
 
															+		if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) {
														
 
															+			char reply[1 + sizeof(uint64_t)];
														
 
															+			int r = read(fd, reply, sizeof(reply));
														
 
															+			if (r > 1) {
														
 
															+				tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
														
 
															+					 "tdb_open: %u random bytes from"
														
 
															+					 " /dev/egd-pool\n", r-1);
														
 
															+				/* Copy at least some bytes. */
														
 
															+				memcpy(&ret, reply+1, r - 1);
														
 
															+				if (reply[0] == sizeof(uint64_t)
														
 
															+				    && r == sizeof(reply)) {
														
 
															+					close(fd);
														
 
															+					return ret;
														
 
															+				}
														
 
															+			}
														
 
															+		}
														
 
															+		close(fd);
														
 
															+	}
														
 
															+
														
 
															+	/* Fallback: pid and time. */
														
 
															+	gettimeofday(&now, NULL);
														
 
															+	ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
														
 
															+	tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
														
 
															+		 "tdb_open: random from getpid and time\n");
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+struct new_database {
														
 
															+	struct tdb_header hdr;
														
 
															+	struct tdb_used_record hrec;
														
 
															+	tdb_off_t hash[1ULL << INITIAL_HASH_BITS];
														
 
															+	struct tdb_used_record frec;
														
 
															+	tdb_off_t free[INITIAL_FREE_BUCKETS + 1]; /* One overflow bucket */
														
 
															+};
														
 
															+
														
 
															+/* initialise a new database */
														
 
															+static int tdb_new_database(struct tdb_context *tdb)
														
 
															+{
														
 
															+	/* We make it up in memory, then write it out if not internal */
														
 
															+	struct new_database newdb;
														
 
															+
														
 
															+	/* Fill in the header */
														
 
															+	newdb.hdr.version = TDB_VERSION;
														
 
															+	newdb.hdr.hash_seed = random_number(tdb);
														
 
															+	newdb.hdr.hash_test = TDB_HASH_MAGIC;
														
 
															+	newdb.hdr.hash_test = tdb->khash(&newdb.hdr.hash_test,
														
 
															+					 sizeof(newdb.hdr.hash_test),
														
 
															+					 newdb.hdr.hash_seed,
														
 
															+					 tdb->hash_priv);
														
 
															+
														
 
															+	newdb.hdr.v.generation = 0;
														
 
															+
														
 
															+	/* Free array has 1 zone, 10 buckets.  All buckets empty. */
														
 
															+	newdb.hdr.v.num_zones = 1;
														
 
															+	newdb.hdr.v.zone_bits = INITIAL_ZONE_BITS;
														
 
															+	newdb.hdr.v.free_buckets = INITIAL_FREE_BUCKETS;
														
 
															+	newdb.hdr.v.free_off = offsetof(struct new_database, free);
														
 
															+	set_header(tdb, &newdb.frec, 0,
														
 
															+		   sizeof(newdb.free), sizeof(newdb.free), 0);
														
 
															+	memset(newdb.free, 0, sizeof(newdb.free));
														
 
															+
														
 
															+	/* Initial hashes are empty. */
														
 
															+	newdb.hdr.v.hash_bits = INITIAL_HASH_BITS;
														
 
															+	newdb.hdr.v.hash_off = offsetof(struct new_database, hash);
														
 
															+	set_header(tdb, &newdb.hrec, 0,
														
 
															+		   sizeof(newdb.hash), sizeof(newdb.hash), 0);
														
 
															+	memset(newdb.hash, 0, sizeof(newdb.hash));
														
 
															+
														
 
															+	if (tdb->flags & TDB_INTERNAL) {
														
 
															+		tdb->map_size = sizeof(newdb);
														
 
															+		tdb->map_ptr = malloc(tdb->map_size);
														
 
															+		if (!tdb->map_ptr) {
														
 
															+			tdb->ecode = TDB_ERR_OOM;
														
 
															+			return -1;
														
 
															+		}
														
 
															+		memcpy(tdb->map_ptr, &newdb, tdb->map_size);
														
 
															+		tdb->header = newdb.hdr;
														
 
															+		/* Convert the `ondisk' version if asked. */
														
 
															+		tdb_convert(tdb, tdb->map_ptr, sizeof(newdb));
														
 
															+		return 0;
														
 
															+	}
														
 
															+	if (lseek(tdb->fd, 0, SEEK_SET) == -1)
														
 
															+		return -1;
														
 
															+
														
 
															+	if (ftruncate(tdb->fd, 0) == -1)
														
 
															+		return -1;
														
 
															+
														
 
															+	/* This creates an endian-converted header, as if read from disk */
														
 
															+	tdb->header = newdb.hdr;
														
 
															+	tdb_convert(tdb, &tdb->header, sizeof(tdb->header));
														
 
															+
														
 
															+	/* Don't endian-convert the magic food! */
														
 
															+	memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
														
 
															+	strcpy(newdb.hdr.magic_food, TDB_MAGIC_FOOD);
														
 
															+
														
 
															+	if (!tdb_pwrite_all(tdb->fd, &newdb, sizeof(newdb), 0)) {
														
 
															+		tdb->ecode = TDB_ERR_IO;
														
 
															+		return -1;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+struct tdb_context *tdb_open(const char *name, int tdb_flags,
														
 
															+			     int open_flags, mode_t mode,
														
 
															+			     union tdb_attribute *attr)
														
 
															+{
														
 
															+	struct tdb_context *tdb;
														
 
															+	struct stat st;
														
 
															+	int save_errno;
														
 
															+	uint64_t hash_test;
														
 
															+	unsigned v;
														
 
															+
														
 
															+	if (!(tdb = (struct tdb_context *)calloc(1, sizeof *tdb))) {
														
 
															+		/* Can't log this */
														
 
															+		errno = ENOMEM;
														
 
															+		goto fail;
														
 
															+	}
														
 
															+	tdb->fd = -1;
														
 
															+	tdb->name = NULL;
														
 
															+	tdb->map_ptr = NULL;
														
 
															+	tdb->flags = tdb_flags;
														
 
															+	tdb->log = null_log_fn;
														
 
															+	tdb->log_priv = NULL;
														
 
															+	tdb->khash = jenkins_hash;
														
 
															+	tdb->hash_priv = NULL;
														
 
															+
														
 
															+	/* FIXME */
														
 
															+	if (attr) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_open: attributes not yet supported\n");
														
 
															+		errno = EINVAL;
														
 
															+		goto fail;
														
 
															+	}
														
 
															+
														
 
															+	if ((open_flags & O_ACCMODE) == O_WRONLY) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_open: can't open tdb %s write-only\n", name);
														
 
															+		errno = EINVAL;
														
 
															+		goto fail;
														
 
															+	}
														
 
															+
														
 
															+	if ((open_flags & O_ACCMODE) == O_RDONLY) {
														
 
															+		tdb->read_only = 1;
														
 
															+		/* read only databases don't do locking */
														
 
															+		tdb->flags |= TDB_NOLOCK;
														
 
															+	}
														
 
															+
														
 
															+	/* internal databases don't mmap or lock */
														
 
															+	if (tdb->flags & TDB_INTERNAL) {
														
 
															+		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
														
 
															+		if (tdb_new_database(tdb) != 0) {
														
 
															+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+				 "tdb_open: tdb_new_database failed!");
														
 
															+			goto fail;
														
 
															+		}
														
 
															+		TEST_IT(tdb->flags & TDB_CONVERT);
														
 
															+		goto internal;
														
 
															+	}
														
 
															+
														
 
															+	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
														
 
															+			 "tdb_open: could not open file %s: %s\n",
														
 
															+			 name, strerror(errno));
														
 
															+		goto fail;	/* errno set by open(2) */
														
 
															+	}
														
 
															+
														
 
															+	/* on exec, don't inherit the fd */
														
 
															+	v = fcntl(tdb->fd, F_GETFD, 0);
														
 
															+        fcntl(tdb->fd, F_SETFD, v | FD_CLOEXEC);
														
 
															+
														
 
															+	/* ensure there is only one process initialising at once */
														
 
															+	if (tdb_lock_open(tdb) == -1) {
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_open: failed to get open lock on %s: %s\n",
														
 
															+			 name, strerror(errno));
														
 
															+		goto fail;	/* errno set by tdb_brlock */
														
 
															+	}
														
 
															+
														
 
															+	errno = 0;
														
 
															+	if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
														
 
															+	    || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0) {
														
 
															+		if (!(open_flags & O_CREAT) || tdb_new_database(tdb) == -1) {
														
 
															+			if (errno == 0) {
														
 
															+				errno = EIO; /* ie bad format or something */
														
 
															+			}
														
 
															+			goto fail;
														
 
															+		}
														
 
															+	} else if (tdb->header.version != TDB_VERSION) {
														
 
															+		if (tdb->header.version == bswap_64(TDB_VERSION))
														
 
															+			tdb->flags |= TDB_CONVERT;
														
 
															+		else {
														
 
															+			/* wrong version */
														
 
															+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+				 "tdb_open: %s is unknown version 0x%llx\n",
														
 
															+				 name, (long long)tdb->header.version);
														
 
															+			errno = EIO;
														
 
															+			goto fail;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	tdb_convert(tdb, &tdb->header, sizeof(tdb->header));
														
 
															+	hash_test = TDB_HASH_MAGIC;
														
 
															+	hash_test = tdb->khash(&hash_test, sizeof(hash_test),
														
 
															+			       tdb->header.hash_seed, tdb->hash_priv);
														
 
															+	if (tdb->header.hash_test != hash_test) {
														
 
															+		/* wrong hash variant */
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_open: %s uses a different hash function\n",
														
 
															+			 name);
														
 
															+		errno = EIO;
														
 
															+		goto fail;
														
 
															+	}
														
 
															+
														
 
															+	if (fstat(tdb->fd, &st) == -1)
														
 
															+		goto fail;
														
 
															+
														
 
															+	/* Is it already in the open list?  If so, fail. */
														
 
															+	if (tdb_already_open(st.st_dev, st.st_ino)) {
														
 
															+		/* FIXME */
														
 
															+		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+			 "tdb_open: %s (%d,%d) is already open in this process\n",
														
 
															+			 name, (int)st.st_dev, (int)st.st_ino);
														
 
															+		errno = EBUSY;
														
 
															+		goto fail;
														
 
															+	}
														
 
															+
														
 
															+	tdb->name = strdup(name);
														
 
															+	if (!tdb->name) {
														
 
															+		errno = ENOMEM;
														
 
															+		goto fail;
														
 
															+	}
														
 
															+
														
 
															+	tdb->map_size = st.st_size;
														
 
															+	tdb->device = st.st_dev;
														
 
															+	tdb->inode = st.st_ino;
														
 
															+	tdb_io_init(tdb);
														
 
															+	tdb_mmap(tdb);
														
 
															+
														
 
															+ internal:
														
 
															+	/* Internal (memory-only) databases skip all the code above to
														
 
															+	 * do with disk files, and resume here by releasing their
														
 
															+	 * open lock and hooking into the active list. */
														
 
															+	tdb_unlock_open(tdb);
														
 
															+	tdb->last_zone = random_free_zone(tdb);
														
 
															+	tdb->next = tdbs;
														
 
															+	tdbs = tdb;
														
 
															+	return tdb;
														
 
															+
														
 
															+ fail:
														
 
															+	save_errno = errno;
														
 
															+
														
 
															+	if (!tdb)
														
 
															+		return NULL;
														
 
															+
														
 
															+#ifdef TDB_TRACE
														
 
															+	close(tdb->tracefd);
														
 
															+#endif
														
 
															+	if (tdb->map_ptr) {
														
 
															+		if (tdb->flags & TDB_INTERNAL) {
														
 
															+			free(tdb->map_ptr);
														
 
															+		} else
														
 
															+			tdb_munmap(tdb);
														
 
															+	}
														
 
															+	free((char *)tdb->name);
														
 
															+	if (tdb->fd != -1)
														
 
															+		if (close(tdb->fd) != 0)
														
 
															+			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
														
 
															+				 "tdb_open: failed to close tdb->fd"
														
 
															+				 " on error!\n");
														
 
															+	free(tdb);
														
 
															+	errno = save_errno;
														
 
															+	return NULL;
														
 
															+}
														
 
															+
														
 
															+static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
														
 
															+{
														
 
															+	return memcmp(data.dptr, key.dptr, data.dsize) == 0;
														
 
															+}
														
 
															+
														
 
															+static void unlock_lists(struct tdb_context *tdb,
														
 
															+			 uint64_t start, uint64_t end, int ltype)
														
 
															+{
														
 
															+	do {
														
 
															+		tdb_unlock_list(tdb, start, ltype);
														
 
															+		start = (start + ((1ULL << tdb->header.v.hash_bits) - 1))
														
 
															+			& ((1ULL << tdb->header.v.hash_bits) - 1);
														
 
															+	} while (start != end);
														
 
															+}
														
 
															+
														
 
															+/* FIXME: Return header copy? */
														
 
															+/* Returns -1 or offset of entry (0 if not found).
														
 
															+ * Locks hash entried from *start to *end (where the entry was found). */
														
 
															+static tdb_off_t find_bucket_and_lock(struct tdb_context *tdb,
														
 
															+				      const struct tdb_data *key,
														
 
															+				      uint64_t hash,
														
 
															+				      uint64_t *start,
														
 
															+				      uint64_t *end,
														
 
															+				      uint64_t *room,
														
 
															+				      int ltype)
														
 
															+{
														
 
															+	uint64_t hextra;
														
 
															+	tdb_off_t off;
														
 
															+
														
 
															+	/* hash_bits might be out of date... */
														
 
															+again:
														
 
															+	*start = *end = hash & ((1ULL << tdb->header.v.hash_bits) - 1);
														
 
															+	hextra = hash >> tdb->header.v.hash_bits;
														
 
															+
														
 
															+	/* FIXME: can we avoid locks for some fast paths? */
														
 
															+	if (tdb_lock_list(tdb, *end, ltype, TDB_LOCK_WAIT) == -1)
														
 
															+		return TDB_OFF_ERR;
														
 
															+
														
 
															+	/* We only need to check this for first lock. */
														
 
															+	if (unlikely(update_header(tdb))) {
														
 
															+		tdb_unlock_list(tdb, *end, ltype);
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+	while ((off = tdb_read_off(tdb, tdb->header.v.hash_off
														
 
															+				   + *end * sizeof(tdb_off_t)))
														
 
															+	       != TDB_OFF_ERR) {
														
 
															+		struct tdb_used_record pad, *r;
														
 
															+		uint64_t keylen, next;
														
 
															+
														
 
															+		/* Didn't find it? */
														
 
															+		if (!off)
														
 
															+			return 0;
														
 
															+
														
 
															+#if 0 /* FIXME: Check other bits. */
														
 
															+		unsigned int bits, bitmask, hoffextra;
														
 
															+		/* Bottom three bits show how many extra hash bits. */
														
 
															+		bits = (off & ((1 << TDB_EXTRA_HASHBITS_NUM) - 1)) + 1;
														
 
															+		bitmask = (1 << bits)-1;
														
 
															+		hoffextra = ((off >> TDB_EXTRA_HASHBITS_NUM) & bitmask);
														
 
															+		if ((hextra & bitmask) != hoffextra) 
														
 
															+			goto lock_next;
														
 
															+#endif
														
 
															+
														
 
															+		r = tdb_get(tdb, off, &pad, sizeof(*r));
														
 
															+		if (!r)
														
 
															+			goto unlock_err;
														
 
															+
														
 
															+		if (rec_magic(r) != TDB_MAGIC) {
														
 
															+			tdb->ecode = TDB_ERR_CORRUPT;
														
 
															+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+				 "find_bucket_and_lock: bad magic 0x%llx"
														
 
															+				 " at offset %llu!\n",
														
 
															+				 (long long)rec_magic(r), (long long)off);
														
 
															+			goto unlock_err;
														
 
															+		}
														
 
															+
														
 
															+		/* FIXME: check extra bits in header! */
														
 
															+		keylen = rec_key_length(r);
														
 
															+		if (keylen != key->dsize)
														
 
															+			goto lock_next;
														
 
															+
														
 
															+		switch (tdb_parse_data(tdb, *key, off + sizeof(*r), key->dsize,
														
 
															+				       tdb_key_compare, NULL)) {
														
 
															+		case 1:
														
 
															+			/* Match! */
														
 
															+			*room = rec_data_length(r) + rec_extra_padding(r);
														
 
															+			return off >> TDB_EXTRA_HASHBITS_NUM;
														
 
															+		case 0:
														
 
															+			break;
														
 
															+		default:
														
 
															+			goto unlock_err;
														
 
															+		}
														
 
															+
														
 
															+	lock_next:
														
 
															+		/* Lock next bucket. */
														
 
															+		/* FIXME: We can deadlock if this wraps! */
														
 
															+		next = (*end + 1) & ((1ULL << tdb->header.v.hash_bits) - 1);
														
 
															+		if (next == *start) {
														
 
															+			tdb->ecode = TDB_ERR_CORRUPT;
														
 
															+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+				 "find_bucket_and_lock: full hash table!\n");
														
 
															+			goto unlock_err;
														
 
															+		}
														
 
															+		if (tdb_lock_list(tdb, next, ltype, TDB_LOCK_WAIT) == -1)
														
 
															+			goto unlock_err;
														
 
															+		*end = next;
														
 
															+	}
														
 
															+
														
 
															+unlock_err:
														
 
															+	TEST_IT(*end < *start);
														
 
															+	unlock_lists(tdb, *start, *end, ltype);
														
 
															+	return TDB_OFF_ERR;
														
 
															+}
														
 
															+
														
 
															+static int update_rec_hdr(struct tdb_context *tdb,
														
 
															+			  tdb_off_t off,
														
 
															+			  tdb_len_t keylen,
														
 
															+			  tdb_len_t datalen,
														
 
															+			  tdb_len_t room,
														
 
															+			  uint64_t h)
														
 
															+{
														
 
															+	struct tdb_used_record rec;
														
 
															+
														
 
															+	if (set_header(tdb, &rec, keylen, datalen, room - datalen, h))
														
 
															+		return -1;
														
 
															+
														
 
															+	return tdb_write_convert(tdb, off, &rec, sizeof(rec));
														
 
															+}
														
 
															+
														
 
															+/* If we fail, others will try after us. */
														
 
															+static void enlarge_hash(struct tdb_context *tdb)
														
 
															+{
														
 
															+	tdb_off_t newoff, i;
														
 
															+	uint64_t h, num = 1ULL << tdb->header.v.hash_bits;
														
 
															+	struct tdb_used_record pad, *r;
														
 
															+
														
 
															+	/* FIXME: We should do this without holding locks throughout. */
														
 
															+	if (tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == -1)
														
 
															+		return;
														
 
															+
														
 
															+	if (unlikely(update_header(tdb))) {
														
 
															+		/* Someone else enlarged for us?  Nothing to do. */
														
 
															+		if ((1ULL << tdb->header.v.hash_bits) != num)
														
 
															+			goto unlock;
														
 
															+	}
														
 
															+
														
 
															+	newoff = alloc(tdb, 0, num * 2, 0, false);
														
 
															+	if (unlikely(newoff == TDB_OFF_ERR))
														
 
															+		goto unlock;
														
 
															+	if (unlikely(newoff == 0)) {
														
 
															+		if (tdb_expand(tdb, 0, num * 2, false) == -1)
														
 
															+			goto unlock;
														
 
															+		newoff = alloc(tdb, 0, num * 2, 0, false);
														
 
															+		if (newoff == TDB_OFF_ERR || newoff == 0)
														
 
															+			goto unlock;
														
 
															+	}
														
 
															+
														
 
															+	/* FIXME: If the space before is empty, we know this is in its ideal
														
 
															+	 * location.  We can steal a bit from the pointer to avoid rehash. */
														
 
															+	for (i = tdb_find_nonzero_off(tdb, tdb->header.v.hash_off, num);
														
 
															+	     i < num;
														
 
															+	     i += tdb_find_nonzero_off(tdb, tdb->header.v.hash_off
														
 
															+				       + i*sizeof(tdb_off_t), num - i)) {
														
 
															+		tdb_off_t off;
														
 
															+		off = tdb_read_off(tdb, tdb->header.v.hash_off
														
 
															+				   + i*sizeof(tdb_off_t));
														
 
															+		if (unlikely(off == TDB_OFF_ERR))
														
 
															+			goto unlock;
														
 
															+		if (unlikely(!off)) {
														
 
															+			tdb->ecode = TDB_ERR_CORRUPT;
														
 
															+			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
														
 
															+				 "find_bucket_and_lock: zero hash bucket!\n");
														
 
															+			goto unlock;
														
 
															+		}
														
 
															+		h = hash_record(tdb, off);
														
 
															+		/* FIXME: Encode extra hash bits! */
														
 
															+		if (tdb_write_off(tdb, newoff
														
 
															+				  + (h & ((num * 2) - 1)) * sizeof(uint64_t),
														
 
															+				  off) == -1)
														
 
															+			goto unlock;
														
 
															+	}
														
 
															+
														
 
															+	/* Free up old hash. */
														
 
															+	r = tdb_get(tdb, tdb->header.v.hash_off, &pad, sizeof(*r));
														
 
															+	if (!r)
														
 
															+		goto unlock;
														
 
															+	add_free_record(tdb, tdb->header.v.hash_off,
														
 
															+			rec_data_length(r) + rec_extra_padding(r));
														
 
															+
														
 
															+	/* Now we write the modified header. */
														
 
															+	tdb->header.v.generation++;
														
 
															+	tdb->header.v.hash_bits++;
														
 
															+	tdb->header.v.hash_off = newoff;
														
 
															+	tdb_write_convert(tdb, offsetof(struct tdb_header, v),
														
 
															+			  &tdb->header.v, sizeof(tdb->header.v));
														
 
															+unlock:
														
 
															+	tdb_allrecord_unlock(tdb, F_WRLCK);
														
 
															+}
														
 
															+
														
 
															+int tdb_store(struct tdb_context *tdb,
														
 
															+	      struct tdb_data key, struct tdb_data dbuf, int flag)
														
 
															+{
														
 
															+	tdb_off_t new_off, off, start, end, room;
														
 
															+	uint64_t h;
														
 
															+	bool growing = false;
														
 
															+
														
 
															+	h = tdb_hash(tdb, key.dptr, key.dsize);
														
 
															+	off = find_bucket_and_lock(tdb, &key, h, &start, &end, &room, F_WRLCK);
														
 
															+	if (off == TDB_OFF_ERR)
														
 
															+		return -1;
														
 
															+
														
 
															+	/* Now we have lock on this hash bucket. */
														
 
															+	if (flag == TDB_INSERT) {
														
 
															+		if (off) {
														
 
															+			tdb->ecode = TDB_ERR_EXISTS;
														
 
															+			goto fail;
														
 
															+		}
														
 
															+	} else {
														
 
															+		if (off) {
														
 
															+			if (room >= key.dsize + dbuf.dsize) {
														
 
															+				new_off = off;
														
 
															+				if (update_rec_hdr(tdb, off,
														
 
															+						   key.dsize, dbuf.dsize,
														
 
															+						   room, h))
														
 
															+					goto fail;
														
 
															+				goto write;
														
 
															+			}
														
 
															+			/* FIXME: See if right record is free? */
														
 
															+			/* Hint to allocator that we've realloced. */
														
 
															+			growing = true;
														
 
															+		} else {
														
 
															+			if (flag == TDB_MODIFY) {
														
 
															+				/* if the record doesn't exist and we
														
 
															+				   are in TDB_MODIFY mode then we should fail
														
 
															+				   the store */
														
 
															+				tdb->ecode = TDB_ERR_NOEXIST;
														
 
															+				goto fail;
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* Allocate a new record. */
														
 
															+	new_off = alloc(tdb, key.dsize, dbuf.dsize, h, growing);
														
 
															+	if (new_off == 0) {
														
 
															+		unlock_lists(tdb, start, end, F_WRLCK);
														
 
															+		/* Expand, then try again... */
														
 
															+		if (tdb_expand(tdb, key.dsize, dbuf.dsize, growing) == -1)
														
 
															+			return -1;
														
 
															+		return tdb_store(tdb, key, dbuf, flag);
														
 
															+	}
														
 
															+
														
 
															+	/* We didn't like the existing one: remove it. */
														
 
															+	if (off) {
														
 
															+		add_free_record(tdb, off, sizeof(struct tdb_used_record)
														
 
															+				+ key.dsize + room);
														
 
															+	}
														
 
															+
														
 
															+write:
														
 
															+	off = tdb->header.v.hash_off + end * sizeof(tdb_off_t);
														
 
															+	/* FIXME: Encode extra hash bits! */
														
 
															+	if (tdb_write_off(tdb, off, new_off) == -1)
														
 
															+		goto fail;
														
 
															+
														
 
															+	off = new_off + sizeof(struct tdb_used_record);
														
 
															+	if (tdb->methods->write(tdb, off, key.dptr, key.dsize) == -1)
														
 
															+		goto fail;
														
 
															+	off += key.dsize;
														
 
															+	if (tdb->methods->write(tdb, off, dbuf.dptr, dbuf.dsize) == -1)
														
 
															+		goto fail;
														
 
															+
														
 
															+	/* FIXME: tdb_increment_seqnum(tdb); */
														
 
															+	unlock_lists(tdb, start, end, F_WRLCK);
														
 
															+
														
 
															+	/* By simple trial and error, this roughly approximates a 60%
														
 
															+	 * full measure. */
														
 
															+	if (unlikely(end - start > 4 * tdb->header.v.hash_bits - 32))
														
 
															+		enlarge_hash(tdb);
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+fail:
														
 
															+	unlock_lists(tdb, start, end, F_WRLCK);
														
 
															+	return -1;
														
 
															+}
														
 
															+
														
 
															+struct tdb_data tdb_fetch(struct tdb_context *tdb, struct tdb_data key)
														
 
															+{
														
 
															+	tdb_off_t off, start, end, room;
														
 
															+	uint64_t h;
														
 
															+	struct tdb_used_record pad, *r;
														
 
															+	struct tdb_data ret;
														
 
															+
														
 
															+	h = tdb_hash(tdb, key.dptr, key.dsize);
														
 
															+	off = find_bucket_and_lock(tdb, &key, h, &start, &end, &room, F_RDLCK);
														
 
															+	if (off == TDB_OFF_ERR)
														
 
															+		return tdb_null;
														
 
															+
														
 
															+	if (!off) {
														
 
															+		unlock_lists(tdb, start, end, F_RDLCK);
														
 
															+		tdb->ecode = TDB_SUCCESS;
														
 
															+		return tdb_null;
														
 
															+	}
														
 
															+
														
 
															+	r = tdb_get(tdb, off, &pad, sizeof(*r));
														
 
															+	if (!r) {
														
 
															+		unlock_lists(tdb, start, end, F_RDLCK);
														
 
															+		return tdb_null;
														
 
															+	}
														
 
															+
														
 
															+	ret.dsize = rec_data_length(r);
														
 
															+	ret.dptr = tdb_alloc_read(tdb, off + sizeof(*r) + key.dsize,
														
 
															+				  ret.dsize);
														
 
															+	unlock_lists(tdb, start, end, F_RDLCK);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int hash_add(struct tdb_context *tdb, uint64_t h, tdb_off_t off)
														
 
															+{
														
 
															+	tdb_off_t i, hoff, len, num;
														
 
															+
														
 
															+	i = (h & ((1ULL << tdb->header.v.hash_bits) - 1));
														
 
															+	hoff = tdb->header.v.hash_off + i * sizeof(tdb_off_t);
														
 
															+	len = (1ULL << tdb->header.v.hash_bits) - i;
														
 
															+
														
 
															+	/* Look for next space. */
														
 
															+	num = tdb_find_zero_off(tdb, hoff, len);
														
 
															+	if (unlikely(num == len)) {
														
 
															+		hoff = tdb->header.v.hash_off;
														
 
															+		len = (1ULL << tdb->header.v.hash_bits);
														
 
															+		num = tdb_find_zero_off(tdb, hoff, len);
														
 
															+		if (i == len)
														
 
															+			return -1;
														
 
															+	}
														
 
															+	/* FIXME: Encode extra hash bits! */
														
 
															+	return tdb_write_off(tdb, hoff + num * sizeof(tdb_off_t), off);
														
 
															+}
														
 
															+
														
 
															+static int unlink_used_record(struct tdb_context *tdb, tdb_off_t chain,
														
 
															+			      uint64_t *extra_locks)
														
 
															+{
														
 
															+	tdb_off_t num, len, i, hoff;
														
 
															+
														
 
															+	/* FIXME: Maybe lock more in search?  Maybe don't lock if scan
														
 
															+	 * finds none? */
														
 
															+again:
														
 
															+	len = (1ULL << tdb->header.v.hash_bits) - (chain + 1);
														
 
															+	hoff = tdb->header.v.hash_off + (chain + 1) * sizeof(tdb_off_t);
														
 
															+	num = tdb_find_zero_off(tdb, hoff, len);
														
 
															+
														
 
															+	/* We want to lock the zero entry, too.  In the wrap case,
														
 
															+	 * this locks one extra.  That's harmless. */
														
 
															+	num++;
														
 
															+
														
 
															+	for (i = chain + 1; i < chain + 1 + num; i++) {
														
 
															+		if (tdb_lock_list(tdb, i, F_WRLCK, TDB_LOCK_WAIT) == -1) {
														
 
															+			if (i != chain + 1)
														
 
															+				unlock_lists(tdb, chain + 1, i-1, F_WRLCK);
														
 
															+			return -1;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* The wrap case: we need those locks out of order! */
														
 
															+	if (unlikely(num == len + 1)) {
														
 
															+		*extra_locks = tdb_find_zero_off(tdb, tdb->header.v.hash_off,
														
 
															+						 1ULL << tdb->header.v.hash_bits);
														
 
															+		(*extra_locks)++;
														
 
															+		for (i = 0; i < *extra_locks; i++) {
														
 
															+			if (tdb_lock_list(tdb, i, F_WRLCK, TDB_LOCK_NOWAIT)) {
														
 
															+				/* Failed.  Caller must lock in order. */
														
 
															+				if (i)
														
 
															+					unlock_lists(tdb, 0, i-1, F_WRLCK);
														
 
															+				unlock_lists(tdb, chain + 1, chain + num,
														
 
															+					     F_WRLCK);
														
 
															+				return 1;
														
 
															+			}
														
 
															+		}
														
 
															+		num += *extra_locks;
														
 
															+	}
														
 
															+
														
 
															+	/* Now we have the locks, be certain that offset is still 0! */
														
 
															+	hoff = tdb->header.v.hash_off
														
 
															+		+ (((chain + num) * sizeof(tdb_off_t))
														
 
															+		   & ((1ULL << tdb->header.v.hash_bits) - 1));
														
 
															+
														
 
															+	if (unlikely(tdb_read_off(tdb, hoff) != 0)) {
														
 
															+		unlock_lists(tdb, chain + 1, chain + num, F_WRLCK);
														
 
															+		goto again;
														
 
															+	}
														
 
															+
														
 
															+	/* OK, all locked.  Unlink first one. */
														
 
															+	hoff = tdb->header.v.hash_off + chain * sizeof(tdb_off_t);
														
 
															+	if (tdb_write_off(tdb, hoff, 0) == -1)
														
 
															+		goto unlock_err;
														
 
															+
														
 
															+	/* Rehash the rest. */
														
 
															+	for (i = 1; i < num; i++) {
														
 
															+		tdb_off_t off;
														
 
															+		uint64_t h;
														
 
															+
														
 
															+		hoff = tdb->header.v.hash_off
														
 
															+			+ (((chain + i) * sizeof(tdb_off_t))
														
 
															+			   & ((1ULL << tdb->header.v.hash_bits) - 1));
														
 
															+		off = tdb_read_off(tdb, hoff);
														
 
															+		if (unlikely(off == TDB_OFF_ERR))
														
 
															+			goto unlock_err;
														
 
															+
														
 
															+		/* Maybe use a bit to indicate it is in ideal place? */
														
 
															+		h = hash_record(tdb, off);
														
 
															+		/* Is it happy where it is? */
														
 
															+		if ((h & ((1ULL << tdb->header.v.hash_bits)-1)) == (chain + i))
														
 
															+			continue;
														
 
															+
														
 
															+		/* Remove it. */
														
 
															+		if (tdb_write_off(tdb, hoff, 0) == -1)
														
 
															+			goto unlock_err;
														
 
															+
														
 
															+		/* Rehash it. */
														
 
															+		if (hash_add(tdb, h, off) == -1)
														
 
															+			goto unlock_err;
														
 
															+	}
														
 
															+	unlock_lists(tdb, chain + 1, chain + num, F_WRLCK);
														
 
															+	return 0;
														
 
															+
														
 
															+unlock_err:
														
 
															+	unlock_lists(tdb, chain + 1, chain + num, F_WRLCK);
														
 
															+	return -1;
														
 
															+}
														
 
															+
														
 
															+int tdb_delete(struct tdb_context *tdb, struct tdb_data key)
														
 
															+{
														
 
															+	tdb_off_t off, start, end, room, extra_locks = 0;
														
 
															+	uint64_t h;
														
 
															+	int ret;
														
 
															+
														
 
															+	h = tdb_hash(tdb, key.dptr, key.dsize);
														
 
															+	off = find_bucket_and_lock(tdb, &key, h, &start, &end, &room, F_WRLCK);
														
 
															+	if (off == TDB_OFF_ERR)
														
 
															+		return -1;
														
 
															+
														
 
															+	if (off == 0) {
														
 
															+		unlock_lists(tdb, start, end, F_WRLCK);
														
 
															+		tdb->ecode = TDB_ERR_NOEXIST;
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	ret = unlink_used_record(tdb, end, &extra_locks);
														
 
															+	if (unlikely(ret == 1)) {
														
 
															+		unsigned int i;
														
 
															+
														
 
															+		unlock_lists(tdb, start, end, F_WRLCK);
														
 
															+
														
 
															+		/* We need extra locks at the start. */
														
 
															+		for (i = 0; i < extra_locks; i++) {
														
 
															+			if (tdb_lock_list(tdb, i, F_WRLCK, TDB_LOCK_WAIT)) {
														
 
															+				if (i)
														
 
															+					unlock_lists(tdb, 0, i-1, F_WRLCK);
														
 
															+				return -1;
														
 
															+			}
														
 
															+		}
														
 
															+		/* Try again now we're holding more locks. */
														
 
															+		ret = tdb_delete(tdb, key);
														
 
															+		unlock_lists(tdb, 0, i, F_WRLCK);
														
 
															+		return ret;
														
 
															+	}
														
 
															+	unlock_lists(tdb, start, end, F_WRLCK);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+int tdb_close(struct tdb_context *tdb)
														
 
															+{
														
 
															+	struct tdb_context **i;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	/* FIXME:
														
 
															+	if (tdb->transaction) {
														
 
															+		tdb_transaction_cancel(tdb);
														
 
															+	}
														
 
															+	*/
														
 
															+	tdb_trace(tdb, "tdb_close");
														
 
															+
														
 
															+	if (tdb->map_ptr) {
														
 
															+		if (tdb->flags & TDB_INTERNAL)
														
 
															+			free(tdb->map_ptr);
														
 
															+		else
														
 
															+			tdb_munmap(tdb);
														
 
															+	}
														
 
															+	free((char *)tdb->name);
														
 
															+	if (tdb->fd != -1) {
														
 
															+		ret = close(tdb->fd);
														
 
															+		tdb->fd = -1;
														
 
															+	}
														
 
															+	free(tdb->lockrecs);
														
 
															+
														
 
															+	/* Remove from contexts list */
														
 
															+	for (i = &tdbs; *i; i = &(*i)->next) {
														
 
															+		if (*i == tdb) {
														
 
															+			*i = tdb->next;
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+#ifdef TDB_TRACE
														
 
															+	close(tdb->tracefd);
														
 
															+#endif
														
 
															+	free(tdb);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
--- a/ccan/tdb2/tdb2.h
+++ b/ccan/tdb2/tdb2.h
@@ -0,0 +1,143 @@
 
															+#ifndef CCAN_TDB2_H
														
 
															+#define CCAN_TDB2_H
														
 
															+
														
 
															+/* 
														
 
															+   Unix SMB/CIFS implementation.
														
 
															+
														
 
															+   trivial database library
														
 
															+
														
 
															+   Copyright (C) Andrew Tridgell 1999-2004
														
 
															+   
														
 
															+     ** NOTE! The following LGPL license applies to the tdb
														
 
															+     ** library. This does NOT imply that all of Samba is released
														
 
															+     ** under the LGPL
														
 
															+   
														
 
															+   This library is free software; you can redistribute it and/or
														
 
															+   modify it under the terms of the GNU Lesser General Public
														
 
															+   License as published by the Free Software Foundation; either
														
 
															+   version 3 of the License, or (at your option) any later version.
														
 
															+
														
 
															+   This library is distributed in the hope that it will be useful,
														
 
															+   but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+   Lesser General Public License for more details.
														
 
															+
														
 
															+   You should have received a copy of the GNU Lesser General Public
														
 
															+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
														
 
															+*/
														
 
															+
														
 
															+#ifdef  __cplusplus
														
 
															+extern "C" {
														
 
															+#endif
														
 
															+
														
 
															+#ifndef _SAMBA_BUILD_
														
 
															+/* For mode_t */
														
 
															+#include <sys/types.h>
														
 
															+/* For O_* flags. */
														
 
															+#include <sys/stat.h>
														
 
															+/* For sig_atomic_t. */
														
 
															+#include <signal.h>
														
 
															+/* For uint64_t */
														
 
															+#include <stdint.h>
														
 
															+#endif
														
 
															+
														
 
															+/* flags to tdb_store() */
														
 
															+#define TDB_REPLACE 1		/* Unused */
														
 
															+#define TDB_INSERT 2 		/* Don't overwrite an existing entry */
														
 
															+#define TDB_MODIFY 3		/* Don't create an existing entry    */
														
 
															+
														
 
															+/* flags for tdb_open() */
														
 
															+#define TDB_DEFAULT 0 /* just a readability place holder */
														
 
															+#define TDB_CLEAR_IF_FIRST 1
														
 
															+#define TDB_INTERNAL 2 /* don't store on disk */
														
 
															+#define TDB_NOLOCK   4 /* don't do any locking */
														
 
															+#define TDB_NOMMAP   8 /* don't use mmap */
														
 
															+#define TDB_CONVERT 16 /* convert endian (internal use) */
														
 
															+#define TDB_BIGENDIAN 32 /* header is big-endian (internal use) */
														
 
															+#define TDB_NOSYNC   64 /* don't use synchronous transactions */
														
 
															+#define TDB_SEQNUM   128 /* maintain a sequence number */
														
 
															+#define TDB_VOLATILE   256 /* Activate the per-hashchain freelist, default 5 */
														
 
															+#define TDB_ALLOW_NESTING 512 /* Allow transactions to nest */
														
 
															+#define TDB_DISALLOW_NESTING 1024 /* Disallow transactions to nest */
														
 
															+
														
 
															+/* error codes */
														
 
															+enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK, 
														
 
															+		TDB_ERR_OOM, TDB_ERR_EXISTS, TDB_ERR_NOLOCK, TDB_ERR_LOCK_TIMEOUT,
														
 
															+		TDB_ERR_NOEXIST, TDB_ERR_EINVAL, TDB_ERR_RDONLY,
														
 
															+		TDB_ERR_NESTING};
														
 
															+
														
 
															+/* debugging uses one of the following levels */
														
 
															+enum tdb_debug_level {TDB_DEBUG_FATAL = 0, TDB_DEBUG_ERROR, 
														
 
															+		      TDB_DEBUG_WARNING, TDB_DEBUG_TRACE};
														
 
															+
														
 
															+typedef struct tdb_data {
														
 
															+	unsigned char *dptr;
														
 
															+	size_t dsize;
														
 
															+} TDB_DATA;
														
 
															+
														
 
															+#ifndef PRINTF_ATTRIBUTE
														
 
															+#if (__GNUC__ >= 3)
														
 
															+/** Use gcc attribute to check printf fns.  a1 is the 1-based index of
														
 
															+ * the parameter containing the format, and a2 the index of the first
														
 
															+ * argument. Note that some gcc 2.x versions don't handle this
														
 
															+ * properly **/
														
 
															+#define PRINTF_ATTRIBUTE(a1, a2) __attribute__ ((format (__printf__, a1, a2)))
														
 
															+#else
														
 
															+#define PRINTF_ATTRIBUTE(a1, a2)
														
 
															+#endif
														
 
															+#endif
														
 
															+
														
 
															+struct tdb_context;
														
 
															+
														
 
															+/* FIXME: Make typesafe */
														
 
															+typedef void (*tdb_logfn_t)(struct tdb_context *, enum tdb_debug_level, void *priv, const char *, ...) PRINTF_ATTRIBUTE(4, 5);
														
 
															+typedef uint64_t (*tdb_hashfn_t)(const void *key, size_t len, uint64_t seed,
														
 
															+				 void *priv);
														
 
															+
														
 
															+enum tdb_attribute_type {
														
 
															+	TDB_ATTRIBUTE_LOG = 0,
														
 
															+	TDB_ATTRIBUTE_HASH = 1
														
 
															+};
														
 
															+
														
 
															+struct tdb_attribute_base {
														
 
															+	enum tdb_attribute_type attr;
														
 
															+	union tdb_attribute *next;
														
 
															+};
														
 
															+
														
 
															+struct tdb_attribute_log {
														
 
															+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
														
 
															+	tdb_logfn_t log_fn;
														
 
															+	void *log_private;
														
 
															+};
														
 
															+
														
 
															+struct tdb_attribute_hash {
														
 
															+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
														
 
															+	tdb_hashfn_t hash_fn;
														
 
															+	void *hash_private;
														
 
															+};
														
 
															+
														
 
															+union tdb_attribute {
														
 
															+	struct tdb_attribute_base base;
														
 
															+	struct tdb_attribute_log log;
														
 
															+	struct tdb_attribute_hash hash;
														
 
															+};
														
 
															+		
														
 
															+struct tdb_context *tdb_open(const char *name, int tdb_flags,
														
 
															+			     int open_flags, mode_t mode,
														
 
															+			     union tdb_attribute *attributes);
														
 
															+
														
 
															+struct tdb_data tdb_fetch(struct tdb_context *tdb, struct tdb_data key);
														
 
															+int tdb_delete(struct tdb_context *tdb, struct tdb_data key);
														
 
															+int tdb_store(struct tdb_context *tdb, struct tdb_data key, struct tdb_data dbuf, int flag);
														
 
															+int tdb_close(struct tdb_context *tdb);
														
 
															+int tdb_check(struct tdb_context *tdb,
														
 
															+	      int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
														
 
															+	      void *private_data);
														
 
															+
														
 
															+extern struct tdb_data tdb_null;
														
 
															+
														
 
															+#ifdef  __cplusplus
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+#endif /* tdb2.h */
														
--- a/ccan/tdb2/test/run-encode.c
+++ b/ccan/tdb2/test/run-encode.c
@@ -0,0 +1,40 @@
 
															+#include <ccan/tdb2/tdb.c>
														
 
															+#include <ccan/tdb2/free.c>
														
 
															+#include <ccan/tdb2/lock.c>
														
 
															+#include <ccan/tdb2/io.c>
														
 
															+#include <ccan/tap/tap.h>
														
 
															+
														
 
															+int main(int argc, char *argv[])
														
 
															+{
														
 
															+	unsigned int i;
														
 
															+	struct tdb_used_record rec;
														
 
															+	struct tdb_context tdb = { .log = null_log_fn, .log_priv = NULL };
														
 
															+
														
 
															+	plan_tests(64 + 32 + 48*6);
														
 
															+
														
 
															+	/* We should be able to encode any data value. */
														
 
															+	for (i = 0; i < 64; i++)
														
 
															+		ok1(set_header(&tdb, &rec, 0, 1ULL << i, 1ULL << i, 0) == 0);
														
 
															+
														
 
															+	/* And any key and data with < 64 bits between them. */
														
 
															+	for (i = 0; i < 32; i++) {
														
 
															+		tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
														
 
															+		ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen, 0) == 0);
														
 
															+	}
														
 
															+
														
 
															+	/* We should neatly encode all values. */
														
 
															+	for (i = 0; i < 48; i++) {
														
 
															+		uint64_t h = 1ULL << (i < 11 ? 63 - i : 63 - 10);
														
 
															+		uint64_t klen = 1ULL << (i < 16 ? i : 15);
														
 
															+		uint64_t dlen = 1ULL << i;
														
 
															+		uint64_t xlen = 1ULL << (i < 32 ? i : 31);
														
 
															+		ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen + xlen, h)
														
 
															+		    == 0);
														
 
															+		ok1(rec_key_length(&rec) == klen);
														
 
															+		ok1(rec_data_length(&rec) == dlen);
														
 
															+		ok1(rec_extra_padding(&rec) == xlen);
														
 
															+		ok1(rec_hash(&rec) == h);
														
 
															+		ok1(rec_magic(&rec) == TDB_MAGIC);
														
 
															+	}
														
 
															+	return exit_status();
														
 
															+}
														
--- a/ccan/tdb2/test/run-fls.c
+++ b/ccan/tdb2/test/run-fls.c
@@ -0,0 +1,36 @@
 
															+#include <ccan/tdb2/tdb.c>
														
 
															+#include <ccan/tdb2/free.c>
														
 
															+#include <ccan/tdb2/lock.c>
														
 
															+#include <ccan/tdb2/io.c>
														
 
															+#include <ccan/tap/tap.h>
														
 
															+
														
 
															+static unsigned int dumb_fls(uint64_t num)
														
 
															+{
														
 
															+	int i;
														
 
															+
														
 
															+	for (i = 63; i >= 0; i--) {
														
 
															+		if (num & (1ULL << i))
														
 
															+			break;
														
 
															+	}
														
 
															+	return i + 1;
														
 
															+}
														
 
															+
														
 
															+int main(int argc, char *argv[])
														
 
															+{
														
 
															+	unsigned int i, j;
														
 
															+
														
 
															+	plan_tests(64 * 64 + 2);
														
 
															+
														
 
															+	ok1(fls64(0) == 0);
														
 
															+	ok1(dumb_fls(0) == 0);
														
 
															+
														
 
															+	for (i = 0; i < 64; i++) {
														
 
															+		for (j = 0; j < 64; j++) {
														
 
															+			uint64_t val = (1ULL << i) | (1ULL << j);
														
 
															+			ok(fls64(val) == dumb_fls(val),
														
 
															+			   "%llu -> %u should be %u", (long long)val,
														
 
															+			   fls64(val), dumb_fls(val));
														
 
															+		}
														
 
															+	}
														
 
															+	return exit_status();
														
 
															+}