Browse Source

Merge branch 'tdb2'

Rusty Russell 15 years ago
parent
commit
51a56b5262

+ 2 - 2
ccan/tdb/tools/Makefile

@@ -2,7 +2,7 @@ LDLIBS:=../../tdb.o ../../tally.o
 CFLAGS:=-I../../.. -Wall -O3 #-g -pg
 CFLAGS:=-I../../.. -Wall -O3 #-g -pg
 LDFLAGS:=-L../../..
 LDFLAGS:=-L../../..
 
 
-default: replay_trace tdbtorture tdbdump tdbtool starvation mktdb
+default: replay_trace tdbtorture tdbdump tdbtool starvation mktdb speed
 
 
 benchmark: replay_trace
 benchmark: replay_trace
 	@trap "rm -f /tmp/trace.$$$$" 0; for f in benchmarks/*.rz; do if runzip -k $$f -o /tmp/trace.$$$$ && echo -n "$$f": && ./replay_trace --quiet -n 5 replay.tdb /tmp/trace.$$$$ && rm /tmp/trace.$$$$; then rm -f /tmp/trace.$$$$; else exit 1; fi; done
 	@trap "rm -f /tmp/trace.$$$$" 0; for f in benchmarks/*.rz; do if runzip -k $$f -o /tmp/trace.$$$$ && echo -n "$$f": && ./replay_trace --quiet -n 5 replay.tdb /tmp/trace.$$$$ && rm /tmp/trace.$$$$; then rm -f /tmp/trace.$$$$; else exit 1; fi; done
@@ -30,4 +30,4 @@ check: replay_trace
 	@sed 's/\(^[0-9]* traverse\) .*/\1fn/' < $^ > $@
 	@sed 's/\(^[0-9]* traverse\) .*/\1fn/' < $^ > $@
 
 
 clean:
 clean:
-	rm -f replay_trace tdbtorture tdbdump tdbtool *.o
+	rm -f replay_trace tdbtorture tdbdump tdbtool speed *.o

+ 247 - 0
ccan/tdb/tools/speed.c

@@ -0,0 +1,247 @@
+/* Simple speed test for TDB */
+#include <err.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <ccan/tdb/tdb.h>
+
+/* Nanoseconds per operation */
+static size_t normalize(const struct timeval *start,
+			const struct timeval *stop,
+			unsigned int num)
+{
+	struct timeval diff;
+
+	timersub(stop, start, &diff);
+
+	/* Floating point is more accurate here. */
+	return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
+		/ num * 1000;
+}
+
+static size_t file_size(void)
+{
+	struct stat st;
+
+	if (stat("/tmp/speed.tdb", &st) != 0)
+		return -1;
+	return st.st_size;
+}
+
+static int count_record(struct tdb_context *tdb,
+			TDB_DATA key, TDB_DATA data, void *p)
+{
+	int *total = p;
+	*total += *(int *)data.dptr;
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, num = 1000, stage = 0, stopat = -1;
+	int flags = TDB_DEFAULT;
+	TDB_DATA key, data;
+	struct tdb_context *tdb;
+	struct timeval start, stop;
+	bool transaction = false;
+
+	if (argv[1] && strcmp(argv[1], "--internal") == 0) {
+		flags = TDB_INTERNAL;
+		argc--;
+		argv++;
+	}
+
+	if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
+		transaction = true;
+		argc--;
+		argv++;
+	}
+
+	tdb = tdb_open("/tmp/speed.tdb", 100003, flags, O_RDWR|O_CREAT|O_TRUNC,
+		       0600);
+	if (!tdb)
+		err(1, "Opening /tmp/speed.tdb");
+
+	key.dptr = (void *)&i;
+	key.dsize = sizeof(i);
+	data = key;
+
+	if (argv[1]) {
+		num = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+
+	if (argv[1]) {
+		stopat = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Add 1000 records. */
+	printf("Adding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Finding 1000 records. */
+	printf("Finding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++) {
+		int *dptr;
+		dptr = (int *)tdb_fetch(tdb, key).dptr;
+		if (!dptr || *dptr != i)
+			errx(1, "Fetching key %u in tdb gave %u",
+			     i, dptr ? *dptr : -1);
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Missing 1000 records. */
+	printf("Missing %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = num; i < num*2; i++) {
+		int *dptr;
+		dptr = (int *)tdb_fetch(tdb, key).dptr;
+		if (dptr)
+			errx(1, "Fetching key %u in tdb gave %u", i, *dptr);
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Traverse 1000 records. */
+	printf("Traversing %u records: ", num); fflush(stdout);
+	i = 0;
+	gettimeofday(&start, NULL);
+	if (tdb_traverse(tdb, count_record, &i) != num)
+		errx(1, "Traverse returned wrong number of records");
+	if (i != (num - 1) * (num / 2))
+		errx(1, "Traverse tallied to %u", i);
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Delete 1000 records (not in order). */
+	printf("Deleting %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if (tdb_delete(tdb, key) != 0)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Re-add 1000 records (not in order). */
+	printf("Re-adding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Append 1000 records. */
+	printf("Appending %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (tdb_append(tdb, key, data) != 0)
+			errx(1, "Appending key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Churn 1000 records: not in order! */
+	printf("Churning %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 1000019) % num;
+		if (tdb_delete(tdb, key) != 0)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+		i += num;
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+
+	return 0;
+}

+ 310 - 162
ccan/tdb2/check.c

@@ -43,25 +43,25 @@ static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
 	hash_test = TDB_HASH_MAGIC;
 	hash_test = TDB_HASH_MAGIC;
 	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
 	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
 	if (hdr.hash_test != hash_test) {
 	if (hdr.hash_test != hash_test) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "check: hash test %llu should be %llu\n",
-			 (long long)hdr.hash_test,
-			 (long long)hash_test);
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "check: hash test %llu should be %llu",
+			   (long long)hdr.hash_test,
+			   (long long)hash_test);
 		return false;
 		return false;
 	}
 	}
 
 
 	if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
 	if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "check: bad magic '%.*s'\n",
-			 (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "check: bad magic '%.*s'",
+			   (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
 		return false;
 		return false;
 	}
 	}
 
 
 	*recovery = hdr.recovery;
 	*recovery = hdr.recovery;
 	if (*recovery) {
 	if (*recovery) {
 		if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
 		if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_check: invalid recovery offset %zu\n",
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+				 "tdb_check: invalid recovery offset %zu",
 				 (size_t)*recovery);
 				 (size_t)*recovery);
 			return false;
 			return false;
 		}
 		}
@@ -77,7 +77,65 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			    unsigned hprefix_bits,
 			    unsigned hprefix_bits,
 			    tdb_off_t used[],
 			    tdb_off_t used[],
 			    size_t num_used,
 			    size_t num_used,
-			    size_t *num_found);
+			    size_t *num_found,
+			    int (*check)(TDB_DATA, TDB_DATA, void *),
+			    void *private_data);
+
+static bool check_hash_chain(struct tdb_context *tdb,
+			     tdb_off_t off,
+			     uint64_t hash,
+			     tdb_off_t used[],
+			     size_t num_used,
+			     size_t *num_found,
+			     int (*check)(TDB_DATA, TDB_DATA, void *),
+			     void *private_data)
+{
+	struct tdb_used_record rec;
+
+	if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
+		return false;
+
+	if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Bad hash chain magic %llu",
+			   (long long)rec_magic(&rec));
+		return false;
+	}
+
+	if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Bad hash chain length %llu vs %zu",
+			   (long long)rec_data_length(&rec),
+			   sizeof(struct tdb_chain));
+		return false;
+	}
+	if (rec_key_length(&rec) != 0) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			 "tdb_check: Bad hash chain key length %llu",
+			 (long long)rec_key_length(&rec));
+		return false;
+	}
+	if (rec_hash(&rec) != 0) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			 "tdb_check: Bad hash chain hash value %llu",
+			 (long long)rec_hash(&rec));
+		return false;
+	}
+
+	off += sizeof(rec);
+	if (!check_hash_tree(tdb, off, 0, hash, 64,
+			     used, num_used, num_found, check, private_data))
+		return false;
+
+	off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
+	if (off == TDB_OFF_ERR)
+		return false;
+	if (off == 0)
+		return true;
+	(*num_found)++;
+	return check_hash_chain(tdb, off, hash, used, num_used, num_found,
+				check, private_data);
+}
 
 
 static bool check_hash_record(struct tdb_context *tdb,
 static bool check_hash_record(struct tdb_context *tdb,
 			      tdb_off_t off,
 			      tdb_off_t off,
@@ -85,30 +143,43 @@ static bool check_hash_record(struct tdb_context *tdb,
 			      unsigned hprefix_bits,
 			      unsigned hprefix_bits,
 			      tdb_off_t used[],
 			      tdb_off_t used[],
 			      size_t num_used,
 			      size_t num_used,
-			      size_t *num_found)
+			      size_t *num_found,
+			      int (*check)(TDB_DATA, TDB_DATA, void *),
+			      void *private_data)
 {
 {
 	struct tdb_used_record rec;
 	struct tdb_used_record rec;
 
 
+	if (hprefix_bits >= 64)
+		return check_hash_chain(tdb, off, hprefix, used, num_used,
+					num_found, check, private_data);
+
 	if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
 	if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
 		return false;
 		return false;
 
 
+	if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Bad hash table magic %llu",
+			   (long long)rec_magic(&rec));
+		return false;
+	}
 	if (rec_data_length(&rec)
 	if (rec_data_length(&rec)
 	    != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
 	    != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: Bad hash table length %llu vs %llu\n",
-			 (long long)rec_data_length(&rec),
-			 (long long)sizeof(tdb_off_t)<<TDB_SUBLEVEL_HASH_BITS);
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Bad hash table length %llu vs %llu",
+			   (long long)rec_data_length(&rec),
+			   (long long)sizeof(tdb_off_t)
+			   << TDB_SUBLEVEL_HASH_BITS);
 		return false;
 		return false;
 	}
 	}
 	if (rec_key_length(&rec) != 0) {
 	if (rec_key_length(&rec) != 0) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: Bad hash table key length %llu\n",
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			 "tdb_check: Bad hash table key length %llu",
 			 (long long)rec_key_length(&rec));
 			 (long long)rec_key_length(&rec));
 		return false;
 		return false;
 	}
 	}
 	if (rec_hash(&rec) != 0) {
 	if (rec_hash(&rec) != 0) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: Bad hash table hash value %llu\n",
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			 "tdb_check: Bad hash table hash value %llu",
 			 (long long)rec_hash(&rec));
 			 (long long)rec_hash(&rec));
 		return false;
 		return false;
 	}
 	}
@@ -117,7 +188,7 @@ static bool check_hash_record(struct tdb_context *tdb,
 	return check_hash_tree(tdb, off,
 	return check_hash_tree(tdb, off,
 			       TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
 			       TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
 			       hprefix, hprefix_bits,
 			       hprefix, hprefix_bits,
-			       used, num_used, num_found);
+			       used, num_used, num_found, check, private_data);
 }
 }
 
 
 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
@@ -141,7 +212,9 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			    unsigned hprefix_bits,
 			    unsigned hprefix_bits,
 			    tdb_off_t used[],
 			    tdb_off_t used[],
 			    size_t num_used,
 			    size_t num_used,
-			    size_t *num_found)
+			    size_t *num_found,
+			    int (*check)(TDB_DATA, TDB_DATA, void *),
+			    void *private_data)
 {
 {
 	unsigned int g, b;
 	unsigned int g, b;
 	const tdb_off_t *hash;
 	const tdb_off_t *hash;
@@ -166,16 +239,42 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			off = group[b] & TDB_OFF_MASK;
 			off = group[b] & TDB_OFF_MASK;
 			p = asearch(&off, used, num_used, off_cmp);
 			p = asearch(&off, used, num_used, off_cmp);
 			if (!p) {
 			if (!p) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: Invalid offset %llu "
-					 "in hash\n",
-					 (long long)off);
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: Invalid offset %llu "
+					   "in hash", (long long)off);
 				goto fail;
 				goto fail;
 			}
 			}
 			/* Mark it invalid. */
 			/* Mark it invalid. */
 			*p ^= 1;
 			*p ^= 1;
 			(*num_found)++;
 			(*num_found)++;
 
 
+			if (hprefix_bits == 64) {
+				/* Chained entries are unordered. */
+				if (is_subhash(group[b])) {
+					tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_DEBUG_ERROR,
+						   "tdb_check: Invalid chain"
+						   " entry subhash");
+					goto fail;
+				}
+				h = hash_record(tdb, off);
+				if (h != hprefix) {
+					tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_DEBUG_ERROR,
+						   "check: bad hash chain"
+						   " placement"
+						   " 0x%llx vs 0x%llx",
+						   (long long)h,
+						   (long long)hprefix);
+					goto fail;
+				}
+				if (tdb_read_convert(tdb, off, &rec,
+						     sizeof(rec)))
+					goto fail;
+				goto check;
+			}
+
 			if (is_subhash(group[b])) {
 			if (is_subhash(group[b])) {
 				uint64_t subprefix;
 				uint64_t subprefix;
 				subprefix = (hprefix 
 				subprefix = (hprefix 
@@ -188,7 +287,8 @@ static bool check_hash_tree(struct tdb_context *tdb,
 					       hprefix_bits
 					       hprefix_bits
 						       + group_bits
 						       + group_bits
 						       + TDB_HASH_GROUP_BITS,
 						       + TDB_HASH_GROUP_BITS,
-					       used, num_used, num_found))
+					       used, num_used, num_found,
+					       check, private_data))
 					goto fail;
 					goto fail;
 				continue;
 				continue;
 			}
 			}
@@ -199,18 +299,20 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			used_bits = 0;
 			used_bits = 0;
 			if (get_bits(h, hprefix_bits, &used_bits) != hprefix
 			if (get_bits(h, hprefix_bits, &used_bits) != hprefix
 			    && hprefix_bits) {
 			    && hprefix_bits) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "check: bad hash placement"
-					 " 0x%llx vs 0x%llx\n",
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "check: bad hash placement"
+					   " 0x%llx vs 0x%llx",
 					 (long long)h, (long long)hprefix);
 					 (long long)h, (long long)hprefix);
 				goto fail;
 				goto fail;
 			}
 			}
 
 
 			/* Does it belong in this group? */
 			/* Does it belong in this group? */
 			if (get_bits(h, group_bits, &used_bits) != g) {
 			if (get_bits(h, group_bits, &used_bits) != g) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "check: bad group %llu vs %u\n",
-					 (long long)h, g);
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "check: bad group %llu vs %u",
+					   (long long)h, g);
 				goto fail;
 				goto fail;
 			}
 			}
 
 
@@ -219,11 +321,12 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
 			if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
 			    != bucket) {
 			    != bucket) {
 				used_bits -= TDB_HASH_GROUP_BITS;
 				used_bits -= TDB_HASH_GROUP_BITS;
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "check: bad bucket %u vs %u\n",
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					 "check: bad bucket %u vs %u",
 					 (unsigned)get_bits(h,
 					 (unsigned)get_bits(h,
-							    TDB_HASH_GROUP_BITS,
-							    &used_bits),
+							TDB_HASH_GROUP_BITS,
+							&used_bits),
 					 bucket);
 					 bucket);
 				goto fail;
 				goto fail;
 			}
 			}
@@ -234,28 +337,46 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			     i != b;
 			     i != b;
 			     i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
 			     i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
 				if (group[i] == 0) {
 				if (group[i] == 0) {
-					tdb->log(tdb, TDB_DEBUG_ERROR,
-						 tdb->log_priv,
-						 "check: bad group placement"
-						 " %u vs %u\n",
-						 b, bucket);
+					tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_DEBUG_ERROR,
+						   "check: bad group placement"
+						   " %u vs %u",
+						   b, bucket);
 					goto fail;
 					goto fail;
 				}
 				}
 			}
 			}
 
 
-			if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
+			if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
 				goto fail;
 				goto fail;
 
 
 			/* Bottom bits must match header. */
 			/* Bottom bits must match header. */
 			if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
 			if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: Bad hash magic at"
-					 " offset %llu (0x%llx vs 0x%llx)\n",
-					 (long long)off,
-					 (long long)h,
-					 (long long)rec_hash(&rec));
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: Bad hash magic at"
+					   " offset %llu (0x%llx vs 0x%llx)",
+					   (long long)off,
+					   (long long)h,
+					   (long long)rec_hash(&rec));
 				goto fail;
 				goto fail;
 			}
 			}
+
+		check:
+			if (check) {
+				TDB_DATA key, data;
+				key.dsize = rec_key_length(&rec);
+				data.dsize = rec_data_length(&rec);
+				key.dptr = (void *)tdb_access_read(tdb,
+						   off + sizeof(rec),
+						   key.dsize + data.dsize,
+						   false);
+				if (!key.dptr)
+					goto fail;
+				data.dptr = key.dptr + key.dsize;
+				if (check(key, data, private_data) != 0)
+					goto fail;
+				tdb_access_release(tdb, key.dptr);
+			}
 		}
 		}
 	}
 	}
 	tdb_access_release(tdb, hash);
 	tdb_access_release(tdb, hash);
@@ -268,19 +389,22 @@ fail:
 
 
 static bool check_hash(struct tdb_context *tdb,
 static bool check_hash(struct tdb_context *tdb,
 		       tdb_off_t used[],
 		       tdb_off_t used[],
-		       size_t num_used, size_t num_flists)
+		       size_t num_used, size_t num_ftables,
+		       int (*check)(TDB_DATA, TDB_DATA, void *),
+		       void *private_data)
 {
 {
-	/* Free lists also show up as used. */
-	size_t num_found = num_flists;
+	/* Free tables also show up as used. */
+	size_t num_found = num_ftables;
 
 
 	if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
 	if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
 			     TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
 			     TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
-			     0, 0, used, num_used, &num_found))
+			     0, 0, used, num_used, &num_found,
+			     check, private_data))
 		return false;
 		return false;
 
 
 	if (num_found != num_used) {
 	if (num_found != num_used) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: Not all entries are in hash\n");
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Not all entries are in hash");
 		return false;
 		return false;
 	}
 	}
 	return true;
 	return true;
@@ -289,62 +413,63 @@ static bool check_hash(struct tdb_context *tdb,
 static bool check_free(struct tdb_context *tdb,
 static bool check_free(struct tdb_context *tdb,
 		       tdb_off_t off,
 		       tdb_off_t off,
 		       const struct tdb_free_record *frec,
 		       const struct tdb_free_record *frec,
-		       tdb_off_t prev, tdb_off_t flist_off, unsigned int bucket)
+		       tdb_off_t prev, unsigned int ftable,
+		       unsigned int bucket)
 {
 {
 	if (frec_magic(frec) != TDB_FREE_MAGIC) {
 	if (frec_magic(frec) != TDB_FREE_MAGIC) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: offset %llu bad magic 0x%llx\n",
-			 (long long)off, (long long)frec->magic_and_meta);
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: offset %llu bad magic 0x%llx",
+			   (long long)off, (long long)frec->magic_and_prev);
 		return false;
 		return false;
 	}
 	}
-	if (frec_flist(frec) != flist_off) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: offset %llu bad freelist 0x%llx\n",
-			 (long long)off, (long long)frec_flist(frec));
+	if (frec_ftable(frec) != ftable) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: offset %llu bad freetable %u",
+			   (long long)off, frec_ftable(frec));
 		return false;
 		return false;
 	}
 	}
 
 
 	if (tdb->methods->oob(tdb, off
 	if (tdb->methods->oob(tdb, off
-			      + frec->data_len+sizeof(struct tdb_used_record),
+			      + frec_len(frec) + sizeof(struct tdb_used_record),
 			      false))
 			      false))
 		return false;
 		return false;
-	if (size_to_bucket(frec->data_len) != bucket) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: offset %llu in wrong bucket %u vs %u\n",
-			 (long long)off,
-			 bucket, size_to_bucket(frec->data_len));
+	if (size_to_bucket(frec_len(frec)) != bucket) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: offset %llu in wrong bucket %u vs %u",
+			   (long long)off,
+			   bucket, size_to_bucket(frec_len(frec)));
 		return false;
 		return false;
 	}
 	}
-	if (prev != frec->prev) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: offset %llu bad prev %llu vs %llu\n",
-			 (long long)off,
-			 (long long)prev, (long long)frec->prev);
+	if (prev != frec_prev(frec)) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: offset %llu bad prev %llu vs %llu",
+			   (long long)off,
+			   (long long)prev, (long long)frec_len(frec));
 		return false;
 		return false;
 	}
 	}
 	return true;
 	return true;
 }
 }
 		       
 		       
-static bool check_free_list(struct tdb_context *tdb,
-			    tdb_off_t flist_off,
-			    tdb_off_t free[],
-			    size_t num_free,
-			    size_t *num_found)
+static bool check_free_table(struct tdb_context *tdb,
+			     tdb_off_t ftable_off,
+			     unsigned ftable_num,
+			     tdb_off_t free[],
+			     size_t num_free,
+			     size_t *num_found)
 {
 {
-	struct tdb_freelist flist;
+	struct tdb_freetable ft;
 	tdb_off_t h;
 	tdb_off_t h;
 	unsigned int i;
 	unsigned int i;
 
 
-	if (tdb_read_convert(tdb, flist_off, &flist, sizeof(flist)) == -1)
+	if (tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft)) == -1)
 		return false;
 		return false;
 
 
-	if (rec_magic(&flist.hdr) != TDB_MAGIC
-	    || rec_key_length(&flist.hdr) != 0
-	    || rec_data_length(&flist.hdr) != sizeof(flist) - sizeof(flist.hdr)
-	    || rec_hash(&flist.hdr) != 1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR,
-			 tdb->log_priv,
-			 "tdb_check: Invalid header on free list\n");
+	if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
+	    || rec_key_length(&ft.hdr) != 0
+	    || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
+	    || rec_hash(&ft.hdr) != 0) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Invalid header on free table");
 		return false;
 		return false;
 	}
 	}
 
 
@@ -352,23 +477,23 @@ static bool check_free_list(struct tdb_context *tdb,
 		tdb_off_t off, prev = 0, *p;
 		tdb_off_t off, prev = 0, *p;
 		struct tdb_free_record f;
 		struct tdb_free_record f;
 
 
-		h = bucket_off(flist_off, i);
+		h = bucket_off(ftable_off, i);
 		for (off = tdb_read_off(tdb, h); off; off = f.next) {
 		for (off = tdb_read_off(tdb, h); off; off = f.next) {
 			if (off == TDB_OFF_ERR)
 			if (off == TDB_OFF_ERR)
 				return false;
 				return false;
 			if (tdb_read_convert(tdb, off, &f, sizeof(f)))
 			if (tdb_read_convert(tdb, off, &f, sizeof(f)))
 				return false;
 				return false;
-			if (!check_free(tdb, off, &f, prev, flist_off, i))
+			if (!check_free(tdb, off, &f, prev, ftable_num, i))
 				return false;
 				return false;
 
 
 			/* FIXME: Check hash bits */
 			/* FIXME: Check hash bits */
 			p = asearch(&off, free, num_free, off_cmp);
 			p = asearch(&off, free, num_free, off_cmp);
 			if (!p) {
 			if (!p) {
-				tdb->log(tdb, TDB_DEBUG_ERROR,
-					 tdb->log_priv,
-					 "tdb_check: Invalid offset"
-					 " %llu in free table\n",
-					 (long long)off);
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: Invalid offset"
+					   " %llu in free table",
+					   (long long)off);
 				return false;
 				return false;
 			}
 			}
 			/* Mark it invalid. */
 			/* Mark it invalid. */
@@ -381,7 +506,7 @@ static bool check_free_list(struct tdb_context *tdb,
 }
 }
 
 
 /* Slow, but should be very rare. */
 /* Slow, but should be very rare. */
-static size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
+size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
 {
 {
 	size_t len;
 	size_t len;
 
 
@@ -409,113 +534,135 @@ static bool check_linear(struct tdb_context *tdb,
 			struct tdb_used_record u;
 			struct tdb_used_record u;
 			struct tdb_free_record f;
 			struct tdb_free_record f;
 			struct tdb_recovery_record r;
 			struct tdb_recovery_record r;
-		} pad, *p;
-		p = tdb_get(tdb, off, &pad, sizeof(pad));
-		if (!p)
+		} rec;
+		/* r is larger: only get that if we need to. */
+		if (tdb_read_convert(tdb, off, &rec, sizeof(rec.f)) == -1)
 			return false;
 			return false;
 
 
 		/* If we crash after ftruncate, we can get zeroes or fill. */
 		/* If we crash after ftruncate, we can get zeroes or fill. */
-		if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
-		    || p->r.magic ==  0x4343434343434343ULL) {
+		if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
+		    || rec.r.magic ==  0x4343434343434343ULL) {
+			if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
+				return false;
+
 			if (recovery == off) {
 			if (recovery == off) {
 				found_recovery = true;
 				found_recovery = true;
-				len = sizeof(p->r) + p->r.max_len;
+				len = sizeof(rec.r) + rec.r.max_len;
 			} else {
 			} else {
 				len = dead_space(tdb, off);
 				len = dead_space(tdb, off);
-				if (len < sizeof(p->r)) {
-					tdb->log(tdb, TDB_DEBUG_ERROR,
-						 tdb->log_priv,
-						 "tdb_check: invalid dead space"
-						 " at %zu\n", (size_t)off);
+				if (len < sizeof(rec.r)) {
+					tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_DEBUG_ERROR,
+						   "tdb_check: invalid dead"
+						   " space at %zu",
+						   (size_t)off);
 					return false;
 					return false;
 				}
 				}
 
 
-				tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
-					 "Dead space at %zu-%zu (of %zu)\n",
-					 (size_t)off, (size_t)(off + len),
-					 (size_t)tdb->map_size);
+				tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_WARNING,
+					   "Dead space at %zu-%zu (of %zu)",
+					   (size_t)off, (size_t)(off + len),
+					   (size_t)tdb->map_size);
 			}
 			}
-		} else if (p->r.magic == TDB_RECOVERY_MAGIC) {
+		} else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
+			if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
+				return false;
 			if (recovery != off) {
 			if (recovery != off) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: unexpected recovery"
-					 " record at offset %zu\n",
-					 (size_t)off);
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: unexpected recovery"
+					   " record at offset %zu",
+					   (size_t)off);
+				return false;
+			}
+			if (rec.r.len > rec.r.max_len) {
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: invalid recovery length"
+					   " %zu", (size_t)rec.r.len);
+				return false;
+			}
+			if (rec.r.eof > tdb->map_size) {
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: invalid old EOF"
+					   " %zu", (size_t)rec.r.eof);
 				return false;
 				return false;
 			}
 			}
 			found_recovery = true;
 			found_recovery = true;
-			len = sizeof(p->r) + p->r.max_len;
-		} else if (frec_magic(&p->f) == TDB_FREE_MAGIC
-			   || frec_magic(&p->f) == TDB_COALESCING_MAGIC) {
-			len = sizeof(p->u) + p->f.data_len;
+			len = sizeof(rec.r) + rec.r.max_len;
+		} else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
+			len = sizeof(rec.u) + frec_len(&rec.f);
 			if (off + len > tdb->map_size) {
 			if (off + len > tdb->map_size) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: free overlength %llu"
-					 " at offset %llu\n",
-					 (long long)len, (long long)off);
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: free overlength %llu"
+					   " at offset %llu",
+					   (long long)len, (long long)off);
 				return false;
 				return false;
 			}
 			}
-			/* This record is free! */
-			if (frec_magic(&p->f) == TDB_FREE_MAGIC
+			/* This record should be in free lists. */
+			if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
 			    && !append(free, num_free, off))
 			    && !append(free, num_free, off))
 				return false;
 				return false;
-		} else {
+		} else if (rec_magic(&rec.u) == TDB_USED_MAGIC
+			   || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
+			   || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
+			   || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
 			uint64_t klen, dlen, extra;
 			uint64_t klen, dlen, extra;
 
 
 			/* This record is used! */
 			/* This record is used! */
-			if (rec_magic(&p->u) != TDB_MAGIC) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: Bad magic 0x%llx"
-					 " at offset %llu\n",
-					 (long long)rec_magic(&p->u),
-					 (long long)off);
-				return false;
-			}
-
 			if (!append(used, num_used, off))
 			if (!append(used, num_used, off))
 				return false;
 				return false;
 
 
-			klen = rec_key_length(&p->u);
-			dlen = rec_data_length(&p->u);
-			extra = rec_extra_padding(&p->u);
+			klen = rec_key_length(&rec.u);
+			dlen = rec_data_length(&rec.u);
+			extra = rec_extra_padding(&rec.u);
 
 
-			len = sizeof(p->u) + klen + dlen + extra;
+			len = sizeof(rec.u) + klen + dlen + extra;
 			if (off + len > tdb->map_size) {
 			if (off + len > tdb->map_size) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: used overlength %llu"
-					 " at offset %llu\n",
-					 (long long)len, (long long)off);
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: used overlength %llu"
+					   " at offset %llu",
+					   (long long)len, (long long)off);
 				return false;
 				return false;
 			}
 			}
 
 
-			if (len < sizeof(p->f)) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: too short record %llu at"
-					 " %llu\n",
-					 (long long)len, (long long)off);
+			if (len < sizeof(rec.f)) {
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: too short record %llu"
+					   " at %llu",
+					   (long long)len, (long long)off);
 				return false;
 				return false;
 			}
 			}
+		} else {
+			tdb_logerr(tdb, TDB_ERR_CORRUPT,
+				   TDB_DEBUG_ERROR,
+				   "tdb_check: Bad magic 0x%llx at offset %zu",
+				   (long long)rec_magic(&rec.u), (size_t)off);
+			return false;
 		}
 		}
 	}
 	}
 
 
 	/* We must have found recovery area if there was one. */
 	/* We must have found recovery area if there was one. */
 	if (recovery != 0 && !found_recovery) {
 	if (recovery != 0 && !found_recovery) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: expected a recovery area at %zu\n",
-			 (size_t)recovery);
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: expected a recovery area at %zu",
+			   (size_t)recovery);
 		return false;
 		return false;
 	}
 	}
 
 
 	return true;
 	return true;
 }
 }
 
 
-/* FIXME: call check() function. */
 int tdb_check(struct tdb_context *tdb,
 int tdb_check(struct tdb_context *tdb,
 	      int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
 	      int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
 	      void *private_data)
 	      void *private_data)
 {
 {
-	tdb_off_t *free = NULL, *used = NULL, flist, recovery;
-	size_t num_free = 0, num_used = 0, num_found = 0, num_flists = 0;
+	tdb_off_t *free = NULL, *used = NULL, ft, recovery;
+	size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
 
 
 	if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
 	if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
 		return -1;
 		return -1;
@@ -532,22 +679,23 @@ int tdb_check(struct tdb_context *tdb,
 	if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery))
 	if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery))
 		goto fail;
 		goto fail;
 
 
-	for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
-		if (flist == TDB_OFF_ERR)
+	for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
+		if (ft == TDB_OFF_ERR)
 			goto fail;
 			goto fail;
-		if (!check_free_list(tdb, flist, free, num_free, &num_found))
+		if (!check_free_table(tdb, ft, num_ftables, free, num_free,
+				      &num_found))
 			goto fail;
 			goto fail;
-		num_flists++;
+		num_ftables++;
 	}
 	}
 
 
 	/* FIXME: Check key uniqueness? */
 	/* FIXME: Check key uniqueness? */
-	if (!check_hash(tdb, used, num_used, num_flists))
+	if (!check_hash(tdb, used, num_used, num_ftables, check, private_data))
 		goto fail;
 		goto fail;
 
 
 	if (num_found != num_free) {
 	if (num_found != num_free) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: Not all entries are in free table\n");
-		return false;
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Not all entries are in free table");
+		return -1;
 	}
 	}
 
 
 	tdb_allrecord_unlock(tdb, F_RDLCK);
 	tdb_allrecord_unlock(tdb, F_RDLCK);

File diff suppressed because it is too large
+ 242 - 273
ccan/tdb2/doc/design.lyx


File diff suppressed because it is too large
+ 244 - 270
ccan/tdb2/doc/design.lyx,v


BIN
ccan/tdb2/doc/design.pdf


+ 149 - 68
ccan/tdb2/doc/design.txt

@@ -2,7 +2,7 @@ TDB2: A Redesigning The Trivial DataBase
 
 
 Rusty Russell, IBM Corporation
 Rusty Russell, IBM Corporation
 
 
-14-September-2010
+1-December-2010
 
 
 Abstract
 Abstract
 
 
@@ -129,6 +129,10 @@ union tdb_attribute {
 This allows future attributes to be added, even if this expands 
 This allows future attributes to be added, even if this expands 
 the size of the union.
 the size of the union.
 
 
+2.1.2 Status
+
+Complete.
+
 2.2 tdb_traverse Makes Impossible Guarantees
 2.2 tdb_traverse Makes Impossible Guarantees
 
 
 tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, 
 tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, 
@@ -148,6 +152,11 @@ occur during your traversal, otherwise you will see some subset.
 You can prevent changes by using a transaction or the locking 
 You can prevent changes by using a transaction or the locking 
 API.
 API.
 
 
+2.2.2 Status
+
+Complete. Delete-during-traverse will still delete every record, 
+too (assuming no other changes).
+
 2.3 Nesting of Transactions Is Fraught
 2.3 Nesting of Transactions Is Fraught
 
 
 TDB has alternated between allowing nested transactions and not 
 TDB has alternated between allowing nested transactions and not 
@@ -182,6 +191,10 @@ However, this behavior can be simulated with a wrapper which uses
 tdb_add_flags() and tdb_remove_flags(), so the API should not be 
 tdb_add_flags() and tdb_remove_flags(), so the API should not be 
 expanded for this relatively-obscure case.
 expanded for this relatively-obscure case.
 
 
+2.3.2 Status
+
+Incomplete; nesting flag is still defined as per tdb1.
+
 2.4 Incorrect Hash Function is Not Detected
 2.4 Incorrect Hash Function is Not Detected
 
 
 tdb_open_ex() allows the calling code to specify a different hash 
 tdb_open_ex() allows the calling code to specify a different hash 
@@ -195,6 +208,10 @@ The header should contain an example hash result (eg. the hash of
 0xdeadbeef), and tdb_open_ex() should check that the given hash 
 0xdeadbeef), and tdb_open_ex() should check that the given hash 
 function produces the same answer, or fail the tdb_open call.
 function produces the same answer, or fail the tdb_open call.
 
 
+2.4.2 Status
+
+Complete.
+
 2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
 2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
 
 
 In response to scalability issues with the free list ([TDB-Freelist-Is]
 In response to scalability issues with the free list ([TDB-Freelist-Is]
@@ -216,6 +233,11 @@ hint that store and delete of records will be at least as common
 as fetch in order to allow some internal tuning, but initially 
 as fetch in order to allow some internal tuning, but initially 
 will become a no-op.
 will become a no-op.
 
 
+2.5.2 Status
+
+Incomplete. TDB_VOLATILE still defined, but implementation should 
+fail on unknown flags to be future-proof.
+
 2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times 
 2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times 
   In The Same Process
   In The Same Process
 
 
@@ -251,6 +273,10 @@ whether re-opening is allowed, as though there may be some
 benefit to adding a call to detect when a tdb_context is shared, 
 benefit to adding a call to detect when a tdb_context is shared, 
 to allow other to create such an API.
 to allow other to create such an API.
 
 
+2.6.2 Status
+
+Incomplete.
+
 2.7 TDB API Is Not POSIX Thread-safe
 2.7 TDB API Is Not POSIX Thread-safe
 
 
 The TDB API uses an error code which can be queried after an 
 The TDB API uses an error code which can be queried after an 
@@ -281,6 +307,10 @@ will exist. Alternatively, a hooking mechanism similar to that
 proposed for [Proposed-Solution-locking-hook] could be used to 
 proposed for [Proposed-Solution-locking-hook] could be used to 
 enable pthread locking at runtime.
 enable pthread locking at runtime.
 
 
+2.7.2 Status
+
+Incomplete.
+
 2.8 *_nonblock Functions And *_mark Functions Expose 
 2.8 *_nonblock Functions And *_mark Functions Expose 
   Implementation
   Implementation
 
 
@@ -343,6 +373,10 @@ locks it doesn't need to obtain.
 It also keeps the complexity out of the API, and in ctdbd where 
 It also keeps the complexity out of the API, and in ctdbd where 
 it is needed.
 it is needed.
 
 
+2.8.2 Status
+
+Incomplete.
+
 2.9 tdb_chainlock Functions Expose Implementation
 2.9 tdb_chainlock Functions Expose Implementation
 
 
 tdb_chainlock locks some number of records, including the record 
 tdb_chainlock locks some number of records, including the record 
@@ -391,6 +425,10 @@ EINVAL if the signal occurs before the kernel is entered,
 otherwise EAGAIN.
 otherwise EAGAIN.
 ]
 ]
 
 
+2.10.2 Status
+
+Incomplete.
+
 2.11 The API Uses Gratuitous Typedefs, Capitals
 2.11 The API Uses Gratuitous Typedefs, Capitals
 
 
 typedefs are useful for providing source compatibility when types 
 typedefs are useful for providing source compatibility when types 
@@ -433,6 +471,10 @@ the tdb_open_ex for logging.
 It should simply take an extra argument, since we are prepared to 
 It should simply take an extra argument, since we are prepared to 
 break the API/ABI.
 break the API/ABI.
 
 
+2.12.2 Status
+
+Complete.
+
 2.13 Various Callback Functions Are Not Typesafe
 2.13 Various Callback Functions Are Not Typesafe
 
 
 The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
 The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
@@ -455,6 +497,10 @@ their parameter.
 See CCAN's typesafe_cb module at 
 See CCAN's typesafe_cb module at 
 http://ccan.ozlabs.org/info/typesafe_cb.html
 http://ccan.ozlabs.org/info/typesafe_cb.html
 
 
+2.13.2 Status
+
+Incomplete.
+
 2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, 
 2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, 
   tdb_reopen_all Problematic
   tdb_reopen_all Problematic
 
 
@@ -475,6 +521,11 @@ it alone has opened the TDB and will erase it.
 Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but 
 Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but 
 see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
 see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
 
 
+2.14.2 Status
+
+Incomplete, TDB_CLEAR_IF_FIRST still defined, but not 
+implemented.
+
 2.15 Extending The Header Is Difficult
 2.15 Extending The Header Is Difficult
 
 
 We have reserved (zeroed) words in the TDB header, which can be 
 We have reserved (zeroed) words in the TDB header, which can be 
@@ -505,6 +556,10 @@ This should allow backwards-compatible features to be added, and
 detection if older code (which doesn't understand the feature) 
 detection if older code (which doesn't understand the feature) 
 writes to the database.
 writes to the database.
 
 
+2.15.2 Status
+
+Incomplete.
+
 2.16 Record Headers Are Not Expandible
 2.16 Record Headers Are Not Expandible
 
 
 If we later want to add (say) checksums on keys and data, it 
 If we later want to add (say) checksums on keys and data, it 
@@ -519,6 +574,10 @@ understand a new format: the new code would write (say) a 1 at
 the tail, and thus if there is no tail or the first byte is 0, we 
 the tail, and thus if there is no tail or the first byte is 0, we 
 would know the extension is not present on that record.
 would know the extension is not present on that record.
 
 
+2.16.2 Status
+
+Incomplete.
+
 2.17 TDB Does Not Use Talloc
 2.17 TDB Does Not Use Talloc
 
 
 Many users of TDB (particularly Samba) use the talloc allocator, 
 Many users of TDB (particularly Samba) use the talloc allocator, 
@@ -541,6 +600,10 @@ returned from tdb_open to close it. All TDB_DATA fields would be
 children of the tdb_context, and the caller would still have to 
 children of the tdb_context, and the caller would still have to 
 manage them (using talloc_free() or talloc_steal()).
 manage them (using talloc_free() or talloc_steal()).
 
 
+2.17.2 Status
+
+Deferred.
+
 3 Performance And Scalability Issues
 3 Performance And Scalability Issues
 
 
 3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST 
 3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST 
@@ -570,6 +633,10 @@ Remove the flag. It was a neat idea, but even trivial servers
 tend to know when they are initializing for the first time and 
 tend to know when they are initializing for the first time and 
 can simply unlink the old tdb at that point.
 can simply unlink the old tdb at that point.
 
 
+3.1.2 Status
+
+Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
+
 3.2 TDB Files Have a 4G Limit
 3.2 TDB Files Have a 4G Limit
 
 
 This seems to be becoming an issue (so much for “trivial”!), 
 This seems to be becoming an issue (so much for “trivial”!), 
@@ -596,6 +663,10 @@ Old versions of tdb will fail to open the new TDB files (since 28
 August 2009, commit 398d0c29290: prior to that any unrecognized 
 August 2009, commit 398d0c29290: prior to that any unrecognized 
 file format would be erased and initialized as a fresh tdb!)
 file format would be erased and initialized as a fresh tdb!)
 
 
+3.2.2 Status
+
+Complete.
+
 3.3 TDB Records Have a 4G Limit
 3.3 TDB Records Have a 4G Limit
 
 
 This has not been a reported problem, and the API uses size_t 
 This has not been a reported problem, and the API uses size_t 
@@ -610,6 +681,10 @@ implementation would return TDB_ERR_OOM in a similar case). It
 seems unlikely that 32 bit keys will be a limitation, so the 
 seems unlikely that 32 bit keys will be a limitation, so the 
 implementation may not support this (see [sub:Records-Incur-A]).
 implementation may not support this (see [sub:Records-Incur-A]).
 
 
+3.3.2 Status
+
+Complete.
+
 3.4 Hash Size Is Determined At TDB Creation Time
 3.4 Hash Size Is Determined At TDB Creation Time
 
 
 TDB contains a number of hash chains in the header; the number is 
 TDB contains a number of hash chains in the header; the number is 
@@ -628,20 +703,9 @@ This was annoying because I was previously convinced that an
 expanding tree of hashes would be very close to optimal.
 expanding tree of hashes would be very close to optimal.
 ], it became clear that it is hard to beat a straight linear hash 
 ], it became clear that it is hard to beat a straight linear hash 
 table which doubles in size when it reaches saturation. 
 table which doubles in size when it reaches saturation. 
-
-1. 
-
-2. 
-
-3. 
-
-
-
-
-
- Unfortunately, altering the hash table introduces serious 
-locking complications: the entire hash table needs to be locked 
-to enlarge the hash table, and others might be holding locks. 
+Unfortunately, altering the hash table introduces serious locking 
+complications: the entire hash table needs to be locked to 
+enlarge the hash table, and others might be holding locks. 
 Particularly insidious are insertions done under tdb_chainlock.
 Particularly insidious are insertions done under tdb_chainlock.
 
 
 Thus an expanding layered hash will be used: an array of hash 
 Thus an expanding layered hash will be used: an array of hash 
@@ -662,6 +726,10 @@ means we can choose not to re-hash all entries when we expand a
 hash group; simply use the next bits we need and mark them 
 hash group; simply use the next bits we need and mark them 
 invalid.
 invalid.
 
 
+3.4.2 Status
+
+Complete.
+
 3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
 3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
 
 
 TDB uses a single linked list for the free list. Allocation 
 TDB uses a single linked list for the free list. Allocation 
@@ -749,45 +817,45 @@ There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fr
 case where all processes are allocating/freeing the same size. 
 case where all processes are allocating/freeing the same size. 
 Thus we almost certainly need to divide in other ways: the most 
 Thus we almost certainly need to divide in other ways: the most 
 obvious is to divide the file into zones, and using a free list 
 obvious is to divide the file into zones, and using a free list 
-(or set of free lists) for each. This approximates address 
+(or table of free lists) for each. This approximates address 
 ordering.
 ordering.
 
 
-Note that this means we need to split the free lists when we 
-expand the file; this is probably acceptable when we double the 
-hash table size, since that is such an expensive operation 
-already. In the case of increasing the file size, there is an 
-optimization we can use: if we use M in the formula above as the 
-file size rounded up to the next power of 2, we only need 
-reshuffle free lists when the file size crosses a power of 2 
-boundary, and reshuffling the free lists is trivial: we simply 
-merge every consecutive pair of free lists.
+Unfortunately it is difficult to know what heuristics should be 
+used to determine zone sizes, and our transaction code relies on 
+being able to create a “recovery area” by simply appending to the 
+file (difficult if it would need to create a new zone header). 
+Thus we use a linked-list of free tables; currently we only ever 
+create one, but if there is more than one we choose one at random 
+to use. In future we may use heuristics to add new free tables on 
+contention. We only expand the file when all free tables are 
+exhausted.
 
 
 The basic algorithm is as follows. Freeing is simple:
 The basic algorithm is as follows. Freeing is simple:
 
 
-1. Identify the correct zone.
+1. Identify the correct free list.
 
 
 2. Lock the corresponding list.
 2. Lock the corresponding list.
 
 
-3. Re-check the zone (we didn't have a lock, sizes could have 
+3. Re-check the list (we didn't have a lock, sizes could have 
   changed): relock if necessary.
   changed): relock if necessary.
 
 
-4. Place the freed entry in the list for that zone.
+4. Place the freed entry in the list.
 
 
 Allocation is a little more complicated, as we perform delayed 
 Allocation is a little more complicated, as we perform delayed 
 coalescing at this point:
 coalescing at this point:
 
 
-1. Pick a zone either the zone we last freed into, or based on a “
-  random” number.
+1. Pick a free table; usually the previous one.
 
 
 2. Lock the corresponding list.
 2. Lock the corresponding list.
 
 
-3. Re-check the zone: relock if necessary.
-
-4. If the top entry is -large enough, remove it from the list and 
+3. If the top entry is -large enough, remove it from the list and 
   return it.
   return it.
 
 
-5. Otherwise, coalesce entries in the list.If there was no entry 
-  large enough, unlock the list and try the next zone.
+4. Otherwise, coalesce entries in the list.If there was no entry 
+  large enough, unlock the list and try the next largest list
+
+5. If no list has an entry which meets our needs, try the next 
+  free table.
 
 
 6. If no zone satisfies, expand the file.
 6. If no zone satisfies, expand the file.
 
 
@@ -798,24 +866,9 @@ ordering seems to be fairly good for keeping fragmentation low
 does not need a tailer to coalesce, though if we needed one we 
 does not need a tailer to coalesce, though if we needed one we 
 could have one cheaply: see [sub:Records-Incur-A]. 
 could have one cheaply: see [sub:Records-Incur-A]. 
 
 
-I anticipate that the number of entries in each free zone would 
-be small, but it might be worth using one free entry to hold 
-pointers to the others for cache efficiency.
-
-<freelist-in-zone>If we want to avoid locking complexity 
-(enlarging the free lists when we enlarge the file) we could 
-place the array of free lists at the beginning of each zone. This 
-means existing array lists never move, but means that a record 
-cannot be larger than a zone. That in turn implies that zones 
-should be variable sized (say, power of 2), which makes the 
-question “what zone is this record in?” much harder (and “pick a 
-random zone”, but that's less common). It could be done with as 
-few as 4 bits from the record header.[footnote:
-Using 2^{16+N*3}means 0 gives a minimal 65536-byte zone, 15 gives 
-the maximal 2^{61} byte zone. Zones range in factor of 8 steps. 
-Given the zone size for the zone the current record is in, we can 
-determine the start of the zone.
-]
+Each free entry has the free table number in the header: less 
+than 255. It also contains a doubly-linked list for easy 
+deletion.
 
 
 3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
 3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
 
 
@@ -944,13 +997,13 @@ This produces a 16 byte used header like this:
 
 
 struct tdb_used_record {
 struct tdb_used_record {
 
 
-        uint32_t magic : 16,
+        uint32_t used_magic : 16,
+
 
 
-                 prev_is_free: 1,
 
 
                  key_data_divide: 5,
                  key_data_divide: 5,
 
 
-                 top_hash: 10;
+                 top_hash: 11;
 
 
         uint32_t extra_octets;
         uint32_t extra_octets;
 
 
@@ -962,21 +1015,27 @@ And a free record like this:
 
 
 struct tdb_free_record {
 struct tdb_free_record {
 
 
-        uint32_t free_magic;
+        uint64_t free_magic: 8,
+
+                   prev : 56;
 
 
-        uint64_t total_length;
 
 
-        uint64_t prev, next;
 
 
-        ...
+        uint64_t free_table: 8,
 
 
-        uint64_t tailer;
+                 total_length : 56
+
+        uint64_t next;;
 
 
 };
 };
 
 
-We might want to take some bits from the used record's top_hash 
-(and the free record which has 32 bits of padding to spare 
-anyway) if we use variable sized zones. See [freelist-in-zone].
+Note that by limiting valid offsets to 56 bits, we can pack 
+everything we need into 3 64-byte words, meaning our minimum 
+record size is 8 bytes.
+
+3.7.2 Status
+
+Complete.
 
 
 3.8 Transaction Commit Requires 4 fdatasync
 3.8 Transaction Commit Requires 4 fdatasync
 
 
@@ -1029,12 +1088,14 @@ but need only be done at open. For running databases, a separate
 header field can be used to indicate a transaction in progress; 
 header field can be used to indicate a transaction in progress; 
 we need only check for recovery if this is set.
 we need only check for recovery if this is set.
 
 
-3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
+3.8.2 Status
 
 
-3.9.1 Proposed Solution
+Deferred.
 
 
-None. At some point you say “use a real database”  (but see [replay-attribute]
-).
+3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
+
+3.9.1 Proposed SolutionNone. At some point you say “use a real 
+  database” (but see [replay-attribute]).
 
 
 But as a thought experiment, if we implemented transactions to 
 But as a thought experiment, if we implemented transactions to 
 only overwrite free entries (this is tricky: there must not be a 
 only overwrite free entries (this is tricky: there must not be a 
@@ -1053,6 +1114,10 @@ rewrite some sections of the hash, too.
 We could then implement snapshots using a similar method, using 
 We could then implement snapshots using a similar method, using 
 multiple different hash tables/free tables.
 multiple different hash tables/free tables.
 
 
+3.9.2 Status
+
+Deferred.
+
 3.10 Transactions Cannot Operate in Parallel
 3.10 Transactions Cannot Operate in Parallel
 
 
 This would be useless for ldb, as it hits the index records with 
 This would be useless for ldb, as it hits the index records with 
@@ -1069,6 +1134,10 @@ allow one write transaction to begin, but it could not commit
 until all r/o transactions are done. This would require a new 
 until all r/o transactions are done. This would require a new 
 RO_TRANSACTION_LOCK, which would be upgraded on commit.
 RO_TRANSACTION_LOCK, which would be upgraded on commit.
 
 
+3.10.2 Status
+
+Deferred.
+
 3.11 Default Hash Function Is Suboptimal
 3.11 Default Hash Function Is Suboptimal
 
 
 The Knuth-inspired multiplicative hash used by tdb is fairly slow 
 The Knuth-inspired multiplicative hash used by tdb is fairly slow 
@@ -1090,6 +1159,10 @@ The seed should be created at tdb-creation time from some random
 source, and placed in the header. This is far from foolproof, but 
 source, and placed in the header. This is far from foolproof, but 
 adds a little bit of protection against hash bombing.
 adds a little bit of protection against hash bombing.
 
 
+3.11.2 Status
+
+Complete.
+
 3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
 3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
 
 
 We lock a record during traversal iteration, and try to grab that 
 We lock a record during traversal iteration, and try to grab that 
@@ -1104,6 +1177,10 @@ indefinitely.
 
 
 Remove reliability guarantees; see [traverse-Proposed-Solution].
 Remove reliability guarantees; see [traverse-Proposed-Solution].
 
 
+3.12.2 Status
+
+Complete.
+
 3.13 Fcntl Locking Adds Overhead
 3.13 Fcntl Locking Adds Overhead
 
 
 Placing a fcntl lock means a system call, as does removing one. 
 Placing a fcntl lock means a system call, as does removing one. 
@@ -1176,3 +1253,7 @@ tdb_open (see [attributes]) to provide replay/trace hooks, which
 could become the basis for this and future parallel transactions 
 could become the basis for this and future parallel transactions 
 and snapshot support.
 and snapshot support.
 
 
+3.15.2 Status
+
+Deferred.
+

+ 187 - 123
ccan/tdb2/free.c

@@ -49,23 +49,24 @@ unsigned int size_to_bucket(tdb_len_t data_len)
 	return bucket;
 	return bucket;
 }
 }
 
 
-tdb_off_t first_flist(struct tdb_context *tdb)
+tdb_off_t first_ftable(struct tdb_context *tdb)
 {
 {
-	return tdb_read_off(tdb, offsetof(struct tdb_header, free_list));
+	return tdb_read_off(tdb, offsetof(struct tdb_header, free_table));
 }
 }
 
 
-tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist)
+tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable)
 {
 {
-	return tdb_read_off(tdb, flist + offsetof(struct tdb_freelist, next));
+	return tdb_read_off(tdb, ftable + offsetof(struct tdb_freetable,next));
 }
 }
 
 
-int tdb_flist_init(struct tdb_context *tdb)
+int tdb_ftable_init(struct tdb_context *tdb)
 {
 {
 	/* Use reservoir sampling algorithm to select a free list at random. */
 	/* Use reservoir sampling algorithm to select a free list at random. */
-	unsigned int rnd, max = 0;
+	unsigned int rnd, max = 0, count = 0;
 	tdb_off_t off;
 	tdb_off_t off;
 
 
-	tdb->flist_off = off = first_flist(tdb);
+	tdb->ftable_off = off = first_ftable(tdb);
+	tdb->ftable = 0;
 
 
 	while (off) {
 	while (off) {
 		if (off == TDB_OFF_ERR)
 		if (off == TDB_OFF_ERR)
@@ -73,50 +74,52 @@ int tdb_flist_init(struct tdb_context *tdb)
 
 
 		rnd = random();
 		rnd = random();
 		if (rnd >= max) {
 		if (rnd >= max) {
-			tdb->flist_off = off;
+			tdb->ftable_off = off;
+			tdb->ftable = count;
 			max = rnd;
 			max = rnd;
 		}
 		}
 
 
-		off = next_flist(tdb, off);
+		off = next_ftable(tdb, off);
+		count++;
 	}
 	}
 	return 0;
 	return 0;
 }
 }
 
 
 /* Offset of a given bucket. */
 /* Offset of a given bucket. */
-tdb_off_t bucket_off(tdb_off_t flist_off, unsigned bucket)
+tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket)
 {
 {
-	return flist_off + offsetof(struct tdb_freelist, buckets)
+	return ftable_off + offsetof(struct tdb_freetable, buckets)
 		+ bucket * sizeof(tdb_off_t);
 		+ bucket * sizeof(tdb_off_t);
 }
 }
 
 
 /* Returns free_buckets + 1, or list number to search. */
 /* Returns free_buckets + 1, or list number to search. */
 static tdb_off_t find_free_head(struct tdb_context *tdb,
 static tdb_off_t find_free_head(struct tdb_context *tdb,
-				tdb_off_t flist_off,
+				tdb_off_t ftable_off,
 				tdb_off_t bucket)
 				tdb_off_t bucket)
 {
 {
 	/* Speculatively search for a non-zero bucket. */
 	/* Speculatively search for a non-zero bucket. */
-	return tdb_find_nonzero_off(tdb, bucket_off(flist_off, 0),
+	return tdb_find_nonzero_off(tdb, bucket_off(ftable_off, 0),
 				    bucket, TDB_FREE_BUCKETS);
 				    bucket, TDB_FREE_BUCKETS);
 }
 }
 
 
 /* Remove from free bucket. */
 /* Remove from free bucket. */
 static int remove_from_list(struct tdb_context *tdb,
 static int remove_from_list(struct tdb_context *tdb,
 			    tdb_off_t b_off, tdb_off_t r_off,
 			    tdb_off_t b_off, tdb_off_t r_off,
-			    struct tdb_free_record *r)
+			    const struct tdb_free_record *r)
 {
 {
 	tdb_off_t off;
 	tdb_off_t off;
 
 
 	/* Front of list? */
 	/* Front of list? */
-	if (r->prev == 0) {
+	if (frec_prev(r) == 0) {
 		off = b_off;
 		off = b_off;
 	} else {
 	} else {
-		off = r->prev + offsetof(struct tdb_free_record, next);
+		off = frec_prev(r) + offsetof(struct tdb_free_record, next);
 	}
 	}
 
 
 #ifdef DEBUG
 #ifdef DEBUG
 	if (tdb_read_off(tdb, off) != r_off) {
 	if (tdb_read_off(tdb, off) != r_off) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "remove_from_list: %llu bad prev in list %llu\n",
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+			 "remove_from_list: %llu bad prev in list %llu",
 			 (long long)r_off, (long long)b_off);
 			 (long long)r_off, (long long)b_off);
 		return -1;
 		return -1;
 	}
 	}
@@ -128,19 +131,19 @@ static int remove_from_list(struct tdb_context *tdb,
 	}
 	}
 
 
 	if (r->next != 0) {
 	if (r->next != 0) {
-		off = r->next + offsetof(struct tdb_free_record, prev);
+		off = r->next + offsetof(struct tdb_free_record,magic_and_prev);
 		/* r->next->prev = r->prev */
 		/* r->next->prev = r->prev */
 
 
 #ifdef DEBUG
 #ifdef DEBUG
-		if (tdb_read_off(tdb, off) != r_off) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "remove_from_list: %llu bad list %llu\n",
-				 (long long)r_off, (long long)b_off);
+		if (tdb_read_off(tdb, off) & TDB_OFF_MASK != r_off) {
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+				   "remove_from_list: %llu bad list %llu",
+				   (long long)r_off, (long long)b_off);
 			return -1;
 			return -1;
 		}
 		}
 #endif
 #endif
 
 
-		if (tdb_write_off(tdb, off, r->prev)) {
+		if (tdb_write_off(tdb, off, r->magic_and_prev)) {
 			return -1;
 			return -1;
 		}
 		}
 	}
 	}
@@ -151,58 +154,66 @@ static int remove_from_list(struct tdb_context *tdb,
 static int enqueue_in_free(struct tdb_context *tdb,
 static int enqueue_in_free(struct tdb_context *tdb,
 			   tdb_off_t b_off,
 			   tdb_off_t b_off,
 			   tdb_off_t off,
 			   tdb_off_t off,
-			   struct tdb_free_record *new)
+			   tdb_len_t len)
 {
 {
-	new->prev = 0;
+	struct tdb_free_record new;
+	uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL));
+
+	/* We only need to set ftable_and_len; rest is set in enqueue_in_free */
+	new.ftable_and_len = ((uint64_t)tdb->ftable << (64 - TDB_OFF_UPPER_STEAL))
+		| len;
+	/* prev = 0. */
+	new.magic_and_prev = magic;
+
 	/* new->next = head. */
 	/* new->next = head. */
-	new->next = tdb_read_off(tdb, b_off);
-	if (new->next == TDB_OFF_ERR)
+	new.next = tdb_read_off(tdb, b_off);
+	if (new.next == TDB_OFF_ERR)
 		return -1;
 		return -1;
 
 
-	if (new->next) {
+	if (new.next) {
 #ifdef DEBUG
 #ifdef DEBUG
 		if (tdb_read_off(tdb,
 		if (tdb_read_off(tdb,
-				 new->next
-				 + offsetof(struct tdb_free_record, prev))
-		    != 0) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "enqueue_in_free: %llu bad head prev %llu\n",
-				 (long long)new->next, (long long)b_off);
+				 new.next + offsetof(struct tdb_free_record,
+						     magic_and_prev))
+		    != magic) {
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+				   "enqueue_in_free: %llu bad head"
+				   " prev %llu",
+				   (long long)new.next, (long long)b_off);
 			return -1;
 			return -1;
 		}
 		}
 #endif
 #endif
 		/* next->prev = new. */
 		/* next->prev = new. */
-		if (tdb_write_off(tdb, new->next
-				  + offsetof(struct tdb_free_record, prev),
-				  off) != 0)
+		if (tdb_write_off(tdb, new.next
+				  + offsetof(struct tdb_free_record,
+					     magic_and_prev),
+				  off | magic) != 0)
 			return -1;
 			return -1;
 	}
 	}
 	/* head = new */
 	/* head = new */
 	if (tdb_write_off(tdb, b_off, off) != 0)
 	if (tdb_write_off(tdb, b_off, off) != 0)
 		return -1;
 		return -1;
 
 
-	return tdb_write_convert(tdb, off, new, sizeof(*new));
+	return tdb_write_convert(tdb, off, &new, sizeof(new));
 }
 }
 
 
 /* List need not be locked. */
 /* List need not be locked. */
 int add_free_record(struct tdb_context *tdb,
 int add_free_record(struct tdb_context *tdb,
 		    tdb_off_t off, tdb_len_t len_with_header)
 		    tdb_off_t off, tdb_len_t len_with_header)
 {
 {
-	struct tdb_free_record new;
 	tdb_off_t b_off;
 	tdb_off_t b_off;
+	tdb_len_t len;
 	int ret;
 	int ret;
 
 
-	assert(len_with_header >= sizeof(new));
+	assert(len_with_header >= sizeof(struct tdb_free_record));
 
 
-	new.magic_and_meta = TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL)
-		| tdb->flist_off;
-	new.data_len = len_with_header - sizeof(struct tdb_used_record);
+	len = len_with_header - sizeof(struct tdb_used_record);
 
 
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(new.data_len));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
 	if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
 	if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
 		return -1;
 		return -1;
 
 
-	ret = enqueue_in_free(tdb, b_off, off, &new);
+	ret = enqueue_in_free(tdb, b_off, off, len);
 	tdb_unlock_free_bucket(tdb, b_off);
 	tdb_unlock_free_bucket(tdb, b_off);
 	return ret;
 	return ret;
 }
 }
@@ -234,91 +245,113 @@ static size_t record_leftover(size_t keylen, size_t datalen,
 	return leftover;
 	return leftover;
 }
 }
 
 
+static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable)
+{
+	tdb_off_t off;
+	unsigned int i;
+
+	if (likely(tdb->ftable == ftable))
+		return tdb->ftable_off;
+
+	off = first_ftable(tdb);
+	for (i = 0; i < ftable; i++)
+		off = next_ftable(tdb, off);
+	return off;
+}
+
 /* Note: we unlock the current bucket if we coalesce or fail. */
 /* Note: we unlock the current bucket if we coalesce or fail. */
 static int coalesce(struct tdb_context *tdb,
 static int coalesce(struct tdb_context *tdb,
 		    tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
 		    tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
 {
 {
-	struct tdb_free_record pad, *r;
 	tdb_off_t end;
 	tdb_off_t end;
+	struct tdb_free_record rec;
 
 
+	add_stat(tdb, alloc_coalesce_tried, 1);
 	end = off + sizeof(struct tdb_used_record) + data_len;
 	end = off + sizeof(struct tdb_used_record) + data_len;
 
 
 	while (end < tdb->map_size) {
 	while (end < tdb->map_size) {
+		const struct tdb_free_record *r;
 		tdb_off_t nb_off;
 		tdb_off_t nb_off;
+		unsigned ftable, bucket;
 
 
-		/* FIXME: do tdb_get here and below really win? */
-		r = tdb_get(tdb, end, &pad, sizeof(pad));
+		r = tdb_access_read(tdb, end, sizeof(*r), true);
 		if (!r)
 		if (!r)
 			goto err;
 			goto err;
 
 
-		if (frec_magic(r) != TDB_FREE_MAGIC)
+		if (frec_magic(r) != TDB_FREE_MAGIC
+		    || frec_ftable(r) == TDB_FTABLE_NONE) {
+			tdb_access_release(tdb, r);
 			break;
 			break;
+		}
 
 
-		nb_off = bucket_off(frec_flist(r), size_to_bucket(r->data_len));
+		ftable = frec_ftable(r);
+		bucket = size_to_bucket(frec_len(r));
+		nb_off = bucket_off(ftable_offset(tdb, ftable), bucket);
+		tdb_access_release(tdb, r);
 
 
 		/* We may be violating lock order here, so best effort. */
 		/* We may be violating lock order here, so best effort. */
-		if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
+		if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1) {
+			add_stat(tdb, alloc_coalesce_lockfail, 1);
 			break;
 			break;
+		}
 
 
 		/* Now we have lock, re-check. */
 		/* Now we have lock, re-check. */
-		r = tdb_get(tdb, end, &pad, sizeof(pad));
-		if (!r) {
+		if (tdb_read_convert(tdb, end, &rec, sizeof(rec))) {
 			tdb_unlock_free_bucket(tdb, nb_off);
 			tdb_unlock_free_bucket(tdb, nb_off);
 			goto err;
 			goto err;
 		}
 		}
 
 
-		if (unlikely(frec_magic(r) != TDB_FREE_MAGIC)) {
+		if (unlikely(frec_magic(&rec) != TDB_FREE_MAGIC)) {
+			add_stat(tdb, alloc_coalesce_race, 1);
 			tdb_unlock_free_bucket(tdb, nb_off);
 			tdb_unlock_free_bucket(tdb, nb_off);
 			break;
 			break;
 		}
 		}
 
 
-		if (unlikely(bucket_off(frec_flist(r),
-					size_to_bucket(r->data_len))
-			     != nb_off)) {
+		if (unlikely(frec_ftable(&rec) != ftable)
+		    || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
+			add_stat(tdb, alloc_coalesce_race, 1);
 			tdb_unlock_free_bucket(tdb, nb_off);
 			tdb_unlock_free_bucket(tdb, nb_off);
 			break;
 			break;
 		}
 		}
 
 
-		if (remove_from_list(tdb, nb_off, end, r) == -1) {
+		if (remove_from_list(tdb, nb_off, end, &rec) == -1) {
 			tdb_unlock_free_bucket(tdb, nb_off);
 			tdb_unlock_free_bucket(tdb, nb_off);
 			goto err;
 			goto err;
 		}
 		}
 
 
-		end += sizeof(struct tdb_used_record) + r->data_len;
+		end += sizeof(struct tdb_used_record) + frec_len(&rec);
 		tdb_unlock_free_bucket(tdb, nb_off);
 		tdb_unlock_free_bucket(tdb, nb_off);
+		add_stat(tdb, alloc_coalesce_num_merged, 1);
 	}
 	}
 
 
 	/* Didn't find any adjacent free? */
 	/* Didn't find any adjacent free? */
 	if (end == off + sizeof(struct tdb_used_record) + data_len)
 	if (end == off + sizeof(struct tdb_used_record) + data_len)
 		return 0;
 		return 0;
 
 
-	/* OK, expand record */
-	r = tdb_get(tdb, off, &pad, sizeof(pad));
-	if (!r)
+	/* OK, expand initial record */
+	if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
 		goto err;
 		goto err;
 
 
-	if (r->data_len != data_len) {
-		tdb->ecode = TDB_ERR_CORRUPT;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "coalesce: expected data len %llu not %llu\n",
-			 (long long)data_len, (long long)r->data_len);
+	if (frec_len(&rec) != data_len) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+			   "coalesce: expected data len %zu not %zu",
+			   (size_t)data_len, (size_t)frec_len(&rec));
 		goto err;
 		goto err;
 	}
 	}
 
 
-	if (remove_from_list(tdb, b_off, off, r) == -1)
-		goto err;
-
-	r = tdb_access_write(tdb, off, sizeof(*r), true);
-	if (!r)
+	if (remove_from_list(tdb, b_off, off, &rec) == -1)
 		goto err;
 		goto err;
 
 
 	/* We have to drop this to avoid deadlocks, so make sure record
 	/* We have to drop this to avoid deadlocks, so make sure record
 	 * doesn't get coalesced by someone else! */
 	 * doesn't get coalesced by someone else! */
-	r->magic_and_meta = TDB_COALESCING_MAGIC << (64 - TDB_OFF_UPPER_STEAL);
-	r->data_len = end - off - sizeof(struct tdb_used_record);
-	if (tdb_access_commit(tdb, r) != 0)
+	rec.ftable_and_len = (TDB_FTABLE_NONE << (64 - TDB_OFF_UPPER_STEAL))
+		| (end - off - sizeof(struct tdb_used_record));
+	if (tdb_write_off(tdb, off + offsetof(struct tdb_free_record,
+					      ftable_and_len),
+			  rec.ftable_and_len) != 0)
 		goto err;
 		goto err;
 
 
+	add_stat(tdb, alloc_coalesce_succeeded, 1);
 	tdb_unlock_free_bucket(tdb, b_off);
 	tdb_unlock_free_bucket(tdb, b_off);
 
 
 	if (add_free_record(tdb, off, end - off) == -1)
 	if (add_free_record(tdb, off, end - off) == -1)
@@ -333,19 +366,21 @@ err:
 
 
 /* We need size bytes to put our key and data in. */
 /* We need size bytes to put our key and data in. */
 static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
 static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
-				tdb_off_t flist_off,
+				tdb_off_t ftable_off,
 				tdb_off_t bucket,
 				tdb_off_t bucket,
 				size_t keylen, size_t datalen,
 				size_t keylen, size_t datalen,
 				bool want_extra,
 				bool want_extra,
+				unsigned magic,
 				unsigned hashlow)
 				unsigned hashlow)
 {
 {
 	tdb_off_t off, b_off,best_off;
 	tdb_off_t off, b_off,best_off;
-	struct tdb_free_record pad, best = { 0 }, *r;
+	struct tdb_free_record best = { 0 };
 	double multiplier;
 	double multiplier;
 	size_t size = adjust_size(keylen, datalen);
 	size_t size = adjust_size(keylen, datalen);
 
 
+	add_stat(tdb, allocs, 1);
 again:
 again:
-	b_off = bucket_off(flist_off, bucket);
+	b_off = bucket_off(ftable_off, bucket);
 
 
 	/* FIXME: Try non-blocking wait first, to measure contention. */
 	/* FIXME: Try non-blocking wait first, to measure contention. */
 	/* Lock this bucket. */
 	/* Lock this bucket. */
@@ -353,7 +388,7 @@ again:
 		return TDB_OFF_ERR;
 		return TDB_OFF_ERR;
 	}
 	}
 
 
-	best.data_len = -1ULL;
+	best.ftable_and_len = -1ULL;
 	best_off = 0;
 	best_off = 0;
 
 
 	/* Get slack if we're after extra. */
 	/* Get slack if we're after extra. */
@@ -369,30 +404,40 @@ again:
 		goto unlock_err;
 		goto unlock_err;
 
 
 	while (off) {
 	while (off) {
-		/* FIXME: Does tdb_get win anything here? */
-		r = tdb_get(tdb, off, &pad, sizeof(*r));
+		const struct tdb_free_record *r;
+		tdb_len_t len;
+		tdb_off_t next;
+
+		r = tdb_access_read(tdb, off, sizeof(*r), true);
 		if (!r)
 		if (!r)
 			goto unlock_err;
 			goto unlock_err;
 
 
 		if (frec_magic(r) != TDB_FREE_MAGIC) {
 		if (frec_magic(r) != TDB_FREE_MAGIC) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "lock_and_alloc: %llu non-free 0x%llx\n",
-				 (long long)off, (long long)r->magic_and_meta);
+			tdb_access_release(tdb, r);
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+				 "lock_and_alloc: %llu non-free 0x%llx",
+				 (long long)off, (long long)r->magic_and_prev);
 			goto unlock_err;
 			goto unlock_err;
 		}
 		}
 
 
-		if (r->data_len >= size && r->data_len < best.data_len) {
+		if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
 			best_off = off;
 			best_off = off;
 			best = *r;
 			best = *r;
 		}
 		}
 
 
-		if (best.data_len < size * multiplier && best_off)
+		if (frec_len(&best) < size * multiplier && best_off) {
+			tdb_access_release(tdb, r);
 			break;
 			break;
+		}
 
 
 		multiplier *= 1.01;
 		multiplier *= 1.01;
 
 
+		next = r->next;
+		len = frec_len(r);
+		tdb_access_release(tdb, r);
+
 		/* Since we're going slow anyway, try coalescing here. */
 		/* Since we're going slow anyway, try coalescing here. */
-		switch (coalesce(tdb, off, b_off, r->data_len)) {
+		switch (coalesce(tdb, off, b_off, len)) {
 		case -1:
 		case -1:
 			/* This has already unlocked on error. */
 			/* This has already unlocked on error. */
 			return -1;
 			return -1;
@@ -400,7 +445,7 @@ again:
 			/* This has unlocked list, restart. */
 			/* This has unlocked list, restart. */
 			goto again;
 			goto again;
 		}
 		}
-		off = r->next;
+		off = next;
 	}
 	}
 
 
 	/* If we found anything at all, use it. */
 	/* If we found anything at all, use it. */
@@ -413,28 +458,30 @@ again:
 			goto unlock_err;
 			goto unlock_err;
 
 
 		leftover = record_leftover(keylen, datalen, want_extra,
 		leftover = record_leftover(keylen, datalen, want_extra,
-					   best.data_len);
+					   frec_len(&best));
 
 
-		assert(keylen + datalen + leftover <= best.data_len);
+		assert(keylen + datalen + leftover <= frec_len(&best));
 		/* We need to mark non-free before we drop lock, otherwise
 		/* We need to mark non-free before we drop lock, otherwise
 		 * coalesce() could try to merge it! */
 		 * coalesce() could try to merge it! */
-		if (set_header(tdb, &rec, keylen, datalen,
-			       best.data_len - leftover,
-			       hashlow) != 0)
+		if (set_header(tdb, &rec, magic, keylen, datalen,
+			       frec_len(&best) - leftover, hashlow) != 0)
 			goto unlock_err;
 			goto unlock_err;
 
 
 		if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
 		if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
 			goto unlock_err;
 			goto unlock_err;
 
 
-		tdb_unlock_free_bucket(tdb, b_off);
-
+		/* Bucket of leftover will be <= current bucket, so nested
+		 * locking is allowed. */
 		if (leftover) {
 		if (leftover) {
+			add_stat(tdb, alloc_leftover, 1);
 			if (add_free_record(tdb,
 			if (add_free_record(tdb,
 					    best_off + sizeof(rec)
 					    best_off + sizeof(rec)
-					    + best.data_len - leftover,
+					    + frec_len(&best) - leftover,
 					    leftover))
 					    leftover))
-				return TDB_OFF_ERR;
+				best_off = TDB_OFF_ERR;
 		}
 		}
+		tdb_unlock_free_bucket(tdb, b_off);
+
 		return best_off;
 		return best_off;
 	}
 	}
 
 
@@ -449,10 +496,10 @@ unlock_err:
 /* Get a free block from current free list, or 0 if none. */
 /* Get a free block from current free list, or 0 if none. */
 static tdb_off_t get_free(struct tdb_context *tdb,
 static tdb_off_t get_free(struct tdb_context *tdb,
 			  size_t keylen, size_t datalen, bool want_extra,
 			  size_t keylen, size_t datalen, bool want_extra,
-			  unsigned hashlow)
+			  unsigned magic, unsigned hashlow)
 {
 {
-	tdb_off_t off, flist;
-	unsigned start_b, b;
+	tdb_off_t off, ftable_off;
+	unsigned start_b, b, ftable;
 	bool wrapped = false;
 	bool wrapped = false;
 
 
 	/* If they are growing, add 50% to get to higher bucket. */
 	/* If they are growing, add 50% to get to higher bucket. */
@@ -462,31 +509,40 @@ static tdb_off_t get_free(struct tdb_context *tdb,
 	else
 	else
 		start_b = size_to_bucket(adjust_size(keylen, datalen));
 		start_b = size_to_bucket(adjust_size(keylen, datalen));
 
 
-	flist = tdb->flist_off;
-	while (!wrapped || flist != tdb->flist_off) {
+	ftable_off = tdb->ftable_off;
+	ftable = tdb->ftable;
+	while (!wrapped || ftable_off != tdb->ftable_off) {
 		/* Start at exact size bucket, and search up... */
 		/* Start at exact size bucket, and search up... */
-		for (b = find_free_head(tdb, flist, start_b);
+		for (b = find_free_head(tdb, ftable_off, start_b);
 		     b < TDB_FREE_BUCKETS;
 		     b < TDB_FREE_BUCKETS;
-		     b = find_free_head(tdb, flist, b + 1)) {
+		     b = find_free_head(tdb, ftable_off, b + 1)) {
 			/* Try getting one from list. */
 			/* Try getting one from list. */
-			off = lock_and_alloc(tdb, flist,
+			off = lock_and_alloc(tdb, ftable_off,
 					     b, keylen, datalen, want_extra,
 					     b, keylen, datalen, want_extra,
-					     hashlow);
+					     magic, hashlow);
 			if (off == TDB_OFF_ERR)
 			if (off == TDB_OFF_ERR)
 				return TDB_OFF_ERR;
 				return TDB_OFF_ERR;
 			if (off != 0) {
 			if (off != 0) {
+				if (b == start_b)
+					add_stat(tdb, alloc_bucket_exact, 1);
+				if (b == TDB_FREE_BUCKETS - 1)
+					add_stat(tdb, alloc_bucket_max, 1);
 				/* Worked?  Stay using this list. */
 				/* Worked?  Stay using this list. */
-				tdb->flist_off = flist;
+				tdb->ftable_off = ftable_off;
+				tdb->ftable = ftable;
 				return off;
 				return off;
 			}
 			}
 			/* Didn't work.  Try next bucket. */
 			/* Didn't work.  Try next bucket. */
 		}
 		}
 
 
-		/* Hmm, try next list. */
-		flist = next_flist(tdb, flist);
-		if (flist == 0) {
+		/* Hmm, try next table. */
+		ftable_off = next_ftable(tdb, ftable_off);
+		ftable++;
+
+		if (ftable_off == 0) {
 			wrapped = true;
 			wrapped = true;
-			flist = first_flist(tdb);
+			ftable_off = first_ftable(tdb);
+			ftable = 0;
 		}
 		}
 	}
 	}
 
 
@@ -495,7 +551,7 @@ static tdb_off_t get_free(struct tdb_context *tdb,
 
 
 int set_header(struct tdb_context *tdb,
 int set_header(struct tdb_context *tdb,
 	       struct tdb_used_record *rec,
 	       struct tdb_used_record *rec,
-	       uint64_t keylen, uint64_t datalen,
+	       unsigned magic, uint64_t keylen, uint64_t datalen,
 	       uint64_t actuallen, unsigned hashlow)
 	       uint64_t actuallen, unsigned hashlow)
 {
 {
 	uint64_t keybits = (fls64(keylen) + 1) / 2;
 	uint64_t keybits = (fls64(keylen) + 1) / 2;
@@ -504,16 +560,15 @@ int set_header(struct tdb_context *tdb,
 	rec->magic_and_meta = (hashlow & ((1 << 11)-1))
 	rec->magic_and_meta = (hashlow & ((1 << 11)-1))
 		| ((actuallen - (keylen + datalen)) << 11)
 		| ((actuallen - (keylen + datalen)) << 11)
 		| (keybits << 43)
 		| (keybits << 43)
-		| (TDB_MAGIC << 48);
+		| ((uint64_t)magic << 48);
 	rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
 	rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
 
 
 	/* Encoding can fail on big values. */
 	/* Encoding can fail on big values. */
 	if (rec_key_length(rec) != keylen
 	if (rec_key_length(rec) != keylen
 	    || rec_data_length(rec) != datalen
 	    || rec_data_length(rec) != datalen
 	    || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
 	    || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
-		tdb->ecode = TDB_ERR_IO;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "Could not encode k=%llu,d=%llu,a=%llu\n",
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			 "Could not encode k=%llu,d=%llu,a=%llu",
 			 (long long)keylen, (long long)datalen,
 			 (long long)keylen, (long long)datalen,
 			 (long long)actuallen);
 			 (long long)actuallen);
 		return -1;
 		return -1;
@@ -533,11 +588,19 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 	/* Need to hold a hash lock to expand DB: transactions rely on it. */
 	/* Need to hold a hash lock to expand DB: transactions rely on it. */
 	if (!(tdb->flags & TDB_NOLOCK)
 	if (!(tdb->flags & TDB_NOLOCK)
 	    && !tdb->allrecord_lock.count && !tdb_has_hash_locks(tdb)) {
 	    && !tdb->allrecord_lock.count && !tdb_has_hash_locks(tdb)) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_expand: must hold lock during expand\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_expand: must hold lock during expand");
 		return -1;
 		return -1;
 	}
 	}
 
 
+	/* always make room for at least 100 more records, and at
+           least 25% more space. */
+	if (size * TDB_EXTENSION_FACTOR > tdb->map_size / 4)
+		wanted = size * TDB_EXTENSION_FACTOR;
+	else
+		wanted = tdb->map_size / 4;
+	wanted = adjust_size(0, wanted);
+
 	/* Only one person can expand file at a time. */
 	/* Only one person can expand file at a time. */
 	if (tdb_lock_expand(tdb, F_WRLCK) != 0)
 	if (tdb_lock_expand(tdb, F_WRLCK) != 0)
 		return -1;
 		return -1;
@@ -550,7 +613,7 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 		return 0;
 		return 0;
 	}
 	}
 
 
-	if (tdb->methods->expand_file(tdb, wanted*TDB_EXTENSION_FACTOR) == -1) {
+	if (tdb->methods->expand_file(tdb, wanted) == -1) {
 		tdb_unlock_expand(tdb, F_WRLCK);
 		tdb_unlock_expand(tdb, F_WRLCK);
 		return -1;
 		return -1;
 	}
 	}
@@ -558,12 +621,13 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 	/* We need to drop this lock before adding free record. */
 	/* We need to drop this lock before adding free record. */
 	tdb_unlock_expand(tdb, F_WRLCK);
 	tdb_unlock_expand(tdb, F_WRLCK);
 
 
-	return add_free_record(tdb, old_size, wanted * TDB_EXTENSION_FACTOR);
+	add_stat(tdb, expands, 1);
+	return add_free_record(tdb, old_size, wanted);
 }
 }
 
 
 /* This won't fail: it will expand the database if it has to. */
 /* This won't fail: it will expand the database if it has to. */
 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
-		uint64_t hash, bool growing)
+		uint64_t hash, unsigned magic, bool growing)
 {
 {
 	tdb_off_t off;
 	tdb_off_t off;
 
 
@@ -571,7 +635,7 @@ tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
 	assert(!tdb->direct_access);
 	assert(!tdb->direct_access);
 
 
 	for (;;) {
 	for (;;) {
-		off = get_free(tdb, keylen, datalen, growing, hash);
+		off = get_free(tdb, keylen, datalen, growing, magic, hash);
 		if (likely(off != 0))
 		if (likely(off != 0))
 			break;
 			break;
 
 

+ 187 - 46
ccan/tdb2/hash.c

@@ -42,17 +42,19 @@ uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len)
 
 
 uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
 uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
 {
 {
-	struct tdb_used_record pad, *r;
+	const struct tdb_used_record *r;
 	const void *key;
 	const void *key;
 	uint64_t klen, hash;
 	uint64_t klen, hash;
 
 
-	r = tdb_get(tdb, off, &pad, sizeof(pad));
+	r = tdb_access_read(tdb, off, sizeof(*r), true);
 	if (!r)
 	if (!r)
 		/* FIXME */
 		/* FIXME */
 		return 0;
 		return 0;
 
 
 	klen = rec_key_length(r);
 	klen = rec_key_length(r);
-	key = tdb_access_read(tdb, off + sizeof(pad), klen, false);
+	tdb_access_release(tdb, r);
+
+	key = tdb_access_read(tdb, off + sizeof(*r), klen, false);
 	if (!key)
 	if (!key)
 		return 0;
 		return 0;
 
 
@@ -76,6 +78,30 @@ static uint32_t use_bits(struct hash_info *h, unsigned num)
 	return bits(h->h, 64 - h->hash_used, num);
 	return bits(h->h, 64 - h->hash_used, num);
 }
 }
 
 
+static bool key_matches(struct tdb_context *tdb,
+			const struct tdb_used_record *rec,
+			tdb_off_t off,
+			const struct tdb_data *key)
+{
+	bool ret = false;
+	const char *rkey;
+
+	if (rec_key_length(rec) != key->dsize) {
+		add_stat(tdb, compare_wrong_keylen, 1);
+		return ret;
+	}
+
+	rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false);
+	if (!rkey)
+		return ret;
+	if (memcmp(rkey, key->dptr, key->dsize) == 0)
+		ret = true;
+	else
+		add_stat(tdb, compare_wrong_keycmp, 1);
+	tdb_access_release(tdb, rkey);
+	return ret;
+}
+
 /* Does entry match? */
 /* Does entry match? */
 static bool match(struct tdb_context *tdb,
 static bool match(struct tdb_context *tdb,
 		  struct hash_info *h,
 		  struct hash_info *h,
@@ -83,38 +109,33 @@ static bool match(struct tdb_context *tdb,
 		  tdb_off_t val,
 		  tdb_off_t val,
 		  struct tdb_used_record *rec)
 		  struct tdb_used_record *rec)
 {
 {
-	bool ret;
-	const unsigned char *rkey;
 	tdb_off_t off;
 	tdb_off_t off;
 
 
-	/* FIXME: Handle hash value truncated. */
-	if (bits(val, TDB_OFF_HASH_TRUNCATED_BIT, 1))
-		abort();
-
+	add_stat(tdb, compares, 1);
 	/* Desired bucket must match. */
 	/* Desired bucket must match. */
-	if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK))
+	if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK)) {
+		add_stat(tdb, compare_wrong_bucket, 1);
 		return false;
 		return false;
+	}
 
 
 	/* Top bits of offset == next bits of hash. */
 	/* Top bits of offset == next bits of hash. */
 	if (bits(val, TDB_OFF_HASH_EXTRA_BIT, TDB_OFF_UPPER_STEAL_EXTRA)
 	if (bits(val, TDB_OFF_HASH_EXTRA_BIT, TDB_OFF_UPPER_STEAL_EXTRA)
 	    != bits(h->h, 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
 	    != bits(h->h, 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
-		    TDB_OFF_UPPER_STEAL_EXTRA))
+		    TDB_OFF_UPPER_STEAL_EXTRA)) {
+		add_stat(tdb, compare_wrong_offsetbits, 1);
 		return false;
 		return false;
+	}
 
 
 	off = val & TDB_OFF_MASK;
 	off = val & TDB_OFF_MASK;
 	if (tdb_read_convert(tdb, off, rec, sizeof(*rec)) == -1)
 	if (tdb_read_convert(tdb, off, rec, sizeof(*rec)) == -1)
 		return false;
 		return false;
 
 
-	/* FIXME: check extra bits in header? */
-	if (rec_key_length(rec) != key->dsize)
+	if ((h->h & ((1 << 11)-1)) != rec_hash(rec)) {
+		add_stat(tdb, compare_wrong_rechash, 1);
 		return false;
 		return false;
+	}
 
 
-	rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false);
-	if (!rkey)
-		return false;
-	ret = (memcmp(rkey, key->dptr, key->dsize) == 0);
-	tdb_access_release(tdb, rkey);
-	return ret;
+	return key_matches(tdb, rec, off, key);
 }
 }
 
 
 static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
 static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
@@ -123,10 +144,9 @@ static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
 		+ (bucket % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
 		+ (bucket % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
 }
 }
 
 
-/* Truncated hashes can't be all 1: that's how we spot a sub-hash */
 bool is_subhash(tdb_off_t val)
 bool is_subhash(tdb_off_t val)
 {
 {
-	return val >> (64-TDB_OFF_UPPER_STEAL) == (1<<TDB_OFF_UPPER_STEAL) - 1;
+	return (val >> TDB_OFF_UPPER_STEAL_SUBHASH_BIT) & 1;
 }
 }
 
 
 /* FIXME: Guess the depth, don't over-lock! */
 /* FIXME: Guess the depth, don't over-lock! */
@@ -136,6 +156,65 @@ static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size)
 	return group << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS));
 	return group << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS));
 }
 }
 
 
+static tdb_off_t COLD find_in_chain(struct tdb_context *tdb,
+				    struct tdb_data key,
+				    tdb_off_t chain,
+				    struct hash_info *h,
+				    struct tdb_used_record *rec,
+				    struct traverse_info *tinfo)
+{
+	tdb_off_t off, next;
+
+	/* In case nothing is free, we set these to zero. */
+	h->home_bucket = h->found_bucket = 0;
+
+	for (off = chain; off; off = next) {
+		unsigned int i;
+
+		h->group_start = off;
+		if (tdb_read_convert(tdb, off, h->group, sizeof(h->group)))
+			return TDB_OFF_ERR;
+
+		for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+			tdb_off_t recoff;
+			if (!h->group[i]) {
+				/* Remember this empty bucket. */
+				h->home_bucket = h->found_bucket = i;
+				continue;
+			}
+
+			/* We can insert extra bits via add_to_hash
+			 * empty bucket logic. */
+			recoff = h->group[i] & TDB_OFF_MASK;
+			if (tdb_read_convert(tdb, recoff, rec, sizeof(*rec)))
+				return TDB_OFF_ERR;
+
+			if (key_matches(tdb, rec, recoff, &key)) {
+				h->home_bucket = h->found_bucket = i;
+
+				if (tinfo) {
+					tinfo->levels[tinfo->num_levels]
+						.hashtable = off;
+					tinfo->levels[tinfo->num_levels]
+						.total_buckets
+						= 1 << TDB_HASH_GROUP_BITS;
+					tinfo->levels[tinfo->num_levels].entry
+						= i;
+					tinfo->num_levels++;
+				}
+				return recoff;
+			}
+		}
+		next = tdb_read_off(tdb, off
+				    + offsetof(struct tdb_chain, next));
+		if (next == TDB_OFF_ERR)
+			return TDB_OFF_ERR;
+		if (next)
+			next += sizeof(struct tdb_used_record);
+	}
+	return 0;
+}
+
 /* This is the core routine which searches the hashtable for an entry.
 /* This is the core routine which searches the hashtable for an entry.
  * On error, no locks are held and TDB_OFF_ERR is returned.
  * On error, no locks are held and TDB_OFF_ERR is returned.
  * Otherwise, hinfo is filled in (and the optional tinfo).
  * Otherwise, hinfo is filled in (and the optional tinfo).
@@ -171,7 +250,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb,
 		tinfo->levels[0].total_buckets = 1 << TDB_HASH_GROUP_BITS;
 		tinfo->levels[0].total_buckets = 1 << TDB_HASH_GROUP_BITS;
 	}
 	}
 
 
-	while (likely(h->hash_used < 64)) {
+	while (h->hash_used <= 64) {
 		/* Read in the hash group. */
 		/* Read in the hash group. */
 		h->group_start = hashtable
 		h->group_start = hashtable
 			+ group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
 			+ group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
@@ -228,8 +307,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb,
 		return 0;
 		return 0;
 	}
 	}
 
 
-	/* FIXME: We hit the bottom.  Chain! */
-	abort();
+	return find_in_chain(tdb, key, hashtable, h, rec, tinfo);
 
 
 fail:
 fail:
 	tdb_unlock_hashes(tdb, h->hlock_start, h->hlock_range, ltype);
 	tdb_unlock_hashes(tdb, h->hlock_start, h->hlock_range, ltype);
@@ -239,8 +317,8 @@ fail:
 /* I wrote a simple test, expanding a hash to 2GB, for the following
 /* I wrote a simple test, expanding a hash to 2GB, for the following
  * cases:
  * cases:
  * 1) Expanding all the buckets at once,
  * 1) Expanding all the buckets at once,
- * 2) Expanding the most-populated bucket,
- * 3) Expanding the bucket we wanted to place the new entry ito.
+ * 2) Expanding the bucket we wanted to place the new entry into.
+ * 3) Expanding the most-populated bucket,
  *
  *
  * I measured the worst/average/best density during this process.
  * I measured the worst/average/best density during this process.
  * 1) 3%/16%/30%
  * 1) 3%/16%/30%
@@ -315,6 +393,41 @@ int replace_in_hash(struct tdb_context *tdb,
 			     encode_offset(new_off, h));
 			     encode_offset(new_off, h));
 }
 }
 
 
+/* We slot in anywhere that's empty in the chain. */
+static int COLD add_to_chain(struct tdb_context *tdb,
+			     tdb_off_t subhash,
+			     tdb_off_t new_off)
+{
+	size_t entry = tdb_find_zero_off(tdb, subhash, 1<<TDB_HASH_GROUP_BITS);
+
+	if (entry == 1 << TDB_HASH_GROUP_BITS) {
+		tdb_off_t next;
+
+		next = tdb_read_off(tdb, subhash
+				    + offsetof(struct tdb_chain, next));
+		if (next == TDB_OFF_ERR)
+			return -1;
+
+		if (!next) {
+			next = alloc(tdb, 0, sizeof(struct tdb_chain), 0,
+				     TDB_CHAIN_MAGIC, false);
+			if (next == TDB_OFF_ERR)
+				return -1;
+			if (zero_out(tdb, next+sizeof(struct tdb_used_record),
+				     sizeof(struct tdb_chain)))
+				return -1;
+			if (tdb_write_off(tdb, subhash
+					  + offsetof(struct tdb_chain, next),
+					  next) != 0)
+				return -1;
+		}
+		return add_to_chain(tdb, next, new_off);
+	}
+
+	return tdb_write_off(tdb, subhash + entry * sizeof(tdb_off_t),
+			     new_off);
+}
+
 /* Add into a newly created subhash. */
 /* Add into a newly created subhash. */
 static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
 static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
 			  unsigned hash_used, tdb_off_t val)
 			  unsigned hash_used, tdb_off_t val)
@@ -325,14 +438,12 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
 
 
 	h.hash_used = hash_used;
 	h.hash_used = hash_used;
 
 
-	/* FIXME chain if hash_used == 64 */
 	if (hash_used + TDB_SUBLEVEL_HASH_BITS > 64)
 	if (hash_used + TDB_SUBLEVEL_HASH_BITS > 64)
-		abort();
+		return add_to_chain(tdb, subhash, off);
 
 
-	/* FIXME: Do truncated hash bits if we can! */
 	h.h = hash_record(tdb, off);
 	h.h = hash_record(tdb, off);
 	gnum = use_bits(&h, TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS);
 	gnum = use_bits(&h, TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS);
-	h.group_start = subhash	+ sizeof(struct tdb_used_record)
+	h.group_start = subhash
 		+ gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
 		+ gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
 	h.home_bucket = use_bits(&h, TDB_HASH_GROUP_BITS);
 	h.home_bucket = use_bits(&h, TDB_HASH_GROUP_BITS);
 
 
@@ -346,20 +457,29 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
 
 
 static int expand_group(struct tdb_context *tdb, struct hash_info *h)
 static int expand_group(struct tdb_context *tdb, struct hash_info *h)
 {
 {
-	unsigned bucket, num_vals, i;
+	unsigned bucket, num_vals, i, magic;
+	size_t subsize;
 	tdb_off_t subhash;
 	tdb_off_t subhash;
 	tdb_off_t vals[1 << TDB_HASH_GROUP_BITS];
 	tdb_off_t vals[1 << TDB_HASH_GROUP_BITS];
 
 
 	/* Attach new empty subhash under fullest bucket. */
 	/* Attach new empty subhash under fullest bucket. */
 	bucket = fullest_bucket(tdb, h->group, h->home_bucket);
 	bucket = fullest_bucket(tdb, h->group, h->home_bucket);
 
 
-	subhash = alloc(tdb, 0, sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS,
-			0, false);
+	if (h->hash_used == 64) {
+		add_stat(tdb, alloc_chain, 1);
+		subsize = sizeof(struct tdb_chain);
+		magic = TDB_CHAIN_MAGIC;
+	} else {
+		add_stat(tdb, alloc_subhash, 1);
+		subsize = (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS);
+		magic = TDB_HTABLE_MAGIC;
+	}
+
+	subhash = alloc(tdb, 0, subsize, 0, magic, false);
 	if (subhash == TDB_OFF_ERR)
 	if (subhash == TDB_OFF_ERR)
 		return -1;
 		return -1;
 
 
-	if (zero_out(tdb, subhash + sizeof(struct tdb_used_record),
-		     sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) == -1)
+	if (zero_out(tdb, subhash + sizeof(struct tdb_used_record), subsize))
 		return -1;
 		return -1;
 
 
 	/* Remove any which are destined for bucket or are in wrong place. */
 	/* Remove any which are destined for bucket or are in wrong place. */
@@ -377,7 +497,10 @@ static int expand_group(struct tdb_context *tdb, struct hash_info *h)
 	/* assert(num_vals); */
 	/* assert(num_vals); */
 
 
 	/* Overwrite expanded bucket with subhash pointer. */
 	/* Overwrite expanded bucket with subhash pointer. */
-	h->group[bucket] = subhash | ~((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
+	h->group[bucket] = subhash | (1ULL << TDB_OFF_UPPER_STEAL_SUBHASH_BIT);
+
+	/* Point to actual contents of record. */
+	subhash += sizeof(struct tdb_used_record);
 
 
 	/* Put values back. */
 	/* Put values back. */
 	for (i = 0; i < num_vals; i++) {
 	for (i = 0; i < num_vals; i++) {
@@ -433,10 +556,6 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h)
 
 
 int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off)
 int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off)
 {
 {
-	/* FIXME: chain! */
-	if (h->hash_used >= 64)
-		abort();
-
 	/* We hit an empty bucket during search?  That's where it goes. */
 	/* We hit an empty bucket during search?  That's where it goes. */
 	if (!h->group[h->found_bucket]) {
 	if (!h->group[h->found_bucket]) {
 		h->group[h->found_bucket] = encode_offset(new_off, h);
 		h->group[h->found_bucket] = encode_offset(new_off, h);
@@ -445,6 +564,9 @@ int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off)
 					 h->group, sizeof(h->group));
 					 h->group, sizeof(h->group));
 	}
 	}
 
 
+	if (h->hash_used > 64)
+		return add_to_chain(tdb, h->group_start, new_off);
+
 	/* We're full.  Expand. */
 	/* We're full.  Expand. */
 	if (expand_group(tdb, h) == -1)
 	if (expand_group(tdb, h) == -1)
 		return -1;
 		return -1;
@@ -523,7 +645,11 @@ again:
 		tlevel++;
 		tlevel++;
 		tlevel->hashtable = off + sizeof(struct tdb_used_record);
 		tlevel->hashtable = off + sizeof(struct tdb_used_record);
 		tlevel->entry = 0;
 		tlevel->entry = 0;
-		tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS);
+		/* Next level is a chain? */
+		if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1))
+			tlevel->total_buckets = (1 << TDB_HASH_GROUP_BITS);
+		else
+			tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS);
 		goto again;
 		goto again;
 	}
 	}
 
 
@@ -531,6 +657,20 @@ again:
 	if (tinfo->num_levels == 1)
 	if (tinfo->num_levels == 1)
 		return 0;
 		return 0;
 
 
+	/* Handle chained entries. */
+	if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1)) {
+		tlevel->hashtable = tdb_read_off(tdb, tlevel->hashtable
+						 + offsetof(struct tdb_chain,
+							    next));
+		if (tlevel->hashtable == TDB_OFF_ERR)
+			return TDB_OFF_ERR;
+		if (tlevel->hashtable) {
+			tlevel->hashtable += sizeof(struct tdb_used_record);
+			tlevel->entry = 0;
+			goto again;
+		}
+	}
+
 	/* Go back up and keep searching. */
 	/* Go back up and keep searching. */
 	tinfo->num_levels--;
 	tinfo->num_levels--;
 	tlevel--;
 	tlevel--;
@@ -563,11 +703,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype,
 						  ltype);
 						  ltype);
 				return -1;
 				return -1;
 			}
 			}
-			if (rec_magic(&rec) != TDB_MAGIC) {
-				tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-					 "next_in_hash:"
-					 " corrupt record at %llu\n",
-					 (long long)off);
+			if (rec_magic(&rec) != TDB_USED_MAGIC) {
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_FATAL,
+					   "next_in_hash:"
+					   " corrupt record at %llu",
+					   (long long)off);
 				return -1;
 				return -1;
 			}
 			}
 
 

+ 114 - 92
ccan/tdb2/io.c

@@ -56,9 +56,9 @@ void tdb_mmap(struct tdb_context *tdb)
 	 */
 	 */
 	if (tdb->map_ptr == MAP_FAILED) {
 	if (tdb->map_ptr == MAP_FAILED) {
 		tdb->map_ptr = NULL;
 		tdb->map_ptr = NULL;
-		tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
-			 "tdb_mmap failed for size %lld (%s)\n", 
-			 (long long)tdb->map_size, strerror(errno));
+		tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_WARNING,
+			   "tdb_mmap failed for size %lld (%s)",
+			   (long long)tdb->map_size, strerror(errno));
 	}
 	}
 }
 }
 
 
@@ -70,7 +70,6 @@ void tdb_mmap(struct tdb_context *tdb)
 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 {
 {
 	struct stat st;
 	struct stat st;
-	int ret;
 
 
 	/* We can't hold pointers during this: we could unmap! */
 	/* We can't hold pointers during this: we could unmap! */
 	assert(!tdb->direct_access
 	assert(!tdb->direct_access
@@ -81,11 +80,9 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 		return 0;
 		return 0;
 	if (tdb->flags & TDB_INTERNAL) {
 	if (tdb->flags & TDB_INTERNAL) {
 		if (!probe) {
 		if (!probe) {
-			/* Ensure ecode is set for log fn. */
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
 				 "tdb_oob len %lld beyond internal"
 				 "tdb_oob len %lld beyond internal"
-				 " malloc size %lld\n",
+				 " malloc size %lld",
 				 (long long)len,
 				 (long long)len,
 				 (long long)tdb->map_size);
 				 (long long)tdb->map_size);
 		}
 		}
@@ -95,22 +92,20 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 	if (tdb_lock_expand(tdb, F_RDLCK) != 0)
 	if (tdb_lock_expand(tdb, F_RDLCK) != 0)
 		return -1;
 		return -1;
 
 
-	ret = fstat(tdb->fd, &st);
-
-	tdb_unlock_expand(tdb, F_RDLCK);
-
-	if (ret == -1) {
-		tdb->ecode = TDB_ERR_IO;
+	if (fstat(tdb->fd, &st) != 0) {
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+			   "Failed to fstat file: %s", strerror(errno));
+		tdb_unlock_expand(tdb, F_RDLCK);
 		return -1;
 		return -1;
 	}
 	}
 
 
+	tdb_unlock_expand(tdb, F_RDLCK);
+
 	if (st.st_size < (size_t)len) {
 	if (st.st_size < (size_t)len) {
 		if (!probe) {
 		if (!probe) {
-			/* Ensure ecode is set for log fn. */
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_oob len %lld beyond eof at %lld\n",
-				 (long long)len, (long long)st.st_size);
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+				   "tdb_oob len %zu beyond eof at %zu",
+				   (size_t)len, st.st_size);
 		}
 		}
 		return -1;
 		return -1;
 	}
 	}
@@ -123,19 +118,6 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 	return 0;
 	return 0;
 }
 }
 
 
-/* Either make a copy into pad and return that, or return ptr into mmap. */
-/* Note: pad has to be a real object, so we can't get here if len
- * overflows size_t */
-void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
-{
-	if (likely(!(tdb->flags & TDB_CONVERT))) {
-		void *ret = tdb->methods->direct(tdb, off, len);
-		if (ret)
-			return ret;
-	}
-	return tdb_read_convert(tdb, off, pad, len) == -1 ? NULL : pad;
-}
-
 /* Endian conversion: we only ever deal with 8 byte quantities */
 /* Endian conversion: we only ever deal with 8 byte quantities */
 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
 {
 {
@@ -191,7 +173,9 @@ uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
 {
 {
 	char buf[8192] = { 0 };
 	char buf[8192] = { 0 };
-	void *p = tdb->methods->direct(tdb, off, len);
+	void *p = tdb->methods->direct(tdb, off, len, true);
+
+	assert(!tdb->read_only);
 	if (p) {
 	if (p) {
 		memset(p, 0, len);
 		memset(p, 0, len);
 		return 0;
 		return 0;
@@ -208,13 +192,18 @@ int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
 
 
 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
 {
 {
-	tdb_off_t pad, *ret;
+	tdb_off_t ret;
 
 
-	ret = tdb_get(tdb, off, &pad, sizeof(pad));
-	if (!ret) {
-		return TDB_OFF_ERR;
+	if (likely(!(tdb->flags & TDB_CONVERT))) {
+		tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
+						    false);
+		if (p)
+			return *p;
 	}
 	}
-	return *ret;
+
+	if (tdb_read_convert(tdb, off, &ret, sizeof(ret)) == -1)
+		return TDB_OFF_ERR;
+	return ret;
 }
 }
 
 
 /* Even on files, we can get partial writes due to signals. */
 /* Even on files, we can get partial writes due to signals. */
@@ -278,15 +267,17 @@ bool tdb_read_all(int fd, void *buf, size_t len)
 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
 		     const void *buf, tdb_len_t len)
 		     const void *buf, tdb_len_t len)
 {
 {
-	if (len == 0) {
-		return 0;
-	}
-
 	if (tdb->read_only) {
 	if (tdb->read_only) {
-		tdb->ecode = TDB_ERR_RDONLY;
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Write to read-only database");
 		return -1;
 		return -1;
 	}
 	}
 
 
+	/* FIXME: Bogus optimization? */
+	if (len == 0) {
+		return 0;
+	}
+
 	if (tdb->methods->oob(tdb, off + len, 0) != 0)
 	if (tdb->methods->oob(tdb, off + len, 0) != 0)
 		return -1;
 		return -1;
 
 
@@ -294,11 +285,9 @@ static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
 		memcpy(off + (char *)tdb->map_ptr, buf, len);
 		memcpy(off + (char *)tdb->map_ptr, buf, len);
 	} else {
 	} else {
 		if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
 		if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_write failed at %llu len=%llu (%s)\n",
-				 (long long)off, (long long)len,
-				 strerror(errno));
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+				   "tdb_write failed at %zu len=%zu (%s)",
+				   (size_t)off, (size_t)len, strerror(errno));
 			return -1;
 			return -1;
 		}
 		}
 	}
 	}
@@ -317,14 +306,12 @@ static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
 		memcpy(buf, off + (char *)tdb->map_ptr, len);
 		memcpy(buf, off + (char *)tdb->map_ptr, len);
 	} else {
 	} else {
 		if (!tdb_pread_all(tdb->fd, buf, len, off)) {
 		if (!tdb_pread_all(tdb->fd, buf, len, off)) {
-			/* Ensure ecode is set for log fn. */
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_read failed at %lld "
-				 "len=%lld (%s) map_size=%lld\n",
-				 (long long)off, (long long)len,
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+				   "tdb_read failed at %zu "
+				   "len=%zu (%s) map_size=%zu",
+				 (size_t)off, (size_t)len,
 				 strerror(errno),
 				 strerror(errno),
-				 (long long)tdb->map_size);
+				 (size_t)tdb->map_size);
 			return -1;
 			return -1;
 		}
 		}
 	}
 	}
@@ -338,10 +325,9 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
 	if (unlikely((tdb->flags & TDB_CONVERT))) {
 	if (unlikely((tdb->flags & TDB_CONVERT))) {
 		void *conv = malloc(len);
 		void *conv = malloc(len);
 		if (!conv) {
 		if (!conv) {
-			tdb->ecode = TDB_ERR_OOM;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_write: no memory converting %zu bytes\n",
-				 len);
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "tdb_write: no memory converting"
+				   " %zu bytes", len);
 			return -1;
 			return -1;
 		}
 		}
 		memcpy(conv, rec, len);
 		memcpy(conv, rec, len);
@@ -364,6 +350,20 @@ int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
 
 
 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
 {
 {
+	if (tdb->read_only) {
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Write to read-only database");
+		return -1;
+	}
+
+	if (likely(!(tdb->flags & TDB_CONVERT))) {
+		tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
+						    true);
+		if (p) {
+			*p = val;
+			return 0;
+		}
+	}
 	return tdb_write_convert(tdb, off, &val, sizeof(val));
 	return tdb_write_convert(tdb, off, &val, sizeof(val));
 }
 }
 
 
@@ -374,12 +374,12 @@ static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
 
 
 	/* some systems don't like zero length malloc */
 	/* some systems don't like zero length malloc */
 	buf = malloc(prefix + len ? prefix + len : 1);
 	buf = malloc(prefix + len ? prefix + len : 1);
-	if (unlikely(!buf)) {
-		tdb->ecode = TDB_ERR_OOM;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_alloc_read malloc failed len=%lld\n",
-			 (long long)prefix + len);
-	} else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix, len))) {
+	if (!buf) {
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			   "tdb_alloc_read malloc failed len=%zu",
+			   (size_t)(prefix + len));
+	} else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix,
+					       len) == -1)) {
 		free(buf);
 		free(buf);
 		buf = NULL;
 		buf = NULL;
 	}
 	}
@@ -400,9 +400,8 @@ static int fill(struct tdb_context *tdb,
 		size_t n = len > size ? size : len;
 		size_t n = len > size ? size : len;
 
 
 		if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
 		if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "fill write failed: giving up!\n");
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+				 "fill write failed: giving up!");
 			return -1;
 			return -1;
 		}
 		}
 		len -= n;
 		len -= n;
@@ -418,14 +417,16 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
 	char buf[8192];
 	char buf[8192];
 
 
 	if (tdb->read_only) {
 	if (tdb->read_only) {
-		tdb->ecode = TDB_ERR_RDONLY;
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Expand on read-only database");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	if (tdb->flags & TDB_INTERNAL) {
 	if (tdb->flags & TDB_INTERNAL) {
 		char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
 		char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
 		if (!new) {
 		if (!new) {
-			tdb->ecode = TDB_ERR_OOM;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "No memory to expand database");
 			return -1;
 			return -1;
 		}
 		}
 		tdb->map_ptr = new;
 		tdb->map_ptr = new;
@@ -443,7 +444,7 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
 		   file isn't sparse, which would be very bad if we ran out of
 		   file isn't sparse, which would be very bad if we ran out of
 		   disk. This must be done with write, not via mmap */
 		   disk. This must be done with write, not via mmap */
 		memset(buf, 0x43, sizeof(buf));
 		memset(buf, 0x43, sizeof(buf));
-		if (fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
+		if (0 || fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
 			return -1;
 			return -1;
 		tdb->map_size += addition;
 		tdb->map_size += addition;
 		tdb_mmap(tdb);
 		tdb_mmap(tdb);
@@ -451,25 +452,20 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
 	return 0;
 	return 0;
 }
 }
 
 
-/* This is only neded for tdb_access_commit, but used everywhere to simplify. */
-struct tdb_access_hdr {
-	tdb_off_t off;
-	tdb_len_t len;
-	bool convert;
-};
-
 const void *tdb_access_read(struct tdb_context *tdb,
 const void *tdb_access_read(struct tdb_context *tdb,
 			    tdb_off_t off, tdb_len_t len, bool convert)
 			    tdb_off_t off, tdb_len_t len, bool convert)
 {
 {
 	const void *ret = NULL;	
 	const void *ret = NULL;	
 
 
 	if (likely(!(tdb->flags & TDB_CONVERT)))
 	if (likely(!(tdb->flags & TDB_CONVERT)))
-		ret = tdb->methods->direct(tdb, off, len);
+		ret = tdb->methods->direct(tdb, off, len, false);
 
 
 	if (!ret) {
 	if (!ret) {
 		struct tdb_access_hdr *hdr;
 		struct tdb_access_hdr *hdr;
 		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
 		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
 		if (hdr) {
 		if (hdr) {
+			hdr->next = tdb->access;
+			tdb->access = hdr;
 			ret = hdr + 1;
 			ret = hdr + 1;
 			if (convert)
 			if (convert)
 				tdb_convert(tdb, (void *)ret, len);
 				tdb_convert(tdb, (void *)ret, len);
@@ -485,13 +481,21 @@ void *tdb_access_write(struct tdb_context *tdb,
 {
 {
 	void *ret = NULL;
 	void *ret = NULL;
 
 
+	if (tdb->read_only) {
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Write to read-only database");
+		return NULL;
+	}
+
 	if (likely(!(tdb->flags & TDB_CONVERT)))
 	if (likely(!(tdb->flags & TDB_CONVERT)))
-		ret = tdb->methods->direct(tdb, off, len);
+		ret = tdb->methods->direct(tdb, off, len, true);
 
 
 	if (!ret) {
 	if (!ret) {
 		struct tdb_access_hdr *hdr;
 		struct tdb_access_hdr *hdr;
 		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
 		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
 		if (hdr) {
 		if (hdr) {
+			hdr->next = tdb->access;
+			tdb->access = hdr;
 			hdr->off = off;
 			hdr->off = off;
 			hdr->len = len;
 			hdr->len = len;
 			hdr->convert = convert;
 			hdr->convert = convert;
@@ -505,30 +509,41 @@ void *tdb_access_write(struct tdb_context *tdb,
 	return ret;
 	return ret;
 }
 }
 
 
+static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p)
+{
+	struct tdb_access_hdr **hp;
+
+	for (hp = &tdb->access; *hp; hp = &(*hp)->next) {
+		if (*hp + 1 == p)
+			return hp;
+	}
+	return NULL;
+}
+
 void tdb_access_release(struct tdb_context *tdb, const void *p)
 void tdb_access_release(struct tdb_context *tdb, const void *p)
 {
 {
-	if (!tdb->map_ptr
-	    || (char *)p < (char *)tdb->map_ptr
-	    || (char *)p >= (char *)tdb->map_ptr + tdb->map_size)
-		free((struct tdb_access_hdr *)p - 1);
-	else
+	struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
+
+	if (hp) {
+		hdr = *hp;
+		*hp = hdr->next;
+		free(hdr);
+	} else
 		tdb->direct_access--;
 		tdb->direct_access--;
 }
 }
 
 
 int tdb_access_commit(struct tdb_context *tdb, void *p)
 int tdb_access_commit(struct tdb_context *tdb, void *p)
 {
 {
+	struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
 	int ret = 0;
 	int ret = 0;
 
 
-	if (!tdb->map_ptr
-	    || (char *)p < (char *)tdb->map_ptr
-	    || (char *)p >= (char *)tdb->map_ptr + tdb->map_size) {
-		struct tdb_access_hdr *hdr;
-
-		hdr = (struct tdb_access_hdr *)p - 1;
+	if (hp) {
+		hdr = *hp;
 		if (hdr->convert)
 		if (hdr->convert)
 			ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
 			ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
 		else
 		else
 			ret = tdb_write(tdb, hdr->off, p, hdr->len);
 			ret = tdb_write(tdb, hdr->off, p, hdr->len);
+		*hp = hdr->next;
 		free(hdr);
 		free(hdr);
 	} else
 	} else
 		tdb->direct_access--;
 		tdb->direct_access--;
@@ -536,7 +551,8 @@ int tdb_access_commit(struct tdb_context *tdb, void *p)
 	return ret;
 	return ret;
 }
 }
 
 
-static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
+static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
+			bool write)
 {
 {
 	if (unlikely(!tdb->map_ptr))
 	if (unlikely(!tdb->map_ptr))
 		return NULL;
 		return NULL;
@@ -546,6 +562,12 @@ static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
 	return (char *)tdb->map_ptr + off;
 	return (char *)tdb->map_ptr + off;
 }
 }
 
 
+void add_stat_(struct tdb_context *tdb, uint64_t *stat, size_t val)
+{
+	if ((uintptr_t)stat < (uintptr_t)tdb->stats + tdb->stats->size)
+		*stat += val;
+}
+
 static const struct tdb_methods io_methods = {
 static const struct tdb_methods io_methods = {
 	tdb_read,
 	tdb_read,
 	tdb_write,
 	tdb_write,

+ 77 - 97
ccan/tdb2/lock.c

@@ -40,10 +40,13 @@ static int fcntl_lock(struct tdb_context *tdb,
 	fl.l_len = len;
 	fl.l_len = len;
 	fl.l_pid = 0;
 	fl.l_pid = 0;
 
 
+	add_stat(tdb, lock_lowlevel, 1);
 	if (waitflag)
 	if (waitflag)
 		return fcntl(tdb->fd, F_SETLKW, &fl);
 		return fcntl(tdb->fd, F_SETLKW, &fl);
-	else
+	else {
+		add_stat(tdb, lock_nonblock, 1);
 		return fcntl(tdb->fd, F_SETLK, &fl);
 		return fcntl(tdb->fd, F_SETLK, &fl);
+	}
 }
 }
 
 
 static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
 static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
@@ -99,7 +102,7 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
 	}
 	}
 
 
 	if (!found) {
 	if (!found) {
-		fprintf(stderr, "Unlock on %u@%u not found!\n",
+		fprintf(stderr, "Unlock on %u@%u not found!",
 			(int)off, (int)len);
 			(int)off, (int)len);
 		abort();
 		abort();
 	}
 	}
@@ -132,16 +135,16 @@ static int tdb_brlock(struct tdb_context *tdb,
 	}
 	}
 
 
 	if (rw_type == F_WRLCK && tdb->read_only) {
 	if (rw_type == F_WRLCK && tdb->read_only) {
-		tdb->ecode = TDB_ERR_RDONLY;
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Write lock attempted on read-only database");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	/* A 32 bit system cannot open a 64-bit file, but it could have
 	/* A 32 bit system cannot open a 64-bit file, but it could have
 	 * expanded since then: check here. */
 	 * expanded since then: check here. */
 	if ((size_t)(offset + len) != offset + len) {
 	if ((size_t)(offset + len) != offset + len) {
-		tdb->ecode = TDB_ERR_IO;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_brlock: lock on giant offset %llu\n",
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			 "tdb_brlock: lock on giant offset %llu",
 			 (long long)(offset + len));
 			 (long long)(offset + len));
 		return -1;
 		return -1;
 	}
 	}
@@ -157,11 +160,12 @@ static int tdb_brlock(struct tdb_context *tdb,
 		 * EAGAIN is an expected return from non-blocking
 		 * EAGAIN is an expected return from non-blocking
 		 * locks. */
 		 * locks. */
 		if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
 		if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_brlock failed (fd=%d) at"
-				 " offset %llu rw_type=%d flags=%d len=%llu\n",
-				 tdb->fd, (long long)offset, rw_type,
-				 flags, (long long)len);
+			tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+				   "tdb_brlock failed (fd=%d) at"
+				   " offset %zu rw_type=%d flags=%d len=%zu:"
+				   " %s",
+				   tdb->fd, (size_t)offset, rw_type,
+				   flags, (size_t)len, strerror(errno));
 		}
 		}
 		return -1;
 		return -1;
 	}
 	}
@@ -182,10 +186,10 @@ static int tdb_brunlock(struct tdb_context *tdb,
 	} while (ret == -1 && errno == EINTR);
 	} while (ret == -1 && errno == EINTR);
 
 
 	if (ret == -1) {
 	if (ret == -1) {
-		tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
-			 "tdb_brunlock failed (fd=%d) at offset %llu"
-			 " rw_type=%d len=%llu\n",
-			 tdb->fd, (long long)offset, rw_type, (long long)len);
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_TRACE,
+			   "tdb_brunlock failed (fd=%d) at offset %zu"
+			   " rw_type=%d len=%zu",
+			   tdb->fd, (size_t)offset, rw_type, (size_t)len);
 	}
 	}
 	return ret;
 	return ret;
 }
 }
@@ -201,15 +205,15 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
 	int count = 1000;
 	int count = 1000;
 
 
 	if (tdb->allrecord_lock.count != 1) {
 	if (tdb->allrecord_lock.count != 1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_upgrade failed: count %u too high\n",
-			 tdb->allrecord_lock.count);
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_allrecord_upgrade failed: count %u too high",
+			   tdb->allrecord_lock.count);
 		return -1;
 		return -1;
 	}
 	}
 
 
 	if (tdb->allrecord_lock.off != 1) {
 	if (tdb->allrecord_lock.off != 1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_upgrade failed: already upgraded?\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_allrecord_upgrade failed: already upgraded?");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -230,8 +234,8 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
 		tv.tv_usec = 1;
 		tv.tv_usec = 1;
 		select(0, NULL, NULL, NULL, &tv);
 		select(0, NULL, NULL, NULL, &tv);
 	}
 	}
-	tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
-		 "tdb_allrecord_upgrade failed\n");
+	tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_WARNING,
+		   "tdb_allrecord_upgrade failed");
 	return -1;
 	return -1;
 }
 }
 
 
@@ -276,23 +280,23 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
 	struct tdb_lock_type *new_lck;
 	struct tdb_lock_type *new_lck;
 
 
 	if (offset > TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE + tdb->map_size / 8) {
 	if (offset > TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE + tdb->map_size / 8) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_nest_lock: invalid offset %llu ltype=%d\n",
-			 (long long)offset, ltype);
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
+			   "tdb_nest_lock: invalid offset %zu ltype=%d",
+			   (size_t)offset, ltype);
 		return -1;
 		return -1;
 	}
 	}
 
 
 	if (tdb->flags & TDB_NOLOCK)
 	if (tdb->flags & TDB_NOLOCK)
 		return 0;
 		return 0;
 
 
+	add_stat(tdb, locks, 1);
+
 	new_lck = find_nestlock(tdb, offset);
 	new_lck = find_nestlock(tdb, offset);
 	if (new_lck) {
 	if (new_lck) {
 		if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
 		if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
-			tdb->ecode = TDB_ERR_LOCK;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_nest_lock: offset %llu has read lock\n",
-				 (long long)offset);
+			tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
+				   "tdb_nest_lock: offset %zu has read lock",
+				   (size_t)offset);
 			return -1;
 			return -1;
 		}
 		}
 		/* Just increment the struct, posix locks don't stack. */
 		/* Just increment the struct, posix locks don't stack. */
@@ -303,9 +307,8 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
 	if (tdb->num_lockrecs
 	if (tdb->num_lockrecs
 	    && offset >= TDB_HASH_LOCK_START
 	    && offset >= TDB_HASH_LOCK_START
 	    && offset < TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) {
 	    && offset < TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_nest_lock: already have a hash lock?\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
+			   "tdb_nest_lock: already have a hash lock?");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -313,10 +316,9 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
 		tdb->lockrecs,
 		tdb->lockrecs,
 		sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
 		sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
 	if (new_lck == NULL) {
 	if (new_lck == NULL) {
-		tdb->ecode = TDB_ERR_OOM;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_nest_lock: unable to allocate %llu lock struct",
-			 (long long)(tdb->num_lockrecs + 1));
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			 "tdb_nest_lock: unable to allocate %zu lock struct",
+			 tdb->num_lockrecs + 1);
 		errno = ENOMEM;
 		errno = ENOMEM;
 		return -1;
 		return -1;
 	}
 	}
@@ -361,9 +363,8 @@ static int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t off, int ltype)
 
 
 	lck = find_nestlock(tdb, off);
 	lck = find_nestlock(tdb, off);
 	if ((lck == NULL) || (lck->count == 0)) {
 	if ((lck == NULL) || (lck->count == 0)) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_nest_unlock: no lock for %llu\n", (long long)off);
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_nest_unlock: no lock for %zu", (size_t)off);
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -448,9 +449,8 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 {
 {
 	/* FIXME: There are no locks on read-only dbs */
 	/* FIXME: There are no locks on read-only dbs */
 	if (tdb->read_only) {
 	if (tdb->read_only) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_lock: read-only\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_allrecord_lock: read-only");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -462,49 +462,45 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 
 
 	if (tdb->allrecord_lock.count) {
 	if (tdb->allrecord_lock.count) {
 		/* a global lock of a different type exists */
 		/* a global lock of a different type exists */
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_lock: already have %s lock\n",
-			 tdb->allrecord_lock.ltype == F_RDLCK
-			 ? "read" : "write");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_allrecord_lock: already have %s lock",
+			   tdb->allrecord_lock.ltype == F_RDLCK
+			   ? "read" : "write");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	if (tdb_has_hash_locks(tdb)) {
 	if (tdb_has_hash_locks(tdb)) {
 		/* can't combine global and chain locks */
 		/* can't combine global and chain locks */
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_lock: already have chain lock\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			 "tdb_allrecord_lock: already have chain lock");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	if (upgradable && ltype != F_RDLCK) {
 	if (upgradable && ltype != F_RDLCK) {
 		/* tdb error: you can't upgrade a write lock! */
 		/* tdb error: you can't upgrade a write lock! */
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_lock: can't upgrade a write lock\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_allrecord_lock: can't upgrade a write lock");
 		return -1;
 		return -1;
 	}
 	}
 
 
+	add_stat(tdb, locks, 1);
 again:
 again:
 	/* Lock hashes, gradually. */
 	/* Lock hashes, gradually. */
 	if (tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START,
 	if (tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START,
 			     TDB_HASH_LOCK_RANGE)) {
 			     TDB_HASH_LOCK_RANGE)) {
 		if (!(flags & TDB_LOCK_PROBE)) {
 		if (!(flags & TDB_LOCK_PROBE)) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_allrecord_lock hashes failed (%s)\n",
-				 strerror(errno));
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
+				   "tdb_allrecord_lock hashes failed");
 		}
 		}
 		return -1;
 		return -1;
 	}
 	}
 
 
-	/* Lock free lists: there to end of file. */
+	/* Lock free tables: there to end of file. */
 	if (tdb_brlock(tdb, ltype, TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE,
 	if (tdb_brlock(tdb, ltype, TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE,
 		       0, flags)) {
 		       0, flags)) {
 		if (!(flags & TDB_LOCK_PROBE)) {
 		if (!(flags & TDB_LOCK_PROBE)) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_allrecord_lock freelist failed (%s)\n",
-				 strerror(errno));
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
+				 "tdb_allrecord_lock freetables failed");
 		}
 		}
 		tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, 
 		tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, 
 			     TDB_HASH_LOCK_RANGE);
 			     TDB_HASH_LOCK_RANGE);
@@ -559,29 +555,19 @@ void tdb_unlock_expand(struct tdb_context *tdb, int ltype)
 /* unlock entire db */
 /* unlock entire db */
 int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
 int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
 {
 {
-	/* FIXME: There are no locks on read-only dbs */
-	if (tdb->read_only) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_unlock: read-only\n");
-		return -1;
-	}
-
 	if (tdb->allrecord_lock.count == 0) {
 	if (tdb->allrecord_lock.count == 0) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_unlock: not locked!\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_allrecord_unlock: not locked!");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	/* Upgradable locks are marked as write locks. */
 	/* Upgradable locks are marked as write locks. */
 	if (tdb->allrecord_lock.ltype != ltype
 	if (tdb->allrecord_lock.ltype != ltype
 	    && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
 	    && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_unlock: have %s lock\n",
-			 tdb->allrecord_lock.ltype == F_RDLCK
-			 ? "read" : "write");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			 "tdb_allrecord_unlock: have %s lock",
+			   tdb->allrecord_lock.ltype == F_RDLCK
+			   ? "read" : "write");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -642,25 +628,22 @@ int tdb_lock_hashes(struct tdb_context *tdb,
 	}
 	}
 
 
 	if (tdb->allrecord_lock.count) {
 	if (tdb->allrecord_lock.count) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_lock_hashes: have %s allrecordlock\n",
-			 tdb->allrecord_lock.ltype == F_RDLCK
-			 ? "read" : "write");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_lock_hashes: already have %s allrecordlock",
+			   tdb->allrecord_lock.ltype == F_RDLCK
+			   ? "read" : "write");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	if (tdb_has_free_lock(tdb)) {
 	if (tdb_has_free_lock(tdb)) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_lock_hashes: have free lock already\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_lock_hashes: already have free lock");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	if (tdb_has_expansion_lock(tdb)) {
 	if (tdb_has_expansion_lock(tdb)) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_lock_hashes: have expansion lock already\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_lock_hashes: already have expansion lock");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -678,9 +661,8 @@ int tdb_unlock_hashes(struct tdb_context *tdb,
 	if (tdb->allrecord_lock.count) {
 	if (tdb->allrecord_lock.count) {
 		if (tdb->allrecord_lock.ltype == F_RDLCK
 		if (tdb->allrecord_lock.ltype == F_RDLCK
 		    && ltype == F_WRLCK) {
 		    && ltype == F_WRLCK) {
-			tdb->ecode = TDB_ERR_LOCK;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_unlock_hashes RO allrecord!\n");
+			tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
+				   "tdb_unlock_hashes RO allrecord!");
 			return -1;
 			return -1;
 		}
 		}
 		return 0;
 		return 0;
@@ -709,17 +691,15 @@ int tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
 	if (tdb->allrecord_lock.count) {
 	if (tdb->allrecord_lock.count) {
 		if (tdb->allrecord_lock.ltype == F_WRLCK)
 		if (tdb->allrecord_lock.ltype == F_WRLCK)
 			return 0;
 			return 0;
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_lock_free_bucket with RO allrecordlock!\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
+			 "tdb_lock_free_bucket with RO allrecordlock!");
 		return -1;
 		return -1;
 	}
 	}
 
 
 #if 0 /* FIXME */
 #if 0 /* FIXME */
 	if (tdb_has_expansion_lock(tdb)) {
 	if (tdb_has_expansion_lock(tdb)) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_lock_free_bucket: have expansion lock already\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_lock_free_bucket: already have expansion lock");
 		return -1;
 		return -1;
 	}
 	}
 #endif
 #endif

+ 87 - 36
ccan/tdb2/private.h

@@ -36,6 +36,7 @@
 #include "config.h"
 #include "config.h"
 #include <ccan/tdb2/tdb2.h>
 #include <ccan/tdb2/tdb2.h>
 #include <ccan/likely/likely.h>
 #include <ccan/likely/likely.h>
+#include <ccan/compiler/compiler.h>
 #ifdef HAVE_BYTESWAP_H
 #ifdef HAVE_BYTESWAP_H
 #include <byteswap.h>
 #include <byteswap.h>
 #endif
 #endif
@@ -63,9 +64,11 @@ typedef uint64_t tdb_off_t;
 
 
 #define TDB_MAGIC_FOOD "TDB file\n"
 #define TDB_MAGIC_FOOD "TDB file\n"
 #define TDB_VERSION ((uint64_t)(0x26011967 + 7))
 #define TDB_VERSION ((uint64_t)(0x26011967 + 7))
-#define TDB_MAGIC ((uint64_t)0x1999)
+#define TDB_USED_MAGIC ((uint64_t)0x1999)
+#define TDB_HTABLE_MAGIC ((uint64_t)0x1888)
+#define TDB_CHAIN_MAGIC ((uint64_t)0x1777)
+#define TDB_FTABLE_MAGIC ((uint64_t)0x1666)
 #define TDB_FREE_MAGIC ((uint64_t)0xFE)
 #define TDB_FREE_MAGIC ((uint64_t)0xFE)
-#define TDB_COALESCING_MAGIC ((uint64_t)0xFD)
 #define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
 #define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
 #define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
 #define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
 #define TDB_RECOVERY_INVALID_MAGIC (0x0ULL)
 #define TDB_RECOVERY_INVALID_MAGIC (0x0ULL)
@@ -91,20 +94,22 @@ typedef uint64_t tdb_off_t;
 #define TDB_SUBLEVEL_HASH_BITS 6
 #define TDB_SUBLEVEL_HASH_BITS 6
 /* And 8 entries in each group, ie 8 groups per sublevel. */
 /* And 8 entries in each group, ie 8 groups per sublevel. */
 #define TDB_HASH_GROUP_BITS 3
 #define TDB_HASH_GROUP_BITS 3
+/* This is currently 10: beyond this we chain. */
+#define TDB_MAX_LEVELS (1+(64-TDB_TOPLEVEL_HASH_BITS) / TDB_SUBLEVEL_HASH_BITS)
 
 
-/* Extend file by least 32 times larger than needed. */
-#define TDB_EXTENSION_FACTOR 32
+/* Extend file by least 100 times larger than needed. */
+#define TDB_EXTENSION_FACTOR 100
 
 
 /* We steal bits from the offsets to store hash info. */
 /* We steal bits from the offsets to store hash info. */
 #define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1)
 #define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1)
 /* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
 /* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
 #define TDB_OFF_UPPER_STEAL 8
 #define TDB_OFF_UPPER_STEAL 8
 #define   TDB_OFF_UPPER_STEAL_EXTRA 7
 #define   TDB_OFF_UPPER_STEAL_EXTRA 7
-#define   TDB_OFF_UPPER_STEAL_TRUNCBIT 1
-/* If this is set, hash is truncated (only 1 bit is valid). */
-#define TDB_OFF_HASH_TRUNCATED_BIT 56
-/* The bit number where we store next level of hash. */
+/* The bit number where we store extra hash bits. */
 #define TDB_OFF_HASH_EXTRA_BIT 57
 #define TDB_OFF_HASH_EXTRA_BIT 57
+#define TDB_OFF_UPPER_STEAL_SUBHASH_BIT 56
+
+/* The bit number where we store the extra hash bits. */
 /* Convenience mask to get actual offset. */
 /* Convenience mask to get actual offset. */
 #define TDB_OFF_MASK \
 #define TDB_OFF_MASK \
 	(((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK)
 	(((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK)
@@ -116,6 +121,9 @@ typedef uint64_t tdb_off_t;
 #define TDB_MIN_DATA_LEN	\
 #define TDB_MIN_DATA_LEN	\
 	(sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
 	(sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
 
 
+/* Indicates this entry is not on an flist (can happen during coalescing) */
+#define TDB_FTABLE_NONE ((1ULL << TDB_OFF_UPPER_STEAL) - 1)
+
 #if !HAVE_BSWAP_64
 #if !HAVE_BSWAP_64
 static inline uint64_t bswap_64(uint64_t x)
 static inline uint64_t bswap_64(uint64_t x)
 {
 {
@@ -173,20 +181,30 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r)
 }
 }
 
 
 struct tdb_free_record {
 struct tdb_free_record {
-        uint64_t magic_and_meta; /* TDB_OFF_UPPER_STEAL bits of magic */
-        uint64_t data_len; /* Not counting these two fields. */
-	/* This is why the minimum record size is 16 bytes.  */
-	uint64_t next, prev;
+        uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */
+        uint64_t ftable_and_len; /* Len not counting these two fields. */
+	/* This is why the minimum record size is 8 bytes.  */
+	uint64_t next;
 };
 };
 
 
+static inline uint64_t frec_prev(const struct tdb_free_record *f)
+{
+	return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
+}
+
 static inline uint64_t frec_magic(const struct tdb_free_record *f)
 static inline uint64_t frec_magic(const struct tdb_free_record *f)
 {
 {
-	return f->magic_and_meta >> (64 - TDB_OFF_UPPER_STEAL);
+	return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL);
 }
 }
 
 
-static inline uint64_t frec_flist(const struct tdb_free_record *f)
+static inline uint64_t frec_len(const struct tdb_free_record *f)
 {
 {
-	return f->magic_and_meta & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
+	return f->ftable_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
+}
+
+static inline unsigned frec_ftable(const struct tdb_free_record *f)
+{
+	return f->ftable_and_len >> (64 - TDB_OFF_UPPER_STEAL);
 }
 }
 
 
 struct tdb_recovery_record {
 struct tdb_recovery_record {
@@ -199,6 +217,12 @@ struct tdb_recovery_record {
 	uint64_t eof;
 	uint64_t eof;
 };
 };
 
 
+/* If we bottom out of the subhashes, we chain. */
+struct tdb_chain {
+	tdb_off_t rec[1 << TDB_HASH_GROUP_BITS];
+	tdb_off_t next;
+};
+
 /* this is stored at the front of every database */
 /* this is stored at the front of every database */
 struct tdb_header {
 struct tdb_header {
 	char magic_food[64]; /* for /etc/magic */
 	char magic_food[64]; /* for /etc/magic */
@@ -206,7 +230,7 @@ struct tdb_header {
 	uint64_t version; /* version of the code */
 	uint64_t version; /* version of the code */
 	uint64_t hash_test; /* result of hashing HASH_MAGIC. */
 	uint64_t hash_test; /* result of hashing HASH_MAGIC. */
 	uint64_t hash_seed; /* "random" seed written at creation time. */
 	uint64_t hash_seed; /* "random" seed written at creation time. */
-	tdb_off_t free_list; /* (First) free list. */
+	tdb_off_t free_table; /* (First) free table. */
 	tdb_off_t recovery; /* Transaction recovery area. */
 	tdb_off_t recovery; /* Transaction recovery area. */
 
 
 	tdb_off_t reserved[26];
 	tdb_off_t reserved[26];
@@ -215,7 +239,7 @@ struct tdb_header {
 	tdb_off_t hashtable[1ULL << TDB_TOPLEVEL_HASH_BITS];
 	tdb_off_t hashtable[1ULL << TDB_TOPLEVEL_HASH_BITS];
 };
 };
 
 
-struct tdb_freelist {
+struct tdb_freetable {
 	struct tdb_used_record hdr;
 	struct tdb_used_record hdr;
 	tdb_off_t next;
 	tdb_off_t next;
 	tdb_off_t buckets[TDB_FREE_BUCKETS];
 	tdb_off_t buckets[TDB_FREE_BUCKETS];
@@ -246,7 +270,7 @@ struct traverse_info {
 		/* We ignore groups here, and treat it as a big array. */
 		/* We ignore groups here, and treat it as a big array. */
 		unsigned entry;
 		unsigned entry;
 		unsigned int total_buckets;
 		unsigned int total_buckets;
-	} levels[64 / TDB_SUBLEVEL_HASH_BITS];
+	} levels[TDB_MAX_LEVELS + 1];
 	unsigned int num_levels;
 	unsigned int num_levels;
 	unsigned int toplevel_group;
 	unsigned int toplevel_group;
 	/* This makes delete-everything-inside-traverse work as expected. */
 	/* This makes delete-everything-inside-traverse work as expected. */
@@ -269,6 +293,15 @@ struct tdb_lock_type {
 	uint32_t ltype;
 	uint32_t ltype;
 };
 };
 
 
+/* This is only needed for tdb_access_commit, but used everywhere to
+ * simplify. */
+struct tdb_access_hdr {
+	struct tdb_access_hdr *next;
+	tdb_off_t off;
+	tdb_len_t len;
+	bool convert;
+};
+
 struct tdb_context {
 struct tdb_context {
 	/* Filename of the database. */
 	/* Filename of the database. */
 	const char *name;
 	const char *name;
@@ -298,8 +331,8 @@ struct tdb_context {
 	uint32_t flags;
 	uint32_t flags;
 
 
 	/* Logging function */
 	/* Logging function */
-	tdb_logfn_t log;
-	void *log_priv;
+	tdb_logfn_t logfn;
+	void *log_private;
 
 
 	/* Hash function. */
 	/* Hash function. */
 	tdb_hashfn_t khash;
 	tdb_hashfn_t khash;
@@ -309,17 +342,23 @@ struct tdb_context {
 	/* Set if we are in a transaction. */
 	/* Set if we are in a transaction. */
 	struct tdb_transaction *transaction;
 	struct tdb_transaction *transaction;
 	
 	
-	/* What freelist are we using? */
-	uint64_t flist_off;
+	/* What free table are we using? */
+	tdb_off_t ftable_off;
+	unsigned int ftable;
 
 
 	/* IO methods: changes for transactions. */
 	/* IO methods: changes for transactions. */
 	const struct tdb_methods *methods;
 	const struct tdb_methods *methods;
 
 
 	/* Lock information */
 	/* Lock information */
 	struct tdb_lock_type allrecord_lock;
 	struct tdb_lock_type allrecord_lock;
-	uint64_t num_lockrecs;
+	size_t num_lockrecs;
 	struct tdb_lock_type *lockrecs;
 	struct tdb_lock_type *lockrecs;
 
 
+	struct tdb_attribute_stats *stats;
+
+	/* Direct access information */
+	struct tdb_access_hdr *access;
+
 	/* Single list of all TDBs, to avoid multiple opens. */
 	/* Single list of all TDBs, to avoid multiple opens. */
 	struct tdb_context *next;
 	struct tdb_context *next;
 	dev_t device;	
 	dev_t device;	
@@ -331,7 +370,7 @@ struct tdb_methods {
 	int (*write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
 	int (*write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
 	int (*oob)(struct tdb_context *, tdb_off_t, bool);
 	int (*oob)(struct tdb_context *, tdb_off_t, bool);
 	int (*expand_file)(struct tdb_context *, tdb_len_t);
 	int (*expand_file)(struct tdb_context *, tdb_len_t);
-	void *(*direct)(struct tdb_context *, tdb_off_t, size_t);
+	void *(*direct)(struct tdb_context *, tdb_off_t, size_t, bool);
 };
 };
 
 
 /*
 /*
@@ -367,29 +406,32 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h);
 bool is_subhash(tdb_off_t val);
 bool is_subhash(tdb_off_t val);
 
 
 /* free.c: */
 /* free.c: */
-int tdb_flist_init(struct tdb_context *tdb);
+int tdb_ftable_init(struct tdb_context *tdb);
 
 
 /* check.c needs these to iterate through free lists. */
 /* check.c needs these to iterate through free lists. */
-tdb_off_t first_flist(struct tdb_context *tdb);
-tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist);
+tdb_off_t first_ftable(struct tdb_context *tdb);
+tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable);
 
 
-/* If this fails, try tdb_expand. */
+/* This returns space or TDB_OFF_ERR. */
 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
-		uint64_t hash, bool growing);
+		uint64_t hash, unsigned magic, bool growing);
 
 
 /* Put this record in a free list. */
 /* Put this record in a free list. */
 int add_free_record(struct tdb_context *tdb,
 int add_free_record(struct tdb_context *tdb,
 		    tdb_off_t off, tdb_len_t len_with_header);
 		    tdb_off_t off, tdb_len_t len_with_header);
 
 
-/* Set up header for a used record. */
+/* Set up header for a used/ftable/htable/chain record. */
 int set_header(struct tdb_context *tdb,
 int set_header(struct tdb_context *tdb,
 	       struct tdb_used_record *rec,
 	       struct tdb_used_record *rec,
-	       uint64_t keylen, uint64_t datalen,
+	       unsigned magic, uint64_t keylen, uint64_t datalen,
 	       uint64_t actuallen, unsigned hashlow);
 	       uint64_t actuallen, unsigned hashlow);
 
 
 /* Used by tdb_check to verify. */
 /* Used by tdb_check to verify. */
 unsigned int size_to_bucket(tdb_len_t data_len);
 unsigned int size_to_bucket(tdb_len_t data_len);
-tdb_off_t bucket_off(tdb_off_t flist_off, unsigned bucket);
+tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket);
+
+/* Used by tdb_summary */
+size_t dead_space(struct tdb_context *tdb, tdb_off_t off);
 
 
 /* io.c: */
 /* io.c: */
 /* Initialize tdb->methods. */
 /* Initialize tdb->methods. */
@@ -402,10 +444,6 @@ void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size);
 void tdb_munmap(struct tdb_context *tdb);
 void tdb_munmap(struct tdb_context *tdb);
 void tdb_mmap(struct tdb_context *tdb);
 void tdb_mmap(struct tdb_context *tdb);
 
 
-/* Either make a copy into pad and return that, or return ptr into mmap.
- * Converts endian (ie. will use pad in that case). */
-void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len);
-
 /* Either alloc a copy, or give direct access.  Release frees or noop. */
 /* Either alloc a copy, or give direct access.  Release frees or noop. */
 const void *tdb_access_read(struct tdb_context *tdb,
 const void *tdb_access_read(struct tdb_context *tdb,
 			    tdb_off_t off, tdb_len_t len, bool convert);
 			    tdb_off_t off, tdb_len_t len, bool convert);
@@ -452,6 +490,13 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
 		     void *rec, size_t len);
 		     void *rec, size_t len);
 
 
+/* Adds a stat, if it's in range. */
+void add_stat_(struct tdb_context *tdb, uint64_t *stat, size_t val);
+#define add_stat(tdb, statname, val)					\
+	do {								\
+		if (unlikely((tdb)->stats))				\
+			add_stat_((tdb), &(tdb)->stats->statname, (val)); \
+	} while (0)
 
 
 /* lock.c: */
 /* lock.c: */
 void tdb_lock_init(struct tdb_context *tdb);
 void tdb_lock_init(struct tdb_context *tdb);
@@ -507,6 +552,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype,
 int tdb_transaction_recover(struct tdb_context *tdb);
 int tdb_transaction_recover(struct tdb_context *tdb);
 bool tdb_needs_recovery(struct tdb_context *tdb);
 bool tdb_needs_recovery(struct tdb_context *tdb);
 
 
+/* tdb.c: */
+void COLD tdb_logerr(struct tdb_context *tdb,
+		     enum TDB_ERROR ecode,
+		     enum tdb_debug_level level,
+		     const char *fmt, ...);
+
 #ifdef TDB_TRACE
 #ifdef TDB_TRACE
 void tdb_trace(struct tdb_context *tdb, const char *op);
 void tdb_trace(struct tdb_context *tdb, const char *op);
 void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op);
 void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op);

+ 63 - 36
ccan/tdb2/summary.c

@@ -37,33 +37,43 @@ static int count_hash(struct tdb_context *tdb,
 
 
 static bool summarize(struct tdb_context *tdb,
 static bool summarize(struct tdb_context *tdb,
 		      struct tally *hashes,
 		      struct tally *hashes,
-		      struct tally *flists,
+		      struct tally *ftables,
 		      struct tally *free,
 		      struct tally *free,
 		      struct tally *keys,
 		      struct tally *keys,
 		      struct tally *data,
 		      struct tally *data,
 		      struct tally *extra,
 		      struct tally *extra,
 		      struct tally *uncoal,
 		      struct tally *uncoal,
-		      struct tally *buckets)
+		      struct tally *buckets,
+		      struct tally *chains)
 {
 {
 	tdb_off_t off;
 	tdb_off_t off;
 	tdb_len_t len;
 	tdb_len_t len;
 	tdb_len_t unc = 0;
 	tdb_len_t unc = 0;
 
 
 	for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
 	for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
-		union {
+		const union {
 			struct tdb_used_record u;
 			struct tdb_used_record u;
 			struct tdb_free_record f;
 			struct tdb_free_record f;
-		} pad, *p;
-		p = tdb_get(tdb, off, &pad, sizeof(pad));
+			struct tdb_recovery_record r;
+		} *p;
+		/* We might not be able to get the whole thing. */
+		p = tdb_access_read(tdb, off, sizeof(p->f), true);
 		if (!p)
 		if (!p)
 			return false;
 			return false;
-		if (rec_magic(&p->u) != TDB_MAGIC) {
-			len = p->f.data_len;
+		if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
+		    || p->r.magic == TDB_RECOVERY_MAGIC) {
+			if (unc) {
+				tally_add(uncoal, unc);
+				unc = 0;
+			}
+			len = sizeof(p->r) + p->r.max_len;
+		} else if (frec_magic(&p->f) == TDB_FREE_MAGIC) {
+			len = frec_len(&p->f);
 			tally_add(free, len);
 			tally_add(free, len);
 			tally_add(buckets, size_to_bucket(len));
 			tally_add(buckets, size_to_bucket(len));
 			len += sizeof(p->u);
 			len += sizeof(p->u);
 			unc++;
 			unc++;
-		} else {
+		} else if (rec_magic(&p->u) == TDB_USED_MAGIC) {
 			if (unc) {
 			if (unc) {
 				tally_add(uncoal, unc);
 				tally_add(uncoal, unc);
 				unc = 0;
 				unc = 0;
@@ -73,25 +83,35 @@ static bool summarize(struct tdb_context *tdb,
 				+ rec_data_length(&p->u)
 				+ rec_data_length(&p->u)
 				+ rec_extra_padding(&p->u);
 				+ rec_extra_padding(&p->u);
 
 
-			/* FIXME: Use different magic for hashes, flists. */
-			if (!rec_key_length(&p->u) && rec_hash(&p->u) < 2) {
-				if (rec_hash(&p->u) == 0) {
-					int count = count_hash(tdb,
-							off + sizeof(p->u),
-							TDB_SUBLEVEL_HASH_BITS);
-					if (count == -1)
-						return false;
-					tally_add(hashes, count);
-				} else {
-					tally_add(flists,
-						  rec_data_length(&p->u));
-				}
-			} else {
-				tally_add(keys, rec_key_length(&p->u));
-				tally_add(data, rec_data_length(&p->u));
-			}
+			tally_add(keys, rec_key_length(&p->u));
+			tally_add(data, rec_data_length(&p->u));
+			tally_add(extra, rec_extra_padding(&p->u));
+		} else if (rec_magic(&p->u) == TDB_HTABLE_MAGIC) {
+			int count = count_hash(tdb,
+					       off + sizeof(p->u),
+					       TDB_SUBLEVEL_HASH_BITS);
+			if (count == -1)
+				return false;
+			tally_add(hashes, count);
+			tally_add(extra, rec_extra_padding(&p->u));
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+		} else if (rec_magic(&p->u) == TDB_FTABLE_MAGIC) {
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+			tally_add(ftables, rec_data_length(&p->u));
+			tally_add(extra, rec_extra_padding(&p->u));
+		} else if (rec_magic(&p->u) == TDB_CHAIN_MAGIC) {
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+			tally_add(chains, 1);
 			tally_add(extra, rec_extra_padding(&p->u));
 			tally_add(extra, rec_extra_padding(&p->u));
-		}
+		} else
+			len = dead_space(tdb, off);
+		tdb_access_release(tdb, p);
 	}
 	}
 	if (unc)
 	if (unc)
 		tally_add(uncoal, unc);
 		tally_add(uncoal, unc);
@@ -110,6 +130,7 @@ static bool summarize(struct tdb_context *tdb,
 	"Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
 	"Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
 	"Number of free lists: %zu\n%s" \
 	"Number of free lists: %zu\n%s" \
 	"Toplevel hash used: %u of %u\n" \
 	"Toplevel hash used: %u of %u\n" \
+	"Number of chains: %zu\n" \
 	"Number of subhashes: %zu\n" \
 	"Number of subhashes: %zu\n" \
 	"Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \
 	"Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \
 	"Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
 	"Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
@@ -127,8 +148,8 @@ static bool summarize(struct tdb_context *tdb,
 char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 {
 {
 	tdb_len_t len;
 	tdb_len_t len;
-	struct tally *flists, *hashes, *freet, *keys, *data, *extra, *uncoal,
-		*buckets;
+	struct tally *ftables, *hashes, *freet, *keys, *data, *extra, *uncoal,
+		*buckets, *chains;
 	char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg, *bucketsg;
 	char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg, *bucketsg;
 	char *ret = NULL;
 	char *ret = NULL;
 
 
@@ -143,7 +164,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 	}
 	}
 
 
 	/* Start stats off empty. */
 	/* Start stats off empty. */
-	flists = tally_new(HISTO_HEIGHT);
+	ftables = tally_new(HISTO_HEIGHT);
 	hashes = tally_new(HISTO_HEIGHT);
 	hashes = tally_new(HISTO_HEIGHT);
 	freet = tally_new(HISTO_HEIGHT);
 	freet = tally_new(HISTO_HEIGHT);
 	keys = tally_new(HISTO_HEIGHT);
 	keys = tally_new(HISTO_HEIGHT);
@@ -151,14 +172,16 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 	extra = tally_new(HISTO_HEIGHT);
 	extra = tally_new(HISTO_HEIGHT);
 	uncoal = tally_new(HISTO_HEIGHT);
 	uncoal = tally_new(HISTO_HEIGHT);
 	buckets = tally_new(HISTO_HEIGHT);
 	buckets = tally_new(HISTO_HEIGHT);
-	if (!flists || !hashes || !freet || !keys || !data || !extra
-	    || !uncoal || !buckets) {
-		tdb->ecode = TDB_ERR_OOM;
+	chains = tally_new(HISTO_HEIGHT);
+	if (!ftables || !hashes || !freet || !keys || !data || !extra
+	    || !uncoal || !buckets || !chains) {
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			   "tdb_summary: failed to allocate tally structures");
 		goto unlock;
 		goto unlock;
 	}
 	}
 
 
-	if (!summarize(tdb, hashes, flists, freet, keys, data, extra, uncoal,
-		       buckets))
+	if (!summarize(tdb, hashes, ftables, freet, keys, data, extra, uncoal,
+		       buckets, chains))
 		goto unlock;
 		goto unlock;
 
 
 	if (flags & TDB_SUMMARY_HISTOGRAMS) {
 	if (flags & TDB_SUMMARY_HISTOGRAMS) {
@@ -206,6 +229,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 		      count_hash(tdb, offsetof(struct tdb_header, hashtable),
 		      count_hash(tdb, offsetof(struct tdb_header, hashtable),
 				 TDB_TOPLEVEL_HASH_BITS),
 				 TDB_TOPLEVEL_HASH_BITS),
 		      1 << TDB_TOPLEVEL_HASH_BITS,
 		      1 << TDB_TOPLEVEL_HASH_BITS,
+		      tally_num(chains),
 		      tally_num(hashes),
 		      tally_num(hashes),
 		      tally_min(hashes), tally_mean(hashes), tally_max(hashes),
 		      tally_min(hashes), tally_mean(hashes), tally_max(hashes),
 		      hashesg ? hashesg : "",
 		      hashesg ? hashesg : "",
@@ -215,11 +239,12 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 		      tally_total(freet, NULL) * 100.0 / tdb->map_size,
 		      tally_total(freet, NULL) * 100.0 / tdb->map_size,
 		      (tally_num(keys) + tally_num(freet) + tally_num(hashes))
 		      (tally_num(keys) + tally_num(freet) + tally_num(hashes))
 		      * sizeof(struct tdb_used_record) * 100.0 / tdb->map_size,
 		      * sizeof(struct tdb_used_record) * 100.0 / tdb->map_size,
-		      tally_num(flists) * sizeof(struct tdb_freelist)
+		      tally_num(ftables) * sizeof(struct tdb_freetable)
 		      * 100.0 / tdb->map_size,
 		      * 100.0 / tdb->map_size,
 		      (tally_num(hashes)
 		      (tally_num(hashes)
 		       * (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
 		       * (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
-		       + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS))
+		       + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS)
+		       + sizeof(struct tdb_chain) * tally_num(chains))
 		      * 100.0 / tdb->map_size);
 		      * 100.0 / tdb->map_size);
 
 
 unlock:
 unlock:
@@ -237,6 +262,8 @@ unlock:
 	free(data);
 	free(data);
 	free(extra);
 	free(extra);
 	free(uncoal);
 	free(uncoal);
+	free(ftables);
+	free(chains);
 
 
 	tdb_allrecord_unlock(tdb, F_RDLCK);
 	tdb_allrecord_unlock(tdb, F_RDLCK);
 	tdb_unlock_expand(tdb, F_RDLCK);
 	tdb_unlock_expand(tdb, F_RDLCK);

+ 148 - 91
ccan/tdb2/tdb.c

@@ -1,8 +1,7 @@
 #include "private.h"
 #include "private.h"
 #include <ccan/tdb2/tdb2.h>
 #include <ccan/tdb2/tdb2.h>
-#include <ccan/build_assert/build_assert.h>
-#include <ccan/likely/likely.h>
 #include <assert.h>
 #include <assert.h>
+#include <stdarg.h>
 
 
 /* The null return. */
 /* The null return. */
 struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
 struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
@@ -10,13 +9,6 @@ struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
 static struct tdb_context *tdbs = NULL;
 static struct tdb_context *tdbs = NULL;
 
 
-PRINTF_FMT(4, 5) static void
-null_log_fn(struct tdb_context *tdb,
-	    enum tdb_debug_level level, void *priv,
-	    const char *fmt, ...)
-{
-}
-
 static bool tdb_already_open(dev_t device, ino_t ino)
 static bool tdb_already_open(dev_t device, ino_t ino)
 {
 {
 	struct tdb_context *i;
 	struct tdb_context *i;
@@ -39,8 +31,8 @@ static uint64_t random_number(struct tdb_context *tdb)
 	fd = open("/dev/urandom", O_RDONLY);
 	fd = open("/dev/urandom", O_RDONLY);
 	if (fd >= 0) {
 	if (fd >= 0) {
 		if (tdb_read_all(fd, &ret, sizeof(ret))) {
 		if (tdb_read_all(fd, &ret, sizeof(ret))) {
-			tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
-				 "tdb_open: random from /dev/urandom\n");
+			tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
+				 "tdb_open: random from /dev/urandom");
 			close(fd);
 			close(fd);
 			return ret;
 			return ret;
 		}
 		}
@@ -55,9 +47,9 @@ static uint64_t random_number(struct tdb_context *tdb)
 			char reply[1 + sizeof(uint64_t)];
 			char reply[1 + sizeof(uint64_t)];
 			int r = read(fd, reply, sizeof(reply));
 			int r = read(fd, reply, sizeof(reply));
 			if (r > 1) {
 			if (r > 1) {
-				tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
-					 "tdb_open: %u random bytes from"
-					 " /dev/egd-pool\n", r-1);
+				tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
+					   "tdb_open: %u random bytes from"
+					   " /dev/egd-pool", r-1);
 				/* Copy at least some bytes. */
 				/* Copy at least some bytes. */
 				memcpy(&ret, reply+1, r - 1);
 				memcpy(&ret, reply+1, r - 1);
 				if (reply[0] == sizeof(uint64_t)
 				if (reply[0] == sizeof(uint64_t)
@@ -73,14 +65,14 @@ static uint64_t random_number(struct tdb_context *tdb)
 	/* Fallback: pid and time. */
 	/* Fallback: pid and time. */
 	gettimeofday(&now, NULL);
 	gettimeofday(&now, NULL);
 	ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
 	ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
-	tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
-		 "tdb_open: random from getpid and time\n");
+	tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
+		   "tdb_open: random from getpid and time");
 	return ret;
 	return ret;
 }
 }
 
 
 struct new_database {
 struct new_database {
 	struct tdb_header hdr;
 	struct tdb_header hdr;
-	struct tdb_freelist flist;
+	struct tdb_freetable ftable;
 };
 };
 
 
 /* initialise a new database */
 /* initialise a new database */
@@ -109,11 +101,11 @@ static int tdb_new_database(struct tdb_context *tdb,
 	memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
 	memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
 
 
 	/* Free is empty. */
 	/* Free is empty. */
-	newdb.hdr.free_list = offsetof(struct new_database, flist);
-	memset(&newdb.flist, 0, sizeof(newdb.flist));
-	set_header(NULL, &newdb.flist.hdr, 0,
-		   sizeof(newdb.flist) - sizeof(newdb.flist.hdr),
-		   sizeof(newdb.flist) - sizeof(newdb.flist.hdr), 1);
+	newdb.hdr.free_table = offsetof(struct new_database, ftable);
+	memset(&newdb.ftable, 0, sizeof(newdb.ftable));
+	set_header(NULL, &newdb.ftable.hdr, TDB_FTABLE_MAGIC, 0,
+		   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
+		   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr), 0);
 
 
 	/* Magic food */
 	/* Magic food */
 	memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
 	memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
@@ -130,7 +122,8 @@ static int tdb_new_database(struct tdb_context *tdb,
 		tdb->map_size = sizeof(newdb);
 		tdb->map_size = sizeof(newdb);
 		tdb->map_ptr = malloc(tdb->map_size);
 		tdb->map_ptr = malloc(tdb->map_size);
 		if (!tdb->map_ptr) {
 		if (!tdb->map_ptr) {
-			tdb->ecode = TDB_ERR_OOM;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "tdb_new_database: failed to allocate");
 			return -1;
 			return -1;
 		}
 		}
 		memcpy(tdb->map_ptr, &newdb, tdb->map_size);
 		memcpy(tdb->map_ptr, &newdb, tdb->map_size);
@@ -143,7 +136,9 @@ static int tdb_new_database(struct tdb_context *tdb,
 		return -1;
 		return -1;
 
 
 	if (!tdb_pwrite_all(tdb->fd, &newdb, sizeof(newdb), 0)) {
 	if (!tdb_pwrite_all(tdb->fd, &newdb, sizeof(newdb), 0)) {
-		tdb->ecode = TDB_ERR_IO;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+			   "tdb_new_database: failed to write: %s",
+			   strerror(errno));
 		return -1;
 		return -1;
 	}
 	}
 	return 0;
 	return 0;
@@ -155,7 +150,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 {
 {
 	struct tdb_context *tdb;
 	struct tdb_context *tdb;
 	struct stat st;
 	struct stat st;
-	int save_errno;
+	int saved_errno = 0;
 	uint64_t hash_test;
 	uint64_t hash_test;
 	unsigned v;
 	unsigned v;
 	struct tdb_header hdr;
 	struct tdb_header hdr;
@@ -165,7 +160,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	if (!tdb) {
 	if (!tdb) {
 		/* Can't log this */
 		/* Can't log this */
 		errno = ENOMEM;
 		errno = ENOMEM;
-		goto fail;
+		return NULL;
 	}
 	}
 	tdb->name = NULL;
 	tdb->name = NULL;
 	tdb->map_ptr = NULL;
 	tdb->map_ptr = NULL;
@@ -174,9 +169,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	tdb->map_size = sizeof(struct tdb_header);
 	tdb->map_size = sizeof(struct tdb_header);
 	tdb->ecode = TDB_SUCCESS;
 	tdb->ecode = TDB_SUCCESS;
 	tdb->flags = tdb_flags;
 	tdb->flags = tdb_flags;
-	tdb->log = null_log_fn;
-	tdb->log_priv = NULL;
+	tdb->logfn = NULL;
 	tdb->transaction = NULL;
 	tdb->transaction = NULL;
+	tdb->stats = NULL;
+	tdb->access = NULL;
 	tdb_hash_init(tdb);
 	tdb_hash_init(tdb);
 	tdb_io_init(tdb);
 	tdb_io_init(tdb);
 	tdb_lock_init(tdb);
 	tdb_lock_init(tdb);
@@ -184,8 +180,8 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	while (attr) {
 	while (attr) {
 		switch (attr->base.attr) {
 		switch (attr->base.attr) {
 		case TDB_ATTRIBUTE_LOG:
 		case TDB_ATTRIBUTE_LOG:
-			tdb->log = attr->log.log_fn;
-			tdb->log_priv = attr->log.log_private;
+			tdb->logfn = attr->log.log_fn;
+			tdb->log_private = attr->log.log_private;
 			break;
 			break;
 		case TDB_ATTRIBUTE_HASH:
 		case TDB_ATTRIBUTE_HASH:
 			tdb->khash = attr->hash.hash_fn;
 			tdb->khash = attr->hash.hash_fn;
@@ -194,20 +190,24 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 		case TDB_ATTRIBUTE_SEED:
 		case TDB_ATTRIBUTE_SEED:
 			seed = &attr->seed;
 			seed = &attr->seed;
 			break;
 			break;
+		case TDB_ATTRIBUTE_STATS:
+			tdb->stats = &attr->stats;
+			/* They have stats we don't know about?  Tell them. */
+			if (tdb->stats->size > sizeof(attr->stats))
+				tdb->stats->size = sizeof(attr->stats);
+			break;
 		default:
 		default:
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_open: unknown attribute type %u\n",
-				 attr->base.attr);
-			errno = EINVAL;
+			tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+				   "tdb_open: unknown attribute type %u",
+				   attr->base.attr);
 			goto fail;
 			goto fail;
 		}
 		}
 		attr = attr->base.next;
 		attr = attr->base.next;
 	}
 	}
 
 
 	if ((open_flags & O_ACCMODE) == O_WRONLY) {
 	if ((open_flags & O_ACCMODE) == O_WRONLY) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_open: can't open tdb %s write-only\n", name);
-		errno = EINVAL;
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			   "tdb_open: can't open tdb %s write-only", name);
 		goto fail;
 		goto fail;
 	}
 	}
 
 
@@ -225,21 +225,21 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	if (tdb->flags & TDB_INTERNAL) {
 	if (tdb->flags & TDB_INTERNAL) {
 		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
 		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
 		if (tdb_new_database(tdb, seed, &hdr) != 0) {
 		if (tdb_new_database(tdb, seed, &hdr) != 0) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_open: tdb_new_database failed!");
 			goto fail;
 			goto fail;
 		}
 		}
 		tdb_convert(tdb, &hdr.hash_seed, sizeof(hdr.hash_seed));
 		tdb_convert(tdb, &hdr.hash_seed, sizeof(hdr.hash_seed));
 		tdb->hash_seed = hdr.hash_seed;
 		tdb->hash_seed = hdr.hash_seed;
-		tdb_flist_init(tdb);
+		tdb_ftable_init(tdb);
 		return tdb;
 		return tdb;
 	}
 	}
 
 
 	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
 	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
-		tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
-			 "tdb_open: could not open file %s: %s\n",
-			 name, strerror(errno));
-		goto fail;	/* errno set by open(2) */
+		/* errno set by open(2) */
+		saved_errno = errno;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			   "tdb_open: could not open file %s: %s",
+			   name, strerror(errno));
+		goto fail;
 	}
 	}
 
 
 	/* on exec, don't inherit the fd */
 	/* on exec, don't inherit the fd */
@@ -248,19 +248,19 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 
 
 	/* ensure there is only one process initialising at once */
 	/* ensure there is only one process initialising at once */
 	if (tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK) == -1) {
 	if (tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK) == -1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_open: failed to get open lock on %s: %s\n",
-			 name, strerror(errno));
-		goto fail;	/* errno set by tdb_brlock */
+		/* errno set by tdb_brlock */
+		saved_errno = errno;
+		goto fail;
 	}
 	}
 
 
 	if (!tdb_pread_all(tdb->fd, &hdr, sizeof(hdr), 0)
 	if (!tdb_pread_all(tdb->fd, &hdr, sizeof(hdr), 0)
 	    || strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
 	    || strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
-		if (!(open_flags & O_CREAT)
-		    || tdb_new_database(tdb, seed, &hdr) == -1) {
-			if (errno == 0) {
-				errno = EIO; /* ie bad format or something */
-			}
+		if (!(open_flags & O_CREAT)) {
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+				   "tdb_open: %s is not a tdb file", name);
+			goto fail;
+		}
+		if (tdb_new_database(tdb, seed, &hdr) == -1) {
 			goto fail;
 			goto fail;
 		}
 		}
 	} else if (hdr.version != TDB_VERSION) {
 	} else if (hdr.version != TDB_VERSION) {
@@ -268,10 +268,9 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 			tdb->flags |= TDB_CONVERT;
 			tdb->flags |= TDB_CONVERT;
 		else {
 		else {
 			/* wrong version */
 			/* wrong version */
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_open: %s is unknown version 0x%llx\n",
-				 name, (long long)hdr.version);
-			errno = EIO;
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+				   "tdb_open: %s is unknown version 0x%llx",
+				   name, (long long)hdr.version);
 			goto fail;
 			goto fail;
 		}
 		}
 	}
 	}
@@ -282,29 +281,34 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
 	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
 	if (hdr.hash_test != hash_test) {
 	if (hdr.hash_test != hash_test) {
 		/* wrong hash variant */
 		/* wrong hash variant */
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_open: %s uses a different hash function\n",
-			 name);
-		errno = EIO;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			   "tdb_open: %s uses a different hash function",
+			   name);
 		goto fail;
 		goto fail;
 	}
 	}
 
 
-	if (fstat(tdb->fd, &st) == -1)
+	if (fstat(tdb->fd, &st) == -1) {
+		saved_errno = errno;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			   "tdb_open: could not stat open %s: %s",
+			   name, strerror(errno));
 		goto fail;
 		goto fail;
+	}
 
 
 	/* Is it already in the open list?  If so, fail. */
 	/* Is it already in the open list?  If so, fail. */
 	if (tdb_already_open(st.st_dev, st.st_ino)) {
 	if (tdb_already_open(st.st_dev, st.st_ino)) {
 		/* FIXME */
 		/* FIXME */
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_open: %s (%d,%d) is already open in this process\n",
-			 name, (int)st.st_dev, (int)st.st_ino);
-		errno = EBUSY;
+		tdb_logerr(tdb, TDB_ERR_NESTING, TDB_DEBUG_ERROR,
+			   "tdb_open: %s (%d,%d) is already open in this"
+			   " process",
+			   name, (int)st.st_dev, (int)st.st_ino);
 		goto fail;
 		goto fail;
 	}
 	}
 
 
 	tdb->name = strdup(name);
 	tdb->name = strdup(name);
 	if (!tdb->name) {
 	if (!tdb->name) {
-		errno = ENOMEM;
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			   "tdb_open: failed to allocate name");
 		goto fail;
 		goto fail;
 	}
 	}
 
 
@@ -317,11 +321,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 
 
 	/* Now it's fully formed, recover if necessary. */
 	/* Now it's fully formed, recover if necessary. */
 	if (tdb_needs_recovery(tdb) && tdb_lock_and_recover(tdb) == -1) {
 	if (tdb_needs_recovery(tdb) && tdb_lock_and_recover(tdb) == -1) {
-		errno = EIO;
 		goto fail;
 		goto fail;
 	}
 	}
 
 
-	if (tdb_flist_init(tdb) == -1)
+	if (tdb_ftable_init(tdb) == -1)
 		goto fail;
 		goto fail;
 
 
 	tdb->next = tdbs;
 	tdb->next = tdbs;
@@ -329,10 +332,30 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	return tdb;
 	return tdb;
 
 
  fail:
  fail:
-	save_errno = errno;
-
-	if (!tdb)
-		return NULL;
+	/* Map ecode to some logical errno. */
+	if (!saved_errno) {
+		switch (tdb->ecode) {
+		case TDB_ERR_CORRUPT:
+		case TDB_ERR_IO:
+			saved_errno = EIO;
+			break;
+		case TDB_ERR_LOCK:
+			saved_errno = EWOULDBLOCK;
+			break;
+		case TDB_ERR_OOM:
+			saved_errno = ENOMEM;
+			break;
+		case TDB_ERR_EINVAL:
+			saved_errno = EINVAL;
+			break;
+		case TDB_ERR_NESTING:
+			saved_errno = EBUSY;
+			break;
+		default:
+			saved_errno = EINVAL;
+			break;
+		}
+	}
 
 
 #ifdef TDB_TRACE
 #ifdef TDB_TRACE
 	close(tdb->tracefd);
 	close(tdb->tracefd);
@@ -346,15 +369,14 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	free((char *)tdb->name);
 	free((char *)tdb->name);
 	if (tdb->fd != -1)
 	if (tdb->fd != -1)
 		if (close(tdb->fd) != 0)
 		if (close(tdb->fd) != 0)
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_open: failed to close tdb->fd"
-				 " on error!\n");
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+				   "tdb_open: failed to close tdb->fd"
+				   " on error!");
 	free(tdb);
 	free(tdb);
-	errno = save_errno;
+	errno = saved_errno;
 	return NULL;
 	return NULL;
 }
 }
 
 
-/* FIXME: modify, don't rewrite! */
 static int update_rec_hdr(struct tdb_context *tdb,
 static int update_rec_hdr(struct tdb_context *tdb,
 			  tdb_off_t off,
 			  tdb_off_t off,
 			  tdb_len_t keylen,
 			  tdb_len_t keylen,
@@ -364,7 +386,8 @@ static int update_rec_hdr(struct tdb_context *tdb,
 {
 {
 	uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec);
 	uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec);
 
 
-	if (set_header(tdb, rec, keylen, datalen, keylen + dataroom, h))
+	if (set_header(tdb, rec, TDB_USED_MAGIC, keylen, datalen,
+		       keylen + dataroom, h))
 		return -1;
 		return -1;
 
 
 	return tdb_write_convert(tdb, off, rec, sizeof(*rec));
 	return tdb_write_convert(tdb, off, rec, sizeof(*rec));
@@ -380,12 +403,14 @@ static int replace_data(struct tdb_context *tdb,
 	tdb_off_t new_off;
 	tdb_off_t new_off;
 
 
 	/* Allocate a new record. */
 	/* Allocate a new record. */
-	new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, growing);
+	new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, TDB_USED_MAGIC,
+			growing);
 	if (unlikely(new_off == TDB_OFF_ERR))
 	if (unlikely(new_off == TDB_OFF_ERR))
 		return -1;
 		return -1;
 
 
 	/* We didn't like the existing one: remove it. */
 	/* We didn't like the existing one: remove it. */
 	if (old_off) {
 	if (old_off) {
+		add_stat(tdb, frees, 1);
 		add_free_record(tdb, old_off,
 		add_free_record(tdb, old_off,
 				sizeof(struct tdb_used_record)
 				sizeof(struct tdb_used_record)
 				+ key.dsize + old_room);
 				+ key.dsize + old_room);
@@ -445,7 +470,6 @@ int tdb_store(struct tdb_context *tdb,
 						  h.hlock_range, F_WRLCK);
 						  h.hlock_range, F_WRLCK);
 				return 0;
 				return 0;
 			}
 			}
-			/* FIXME: See if right record is free? */
 		} else {
 		} else {
 			if (flag == TDB_MODIFY) {
 			if (flag == TDB_MODIFY) {
 				/* if the record doesn't exist and we
 				/* if the record doesn't exist and we
@@ -502,15 +526,13 @@ int tdb_append(struct tdb_context *tdb,
 					  F_WRLCK);
 					  F_WRLCK);
 			return 0;
 			return 0;
 		}
 		}
-		/* FIXME: Check right record free? */
 
 
 		/* Slow path. */
 		/* Slow path. */
 		newdata = malloc(key.dsize + old_dlen + dbuf.dsize);
 		newdata = malloc(key.dsize + old_dlen + dbuf.dsize);
 		if (!newdata) {
 		if (!newdata) {
-			tdb->ecode = TDB_ERR_OOM;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_append: cannot allocate %llu bytes!\n",
-				 (long long)key.dsize + old_dlen + dbuf.dsize);
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "tdb_append: failed to allocate %zu bytes",
+				   (size_t)(key.dsize+old_dlen+dbuf.dsize));
 			goto fail;
 			goto fail;
 		}
 		}
 		if (tdb->methods->read(tdb, off + sizeof(rec) + key.dsize,
 		if (tdb->methods->read(tdb, off + sizeof(rec) + key.dsize,
@@ -582,6 +604,7 @@ int tdb_delete(struct tdb_context *tdb, struct tdb_data key)
 		goto unlock_err;
 		goto unlock_err;
 
 
 	/* Free the deleted entry. */
 	/* Free the deleted entry. */
+	add_stat(tdb, frees, 1);
 	if (add_free_record(tdb, off,
 	if (add_free_record(tdb, off,
 			    sizeof(struct tdb_used_record)
 			    sizeof(struct tdb_used_record)
 			    + rec_key_length(&rec)
 			    + rec_key_length(&rec)
@@ -602,12 +625,11 @@ int tdb_close(struct tdb_context *tdb)
 	struct tdb_context **i;
 	struct tdb_context **i;
 	int ret = 0;
 	int ret = 0;
 
 
-	/* FIXME:
+	tdb_trace(tdb, "tdb_close");
+
 	if (tdb->transaction) {
 	if (tdb->transaction) {
 		tdb_transaction_cancel(tdb);
 		tdb_transaction_cancel(tdb);
 	}
 	}
-	*/
-	tdb_trace(tdb, "tdb_close");
 
 
 	if (tdb->map_ptr) {
 	if (tdb->map_ptr) {
 		if (tdb->flags & TDB_INTERNAL)
 		if (tdb->flags & TDB_INTERNAL)
@@ -638,12 +660,12 @@ int tdb_close(struct tdb_context *tdb)
 	return ret;
 	return ret;
 }
 }
 
 
-enum TDB_ERROR tdb_error(struct tdb_context *tdb)
+enum TDB_ERROR tdb_error(const struct tdb_context *tdb)
 {
 {
 	return tdb->ecode;
 	return tdb->ecode;
 }
 }
 
 
-const char *tdb_errorstr(struct tdb_context *tdb)
+const char *tdb_errorstr(const struct tdb_context *tdb)
 {
 {
 	/* Gcc warns if you miss a case in the switch, so use that. */
 	/* Gcc warns if you miss a case in the switch, so use that. */
 	switch (tdb->ecode) {
 	switch (tdb->ecode) {
@@ -660,3 +682,38 @@ const char *tdb_errorstr(struct tdb_context *tdb)
 	}
 	}
 	return "Invalid error code";
 	return "Invalid error code";
 }
 }
+
+void COLD tdb_logerr(struct tdb_context *tdb,
+		     enum TDB_ERROR ecode,
+		     enum tdb_debug_level level,
+		     const char *fmt, ...)
+{
+	char *message;
+	va_list ap;
+	size_t len;
+	/* tdb_open paths care about errno, so save it. */
+	int saved_errno = errno;
+
+	tdb->ecode = ecode;
+
+	if (!tdb->logfn)
+		return;
+
+	/* FIXME: Doesn't assume asprintf. */
+	va_start(ap, fmt);
+	len = vsnprintf(NULL, 0, fmt, ap);
+	va_end(ap);
+
+	message = malloc(len + 1);
+	if (!message) {
+		tdb->logfn(tdb, level, tdb->log_private,
+			   "out of memory formatting message");
+		return;
+	}
+	va_start(ap, fmt);
+	len = vsprintf(message, fmt, ap);
+	va_end(ap);
+	tdb->logfn(tdb, level, tdb->log_private, message);
+	free(message);
+	errno = saved_errno;
+}

+ 34 - 5
ccan/tdb2/tdb2.h

@@ -67,7 +67,7 @@ enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK,
 /* flags for tdb_summary. Logical or to combine. */
 /* flags for tdb_summary. Logical or to combine. */
 enum tdb_summary_flags { TDB_SUMMARY_HISTOGRAMS = 1 };
 enum tdb_summary_flags { TDB_SUMMARY_HISTOGRAMS = 1 };
 
 
-/* debugging uses one of the following levels */
+/* logging uses one of the following levels */
 enum tdb_debug_level {TDB_DEBUG_FATAL = 0, TDB_DEBUG_ERROR, 
 enum tdb_debug_level {TDB_DEBUG_FATAL = 0, TDB_DEBUG_ERROR, 
 		      TDB_DEBUG_WARNING, TDB_DEBUG_TRACE};
 		      TDB_DEBUG_WARNING, TDB_DEBUG_TRACE};
 
 
@@ -80,14 +80,15 @@ struct tdb_context;
 
 
 /* FIXME: Make typesafe */
 /* FIXME: Make typesafe */
 typedef int (*tdb_traverse_func)(struct tdb_context *, TDB_DATA, TDB_DATA, void *);
 typedef int (*tdb_traverse_func)(struct tdb_context *, TDB_DATA, TDB_DATA, void *);
-typedef void (*tdb_logfn_t)(struct tdb_context *, enum tdb_debug_level, void *priv, const char *, ...) PRINTF_FMT(4, 5);
+typedef void (*tdb_logfn_t)(struct tdb_context *, enum tdb_debug_level, void *, const char *);
 typedef uint64_t (*tdb_hashfn_t)(const void *key, size_t len, uint64_t seed,
 typedef uint64_t (*tdb_hashfn_t)(const void *key, size_t len, uint64_t seed,
 				 void *priv);
 				 void *priv);
 
 
 enum tdb_attribute_type {
 enum tdb_attribute_type {
 	TDB_ATTRIBUTE_LOG = 0,
 	TDB_ATTRIBUTE_LOG = 0,
 	TDB_ATTRIBUTE_HASH = 1,
 	TDB_ATTRIBUTE_HASH = 1,
-	TDB_ATTRIBUTE_SEED = 2
+	TDB_ATTRIBUTE_SEED = 2,
+	TDB_ATTRIBUTE_STATS = 3
 };
 };
 
 
 struct tdb_attribute_base {
 struct tdb_attribute_base {
@@ -112,11 +113,39 @@ struct tdb_attribute_seed {
 	uint64_t seed;
 	uint64_t seed;
 };
 };
 
 
+struct tdb_attribute_stats {
+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_STATS */
+	size_t size; /* = sizeof(struct tdb_attribute_stats) */
+	uint64_t allocs;
+	uint64_t   alloc_subhash;
+	uint64_t   alloc_chain;
+	uint64_t   alloc_bucket_exact;
+	uint64_t   alloc_bucket_max;
+	uint64_t   alloc_leftover;
+	uint64_t   alloc_coalesce_tried;
+	uint64_t     alloc_coalesce_lockfail;
+	uint64_t     alloc_coalesce_race;
+	uint64_t     alloc_coalesce_succeeded;
+	uint64_t        alloc_coalesce_num_merged;
+	uint64_t compares;
+	uint64_t   compare_wrong_bucket;
+	uint64_t   compare_wrong_offsetbits;
+	uint64_t   compare_wrong_keylen;
+	uint64_t   compare_wrong_rechash;
+	uint64_t   compare_wrong_keycmp;
+	uint64_t expands;
+	uint64_t frees;
+	uint64_t locks;
+	uint64_t    lock_lowlevel;
+	uint64_t    lock_nonblock;
+};
+
 union tdb_attribute {
 union tdb_attribute {
 	struct tdb_attribute_base base;
 	struct tdb_attribute_base base;
 	struct tdb_attribute_log log;
 	struct tdb_attribute_log log;
 	struct tdb_attribute_hash hash;
 	struct tdb_attribute_hash hash;
 	struct tdb_attribute_seed seed;
 	struct tdb_attribute_seed seed;
+	struct tdb_attribute_stats stats;
 };
 };
 		
 		
 struct tdb_context *tdb_open(const char *name, int tdb_flags,
 struct tdb_context *tdb_open(const char *name, int tdb_flags,
@@ -139,8 +168,8 @@ int tdb_check(struct tdb_context *tdb,
 	      int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
 	      int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
 	      void *private_data);
 	      void *private_data);
 
 
-enum TDB_ERROR tdb_error(struct tdb_context *tdb);
-const char *tdb_errorstr(struct tdb_context *tdb);
+enum TDB_ERROR tdb_error(const struct tdb_context *tdb);
+const char *tdb_errorstr(const struct tdb_context *tdb);
 
 
 int tdb_transaction_start(struct tdb_context *tdb);
 int tdb_transaction_start(struct tdb_context *tdb);
 void tdb_transaction_cancel(struct tdb_context *tdb);
 void tdb_transaction_cancel(struct tdb_context *tdb);

+ 41 - 38
ccan/tdb2/test/layout.c

@@ -23,20 +23,20 @@ static void add(struct tdb_layout *layout, union tdb_layout_elem elem)
 	layout->elem[layout->num_elems++] = elem;
 	layout->elem[layout->num_elems++] = elem;
 }
 }
 
 
-void tdb_layout_add_freelist(struct tdb_layout *layout)
+void tdb_layout_add_freetable(struct tdb_layout *layout)
 {
 {
 	union tdb_layout_elem elem;
 	union tdb_layout_elem elem;
-	elem.base.type = FREELIST;
+	elem.base.type = FREETABLE;
 	add(layout, elem);
 	add(layout, elem);
 }
 }
 
 
 void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
 void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
-			 unsigned flist)
+			 unsigned ftable)
 {
 {
 	union tdb_layout_elem elem;
 	union tdb_layout_elem elem;
 	elem.base.type = FREE;
 	elem.base.type = FREE;
 	elem.free.len = len;
 	elem.free.len = len;
-	elem.free.flist_num = flist;
+	elem.free.ftable_num = ftable;
 	add(layout, elem);
 	add(layout, elem);
 }
 }
 
 
@@ -82,9 +82,9 @@ static tdb_len_t hashtable_len(struct tle_hashtable *htable)
 		+ htable->extra;
 		+ htable->extra;
 }
 }
 
 
-static tdb_len_t freelist_len(struct tle_freelist *flist)
+static tdb_len_t freetable_len(struct tle_freetable *ftable)
 {
 {
-	return sizeof(struct tdb_freelist);
+	return sizeof(struct tdb_freetable);
 }
 }
 
 
 static void set_free_record(void *mem, tdb_len_t len)
 static void set_free_record(void *mem, tdb_len_t len)
@@ -97,7 +97,7 @@ static void set_data_record(void *mem, struct tdb_context *tdb,
 {
 {
 	struct tdb_used_record *u = mem;
 	struct tdb_used_record *u = mem;
 
 
-	set_header(tdb, u, used->key.dsize, used->data.dsize,
+	set_header(tdb, u, TDB_USED_MAGIC, used->key.dsize, used->data.dsize,
 		   used->key.dsize + used->data.dsize + used->extra,
 		   used->key.dsize + used->data.dsize + used->extra,
 		   tdb_hash(tdb, used->key.dptr, used->key.dsize));
 		   tdb_hash(tdb, used->key.dptr, used->key.dsize));
 	memcpy(u + 1, used->key.dptr, used->key.dsize);
 	memcpy(u + 1, used->key.dptr, used->key.dsize);
@@ -111,34 +111,36 @@ static void set_hashtable(void *mem, struct tdb_context *tdb,
 	struct tdb_used_record *u = mem;
 	struct tdb_used_record *u = mem;
 	tdb_len_t len = sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS;
 	tdb_len_t len = sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS;
 
 
-	set_header(tdb, u, 0, len, len + htable->extra, 0);
+	set_header(tdb, u, TDB_HTABLE_MAGIC, 0, len, len + htable->extra, 0);
 	memset(u + 1, 0, len);
 	memset(u + 1, 0, len);
 }
 }
 
 
-static void set_freelist(void *mem, struct tdb_context *tdb,
-			 struct tle_freelist *freelist, struct tdb_header *hdr,
-			 tdb_off_t last_flist)
+static void set_freetable(void *mem, struct tdb_context *tdb,
+			 struct tle_freetable *freetable, struct tdb_header *hdr,
+			 tdb_off_t last_ftable)
 {
 {
-	struct tdb_freelist *flist = mem;
-	memset(flist, 0, sizeof(*flist));
-	set_header(tdb, &flist->hdr, 0,
-		   sizeof(*flist) - sizeof(flist->hdr),
-		   sizeof(*flist) - sizeof(flist->hdr), 1);
-
-	if (last_flist) {
-		flist = (struct tdb_freelist *)((char *)hdr + last_flist);
-		flist->next = freelist->base.off;
+	struct tdb_freetable *ftable = mem;
+	memset(ftable, 0, sizeof(*ftable));
+	set_header(tdb, &ftable->hdr, TDB_FTABLE_MAGIC, 0,
+			sizeof(*ftable) - sizeof(ftable->hdr),
+			sizeof(*ftable) - sizeof(ftable->hdr), 0);
+
+	if (last_ftable) {
+		ftable = (struct tdb_freetable *)((char *)hdr + last_ftable);
+		ftable->next = freetable->base.off;
 	} else {
 	} else {
-		hdr->free_list = freelist->base.off;
+		hdr->free_table = freetable->base.off;
 	}
 	}
 }
 }
 
 
 static void add_to_freetable(struct tdb_context *tdb,
 static void add_to_freetable(struct tdb_context *tdb,
 			     tdb_off_t eoff,
 			     tdb_off_t eoff,
 			     tdb_off_t elen,
 			     tdb_off_t elen,
-			     struct tle_freelist *freelist)
+			     unsigned ftable,
+			     struct tle_freetable *freetable)
 {
 {
-	tdb->flist_off = freelist->base.off;
+	tdb->ftable_off = freetable->base.off;
+	tdb->ftable = ftable;
 	add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen);
 	add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen);
 }
 }
 
 
@@ -202,15 +204,15 @@ static void add_to_hashtable(struct tdb_context *tdb,
 	abort();
 	abort();
 }
 }
 
 
-static struct tle_freelist *find_flist(struct tdb_layout *layout, unsigned num)
+static struct tle_freetable *find_ftable(struct tdb_layout *layout, unsigned num)
 {
 {
 	unsigned i;
 	unsigned i;
 
 
 	for (i = 0; i < layout->num_elems; i++) {
 	for (i = 0; i < layout->num_elems; i++) {
-		if (layout->elem[i].base.type != FREELIST)
+		if (layout->elem[i].base.type != FREETABLE)
 			continue;
 			continue;
 		if (num == 0)
 		if (num == 0)
-			return &layout->elem[i].flist;
+			return &layout->elem[i].ftable;
 		num--;
 		num--;
 	}
 	}
 	abort();
 	abort();
@@ -220,7 +222,7 @@ static struct tle_freelist *find_flist(struct tdb_layout *layout, unsigned num)
 struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 {
 {
 	unsigned int i;
 	unsigned int i;
-	tdb_off_t off, len, last_flist;
+	tdb_off_t off, len, last_ftable;
 	char *mem;
 	char *mem;
 	struct tdb_context *tdb;
 	struct tdb_context *tdb;
 
 
@@ -231,8 +233,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 		union tdb_layout_elem *e = &layout->elem[i];
 		union tdb_layout_elem *e = &layout->elem[i];
 		e->base.off = off;
 		e->base.off = off;
 		switch (e->base.type) {
 		switch (e->base.type) {
-		case FREELIST:
-			len = freelist_len(&e->flist);
+		case FREETABLE:
+			len = freetable_len(&e->ftable);
 			break;
 			break;
 		case FREE:
 		case FREE:
 			len = free_record_len(e->free.len);
 			len = free_record_len(e->free.len);
@@ -259,14 +261,14 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 	tdb->map_ptr = mem;
 	tdb->map_ptr = mem;
 	tdb->map_size = off;
 	tdb->map_size = off;
 
 
-	last_flist = 0;
+	last_ftable = 0;
 	for (i = 0; i < layout->num_elems; i++) {
 	for (i = 0; i < layout->num_elems; i++) {
 		union tdb_layout_elem *e = &layout->elem[i];
 		union tdb_layout_elem *e = &layout->elem[i];
 		switch (e->base.type) {
 		switch (e->base.type) {
-		case FREELIST:
-			set_freelist(mem + e->base.off, tdb, &e->flist,
-				     (struct tdb_header *)mem, last_flist);
-			last_flist = e->base.off;
+		case FREETABLE:
+			set_freetable(mem + e->base.off, tdb, &e->ftable,
+				     (struct tdb_header *)mem, last_ftable);
+			last_ftable = e->base.off;
 			break;
 			break;
 		case FREE:
 		case FREE:
 			set_free_record(mem + e->base.off, e->free.len);
 			set_free_record(mem + e->base.off, e->free.len);
@@ -279,8 +281,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 			break;
 			break;
 		}
 		}
 	}
 	}
-	/* Must have a free list! */
-	assert(last_flist);
+	/* Must have a free table! */
+	assert(last_ftable);
 
 
 	/* Now fill the free and hash tables. */
 	/* Now fill the free and hash tables. */
 	for (i = 0; i < layout->num_elems; i++) {
 	for (i = 0; i < layout->num_elems; i++) {
@@ -288,7 +290,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 		switch (e->base.type) {
 		switch (e->base.type) {
 		case FREE:
 		case FREE:
 			add_to_freetable(tdb, e->base.off, e->free.len,
 			add_to_freetable(tdb, e->base.off, e->free.len,
-					 find_flist(layout, e->free.flist_num));
+					 e->free.ftable_num,
+					 find_ftable(layout, e->free.ftable_num));
 			break;
 			break;
 		case DATA:
 		case DATA:
 			add_to_hashtable(tdb, e->base.off, e->used.key);
 			add_to_hashtable(tdb, e->base.off, e->used.key);
@@ -298,7 +301,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 		}
 		}
 	}
 	}
 
 
-	tdb->flist_off = find_flist(layout, 0)->base.off;
+	tdb->ftable_off = find_ftable(layout, 0)->base.off;
 
 
 	/* Get physical if they asked for it. */
 	/* Get physical if they asked for it. */
 	if (layout->filename) {
 	if (layout->filename) {

+ 6 - 6
ccan/tdb2/test/layout.h

@@ -3,9 +3,9 @@
 #include <ccan/tdb2/private.h>
 #include <ccan/tdb2/private.h>
 
 
 struct tdb_layout *new_tdb_layout(const char *filename);
 struct tdb_layout *new_tdb_layout(const char *filename);
-void tdb_layout_add_freelist(struct tdb_layout *layout);
+void tdb_layout_add_freetable(struct tdb_layout *layout);
 void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
 void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
-			 unsigned flist);
+			 unsigned ftable);
 void tdb_layout_add_used(struct tdb_layout *layout,
 void tdb_layout_add_used(struct tdb_layout *layout,
 			 TDB_DATA key, TDB_DATA data,
 			 TDB_DATA key, TDB_DATA data,
 			 tdb_len_t extra);
 			 tdb_len_t extra);
@@ -18,7 +18,7 @@ void tdb_layout_add_hashtable(struct tdb_layout *layout,
 struct tdb_context *tdb_layout_get(struct tdb_layout *layout);
 struct tdb_context *tdb_layout_get(struct tdb_layout *layout);
 
 
 enum layout_type {
 enum layout_type {
-	FREELIST, FREE, DATA, HASHTABLE,
+	FREETABLE, FREE, DATA, HASHTABLE,
 };
 };
 
 
 /* Shared by all union members. */
 /* Shared by all union members. */
@@ -27,14 +27,14 @@ struct tle_base {
 	tdb_off_t off;
 	tdb_off_t off;
 };
 };
 
 
-struct tle_freelist {
+struct tle_freetable {
 	struct tle_base base;
 	struct tle_base base;
 };
 };
 
 
 struct tle_free {
 struct tle_free {
 	struct tle_base base;
 	struct tle_base base;
 	tdb_len_t len;
 	tdb_len_t len;
-	unsigned flist_num;
+	unsigned ftable_num;
 };
 };
 
 
 struct tle_used {
 struct tle_used {
@@ -53,7 +53,7 @@ struct tle_hashtable {
 
 
 union tdb_layout_elem {
 union tdb_layout_elem {
 	struct tle_base base;
 	struct tle_base base;
-	struct tle_freelist flist;
+	struct tle_freetable ftable;
 	struct tle_free free;
 	struct tle_free free;
 	struct tle_used used;
 	struct tle_used used;
 	struct tle_hashtable hashtable;
 	struct tle_hashtable hashtable;

+ 2 - 15
ccan/tdb2/test/logging.c

@@ -1,7 +1,5 @@
-#define _GNU_SOURCE
 #include <stdio.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdlib.h>
-#include <stdarg.h>
 #include <ccan/tap/tap.h>
 #include <ccan/tap/tap.h>
 #include "logging.h"
 #include "logging.h"
 
 
@@ -16,24 +14,13 @@ union tdb_attribute tap_log_attr = {
 
 
 void tap_log_fn(struct tdb_context *tdb,
 void tap_log_fn(struct tdb_context *tdb,
 		enum tdb_debug_level level, void *priv,
 		enum tdb_debug_level level, void *priv,
-		const char *fmt, ...)
+		const char *message)
 {
 {
-	va_list ap;
-	char *p;
-
 	if (suppress_logging)
 	if (suppress_logging)
 		return;
 		return;
 
 
-	va_start(ap, fmt);
-	if (vasprintf(&p, fmt, ap) == -1)
-		abort();
-	/* Strip trailing \n: diag adds it. */
-	if (p[strlen(p)-1] == '\n')
-		p[strlen(p)-1] = '\0';
-	diag("tdb log level %u: %s%s", level, log_prefix, p);
-	free(p);
+	diag("tdb log level %u: %s%s", level, log_prefix, message);
 	if (level != TDB_DEBUG_TRACE)
 	if (level != TDB_DEBUG_TRACE)
 		tap_log_messages++;
 		tap_log_messages++;
-	va_end(ap);
 }
 }
 
 

+ 1 - 1
ccan/tdb2/test/logging.h

@@ -11,7 +11,7 @@ extern union tdb_attribute tap_log_attr;
 
 
 void tap_log_fn(struct tdb_context *tdb,
 void tap_log_fn(struct tdb_context *tdb,
 		enum tdb_debug_level level, void *priv,
 		enum tdb_debug_level level, void *priv,
-		const char *fmt, ...);
+		const char *message);
 
 
 static inline bool data_equal(struct tdb_data a, struct tdb_data b)
 static inline bool data_equal(struct tdb_data a, struct tdb_data b)
 {
 {

+ 8 - 6
ccan/tdb2/test/run-001-encode.c

@@ -12,18 +12,20 @@ int main(int argc, char *argv[])
 {
 {
 	unsigned int i;
 	unsigned int i;
 	struct tdb_used_record rec;
 	struct tdb_used_record rec;
-	struct tdb_context tdb = { .log = tap_log_fn, .log_priv = NULL };
+	struct tdb_context tdb = { .logfn = tap_log_fn };
 
 
 	plan_tests(64 + 32 + 48*6 + 1);
 	plan_tests(64 + 32 + 48*6 + 1);
 
 
 	/* We should be able to encode any data value. */
 	/* We should be able to encode any data value. */
 	for (i = 0; i < 64; i++)
 	for (i = 0; i < 64; i++)
-		ok1(set_header(&tdb, &rec, 0, 1ULL << i, 1ULL << i, 0) == 0);
+		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, 0, 1ULL << i,
+			       1ULL << i, 0) == 0);
 
 
 	/* And any key and data with < 64 bits between them. */
 	/* And any key and data with < 64 bits between them. */
 	for (i = 0; i < 32; i++) {
 	for (i = 0; i < 32; i++) {
 		tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
 		tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
-		ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen, 0) == 0);
+		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
+			       klen + dlen, 0)  == 0);
 	}
 	}
 
 
 	/* We should neatly encode all values. */
 	/* We should neatly encode all values. */
@@ -32,13 +34,13 @@ int main(int argc, char *argv[])
 		uint64_t klen = 1ULL << (i < 16 ? i : 15);
 		uint64_t klen = 1ULL << (i < 16 ? i : 15);
 		uint64_t dlen = 1ULL << i;
 		uint64_t dlen = 1ULL << i;
 		uint64_t xlen = 1ULL << (i < 32 ? i : 31);
 		uint64_t xlen = 1ULL << (i < 32 ? i : 31);
-		ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen + xlen, h)
-		    == 0);
+		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
+			       klen+dlen+xlen, h) == 0);
 		ok1(rec_key_length(&rec) == klen);
 		ok1(rec_key_length(&rec) == klen);
 		ok1(rec_data_length(&rec) == dlen);
 		ok1(rec_data_length(&rec) == dlen);
 		ok1(rec_extra_padding(&rec) == xlen);
 		ok1(rec_extra_padding(&rec) == xlen);
 		ok1((uint64_t)rec_hash(&rec) == h);
 		ok1((uint64_t)rec_hash(&rec) == h);
-		ok1(rec_magic(&rec) == TDB_MAGIC);
+		ok1(rec_magic(&rec) == TDB_USED_MAGIC);
 	}
 	}
 	ok1(tap_log_messages == 0);
 	ok1(tap_log_messages == 0);
 	return exit_status();
 	return exit_status();

+ 11 - 11
ccan/tdb2/test/run-03-coalesce.c

@@ -17,7 +17,7 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
 		return TDB_OFF_ERR;
 		return TDB_OFF_ERR;
 	if (frec_magic(&f) != TDB_FREE_MAGIC)
 	if (frec_magic(&f) != TDB_FREE_MAGIC)
 		return TDB_OFF_ERR;
 		return TDB_OFF_ERR;
-	return f.data_len;
+	return frec_len(&f);
 }
 }
 
 
 int main(int argc, char *argv[])
 int main(int argc, char *argv[])
@@ -38,7 +38,7 @@ int main(int argc, char *argv[])
 
 
 	/* No coalescing can be done due to EOF */
 	/* No coalescing can be done due to EOF */
 	layout = new_tdb_layout(NULL);
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	len = 1024;
 	len = 1024;
 	tdb_layout_add_free(layout, len, 0);
 	tdb_layout_add_free(layout, len, 0);
 	tdb = tdb_layout_get(layout);
 	tdb = tdb_layout_get(layout);
@@ -46,7 +46,7 @@ int main(int argc, char *argv[])
 	ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
 	ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
 
 
 	/* Figure out which bucket free entry is. */
 	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(len));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
 	/* Lock and fail to coalesce. */
 	/* Lock and fail to coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len) == 0);
@@ -57,7 +57,7 @@ int main(int argc, char *argv[])
 
 
 	/* No coalescing can be done due to used record */
 	/* No coalescing can be done due to used record */
 	layout = new_tdb_layout(NULL);
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_used(layout, key, data, 6);
 	tdb_layout_add_used(layout, key, data, 6);
 	tdb = tdb_layout_get(layout);
 	tdb = tdb_layout_get(layout);
@@ -65,7 +65,7 @@ int main(int argc, char *argv[])
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
 	/* Figure out which bucket free entry is. */
 	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(1024));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
 	/* Lock and fail to coalesce. */
 	/* Lock and fail to coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 0);
@@ -76,7 +76,7 @@ int main(int argc, char *argv[])
 
 
 	/* Coalescing can be done due to two free records, then EOF */
 	/* Coalescing can be done due to two free records, then EOF */
 	layout = new_tdb_layout(NULL);
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 2048, 0);
 	tdb_layout_add_free(layout, 2048, 0);
 	tdb = tdb_layout_get(layout);
 	tdb = tdb_layout_get(layout);
@@ -85,7 +85,7 @@ int main(int argc, char *argv[])
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
 	/* Figure out which bucket (first) free entry is. */
 	/* Figure out which bucket (first) free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(1024));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
 	/* Lock and coalesce. */
 	/* Lock and coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);
@@ -97,7 +97,7 @@ int main(int argc, char *argv[])
 
 
 	/* Coalescing can be done due to two free records, then data */
 	/* Coalescing can be done due to two free records, then data */
 	layout = new_tdb_layout(NULL);
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 512, 0);
 	tdb_layout_add_free(layout, 512, 0);
 	tdb_layout_add_used(layout, key, data, 6);
 	tdb_layout_add_used(layout, key, data, 6);
@@ -107,7 +107,7 @@ int main(int argc, char *argv[])
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
 	/* Figure out which bucket free entry is. */
 	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(1024));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
 	/* Lock and coalesce. */
 	/* Lock and coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);
@@ -119,7 +119,7 @@ int main(int argc, char *argv[])
 
 
 	/* Coalescing can be done due to three free records, then EOF */
 	/* Coalescing can be done due to three free records, then EOF */
 	layout = new_tdb_layout(NULL);
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 512, 0);
 	tdb_layout_add_free(layout, 512, 0);
 	tdb_layout_add_free(layout, 256, 0);
 	tdb_layout_add_free(layout, 256, 0);
@@ -130,7 +130,7 @@ int main(int argc, char *argv[])
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
 	/* Figure out which bucket free entry is. */
 	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(1024));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
 	/* Lock and coalesce. */
 	/* Lock and coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);

+ 4 - 2
ccan/tdb2/test/run-04-basichash.c

@@ -65,7 +65,8 @@ int main(int argc, char *argv[])
 		/* FIXME: Check lock length */
 		/* FIXME: Check lock length */
 
 
 		/* Allocate a new record. */
 		/* Allocate a new record. */
-		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h, false);
+		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
+				TDB_USED_MAGIC, false);
 		ok1(new_off != TDB_OFF_ERR);
 		ok1(new_off != TDB_OFF_ERR);
 
 
 		/* We should be able to add it now. */
 		/* We should be able to add it now. */
@@ -225,7 +226,8 @@ int main(int argc, char *argv[])
 
 
 		/* We should be able to add it now. */
 		/* We should be able to add it now. */
 		/* Allocate a new record. */
 		/* Allocate a new record. */
-		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h, false);
+		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
+				TDB_USED_MAGIC, false);
 		ok1(new_off != TDB_OFF_ERR);
 		ok1(new_off != TDB_OFF_ERR);
 		ok1(add_to_hash(tdb, &h, new_off) == 0);
 		ok1(add_to_hash(tdb, &h, new_off) == 0);
 
 

+ 117 - 0
ccan/tdb2/test/run-25-hashoverload.c

@@ -0,0 +1,117 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static uint64_t badhash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+	return 0;
+}
+
+static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
+{
+	if (p)
+		return tdb_delete(tdb, key);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct tdb_context *tdb;
+	struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
+	struct tdb_data dbuf = { (unsigned char *)&j, sizeof(j) };
+	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+						.hash_fn = badhash } };
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT,
+	};
+
+	hattr.base.next = &tap_log_attr;
+
+	plan_tests(5395);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		struct tdb_data d;
+
+		tdb = tdb_open("run-25-hashoverload.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		/* Fill a group. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Now store one last value: should form chain. */
+		ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Check we can find them all. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS) + 1; j++) {
+			d = tdb_fetch(tdb, key);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+		}
+
+		/* Now add a *lot* more. */
+		for (j = (1 << TDB_HASH_GROUP_BITS) + 1;
+		     j < (16 << TDB_HASH_GROUP_BITS);
+		     j++) {
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+			d = tdb_fetch(tdb, key);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Traverse through them. */
+		ok1(tdb_traverse(tdb, trav, NULL) == j);
+
+		/* Empty the first chain-worth. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++)
+			ok1(tdb_delete(tdb, key) == 0);
+
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		for (j = (1 << TDB_HASH_GROUP_BITS);
+		     j < (16 << TDB_HASH_GROUP_BITS);
+		     j++) {
+			d = tdb_fetch(tdb, key);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+		}
+
+		/* Traverse through them. */
+		ok1(tdb_traverse(tdb, trav, NULL)
+		    == (15 << TDB_HASH_GROUP_BITS));
+
+		/* Re-add */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Now try deleting as we go. */
+		ok1(tdb_traverse(tdb, trav, trav)
+		    == (16 << TDB_HASH_GROUP_BITS));
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tdb_traverse(tdb, trav, NULL) == 0);
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}

+ 8 - 8
ccan/tdb2/test/run-30-exhaust-before-expand.c

@@ -9,13 +9,13 @@
 #include <err.h>
 #include <err.h>
 #include "logging.h"
 #include "logging.h"
 
 
-static bool empty_freelist(struct tdb_context *tdb)
+static bool empty_freetable(struct tdb_context *tdb)
 {
 {
-	struct tdb_freelist free;
+	struct tdb_freetable free;
 	unsigned int i;
 	unsigned int i;
 
 
-	/* Now, free list should be completely exhausted in zone 0 */
-	if (tdb_read_convert(tdb, tdb->flist_off, &free, sizeof(free)) != 0)
+	/* Now, free table should be completely exhausted in zone 0 */
+	if (tdb_read_convert(tdb, tdb->ftable_off, &free, sizeof(free)) != 0)
 		abort();
 		abort();
 
 
 	for (i = 0; i < sizeof(free.buckets)/sizeof(free.buckets[0]); i++) {
 	for (i = 0; i < sizeof(free.buckets)/sizeof(free.buckets[0]); i++) {
@@ -50,26 +50,26 @@ int main(int argc, char *argv[])
 		if (!tdb)
 		if (!tdb)
 			continue;
 			continue;
 
 
-		ok1(empty_freelist(tdb));
+		ok1(empty_freetable(tdb));
 		/* Need some hash lock for expand. */
 		/* Need some hash lock for expand. */
 		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
 		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
 		/* Create some free space. */
 		/* Create some free space. */
 		ok1(tdb_expand(tdb, 1) == 0);
 		ok1(tdb_expand(tdb, 1) == 0);
 		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
 		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		ok1(!empty_freelist(tdb));
+		ok1(!empty_freetable(tdb));
 
 
 		size = tdb->map_size;
 		size = tdb->map_size;
 		/* Insert minimal-length records until we expand. */
 		/* Insert minimal-length records until we expand. */
 		for (j = 0; tdb->map_size == size; j++) {
 		for (j = 0; tdb->map_size == size; j++) {
-			was_empty = empty_freelist(tdb);
+			was_empty = empty_freetable(tdb);
 			if (tdb_store(tdb, k, k, TDB_INSERT) != 0)
 			if (tdb_store(tdb, k, k, TDB_INSERT) != 0)
 				err(1, "Failed to store record %i", j);
 				err(1, "Failed to store record %i", j);
 		}
 		}
 
 
 		/* Would have been empty before expansion, but no longer. */
 		/* Would have been empty before expansion, but no longer. */
 		ok1(was_empty);
 		ok1(was_empty);
-		ok1(!empty_freelist(tdb));
+		ok1(!empty_freetable(tdb));
 		tdb_close(tdb);
 		tdb_close(tdb);
 	}
 	}
 
 

+ 17 - 13
ccan/tdb2/test/run-50-multiple-freelists.c

@@ -22,11 +22,11 @@ int main(int argc, char *argv[])
 	data.dsize = 5;
 	data.dsize = 5;
 	key.dsize = 5;
 	key.dsize = 5;
 
 
-	/* Create a TDB with three free lists. */
+	/* Create a TDB with three free tables. */
 	layout = new_tdb_layout(NULL);
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
-	tdb_layout_add_freelist(layout);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
+	tdb_layout_add_freetable(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 80, 0);
 	tdb_layout_add_free(layout, 80, 0);
 	/* Used record prevent coalescing. */
 	/* Used record prevent coalescing. */
 	tdb_layout_add_used(layout, key, data, 6);
 	tdb_layout_add_used(layout, key, data, 6);
@@ -40,24 +40,28 @@ int main(int argc, char *argv[])
 	tdb = tdb_layout_get(layout);
 	tdb = tdb_layout_get(layout);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
-	off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0, 0);
+	off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
 	ok1(off == layout->elem[3].base.off);
 	ok1(off == layout->elem[3].base.off);
-	ok1(tdb->flist_off == layout->elem[0].base.off);
+	ok1(tdb->ftable_off == layout->elem[0].base.off);
 
 
-	off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0, 0);
+	off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
 	ok1(off == layout->elem[5].base.off);
 	ok1(off == layout->elem[5].base.off);
-	ok1(tdb->flist_off == layout->elem[1].base.off);
+	ok1(tdb->ftable_off == layout->elem[1].base.off);
 
 
-	off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0, 0);
+	off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
 	ok1(off == layout->elem[7].base.off);
 	ok1(off == layout->elem[7].base.off);
-	ok1(tdb->flist_off == layout->elem[2].base.off);
+	ok1(tdb->ftable_off == layout->elem[2].base.off);
 
 
-	off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0, 0);
+	off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
 	ok1(off == layout->elem[9].base.off);
 	ok1(off == layout->elem[9].base.off);
-	ok1(tdb->flist_off == layout->elem[0].base.off);
+	ok1(tdb->ftable_off == layout->elem[0].base.off);
 
 
 	/* Now we fail. */
 	/* Now we fail. */
-	off = get_free(tdb, 0, 0, 1, 0);
+	off = get_free(tdb, 0, 0, 1, TDB_USED_MAGIC, 0);
 	ok1(off == 0);
 	ok1(off == 0);
 
 
 	tdb_close(tdb);
 	tdb_close(tdb);

+ 1 - 1
ccan/tdb2/test/run-seed.c

@@ -13,7 +13,7 @@ static int log_count = 0;
 /* Normally we get a log when setting random seed. */
 /* Normally we get a log when setting random seed. */
 static void my_log_fn(struct tdb_context *tdb,
 static void my_log_fn(struct tdb_context *tdb,
 		      enum tdb_debug_level level, void *priv,
 		      enum tdb_debug_level level, void *priv,
-		      const char *fmt, ...)
+		      const char *message)
 {
 {
 	log_count++;
 	log_count++;
 }
 }

+ 6 - 2
ccan/tdb2/test/run-traverse.c

@@ -56,7 +56,6 @@ static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
 		td->high = val;
 		td->high = val;
 
 
 	if (td->delete) {
 	if (td->delete) {
-
 		if (tdb_delete(tdb, key) != 0) {
 		if (tdb_delete(tdb, key) != 0) {
 			td->delete_error = tdb_error(tdb);
 			td->delete_error = tdb_error(tdb);
 			return -1;
 			return -1;
@@ -120,7 +119,7 @@ int main(int argc, char *argv[])
 
 
 	hattr.base.next = &tap_log_attr;
 	hattr.base.next = &tap_log_attr;
 
 
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 50 + 1);
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 53 + 1);
 	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
 	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
 		tdb = tdb_open("run-traverse.tdb", flags[i],
 		tdb = tdb_open("run-traverse.tdb", flags[i],
 			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
 			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
@@ -182,6 +181,7 @@ int main(int argc, char *argv[])
 		ok1(td.low <= NUM_RECORDS / 2);
 		ok1(td.low <= NUM_RECORDS / 2);
 		ok1(td.high > NUM_RECORDS / 2);
 		ok1(td.high > NUM_RECORDS / 2);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tap_log_messages == 0);
 
 
 		/* Growing traverse.  Expect failure on r/o traverse. */
 		/* Growing traverse.  Expect failure on r/o traverse. */
 		tgd.calls = 0;
 		tgd.calls = 0;
@@ -193,6 +193,8 @@ int main(int argc, char *argv[])
 		ok1(tgd.error == TDB_ERR_RDONLY);
 		ok1(tgd.error == TDB_ERR_RDONLY);
 		ok1(tgd.calls == 1);
 		ok1(tgd.calls == 1);
 		ok1(!tgd.mismatch);
 		ok1(!tgd.mismatch);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
 		/* Deleting traverse.  Expect failure on r/o traverse. */
 		/* Deleting traverse.  Expect failure on r/o traverse. */
@@ -209,6 +211,8 @@ int main(int argc, char *argv[])
 		ok1(!td.mismatch);
 		ok1(!td.mismatch);
 		ok1(td.calls == 1);
 		ok1(td.calls == 1);
 		ok1(td.low == td.high);
 		ok1(td.low == td.high);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
 		/* Deleting traverse (delete everything). */
 		/* Deleting traverse (delete everything). */

+ 4 - 3
ccan/tdb2/tools/Makefile

@@ -1,12 +1,13 @@
 OBJS:=../../tdb2.o ../../hash.o ../../tally.o
 OBJS:=../../tdb2.o ../../hash.o ../../tally.o
-CFLAGS:=-I../../.. -Wall -g #-g -O3 #-g -pg
+CFLAGS:=-I../../.. -Wall -g -O3 #-g -pg
 LDFLAGS:=-L../../..
 LDFLAGS:=-L../../..
 
 
-default: tdbtorture tdbtool mktdb
+default: tdbtorture tdbtool mktdb speed
 
 
 tdbtorture: tdbtorture.c $(OBJS)
 tdbtorture: tdbtorture.c $(OBJS)
 tdbtool: tdbtool.c $(OBJS)
 tdbtool: tdbtool.c $(OBJS)
 mktdb: mktdb.c $(OBJS)
 mktdb: mktdb.c $(OBJS)
+speed: speed.c $(OBJS)
 
 
 clean:
 clean:
-	rm -f tdbtorture tdbtool mktdb
+	rm -f tdbtorture tdbtool mktdb speed

+ 331 - 0
ccan/tdb2/tools/speed.c

@@ -0,0 +1,331 @@
+/* Simple speed test for TDB */
+#include <err.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <ccan/tdb2/tdb2.h>
+
+/* Nanoseconds per operation */
+static size_t normalize(const struct timeval *start,
+			const struct timeval *stop,
+			unsigned int num)
+{
+	struct timeval diff;
+
+	timersub(stop, start, &diff);
+
+	/* Floating point is more accurate here. */
+	return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
+		/ num * 1000;
+}
+
+static size_t file_size(void)
+{
+	struct stat st;
+
+	if (stat("/tmp/speed.tdb", &st) != 0)
+		return -1;
+	return st.st_size;
+}
+
+static int count_record(struct tdb_context *tdb,
+			TDB_DATA key, TDB_DATA data, void *p)
+{
+	int *total = p;
+	*total += *(int *)data.dptr;
+	return 0;
+}
+
+static void dump_and_clear_stats(struct tdb_attribute_stats *stats)
+{
+	printf("allocs = %llu\n",
+	       (unsigned long long)stats->allocs);
+	printf("  alloc_subhash = %llu\n",
+	       (unsigned long long)stats->alloc_subhash);
+	printf("  alloc_chain = %llu\n",
+	       (unsigned long long)stats->alloc_chain);
+	printf("  alloc_bucket_exact = %llu\n",
+	       (unsigned long long)stats->alloc_bucket_exact);
+	printf("  alloc_bucket_max = %llu\n",
+	       (unsigned long long)stats->alloc_bucket_max);
+	printf("  alloc_leftover = %llu\n",
+	       (unsigned long long)stats->alloc_leftover);
+	printf("  alloc_coalesce_tried = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_tried);
+	printf("    alloc_coalesce_lockfail = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_lockfail);
+	printf("    alloc_coalesce_race = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_race);
+	printf("    alloc_coalesce_succeeded = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_succeeded);
+	printf("       alloc_coalesce_num_merged = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_num_merged);
+	printf("compares = %llu\n",
+	       (unsigned long long)stats->compares);
+	printf("  compare_wrong_bucket = %llu\n",
+	       (unsigned long long)stats->compare_wrong_bucket);
+	printf("  compare_wrong_offsetbits = %llu\n",
+	       (unsigned long long)stats->compare_wrong_offsetbits);
+	printf("  compare_wrong_keylen = %llu\n",
+	       (unsigned long long)stats->compare_wrong_keylen);
+	printf("  compare_wrong_rechash = %llu\n",
+	       (unsigned long long)stats->compare_wrong_rechash);
+	printf("  compare_wrong_keycmp = %llu\n",
+	       (unsigned long long)stats->compare_wrong_keycmp);
+	printf("expands = %llu\n",
+	       (unsigned long long)stats->expands);
+	printf("frees = %llu\n",
+	       (unsigned long long)stats->frees);
+	printf("locks = %llu\n",
+	       (unsigned long long)stats->locks);
+	printf("   lock_lowlevel = %llu\n",
+	       (unsigned long long)stats->lock_lowlevel);
+	printf("   lock_nonblock = %llu\n",
+	       (unsigned long long)stats->lock_nonblock);
+
+	/* Now clear. */
+	memset(&stats->allocs, 0, (char *)(stats+1) - (char *)&stats->allocs);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, num = 1000, stage = 0, stopat = -1;
+	int flags = TDB_DEFAULT;
+	bool transaction = false;
+	TDB_DATA key, data;
+	struct tdb_context *tdb;
+	struct timeval start, stop;
+	union tdb_attribute seed, stats;
+
+	/* Try to keep benchmarks even. */
+	seed.base.attr = TDB_ATTRIBUTE_SEED;
+	seed.base.next = NULL;
+	seed.seed.seed = 0;
+
+	memset(&stats, 0, sizeof(stats));
+	stats.base.attr = TDB_ATTRIBUTE_STATS;
+	stats.base.next = NULL;
+	stats.stats.size = sizeof(stats);
+
+	if (argv[1] && strcmp(argv[1], "--internal") == 0) {
+		flags = TDB_INTERNAL;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
+		transaction = true;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--stats") == 0) {
+		seed.base.next = &stats;
+		argc--;
+		argv++;
+	}
+
+	tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR|O_CREAT|O_TRUNC,
+		       0600, &seed);
+	if (!tdb)
+		err(1, "Opening /tmp/speed.tdb");
+
+	key.dptr = (void *)&i;
+	key.dsize = sizeof(i);
+	data = key;
+
+	if (argv[1]) {
+		num = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+
+	if (argv[1]) {
+		stopat = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Add 1000 records. */
+	printf("Adding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Finding 1000 records. */
+	printf("Finding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++) {
+		int *dptr;
+		dptr = (int *)tdb_fetch(tdb, key).dptr;
+		if (!dptr || *dptr != i)
+			errx(1, "Fetching key %u in tdb gave %u",
+			     i, dptr ? *dptr : -1);
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Missing 1000 records. */
+	printf("Missing %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = num; i < num*2; i++) {
+		int *dptr;
+		dptr = (int *)tdb_fetch(tdb, key).dptr;
+		if (dptr)
+			errx(1, "Fetching key %u in tdb gave %u", i, *dptr);
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Traverse 1000 records. */
+	printf("Traversing %u records: ", num); fflush(stdout);
+	i = 0;
+	gettimeofday(&start, NULL);
+	if (tdb_traverse(tdb, count_record, &i) != num)
+		errx(1, "Traverse returned wrong number of records");
+	if (i != (num - 1) * (num / 2))
+		errx(1, "Traverse tallied to %u", i);
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Delete 1000 records (not in order). */
+	printf("Deleting %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if (tdb_delete(tdb, key) != 0)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Re-add 1000 records (not in order). */
+	printf("Re-adding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Append 1000 records. */
+	printf("Appending %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (tdb_append(tdb, key, data) != 0)
+			errx(1, "Appending key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Churn 1000 records: not in order! */
+	printf("Churning %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 1000019) % num;
+		if (tdb_delete(tdb, key) != 0)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+		i += num;
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	return 0;
+}

+ 147 - 120
ccan/tdb2/transaction.c

@@ -169,10 +169,9 @@ static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
 	return 0;
 	return 0;
 
 
 fail:
 fail:
-	tdb->ecode = TDB_ERR_IO;
-	tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-		 "transaction_read: failed at off=%llu len=%llu\n",
-		 (long long)off, (long long)len);
+	tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+		   "transaction_read: failed at off=%zu len=%zu",
+		   (size_t)off, (size_t)len);
 	tdb->transaction->transaction_error = 1;
 	tdb->transaction->transaction_error = 1;
 	return -1;
 	return -1;
 }
 }
@@ -188,12 +187,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 
 
 	/* Only a commit is allowed on a prepared transaction */
 	/* Only a commit is allowed on a prepared transaction */
 	if (tdb->transaction->prepared) {
 	if (tdb->transaction->prepared) {
-		tdb->ecode = TDB_ERR_EINVAL;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_FATAL,
 			 "transaction_write: transaction already prepared,"
 			 "transaction_write: transaction already prepared,"
-			 " write not allowed\n");
-		tdb->transaction->transaction_error = 1;
-		return -1;
+			 " write not allowed");
+		goto fail;
 	}
 	}
 
 
 	/* break it up into block sized chunks */
 	/* break it up into block sized chunks */
@@ -228,7 +225,8 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 				(blk+1)*sizeof(uint8_t *));
 				(blk+1)*sizeof(uint8_t *));
 		}
 		}
 		if (new_blocks == NULL) {
 		if (new_blocks == NULL) {
-			tdb->ecode = TDB_ERR_OOM;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "transaction_write: failed to allocate");
 			goto fail;
 			goto fail;
 		}
 		}
 		memset(&new_blocks[tdb->transaction->num_blocks], 0,
 		memset(&new_blocks[tdb->transaction->num_blocks], 0,
@@ -242,9 +240,9 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 	if (tdb->transaction->blocks[blk] == NULL) {
 	if (tdb->transaction->blocks[blk] == NULL) {
 		tdb->transaction->blocks[blk] = (uint8_t *)calloc(getpagesize(), 1);
 		tdb->transaction->blocks[blk] = (uint8_t *)calloc(getpagesize(), 1);
 		if (tdb->transaction->blocks[blk] == NULL) {
 		if (tdb->transaction->blocks[blk] == NULL) {
-			tdb->ecode = TDB_ERR_OOM;
-			tdb->transaction->transaction_error = 1;
-			return -1;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "transaction_write: failed to allocate");
+			goto fail;
 		}
 		}
 		if (tdb->transaction->old_map_size > blk * getpagesize()) {
 		if (tdb->transaction->old_map_size > blk * getpagesize()) {
 			tdb_len_t len2 = getpagesize();
 			tdb_len_t len2 = getpagesize();
@@ -254,6 +252,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 			if (tdb->transaction->io_methods->read(tdb, blk * getpagesize(),
 			if (tdb->transaction->io_methods->read(tdb, blk * getpagesize(),
 							       tdb->transaction->blocks[blk],
 							       tdb->transaction->blocks[blk],
 							       len2) != 0) {
 							       len2) != 0) {
+				tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+					   "transaction_write: failed to"
+					   " read old block: %s",
+					   strerror(errno));
 				SAFE_FREE(tdb->transaction->blocks[blk]);
 				SAFE_FREE(tdb->transaction->blocks[blk]);
 				goto fail;
 				goto fail;
 			}
 			}
@@ -278,10 +280,6 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 	return 0;
 	return 0;
 
 
 fail:
 fail:
-	tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-		 "transaction_write: failed at off=%llu len=%llu\n",
-		 (long long)((blk*getpagesize()) + off),
-		 (long long)len);
 	tdb->transaction->transaction_error = 1;
 	tdb->transaction->transaction_error = 1;
 	return -1;
 	return -1;
 }
 }
@@ -341,6 +339,12 @@ static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 		return 0;
 		return 0;
 	}
 	}
 	tdb->ecode = TDB_ERR_IO;
 	tdb->ecode = TDB_ERR_IO;
+	if (!probe) {
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+			   "tdb_oob len %lld beyond transaction size %lld",
+			   (long long)len,
+			   (long long)tdb->map_size);
+	}
 	return -1;
 	return -1;
 }
 }
 
 
@@ -359,10 +363,39 @@ static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t addition)
 }
 }
 
 
 static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off,
 static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off,
-				size_t len)
+				size_t len, bool write)
 {
 {
-	/* FIXME */
-	return NULL;
+	size_t blk = off / getpagesize(), end_blk;
+
+	/* This is wrong for zero-length blocks, but will fail gracefully */
+	end_blk = (off + len - 1) / getpagesize();
+
+	/* Can only do direct if in single block and we've already copied. */
+	if (write) {
+		if (blk != end_blk)
+			return NULL;
+		if (blk >= tdb->transaction->num_blocks)
+			return NULL;
+		if (tdb->transaction->blocks[blk] == NULL)
+			return NULL;
+		return tdb->transaction->blocks[blk] + off % getpagesize();
+	}
+
+	/* Single which we have copied? */
+	if (blk == end_blk
+	    && blk < tdb->transaction->num_blocks
+	    && tdb->transaction->blocks[blk])
+		return tdb->transaction->blocks[blk] + off % getpagesize();
+
+	/* Otherwise must be all not copied. */
+	while (blk < end_blk) {
+		if (blk >= tdb->transaction->num_blocks)
+			break;
+		if (tdb->transaction->blocks[blk])
+			return NULL;
+		blk++;
+	}
+	return tdb->transaction->io_methods->direct(tdb, off, len, write);
 }
 }
 
 
 static const struct tdb_methods transaction_methods = {
 static const struct tdb_methods transaction_methods = {
@@ -383,9 +416,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t
 	}
 	}
 
 
 	if (fsync(tdb->fd) != 0) {
 	if (fsync(tdb->fd) != 0) {
-		tdb->ecode = TDB_ERR_IO;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction: fsync failed\n");
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+			   "tdb_transaction: fsync failed: %s",
+			   strerror(errno));
 		return -1;
 		return -1;
 	}
 	}
 #ifdef MS_SYNC
 #ifdef MS_SYNC
@@ -393,10 +426,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t
 		tdb_off_t moffset = offset & ~(getpagesize()-1);
 		tdb_off_t moffset = offset & ~(getpagesize()-1);
 		if (msync(moffset + (char *)tdb->map_ptr,
 		if (msync(moffset + (char *)tdb->map_ptr,
 			  length + (offset - moffset), MS_SYNC) != 0) {
 			  length + (offset - moffset), MS_SYNC) != 0) {
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_transaction: msync failed - %s\n",
-				 strerror(errno));
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+				   "tdb_transaction: msync failed: %s",
+				   strerror(errno));
 			return -1;
 			return -1;
 		}
 		}
 	}
 	}
@@ -410,9 +442,8 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb)
 	int i;
 	int i;
 
 
 	if (tdb->transaction == NULL) {
 	if (tdb->transaction == NULL) {
-		tdb->ecode = TDB_ERR_EINVAL;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_cancel: no transaction\n");
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			   "tdb_transaction_cancel: no transaction");
 		return;
 		return;
 	}
 	}
 
 
@@ -441,9 +472,9 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb)
 				   &invalid, sizeof(invalid)) == -1 ||
 				   &invalid, sizeof(invalid)) == -1 ||
 		    transaction_sync(tdb, tdb->transaction->magic_offset,
 		    transaction_sync(tdb, tdb->transaction->magic_offset,
 				     sizeof(invalid)) == -1) {
 				     sizeof(invalid)) == -1) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_transaction_cancel: failed to remove"
-				 " recovery magic\n");
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+				   "tdb_transaction_cancel: failed to remove"
+				   " recovery magic");
 		}
 		}
 	}
 	}
 
 
@@ -469,16 +500,17 @@ int tdb_transaction_start(struct tdb_context *tdb)
 {
 {
 	/* some sanity checks */
 	/* some sanity checks */
 	if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) {
 	if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) {
-		tdb->ecode = TDB_ERR_EINVAL;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_start: cannot start a transaction"
-			 " on a read-only or internal db\n");
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			   "tdb_transaction_start: cannot start a transaction"
+			   " on a read-only or internal db");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	/* cope with nested tdb_transaction_start() calls */
 	/* cope with nested tdb_transaction_start() calls */
 	if (tdb->transaction != NULL) {
 	if (tdb->transaction != NULL) {
-		tdb->ecode = TDB_ERR_NESTING;
+		tdb_logerr(tdb, TDB_ERR_NESTING, TDB_DEBUG_ERROR,
+			   "tdb_transaction_start:"
+			   " already inside transaction");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -486,17 +518,17 @@ int tdb_transaction_start(struct tdb_context *tdb)
 		/* the caller must not have any locks when starting a
 		/* the caller must not have any locks when starting a
 		   transaction as otherwise we'll be screwed by lack
 		   transaction as otherwise we'll be screwed by lack
 		   of nested locks in posix */
 		   of nested locks in posix */
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_start: cannot start a transaction"
-			 " with locks held\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_transaction_start: cannot start a transaction"
+			   " with locks held");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	tdb->transaction = (struct tdb_transaction *)
 	tdb->transaction = (struct tdb_transaction *)
 		calloc(sizeof(struct tdb_transaction), 1);
 		calloc(sizeof(struct tdb_transaction), 1);
 	if (tdb->transaction == NULL) {
 	if (tdb->transaction == NULL) {
-		tdb->ecode = TDB_ERR_OOM;
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			   "tdb_transaction_start: cannot allocate");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -585,17 +617,17 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 
 
 	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
 	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
 	if (recovery_head == TDB_OFF_ERR) {
 	if (recovery_head == TDB_OFF_ERR) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_recovery_allocate:"
 			 "tdb_recovery_allocate:"
-			 " failed to read recovery head\n");
+			 " failed to read recovery head");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	if (recovery_head != 0) {
 	if (recovery_head != 0) {
 		if (methods->read(tdb, recovery_head, &rec, sizeof(rec))) {
 		if (methods->read(tdb, recovery_head, &rec, sizeof(rec))) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 				 "tdb_recovery_allocate:"
 				 "tdb_recovery_allocate:"
-				 " failed to read recovery record\n");
+				 " failed to read recovery record");
 			return -1;
 			return -1;
 		}
 		}
 		tdb_convert(tdb, &rec, sizeof(rec));
 		tdb_convert(tdb, &rec, sizeof(rec));
@@ -621,11 +653,12 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 	   us an area that is being currently used (as of the start of
 	   us an area that is being currently used (as of the start of
 	   the transaction) */
 	   the transaction) */
 	if (recovery_head != 0) {
 	if (recovery_head != 0) {
+		add_stat(tdb, frees, 1);
 		if (add_free_record(tdb, recovery_head,
 		if (add_free_record(tdb, recovery_head,
 				    sizeof(rec) + rec.max_len) != 0) {
 				    sizeof(rec) + rec.max_len) != 0) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_recovery_allocate:"
-				 " failed to free previous recovery area\n");
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+				   "tdb_recovery_allocate:"
+				   " failed to free previous recovery area");
 			return -1;
 			return -1;
 		}
 		}
 	}
 	}
@@ -649,9 +682,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 		sizeof(rec) + *recovery_max_size;
 		sizeof(rec) + *recovery_max_size;
 	tdb->map_size = tdb->transaction->old_map_size;
 	tdb->map_size = tdb->transaction->old_map_size;
 	if (methods->expand_file(tdb, addition) == -1) {
 	if (methods->expand_file(tdb, addition) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_recovery_allocate:"
 			 "tdb_recovery_allocate:"
-			 " failed to create recovery area\n");
+			 " failed to create recovery area");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -665,9 +698,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 	tdb_convert(tdb, &recovery_head, sizeof(recovery_head));
 	tdb_convert(tdb, &recovery_head, sizeof(recovery_head));
 	if (methods->write(tdb, offsetof(struct tdb_header, recovery),
 	if (methods->write(tdb, offsetof(struct tdb_header, recovery),
 			   &recovery_head, sizeof(tdb_off_t)) == -1) {
 			   &recovery_head, sizeof(tdb_off_t)) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_recovery_allocate:"
 			 "tdb_recovery_allocate:"
-			 " failed to write recovery head\n");
+			 " failed to write recovery head");
 		return -1;
 		return -1;
 	}
 	}
 	transaction_write_existing(tdb, offsetof(struct tdb_header, recovery),
 	transaction_write_existing(tdb, offsetof(struct tdb_header, recovery),
@@ -713,7 +746,8 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 
 
 	data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
 	data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
 	if (data == NULL) {
 	if (data == NULL) {
-		tdb->ecode = TDB_ERR_OOM;
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+			   "transaction_setup_recovery: cannot allocate");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -743,10 +777,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 			continue;
 			continue;
 		}
 		}
 		if (offset + length > tdb->map_size) {
 		if (offset + length > tdb->map_size) {
-			tdb->ecode = TDB_ERR_CORRUPT;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_transaction_setup_recovery:"
-				 " transaction data over new region boundary\n");
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+				   "tdb_transaction_setup_recovery:"
+				   " transaction data over new region boundary");
 			free(data);
 			free(data);
 			return -1;
 			return -1;
 		}
 		}
@@ -774,9 +807,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 	/* write the recovery data to the recovery area */
 	/* write the recovery data to the recovery area */
 	if (methods->write(tdb, recovery_offset, data,
 	if (methods->write(tdb, recovery_offset, data,
 			   sizeof(*rec) + recovery_size) == -1) {
 			   sizeof(*rec) + recovery_size) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_transaction_setup_recovery:"
 			 "tdb_transaction_setup_recovery:"
-			 " failed to write recovery data\n");
+			 " failed to write recovery data");
 		free(data);
 		free(data);
 		return -1;
 		return -1;
 	}
 	}
@@ -801,9 +834,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 						   magic);
 						   magic);
 
 
 	if (methods->write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
 	if (methods->write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_transaction_setup_recovery:"
 			 "tdb_transaction_setup_recovery:"
-			 " failed to write recovery magic\n");
+			 " failed to write recovery magic");
 		return -1;
 		return -1;
 	}
 	}
 	transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic));
 	transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic));
@@ -821,27 +854,24 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 	const struct tdb_methods *methods;
 	const struct tdb_methods *methods;
 
 
 	if (tdb->transaction == NULL) {
 	if (tdb->transaction == NULL) {
-		tdb->ecode = TDB_ERR_EINVAL;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_prepare_commit: no transaction\n");
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			   "tdb_transaction_prepare_commit: no transaction");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	if (tdb->transaction->prepared) {
 	if (tdb->transaction->prepared) {
-		tdb->ecode = TDB_ERR_EINVAL;
 		_tdb_transaction_cancel(tdb);
 		_tdb_transaction_cancel(tdb);
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_prepare_commit:"
-			 " transaction already prepared\n");
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			   "tdb_transaction_prepare_commit:"
+			   " transaction already prepared");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	if (tdb->transaction->transaction_error) {
 	if (tdb->transaction->transaction_error) {
-		tdb->ecode = TDB_ERR_IO;
 		_tdb_transaction_cancel(tdb);
 		_tdb_transaction_cancel(tdb);
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_prepare_commit:"
-			 " transaction error pending\n");
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			   "tdb_transaction_prepare_commit:"
+			   " transaction error pending");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -860,9 +890,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 
 
 	/* upgrade the main transaction lock region to a write lock */
 	/* upgrade the main transaction lock region to a write lock */
 	if (tdb_allrecord_upgrade(tdb) == -1) {
 	if (tdb_allrecord_upgrade(tdb) == -1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
 			 "tdb_transaction_prepare_commit:"
 			 "tdb_transaction_prepare_commit:"
-			 " failed to upgrade hash locks\n");
+			 " failed to upgrade hash locks");
 		_tdb_transaction_cancel(tdb);
 		_tdb_transaction_cancel(tdb);
 		return -1;
 		return -1;
 	}
 	}
@@ -870,9 +900,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 	/* get the open lock - this prevents new users attaching to the database
 	/* get the open lock - this prevents new users attaching to the database
 	   during the commit */
 	   during the commit */
 	if (tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK) == -1) {
 	if (tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK) == -1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
 			 "tdb_transaction_prepare_commit:"
 			 "tdb_transaction_prepare_commit:"
-			 " failed to get open lock\n");
+			 " failed to get open lock");
 		_tdb_transaction_cancel(tdb);
 		_tdb_transaction_cancel(tdb);
 		return -1;
 		return -1;
 	}
 	}
@@ -881,9 +911,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 	if (!(tdb->flags & TDB_NOSYNC)) {
 	if (!(tdb->flags & TDB_NOSYNC)) {
 		/* write the recovery data to the end of the file */
 		/* write the recovery data to the end of the file */
 		if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
 		if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 				 "tdb_transaction_prepare_commit:"
 				 "tdb_transaction_prepare_commit:"
-				 " failed to setup recovery data\n");
+				 " failed to setup recovery data");
 			_tdb_transaction_cancel(tdb);
 			_tdb_transaction_cancel(tdb);
 			return -1;
 			return -1;
 		}
 		}
@@ -897,9 +927,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 		/* Restore original map size for tdb_expand_file */
 		/* Restore original map size for tdb_expand_file */
 		tdb->map_size = tdb->transaction->old_map_size;
 		tdb->map_size = tdb->transaction->old_map_size;
 		if (methods->expand_file(tdb, add) == -1) {
 		if (methods->expand_file(tdb, add) == -1) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
 				 "tdb_transaction_prepare_commit:"
 				 "tdb_transaction_prepare_commit:"
-				 " expansion failed\n");
+				 " expansion failed");
 			_tdb_transaction_cancel(tdb);
 			_tdb_transaction_cancel(tdb);
 			return -1;
 			return -1;
 		}
 		}
@@ -927,19 +957,18 @@ int tdb_transaction_commit(struct tdb_context *tdb)
 	int i;
 	int i;
 
 
 	if (tdb->transaction == NULL) {
 	if (tdb->transaction == NULL) {
-		tdb->ecode = TDB_ERR_EINVAL;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_commit: no transaction\n");
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			 "tdb_transaction_commit: no transaction");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	tdb_trace(tdb, "tdb_transaction_commit");
 	tdb_trace(tdb, "tdb_transaction_commit");
 
 
 	if (tdb->transaction->transaction_error) {
 	if (tdb->transaction->transaction_error) {
-		tdb->ecode = TDB_ERR_IO;
 		tdb_transaction_cancel(tdb);
 		tdb_transaction_cancel(tdb);
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_commit: transaction error pending\n");
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			   "tdb_transaction_commit:"
+			   " transaction error pending");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -980,9 +1009,9 @@ int tdb_transaction_commit(struct tdb_context *tdb)
 
 
 		if (methods->write(tdb, offset, tdb->transaction->blocks[i],
 		if (methods->write(tdb, offset, tdb->transaction->blocks[i],
 				   length) == -1) {
 				   length) == -1) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_transaction_commit:"
-				 " write failed during commit\n");
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+				   "tdb_transaction_commit:"
+				   " write failed during commit");
 
 
 			/* we've overwritten part of the data and
 			/* we've overwritten part of the data and
 			   possibly expanded the file, so we need to
 			   possibly expanded the file, so we need to
@@ -1042,9 +1071,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	/* find the recovery area */
 	/* find the recovery area */
 	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
 	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
 	if (recovery_head == TDB_OFF_ERR) {
 	if (recovery_head == TDB_OFF_ERR) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_transaction_recover:"
 			 "tdb_transaction_recover:"
-			 " failed to read recovery head\n");
+			 " failed to read recovery head");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -1055,9 +1084,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 
 
 	/* read the recovery record */
 	/* read the recovery record */
 	if (tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec)) == -1) {
 	if (tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec)) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction_recover:"
-			 " failed to read recovery record\n");
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+			   "tdb_transaction_recover:"
+			   " failed to read recovery record");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -1067,10 +1096,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	}
 	}
 
 
 	if (tdb->read_only) {
 	if (tdb->read_only) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction_recover:"
-			 " attempt to recover read only database\n");
-		tdb->ecode = TDB_ERR_CORRUPT;
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+			   "tdb_transaction_recover:"
+			   " attempt to recover read only database");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -1078,19 +1106,18 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 
 
 	data = (unsigned char *)malloc(rec.len);
 	data = (unsigned char *)malloc(rec.len);
 	if (data == NULL) {
 	if (data == NULL) {
-		tdb->ecode = TDB_ERR_OOM;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction_recover:"
-			 " failed to allocate recovery data\n");
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+			   "tdb_transaction_recover:"
+			   " failed to allocate recovery data");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	/* read the full recovery data */
 	/* read the full recovery data */
 	if (tdb->methods->read(tdb, recovery_head + sizeof(rec), data,
 	if (tdb->methods->read(tdb, recovery_head + sizeof(rec), data,
 			       rec.len) == -1) {
 			       rec.len) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction_recover:"
-			 " failed to read recovery data\n");
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+			   "tdb_transaction_recover:"
+			   " failed to read recovery data");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -1106,9 +1133,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 
 
 		if (tdb->methods->write(tdb, ofs, p, len) == -1) {
 		if (tdb->methods->write(tdb, ofs, p, len) == -1) {
 			free(data);
 			free(data);
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 				 "tdb_transaction_recover:"
 				 "tdb_transaction_recover:"
-				 " failed to recover %zu bytes at offset %zu\n",
+				 " failed to recover %zu bytes at offset %zu",
 				 (size_t)len, (size_t)ofs);
 				 (size_t)len, (size_t)ofs);
 			return -1;
 			return -1;
 		}
 		}
@@ -1118,8 +1145,8 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	free(data);
 	free(data);
 
 
 	if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
 	if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction_recover: failed to sync recovery\n");
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+			   "tdb_transaction_recover: failed to sync recovery");
 		return -1;
 		return -1;
 	}
 	}
 
 
@@ -1127,9 +1154,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	if (recovery_eof <= recovery_head) {
 	if (recovery_eof <= recovery_head) {
 		if (tdb_write_off(tdb, offsetof(struct tdb_header,recovery), 0)
 		if (tdb_write_off(tdb, offsetof(struct tdb_header,recovery), 0)
 		    == -1) {
 		    == -1) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 				 "tdb_transaction_recover:"
 				 "tdb_transaction_recover:"
-				 " failed to remove recovery head\n");
+				 " failed to remove recovery head");
 			return -1;
 			return -1;
 		}
 		}
 	}
 	}
@@ -1139,21 +1166,21 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 			  recovery_head
 			  recovery_head
 			  + offsetof(struct tdb_recovery_record, magic),
 			  + offsetof(struct tdb_recovery_record, magic),
 			  TDB_RECOVERY_INVALID_MAGIC) == -1) {
 			  TDB_RECOVERY_INVALID_MAGIC) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_transaction_recover:"
 			 "tdb_transaction_recover:"
-			 " failed to remove recovery magic\n");
+			 " failed to remove recovery magic");
 		return -1;
 		return -1;
 	}
 	}
 
 
 	if (transaction_sync(tdb, 0, recovery_eof) == -1) {
 	if (transaction_sync(tdb, 0, recovery_eof) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction_recover: failed to sync2 recovery\n");
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+			 "tdb_transaction_recover: failed to sync2 recovery");
 		return -1;
 		return -1;
 	}
 	}
 
 
-	tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
-		 "tdb_transaction_recover: recovered %zu byte database\n",
-		 (size_t)recovery_eof);
+	tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
+		   "tdb_transaction_recover: recovered %zu byte database",
+		   (size_t)recovery_eof);
 
 
 	/* all done */
 	/* all done */
 	return 0;
 	return 0;

Some files were not shown because too many files changed in this diff