Browse Source

Merge branch 'tdb2'

Rusty Russell 15 years ago
parent
commit
51a56b5262

+ 2 - 2
ccan/tdb/tools/Makefile

@@ -2,7 +2,7 @@ LDLIBS:=../../tdb.o ../../tally.o
 CFLAGS:=-I../../.. -Wall -O3 #-g -pg
 LDFLAGS:=-L../../..
 
-default: replay_trace tdbtorture tdbdump tdbtool starvation mktdb
+default: replay_trace tdbtorture tdbdump tdbtool starvation mktdb speed
 
 benchmark: replay_trace
 	@trap "rm -f /tmp/trace.$$$$" 0; for f in benchmarks/*.rz; do if runzip -k $$f -o /tmp/trace.$$$$ && echo -n "$$f": && ./replay_trace --quiet -n 5 replay.tdb /tmp/trace.$$$$ && rm /tmp/trace.$$$$; then rm -f /tmp/trace.$$$$; else exit 1; fi; done
@@ -30,4 +30,4 @@ check: replay_trace
 	@sed 's/\(^[0-9]* traverse\) .*/\1fn/' < $^ > $@
 
 clean:
-	rm -f replay_trace tdbtorture tdbdump tdbtool *.o
+	rm -f replay_trace tdbtorture tdbdump tdbtool speed *.o

+ 247 - 0
ccan/tdb/tools/speed.c

@@ -0,0 +1,247 @@
+/* Simple speed test for TDB */
+#include <err.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <ccan/tdb/tdb.h>
+
+/* Nanoseconds per operation */
+static size_t normalize(const struct timeval *start,
+			const struct timeval *stop,
+			unsigned int num)
+{
+	struct timeval diff;
+
+	timersub(stop, start, &diff);
+
+	/* Floating point is more accurate here. */
+	return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
+		/ num * 1000;
+}
+
+static size_t file_size(void)
+{
+	struct stat st;
+
+	if (stat("/tmp/speed.tdb", &st) != 0)
+		return -1;
+	return st.st_size;
+}
+
+static int count_record(struct tdb_context *tdb,
+			TDB_DATA key, TDB_DATA data, void *p)
+{
+	int *total = p;
+	*total += *(int *)data.dptr;
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, num = 1000, stage = 0, stopat = -1;
+	int flags = TDB_DEFAULT;
+	TDB_DATA key, data;
+	struct tdb_context *tdb;
+	struct timeval start, stop;
+	bool transaction = false;
+
+	if (argv[1] && strcmp(argv[1], "--internal") == 0) {
+		flags = TDB_INTERNAL;
+		argc--;
+		argv++;
+	}
+
+	if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
+		transaction = true;
+		argc--;
+		argv++;
+	}
+
+	tdb = tdb_open("/tmp/speed.tdb", 100003, flags, O_RDWR|O_CREAT|O_TRUNC,
+		       0600);
+	if (!tdb)
+		err(1, "Opening /tmp/speed.tdb");
+
+	key.dptr = (void *)&i;
+	key.dsize = sizeof(i);
+	data = key;
+
+	if (argv[1]) {
+		num = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+
+	if (argv[1]) {
+		stopat = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Add 1000 records. */
+	printf("Adding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Finding 1000 records. */
+	printf("Finding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++) {
+		int *dptr;
+		dptr = (int *)tdb_fetch(tdb, key).dptr;
+		if (!dptr || *dptr != i)
+			errx(1, "Fetching key %u in tdb gave %u",
+			     i, dptr ? *dptr : -1);
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Missing 1000 records. */
+	printf("Missing %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = num; i < num*2; i++) {
+		int *dptr;
+		dptr = (int *)tdb_fetch(tdb, key).dptr;
+		if (dptr)
+			errx(1, "Fetching key %u in tdb gave %u", i, *dptr);
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Traverse 1000 records. */
+	printf("Traversing %u records: ", num); fflush(stdout);
+	i = 0;
+	gettimeofday(&start, NULL);
+	if (tdb_traverse(tdb, count_record, &i) != num)
+		errx(1, "Traverse returned wrong number of records");
+	if (i != (num - 1) * (num / 2))
+		errx(1, "Traverse tallied to %u", i);
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Delete 1000 records (not in order). */
+	printf("Deleting %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if (tdb_delete(tdb, key) != 0)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Re-add 1000 records (not in order). */
+	printf("Re-adding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Append 1000 records. */
+	printf("Appending %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (tdb_append(tdb, key, data) != 0)
+			errx(1, "Appending key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Churn 1000 records: not in order! */
+	printf("Churning %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 1000019) % num;
+		if (tdb_delete(tdb, key) != 0)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+		i += num;
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+
+	return 0;
+}

+ 310 - 162
ccan/tdb2/check.c

@@ -43,25 +43,25 @@ static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
 	hash_test = TDB_HASH_MAGIC;
 	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
 	if (hdr.hash_test != hash_test) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "check: hash test %llu should be %llu\n",
-			 (long long)hdr.hash_test,
-			 (long long)hash_test);
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "check: hash test %llu should be %llu",
+			   (long long)hdr.hash_test,
+			   (long long)hash_test);
 		return false;
 	}
 
 	if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "check: bad magic '%.*s'\n",
-			 (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "check: bad magic '%.*s'",
+			   (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
 		return false;
 	}
 
 	*recovery = hdr.recovery;
 	if (*recovery) {
 		if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_check: invalid recovery offset %zu\n",
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+				 "tdb_check: invalid recovery offset %zu",
 				 (size_t)*recovery);
 			return false;
 		}
@@ -77,7 +77,65 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			    unsigned hprefix_bits,
 			    tdb_off_t used[],
 			    size_t num_used,
-			    size_t *num_found);
+			    size_t *num_found,
+			    int (*check)(TDB_DATA, TDB_DATA, void *),
+			    void *private_data);
+
+static bool check_hash_chain(struct tdb_context *tdb,
+			     tdb_off_t off,
+			     uint64_t hash,
+			     tdb_off_t used[],
+			     size_t num_used,
+			     size_t *num_found,
+			     int (*check)(TDB_DATA, TDB_DATA, void *),
+			     void *private_data)
+{
+	struct tdb_used_record rec;
+
+	if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
+		return false;
+
+	if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Bad hash chain magic %llu",
+			   (long long)rec_magic(&rec));
+		return false;
+	}
+
+	if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Bad hash chain length %llu vs %zu",
+			   (long long)rec_data_length(&rec),
+			   sizeof(struct tdb_chain));
+		return false;
+	}
+	if (rec_key_length(&rec) != 0) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			 "tdb_check: Bad hash chain key length %llu",
+			 (long long)rec_key_length(&rec));
+		return false;
+	}
+	if (rec_hash(&rec) != 0) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			 "tdb_check: Bad hash chain hash value %llu",
+			 (long long)rec_hash(&rec));
+		return false;
+	}
+
+	off += sizeof(rec);
+	if (!check_hash_tree(tdb, off, 0, hash, 64,
+			     used, num_used, num_found, check, private_data))
+		return false;
+
+	off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
+	if (off == TDB_OFF_ERR)
+		return false;
+	if (off == 0)
+		return true;
+	(*num_found)++;
+	return check_hash_chain(tdb, off, hash, used, num_used, num_found,
+				check, private_data);
+}
 
 static bool check_hash_record(struct tdb_context *tdb,
 			      tdb_off_t off,
@@ -85,30 +143,43 @@ static bool check_hash_record(struct tdb_context *tdb,
 			      unsigned hprefix_bits,
 			      tdb_off_t used[],
 			      size_t num_used,
-			      size_t *num_found)
+			      size_t *num_found,
+			      int (*check)(TDB_DATA, TDB_DATA, void *),
+			      void *private_data)
 {
 	struct tdb_used_record rec;
 
+	if (hprefix_bits >= 64)
+		return check_hash_chain(tdb, off, hprefix, used, num_used,
+					num_found, check, private_data);
+
 	if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
 		return false;
 
+	if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Bad hash table magic %llu",
+			   (long long)rec_magic(&rec));
+		return false;
+	}
 	if (rec_data_length(&rec)
 	    != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: Bad hash table length %llu vs %llu\n",
-			 (long long)rec_data_length(&rec),
-			 (long long)sizeof(tdb_off_t)<<TDB_SUBLEVEL_HASH_BITS);
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Bad hash table length %llu vs %llu",
+			   (long long)rec_data_length(&rec),
+			   (long long)sizeof(tdb_off_t)
+			   << TDB_SUBLEVEL_HASH_BITS);
 		return false;
 	}
 	if (rec_key_length(&rec) != 0) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: Bad hash table key length %llu\n",
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			 "tdb_check: Bad hash table key length %llu",
 			 (long long)rec_key_length(&rec));
 		return false;
 	}
 	if (rec_hash(&rec) != 0) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: Bad hash table hash value %llu\n",
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			 "tdb_check: Bad hash table hash value %llu",
 			 (long long)rec_hash(&rec));
 		return false;
 	}
@@ -117,7 +188,7 @@ static bool check_hash_record(struct tdb_context *tdb,
 	return check_hash_tree(tdb, off,
 			       TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
 			       hprefix, hprefix_bits,
-			       used, num_used, num_found);
+			       used, num_used, num_found, check, private_data);
 }
 
 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
@@ -141,7 +212,9 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			    unsigned hprefix_bits,
 			    tdb_off_t used[],
 			    size_t num_used,
-			    size_t *num_found)
+			    size_t *num_found,
+			    int (*check)(TDB_DATA, TDB_DATA, void *),
+			    void *private_data)
 {
 	unsigned int g, b;
 	const tdb_off_t *hash;
@@ -166,16 +239,42 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			off = group[b] & TDB_OFF_MASK;
 			p = asearch(&off, used, num_used, off_cmp);
 			if (!p) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: Invalid offset %llu "
-					 "in hash\n",
-					 (long long)off);
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: Invalid offset %llu "
+					   "in hash", (long long)off);
 				goto fail;
 			}
 			/* Mark it invalid. */
 			*p ^= 1;
 			(*num_found)++;
 
+			if (hprefix_bits == 64) {
+				/* Chained entries are unordered. */
+				if (is_subhash(group[b])) {
+					tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_DEBUG_ERROR,
+						   "tdb_check: Invalid chain"
+						   " entry subhash");
+					goto fail;
+				}
+				h = hash_record(tdb, off);
+				if (h != hprefix) {
+					tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_DEBUG_ERROR,
+						   "check: bad hash chain"
+						   " placement"
+						   " 0x%llx vs 0x%llx",
+						   (long long)h,
+						   (long long)hprefix);
+					goto fail;
+				}
+				if (tdb_read_convert(tdb, off, &rec,
+						     sizeof(rec)))
+					goto fail;
+				goto check;
+			}
+
 			if (is_subhash(group[b])) {
 				uint64_t subprefix;
 				subprefix = (hprefix 
@@ -188,7 +287,8 @@ static bool check_hash_tree(struct tdb_context *tdb,
 					       hprefix_bits
 						       + group_bits
 						       + TDB_HASH_GROUP_BITS,
-					       used, num_used, num_found))
+					       used, num_used, num_found,
+					       check, private_data))
 					goto fail;
 				continue;
 			}
@@ -199,18 +299,20 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			used_bits = 0;
 			if (get_bits(h, hprefix_bits, &used_bits) != hprefix
 			    && hprefix_bits) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "check: bad hash placement"
-					 " 0x%llx vs 0x%llx\n",
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "check: bad hash placement"
+					   " 0x%llx vs 0x%llx",
 					 (long long)h, (long long)hprefix);
 				goto fail;
 			}
 
 			/* Does it belong in this group? */
 			if (get_bits(h, group_bits, &used_bits) != g) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "check: bad group %llu vs %u\n",
-					 (long long)h, g);
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "check: bad group %llu vs %u",
+					   (long long)h, g);
 				goto fail;
 			}
 
@@ -219,11 +321,12 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
 			    != bucket) {
 				used_bits -= TDB_HASH_GROUP_BITS;
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "check: bad bucket %u vs %u\n",
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					 "check: bad bucket %u vs %u",
 					 (unsigned)get_bits(h,
-							    TDB_HASH_GROUP_BITS,
-							    &used_bits),
+							TDB_HASH_GROUP_BITS,
+							&used_bits),
 					 bucket);
 				goto fail;
 			}
@@ -234,28 +337,46 @@ static bool check_hash_tree(struct tdb_context *tdb,
 			     i != b;
 			     i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
 				if (group[i] == 0) {
-					tdb->log(tdb, TDB_DEBUG_ERROR,
-						 tdb->log_priv,
-						 "check: bad group placement"
-						 " %u vs %u\n",
-						 b, bucket);
+					tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_DEBUG_ERROR,
+						   "check: bad group placement"
+						   " %u vs %u",
+						   b, bucket);
 					goto fail;
 				}
 			}
 
-			if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
+			if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
 				goto fail;
 
 			/* Bottom bits must match header. */
 			if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: Bad hash magic at"
-					 " offset %llu (0x%llx vs 0x%llx)\n",
-					 (long long)off,
-					 (long long)h,
-					 (long long)rec_hash(&rec));
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: Bad hash magic at"
+					   " offset %llu (0x%llx vs 0x%llx)",
+					   (long long)off,
+					   (long long)h,
+					   (long long)rec_hash(&rec));
 				goto fail;
 			}
+
+		check:
+			if (check) {
+				TDB_DATA key, data;
+				key.dsize = rec_key_length(&rec);
+				data.dsize = rec_data_length(&rec);
+				key.dptr = (void *)tdb_access_read(tdb,
+						   off + sizeof(rec),
+						   key.dsize + data.dsize,
+						   false);
+				if (!key.dptr)
+					goto fail;
+				data.dptr = key.dptr + key.dsize;
+				if (check(key, data, private_data) != 0)
+					goto fail;
+				tdb_access_release(tdb, key.dptr);
+			}
 		}
 	}
 	tdb_access_release(tdb, hash);
@@ -268,19 +389,22 @@ fail:
 
 static bool check_hash(struct tdb_context *tdb,
 		       tdb_off_t used[],
-		       size_t num_used, size_t num_flists)
+		       size_t num_used, size_t num_ftables,
+		       int (*check)(TDB_DATA, TDB_DATA, void *),
+		       void *private_data)
 {
-	/* Free lists also show up as used. */
-	size_t num_found = num_flists;
+	/* Free tables also show up as used. */
+	size_t num_found = num_ftables;
 
 	if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
 			     TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
-			     0, 0, used, num_used, &num_found))
+			     0, 0, used, num_used, &num_found,
+			     check, private_data))
 		return false;
 
 	if (num_found != num_used) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: Not all entries are in hash\n");
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Not all entries are in hash");
 		return false;
 	}
 	return true;
@@ -289,62 +413,63 @@ static bool check_hash(struct tdb_context *tdb,
 static bool check_free(struct tdb_context *tdb,
 		       tdb_off_t off,
 		       const struct tdb_free_record *frec,
-		       tdb_off_t prev, tdb_off_t flist_off, unsigned int bucket)
+		       tdb_off_t prev, unsigned int ftable,
+		       unsigned int bucket)
 {
 	if (frec_magic(frec) != TDB_FREE_MAGIC) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: offset %llu bad magic 0x%llx\n",
-			 (long long)off, (long long)frec->magic_and_meta);
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: offset %llu bad magic 0x%llx",
+			   (long long)off, (long long)frec->magic_and_prev);
 		return false;
 	}
-	if (frec_flist(frec) != flist_off) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: offset %llu bad freelist 0x%llx\n",
-			 (long long)off, (long long)frec_flist(frec));
+	if (frec_ftable(frec) != ftable) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: offset %llu bad freetable %u",
+			   (long long)off, frec_ftable(frec));
 		return false;
 	}
 
 	if (tdb->methods->oob(tdb, off
-			      + frec->data_len+sizeof(struct tdb_used_record),
+			      + frec_len(frec) + sizeof(struct tdb_used_record),
 			      false))
 		return false;
-	if (size_to_bucket(frec->data_len) != bucket) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: offset %llu in wrong bucket %u vs %u\n",
-			 (long long)off,
-			 bucket, size_to_bucket(frec->data_len));
+	if (size_to_bucket(frec_len(frec)) != bucket) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: offset %llu in wrong bucket %u vs %u",
+			   (long long)off,
+			   bucket, size_to_bucket(frec_len(frec)));
 		return false;
 	}
-	if (prev != frec->prev) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: offset %llu bad prev %llu vs %llu\n",
-			 (long long)off,
-			 (long long)prev, (long long)frec->prev);
+	if (prev != frec_prev(frec)) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: offset %llu bad prev %llu vs %llu",
+			   (long long)off,
+			   (long long)prev, (long long)frec_len(frec));
 		return false;
 	}
 	return true;
 }
 		       
-static bool check_free_list(struct tdb_context *tdb,
-			    tdb_off_t flist_off,
-			    tdb_off_t free[],
-			    size_t num_free,
-			    size_t *num_found)
+static bool check_free_table(struct tdb_context *tdb,
+			     tdb_off_t ftable_off,
+			     unsigned ftable_num,
+			     tdb_off_t free[],
+			     size_t num_free,
+			     size_t *num_found)
 {
-	struct tdb_freelist flist;
+	struct tdb_freetable ft;
 	tdb_off_t h;
 	unsigned int i;
 
-	if (tdb_read_convert(tdb, flist_off, &flist, sizeof(flist)) == -1)
+	if (tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft)) == -1)
 		return false;
 
-	if (rec_magic(&flist.hdr) != TDB_MAGIC
-	    || rec_key_length(&flist.hdr) != 0
-	    || rec_data_length(&flist.hdr) != sizeof(flist) - sizeof(flist.hdr)
-	    || rec_hash(&flist.hdr) != 1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR,
-			 tdb->log_priv,
-			 "tdb_check: Invalid header on free list\n");
+	if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
+	    || rec_key_length(&ft.hdr) != 0
+	    || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
+	    || rec_hash(&ft.hdr) != 0) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Invalid header on free table");
 		return false;
 	}
 
@@ -352,23 +477,23 @@ static bool check_free_list(struct tdb_context *tdb,
 		tdb_off_t off, prev = 0, *p;
 		struct tdb_free_record f;
 
-		h = bucket_off(flist_off, i);
+		h = bucket_off(ftable_off, i);
 		for (off = tdb_read_off(tdb, h); off; off = f.next) {
 			if (off == TDB_OFF_ERR)
 				return false;
 			if (tdb_read_convert(tdb, off, &f, sizeof(f)))
 				return false;
-			if (!check_free(tdb, off, &f, prev, flist_off, i))
+			if (!check_free(tdb, off, &f, prev, ftable_num, i))
 				return false;
 
 			/* FIXME: Check hash bits */
 			p = asearch(&off, free, num_free, off_cmp);
 			if (!p) {
-				tdb->log(tdb, TDB_DEBUG_ERROR,
-					 tdb->log_priv,
-					 "tdb_check: Invalid offset"
-					 " %llu in free table\n",
-					 (long long)off);
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: Invalid offset"
+					   " %llu in free table",
+					   (long long)off);
 				return false;
 			}
 			/* Mark it invalid. */
@@ -381,7 +506,7 @@ static bool check_free_list(struct tdb_context *tdb,
 }
 
 /* Slow, but should be very rare. */
-static size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
+size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
 {
 	size_t len;
 
@@ -409,113 +534,135 @@ static bool check_linear(struct tdb_context *tdb,
 			struct tdb_used_record u;
 			struct tdb_free_record f;
 			struct tdb_recovery_record r;
-		} pad, *p;
-		p = tdb_get(tdb, off, &pad, sizeof(pad));
-		if (!p)
+		} rec;
+		/* r is larger: only get that if we need to. */
+		if (tdb_read_convert(tdb, off, &rec, sizeof(rec.f)) == -1)
 			return false;
 
 		/* If we crash after ftruncate, we can get zeroes or fill. */
-		if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
-		    || p->r.magic ==  0x4343434343434343ULL) {
+		if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
+		    || rec.r.magic ==  0x4343434343434343ULL) {
+			if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
+				return false;
+
 			if (recovery == off) {
 				found_recovery = true;
-				len = sizeof(p->r) + p->r.max_len;
+				len = sizeof(rec.r) + rec.r.max_len;
 			} else {
 				len = dead_space(tdb, off);
-				if (len < sizeof(p->r)) {
-					tdb->log(tdb, TDB_DEBUG_ERROR,
-						 tdb->log_priv,
-						 "tdb_check: invalid dead space"
-						 " at %zu\n", (size_t)off);
+				if (len < sizeof(rec.r)) {
+					tdb_logerr(tdb, TDB_ERR_CORRUPT,
+						   TDB_DEBUG_ERROR,
+						   "tdb_check: invalid dead"
+						   " space at %zu",
+						   (size_t)off);
 					return false;
 				}
 
-				tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
-					 "Dead space at %zu-%zu (of %zu)\n",
-					 (size_t)off, (size_t)(off + len),
-					 (size_t)tdb->map_size);
+				tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_WARNING,
+					   "Dead space at %zu-%zu (of %zu)",
+					   (size_t)off, (size_t)(off + len),
+					   (size_t)tdb->map_size);
 			}
-		} else if (p->r.magic == TDB_RECOVERY_MAGIC) {
+		} else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
+			if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
+				return false;
 			if (recovery != off) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: unexpected recovery"
-					 " record at offset %zu\n",
-					 (size_t)off);
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: unexpected recovery"
+					   " record at offset %zu",
+					   (size_t)off);
+				return false;
+			}
+			if (rec.r.len > rec.r.max_len) {
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: invalid recovery length"
+					   " %zu", (size_t)rec.r.len);
+				return false;
+			}
+			if (rec.r.eof > tdb->map_size) {
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: invalid old EOF"
+					   " %zu", (size_t)rec.r.eof);
 				return false;
 			}
 			found_recovery = true;
-			len = sizeof(p->r) + p->r.max_len;
-		} else if (frec_magic(&p->f) == TDB_FREE_MAGIC
-			   || frec_magic(&p->f) == TDB_COALESCING_MAGIC) {
-			len = sizeof(p->u) + p->f.data_len;
+			len = sizeof(rec.r) + rec.r.max_len;
+		} else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
+			len = sizeof(rec.u) + frec_len(&rec.f);
 			if (off + len > tdb->map_size) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: free overlength %llu"
-					 " at offset %llu\n",
-					 (long long)len, (long long)off);
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: free overlength %llu"
+					   " at offset %llu",
+					   (long long)len, (long long)off);
 				return false;
 			}
-			/* This record is free! */
-			if (frec_magic(&p->f) == TDB_FREE_MAGIC
+			/* This record should be in free lists. */
+			if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
 			    && !append(free, num_free, off))
 				return false;
-		} else {
+		} else if (rec_magic(&rec.u) == TDB_USED_MAGIC
+			   || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
+			   || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
+			   || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
 			uint64_t klen, dlen, extra;
 
 			/* This record is used! */
-			if (rec_magic(&p->u) != TDB_MAGIC) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: Bad magic 0x%llx"
-					 " at offset %llu\n",
-					 (long long)rec_magic(&p->u),
-					 (long long)off);
-				return false;
-			}
-
 			if (!append(used, num_used, off))
 				return false;
 
-			klen = rec_key_length(&p->u);
-			dlen = rec_data_length(&p->u);
-			extra = rec_extra_padding(&p->u);
+			klen = rec_key_length(&rec.u);
+			dlen = rec_data_length(&rec.u);
+			extra = rec_extra_padding(&rec.u);
 
-			len = sizeof(p->u) + klen + dlen + extra;
+			len = sizeof(rec.u) + klen + dlen + extra;
 			if (off + len > tdb->map_size) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: used overlength %llu"
-					 " at offset %llu\n",
-					 (long long)len, (long long)off);
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: used overlength %llu"
+					   " at offset %llu",
+					   (long long)len, (long long)off);
 				return false;
 			}
 
-			if (len < sizeof(p->f)) {
-				tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-					 "tdb_check: too short record %llu at"
-					 " %llu\n",
-					 (long long)len, (long long)off);
+			if (len < sizeof(rec.f)) {
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_ERROR,
+					   "tdb_check: too short record %llu"
+					   " at %llu",
+					   (long long)len, (long long)off);
 				return false;
 			}
+		} else {
+			tdb_logerr(tdb, TDB_ERR_CORRUPT,
+				   TDB_DEBUG_ERROR,
+				   "tdb_check: Bad magic 0x%llx at offset %zu",
+				   (long long)rec_magic(&rec.u), (size_t)off);
+			return false;
 		}
 	}
 
 	/* We must have found recovery area if there was one. */
 	if (recovery != 0 && !found_recovery) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: expected a recovery area at %zu\n",
-			 (size_t)recovery);
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: expected a recovery area at %zu",
+			   (size_t)recovery);
 		return false;
 	}
 
 	return true;
 }
 
-/* FIXME: call check() function. */
 int tdb_check(struct tdb_context *tdb,
 	      int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
 	      void *private_data)
 {
-	tdb_off_t *free = NULL, *used = NULL, flist, recovery;
-	size_t num_free = 0, num_used = 0, num_found = 0, num_flists = 0;
+	tdb_off_t *free = NULL, *used = NULL, ft, recovery;
+	size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
 
 	if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
 		return -1;
@@ -532,22 +679,23 @@ int tdb_check(struct tdb_context *tdb,
 	if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery))
 		goto fail;
 
-	for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
-		if (flist == TDB_OFF_ERR)
+	for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
+		if (ft == TDB_OFF_ERR)
 			goto fail;
-		if (!check_free_list(tdb, flist, free, num_free, &num_found))
+		if (!check_free_table(tdb, ft, num_ftables, free, num_free,
+				      &num_found))
 			goto fail;
-		num_flists++;
+		num_ftables++;
 	}
 
 	/* FIXME: Check key uniqueness? */
-	if (!check_hash(tdb, used, num_used, num_flists))
+	if (!check_hash(tdb, used, num_used, num_ftables, check, private_data))
 		goto fail;
 
 	if (num_found != num_free) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: Not all entries are in free table\n");
-		return false;
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
+			   "tdb_check: Not all entries are in free table");
+		return -1;
 	}
 
 	tdb_allrecord_unlock(tdb, F_RDLCK);

File diff suppressed because it is too large
+ 242 - 273
ccan/tdb2/doc/design.lyx


File diff suppressed because it is too large
+ 244 - 270
ccan/tdb2/doc/design.lyx,v


BIN
ccan/tdb2/doc/design.pdf


+ 149 - 68
ccan/tdb2/doc/design.txt

@@ -2,7 +2,7 @@ TDB2: A Redesigning The Trivial DataBase
 
 Rusty Russell, IBM Corporation
 
-14-September-2010
+1-December-2010
 
 Abstract
 
@@ -129,6 +129,10 @@ union tdb_attribute {
 This allows future attributes to be added, even if this expands 
 the size of the union.
 
+2.1.2 Status
+
+Complete.
+
 2.2 tdb_traverse Makes Impossible Guarantees
 
 tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, 
@@ -148,6 +152,11 @@ occur during your traversal, otherwise you will see some subset.
 You can prevent changes by using a transaction or the locking 
 API.
 
+2.2.2 Status
+
+Complete. Delete-during-traverse will still delete every record, 
+too (assuming no other changes).
+
 2.3 Nesting of Transactions Is Fraught
 
 TDB has alternated between allowing nested transactions and not 
@@ -182,6 +191,10 @@ However, this behavior can be simulated with a wrapper which uses
 tdb_add_flags() and tdb_remove_flags(), so the API should not be 
 expanded for this relatively-obscure case.
 
+2.3.2 Status
+
+Incomplete; nesting flag is still defined as per tdb1.
+
 2.4 Incorrect Hash Function is Not Detected
 
 tdb_open_ex() allows the calling code to specify a different hash 
@@ -195,6 +208,10 @@ The header should contain an example hash result (eg. the hash of
 0xdeadbeef), and tdb_open_ex() should check that the given hash 
 function produces the same answer, or fail the tdb_open call.
 
+2.4.2 Status
+
+Complete.
+
 2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
 
 In response to scalability issues with the free list ([TDB-Freelist-Is]
@@ -216,6 +233,11 @@ hint that store and delete of records will be at least as common
 as fetch in order to allow some internal tuning, but initially 
 will become a no-op.
 
+2.5.2 Status
+
+Incomplete. TDB_VOLATILE still defined, but implementation should 
+fail on unknown flags to be future-proof.
+
 2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times 
   In The Same Process
 
@@ -251,6 +273,10 @@ whether re-opening is allowed, as though there may be some
 benefit to adding a call to detect when a tdb_context is shared, 
 to allow other to create such an API.
 
+2.6.2 Status
+
+Incomplete.
+
 2.7 TDB API Is Not POSIX Thread-safe
 
 The TDB API uses an error code which can be queried after an 
@@ -281,6 +307,10 @@ will exist. Alternatively, a hooking mechanism similar to that
 proposed for [Proposed-Solution-locking-hook] could be used to 
 enable pthread locking at runtime.
 
+2.7.2 Status
+
+Incomplete.
+
 2.8 *_nonblock Functions And *_mark Functions Expose 
   Implementation
 
@@ -343,6 +373,10 @@ locks it doesn't need to obtain.
 It also keeps the complexity out of the API, and in ctdbd where 
 it is needed.
 
+2.8.2 Status
+
+Incomplete.
+
 2.9 tdb_chainlock Functions Expose Implementation
 
 tdb_chainlock locks some number of records, including the record 
@@ -391,6 +425,10 @@ EINVAL if the signal occurs before the kernel is entered,
 otherwise EAGAIN.
 ]
 
+2.10.2 Status
+
+Incomplete.
+
 2.11 The API Uses Gratuitous Typedefs, Capitals
 
 typedefs are useful for providing source compatibility when types 
@@ -433,6 +471,10 @@ the tdb_open_ex for logging.
 It should simply take an extra argument, since we are prepared to 
 break the API/ABI.
 
+2.12.2 Status
+
+Complete.
+
 2.13 Various Callback Functions Are Not Typesafe
 
 The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
@@ -455,6 +497,10 @@ their parameter.
 See CCAN's typesafe_cb module at 
 http://ccan.ozlabs.org/info/typesafe_cb.html
 
+2.13.2 Status
+
+Incomplete.
+
 2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, 
   tdb_reopen_all Problematic
 
@@ -475,6 +521,11 @@ it alone has opened the TDB and will erase it.
 Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but 
 see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
 
+2.14.2 Status
+
+Incomplete, TDB_CLEAR_IF_FIRST still defined, but not 
+implemented.
+
 2.15 Extending The Header Is Difficult
 
 We have reserved (zeroed) words in the TDB header, which can be 
@@ -505,6 +556,10 @@ This should allow backwards-compatible features to be added, and
 detection if older code (which doesn't understand the feature) 
 writes to the database.
 
+2.15.2 Status
+
+Incomplete.
+
 2.16 Record Headers Are Not Expandible
 
 If we later want to add (say) checksums on keys and data, it 
@@ -519,6 +574,10 @@ understand a new format: the new code would write (say) a 1 at
 the tail, and thus if there is no tail or the first byte is 0, we 
 would know the extension is not present on that record.
 
+2.16.2 Status
+
+Incomplete.
+
 2.17 TDB Does Not Use Talloc
 
 Many users of TDB (particularly Samba) use the talloc allocator, 
@@ -541,6 +600,10 @@ returned from tdb_open to close it. All TDB_DATA fields would be
 children of the tdb_context, and the caller would still have to 
 manage them (using talloc_free() or talloc_steal()).
 
+2.17.2 Status
+
+Deferred.
+
 3 Performance And Scalability Issues
 
 3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST 
@@ -570,6 +633,10 @@ Remove the flag. It was a neat idea, but even trivial servers
 tend to know when they are initializing for the first time and 
 can simply unlink the old tdb at that point.
 
+3.1.2 Status
+
+Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
+
 3.2 TDB Files Have a 4G Limit
 
 This seems to be becoming an issue (so much for “trivial”!), 
@@ -596,6 +663,10 @@ Old versions of tdb will fail to open the new TDB files (since 28
 August 2009, commit 398d0c29290: prior to that any unrecognized 
 file format would be erased and initialized as a fresh tdb!)
 
+3.2.2 Status
+
+Complete.
+
 3.3 TDB Records Have a 4G Limit
 
 This has not been a reported problem, and the API uses size_t 
@@ -610,6 +681,10 @@ implementation would return TDB_ERR_OOM in a similar case). It
 seems unlikely that 32 bit keys will be a limitation, so the 
 implementation may not support this (see [sub:Records-Incur-A]).
 
+3.3.2 Status
+
+Complete.
+
 3.4 Hash Size Is Determined At TDB Creation Time
 
 TDB contains a number of hash chains in the header; the number is 
@@ -628,20 +703,9 @@ This was annoying because I was previously convinced that an
 expanding tree of hashes would be very close to optimal.
 ], it became clear that it is hard to beat a straight linear hash 
 table which doubles in size when it reaches saturation. 
-
-1. 
-
-2. 
-
-3. 
-
-
-
-
-
- Unfortunately, altering the hash table introduces serious 
-locking complications: the entire hash table needs to be locked 
-to enlarge the hash table, and others might be holding locks. 
+Unfortunately, altering the hash table introduces serious locking 
+complications: the entire hash table needs to be locked to 
+enlarge the hash table, and others might be holding locks. 
 Particularly insidious are insertions done under tdb_chainlock.
 
 Thus an expanding layered hash will be used: an array of hash 
@@ -662,6 +726,10 @@ means we can choose not to re-hash all entries when we expand a
 hash group; simply use the next bits we need and mark them 
 invalid.
 
+3.4.2 Status
+
+Complete.
+
 3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
 
 TDB uses a single linked list for the free list. Allocation 
@@ -749,45 +817,45 @@ There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fr
 case where all processes are allocating/freeing the same size. 
 Thus we almost certainly need to divide in other ways: the most 
 obvious is to divide the file into zones, and using a free list 
-(or set of free lists) for each. This approximates address 
+(or table of free lists) for each. This approximates address 
 ordering.
 
-Note that this means we need to split the free lists when we 
-expand the file; this is probably acceptable when we double the 
-hash table size, since that is such an expensive operation 
-already. In the case of increasing the file size, there is an 
-optimization we can use: if we use M in the formula above as the 
-file size rounded up to the next power of 2, we only need 
-reshuffle free lists when the file size crosses a power of 2 
-boundary, and reshuffling the free lists is trivial: we simply 
-merge every consecutive pair of free lists.
+Unfortunately it is difficult to know what heuristics should be 
+used to determine zone sizes, and our transaction code relies on 
+being able to create a “recovery area” by simply appending to the 
+file (difficult if it would need to create a new zone header). 
+Thus we use a linked-list of free tables; currently we only ever 
+create one, but if there is more than one we choose one at random 
+to use. In future we may use heuristics to add new free tables on 
+contention. We only expand the file when all free tables are 
+exhausted.
 
 The basic algorithm is as follows. Freeing is simple:
 
-1. Identify the correct zone.
+1. Identify the correct free list.
 
 2. Lock the corresponding list.
 
-3. Re-check the zone (we didn't have a lock, sizes could have 
+3. Re-check the list (we didn't have a lock, sizes could have 
   changed): relock if necessary.
 
-4. Place the freed entry in the list for that zone.
+4. Place the freed entry in the list.
 
 Allocation is a little more complicated, as we perform delayed 
 coalescing at this point:
 
-1. Pick a zone either the zone we last freed into, or based on a “
-  random” number.
+1. Pick a free table; usually the previous one.
 
 2. Lock the corresponding list.
 
-3. Re-check the zone: relock if necessary.
-
-4. If the top entry is -large enough, remove it from the list and 
+3. If the top entry is -large enough, remove it from the list and 
   return it.
 
-5. Otherwise, coalesce entries in the list.If there was no entry 
-  large enough, unlock the list and try the next zone.
+4. Otherwise, coalesce entries in the list.If there was no entry 
+  large enough, unlock the list and try the next largest list
+
+5. If no list has an entry which meets our needs, try the next 
+  free table.
 
 6. If no zone satisfies, expand the file.
 
@@ -798,24 +866,9 @@ ordering seems to be fairly good for keeping fragmentation low
 does not need a tailer to coalesce, though if we needed one we 
 could have one cheaply: see [sub:Records-Incur-A]. 
 
-I anticipate that the number of entries in each free zone would 
-be small, but it might be worth using one free entry to hold 
-pointers to the others for cache efficiency.
-
-<freelist-in-zone>If we want to avoid locking complexity 
-(enlarging the free lists when we enlarge the file) we could 
-place the array of free lists at the beginning of each zone. This 
-means existing array lists never move, but means that a record 
-cannot be larger than a zone. That in turn implies that zones 
-should be variable sized (say, power of 2), which makes the 
-question “what zone is this record in?” much harder (and “pick a 
-random zone”, but that's less common). It could be done with as 
-few as 4 bits from the record header.[footnote:
-Using 2^{16+N*3}means 0 gives a minimal 65536-byte zone, 15 gives 
-the maximal 2^{61} byte zone. Zones range in factor of 8 steps. 
-Given the zone size for the zone the current record is in, we can 
-determine the start of the zone.
-]
+Each free entry has the free table number in the header: less 
+than 255. It also contains a doubly-linked list for easy 
+deletion.
 
 3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
 
@@ -944,13 +997,13 @@ This produces a 16 byte used header like this:
 
 struct tdb_used_record {
 
-        uint32_t magic : 16,
+        uint32_t used_magic : 16,
+
 
-                 prev_is_free: 1,
 
                  key_data_divide: 5,
 
-                 top_hash: 10;
+                 top_hash: 11;
 
         uint32_t extra_octets;
 
@@ -962,21 +1015,27 @@ And a free record like this:
 
 struct tdb_free_record {
 
-        uint32_t free_magic;
+        uint64_t free_magic: 8,
+
+                   prev : 56;
 
-        uint64_t total_length;
 
-        uint64_t prev, next;
 
-        ...
+        uint64_t free_table: 8,
 
-        uint64_t tailer;
+                 total_length : 56
+
+        uint64_t next;;
 
 };
 
-We might want to take some bits from the used record's top_hash 
-(and the free record which has 32 bits of padding to spare 
-anyway) if we use variable sized zones. See [freelist-in-zone].
+Note that by limiting valid offsets to 56 bits, we can pack 
+everything we need into 3 64-byte words, meaning our minimum 
+record size is 8 bytes.
+
+3.7.2 Status
+
+Complete.
 
 3.8 Transaction Commit Requires 4 fdatasync
 
@@ -1029,12 +1088,14 @@ but need only be done at open. For running databases, a separate
 header field can be used to indicate a transaction in progress; 
 we need only check for recovery if this is set.
 
-3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
+3.8.2 Status
 
-3.9.1 Proposed Solution
+Deferred.
 
-None. At some point you say “use a real database”  (but see [replay-attribute]
-).
+3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
+
+3.9.1 Proposed SolutionNone. At some point you say “use a real 
+  database” (but see [replay-attribute]).
 
 But as a thought experiment, if we implemented transactions to 
 only overwrite free entries (this is tricky: there must not be a 
@@ -1053,6 +1114,10 @@ rewrite some sections of the hash, too.
 We could then implement snapshots using a similar method, using 
 multiple different hash tables/free tables.
 
+3.9.2 Status
+
+Deferred.
+
 3.10 Transactions Cannot Operate in Parallel
 
 This would be useless for ldb, as it hits the index records with 
@@ -1069,6 +1134,10 @@ allow one write transaction to begin, but it could not commit
 until all r/o transactions are done. This would require a new 
 RO_TRANSACTION_LOCK, which would be upgraded on commit.
 
+3.10.2 Status
+
+Deferred.
+
 3.11 Default Hash Function Is Suboptimal
 
 The Knuth-inspired multiplicative hash used by tdb is fairly slow 
@@ -1090,6 +1159,10 @@ The seed should be created at tdb-creation time from some random
 source, and placed in the header. This is far from foolproof, but 
 adds a little bit of protection against hash bombing.
 
+3.11.2 Status
+
+Complete.
+
 3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
 
 We lock a record during traversal iteration, and try to grab that 
@@ -1104,6 +1177,10 @@ indefinitely.
 
 Remove reliability guarantees; see [traverse-Proposed-Solution].
 
+3.12.2 Status
+
+Complete.
+
 3.13 Fcntl Locking Adds Overhead
 
 Placing a fcntl lock means a system call, as does removing one. 
@@ -1176,3 +1253,7 @@ tdb_open (see [attributes]) to provide replay/trace hooks, which
 could become the basis for this and future parallel transactions 
 and snapshot support.
 
+3.15.2 Status
+
+Deferred.
+

+ 187 - 123
ccan/tdb2/free.c

@@ -49,23 +49,24 @@ unsigned int size_to_bucket(tdb_len_t data_len)
 	return bucket;
 }
 
-tdb_off_t first_flist(struct tdb_context *tdb)
+tdb_off_t first_ftable(struct tdb_context *tdb)
 {
-	return tdb_read_off(tdb, offsetof(struct tdb_header, free_list));
+	return tdb_read_off(tdb, offsetof(struct tdb_header, free_table));
 }
 
-tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist)
+tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable)
 {
-	return tdb_read_off(tdb, flist + offsetof(struct tdb_freelist, next));
+	return tdb_read_off(tdb, ftable + offsetof(struct tdb_freetable,next));
 }
 
-int tdb_flist_init(struct tdb_context *tdb)
+int tdb_ftable_init(struct tdb_context *tdb)
 {
 	/* Use reservoir sampling algorithm to select a free list at random. */
-	unsigned int rnd, max = 0;
+	unsigned int rnd, max = 0, count = 0;
 	tdb_off_t off;
 
-	tdb->flist_off = off = first_flist(tdb);
+	tdb->ftable_off = off = first_ftable(tdb);
+	tdb->ftable = 0;
 
 	while (off) {
 		if (off == TDB_OFF_ERR)
@@ -73,50 +74,52 @@ int tdb_flist_init(struct tdb_context *tdb)
 
 		rnd = random();
 		if (rnd >= max) {
-			tdb->flist_off = off;
+			tdb->ftable_off = off;
+			tdb->ftable = count;
 			max = rnd;
 		}
 
-		off = next_flist(tdb, off);
+		off = next_ftable(tdb, off);
+		count++;
 	}
 	return 0;
 }
 
 /* Offset of a given bucket. */
-tdb_off_t bucket_off(tdb_off_t flist_off, unsigned bucket)
+tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket)
 {
-	return flist_off + offsetof(struct tdb_freelist, buckets)
+	return ftable_off + offsetof(struct tdb_freetable, buckets)
 		+ bucket * sizeof(tdb_off_t);
 }
 
 /* Returns free_buckets + 1, or list number to search. */
 static tdb_off_t find_free_head(struct tdb_context *tdb,
-				tdb_off_t flist_off,
+				tdb_off_t ftable_off,
 				tdb_off_t bucket)
 {
 	/* Speculatively search for a non-zero bucket. */
-	return tdb_find_nonzero_off(tdb, bucket_off(flist_off, 0),
+	return tdb_find_nonzero_off(tdb, bucket_off(ftable_off, 0),
 				    bucket, TDB_FREE_BUCKETS);
 }
 
 /* Remove from free bucket. */
 static int remove_from_list(struct tdb_context *tdb,
 			    tdb_off_t b_off, tdb_off_t r_off,
-			    struct tdb_free_record *r)
+			    const struct tdb_free_record *r)
 {
 	tdb_off_t off;
 
 	/* Front of list? */
-	if (r->prev == 0) {
+	if (frec_prev(r) == 0) {
 		off = b_off;
 	} else {
-		off = r->prev + offsetof(struct tdb_free_record, next);
+		off = frec_prev(r) + offsetof(struct tdb_free_record, next);
 	}
 
 #ifdef DEBUG
 	if (tdb_read_off(tdb, off) != r_off) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "remove_from_list: %llu bad prev in list %llu\n",
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+			 "remove_from_list: %llu bad prev in list %llu",
 			 (long long)r_off, (long long)b_off);
 		return -1;
 	}
@@ -128,19 +131,19 @@ static int remove_from_list(struct tdb_context *tdb,
 	}
 
 	if (r->next != 0) {
-		off = r->next + offsetof(struct tdb_free_record, prev);
+		off = r->next + offsetof(struct tdb_free_record,magic_and_prev);
 		/* r->next->prev = r->prev */
 
 #ifdef DEBUG
-		if (tdb_read_off(tdb, off) != r_off) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "remove_from_list: %llu bad list %llu\n",
-				 (long long)r_off, (long long)b_off);
+		if (tdb_read_off(tdb, off) & TDB_OFF_MASK != r_off) {
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+				   "remove_from_list: %llu bad list %llu",
+				   (long long)r_off, (long long)b_off);
 			return -1;
 		}
 #endif
 
-		if (tdb_write_off(tdb, off, r->prev)) {
+		if (tdb_write_off(tdb, off, r->magic_and_prev)) {
 			return -1;
 		}
 	}
@@ -151,58 +154,66 @@ static int remove_from_list(struct tdb_context *tdb,
 static int enqueue_in_free(struct tdb_context *tdb,
 			   tdb_off_t b_off,
 			   tdb_off_t off,
-			   struct tdb_free_record *new)
+			   tdb_len_t len)
 {
-	new->prev = 0;
+	struct tdb_free_record new;
+	uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL));
+
+	/* We only need to set ftable_and_len; rest is set in enqueue_in_free */
+	new.ftable_and_len = ((uint64_t)tdb->ftable << (64 - TDB_OFF_UPPER_STEAL))
+		| len;
+	/* prev = 0. */
+	new.magic_and_prev = magic;
+
 	/* new->next = head. */
-	new->next = tdb_read_off(tdb, b_off);
-	if (new->next == TDB_OFF_ERR)
+	new.next = tdb_read_off(tdb, b_off);
+	if (new.next == TDB_OFF_ERR)
 		return -1;
 
-	if (new->next) {
+	if (new.next) {
 #ifdef DEBUG
 		if (tdb_read_off(tdb,
-				 new->next
-				 + offsetof(struct tdb_free_record, prev))
-		    != 0) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "enqueue_in_free: %llu bad head prev %llu\n",
-				 (long long)new->next, (long long)b_off);
+				 new.next + offsetof(struct tdb_free_record,
+						     magic_and_prev))
+		    != magic) {
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+				   "enqueue_in_free: %llu bad head"
+				   " prev %llu",
+				   (long long)new.next, (long long)b_off);
 			return -1;
 		}
 #endif
 		/* next->prev = new. */
-		if (tdb_write_off(tdb, new->next
-				  + offsetof(struct tdb_free_record, prev),
-				  off) != 0)
+		if (tdb_write_off(tdb, new.next
+				  + offsetof(struct tdb_free_record,
+					     magic_and_prev),
+				  off | magic) != 0)
 			return -1;
 	}
 	/* head = new */
 	if (tdb_write_off(tdb, b_off, off) != 0)
 		return -1;
 
-	return tdb_write_convert(tdb, off, new, sizeof(*new));
+	return tdb_write_convert(tdb, off, &new, sizeof(new));
 }
 
 /* List need not be locked. */
 int add_free_record(struct tdb_context *tdb,
 		    tdb_off_t off, tdb_len_t len_with_header)
 {
-	struct tdb_free_record new;
 	tdb_off_t b_off;
+	tdb_len_t len;
 	int ret;
 
-	assert(len_with_header >= sizeof(new));
+	assert(len_with_header >= sizeof(struct tdb_free_record));
 
-	new.magic_and_meta = TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL)
-		| tdb->flist_off;
-	new.data_len = len_with_header - sizeof(struct tdb_used_record);
+	len = len_with_header - sizeof(struct tdb_used_record);
 
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(new.data_len));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
 	if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
 		return -1;
 
-	ret = enqueue_in_free(tdb, b_off, off, &new);
+	ret = enqueue_in_free(tdb, b_off, off, len);
 	tdb_unlock_free_bucket(tdb, b_off);
 	return ret;
 }
@@ -234,91 +245,113 @@ static size_t record_leftover(size_t keylen, size_t datalen,
 	return leftover;
 }
 
+static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable)
+{
+	tdb_off_t off;
+	unsigned int i;
+
+	if (likely(tdb->ftable == ftable))
+		return tdb->ftable_off;
+
+	off = first_ftable(tdb);
+	for (i = 0; i < ftable; i++)
+		off = next_ftable(tdb, off);
+	return off;
+}
+
 /* Note: we unlock the current bucket if we coalesce or fail. */
 static int coalesce(struct tdb_context *tdb,
 		    tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
 {
-	struct tdb_free_record pad, *r;
 	tdb_off_t end;
+	struct tdb_free_record rec;
 
+	add_stat(tdb, alloc_coalesce_tried, 1);
 	end = off + sizeof(struct tdb_used_record) + data_len;
 
 	while (end < tdb->map_size) {
+		const struct tdb_free_record *r;
 		tdb_off_t nb_off;
+		unsigned ftable, bucket;
 
-		/* FIXME: do tdb_get here and below really win? */
-		r = tdb_get(tdb, end, &pad, sizeof(pad));
+		r = tdb_access_read(tdb, end, sizeof(*r), true);
 		if (!r)
 			goto err;
 
-		if (frec_magic(r) != TDB_FREE_MAGIC)
+		if (frec_magic(r) != TDB_FREE_MAGIC
+		    || frec_ftable(r) == TDB_FTABLE_NONE) {
+			tdb_access_release(tdb, r);
 			break;
+		}
 
-		nb_off = bucket_off(frec_flist(r), size_to_bucket(r->data_len));
+		ftable = frec_ftable(r);
+		bucket = size_to_bucket(frec_len(r));
+		nb_off = bucket_off(ftable_offset(tdb, ftable), bucket);
+		tdb_access_release(tdb, r);
 
 		/* We may be violating lock order here, so best effort. */
-		if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
+		if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1) {
+			add_stat(tdb, alloc_coalesce_lockfail, 1);
 			break;
+		}
 
 		/* Now we have lock, re-check. */
-		r = tdb_get(tdb, end, &pad, sizeof(pad));
-		if (!r) {
+		if (tdb_read_convert(tdb, end, &rec, sizeof(rec))) {
 			tdb_unlock_free_bucket(tdb, nb_off);
 			goto err;
 		}
 
-		if (unlikely(frec_magic(r) != TDB_FREE_MAGIC)) {
+		if (unlikely(frec_magic(&rec) != TDB_FREE_MAGIC)) {
+			add_stat(tdb, alloc_coalesce_race, 1);
 			tdb_unlock_free_bucket(tdb, nb_off);
 			break;
 		}
 
-		if (unlikely(bucket_off(frec_flist(r),
-					size_to_bucket(r->data_len))
-			     != nb_off)) {
+		if (unlikely(frec_ftable(&rec) != ftable)
+		    || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
+			add_stat(tdb, alloc_coalesce_race, 1);
 			tdb_unlock_free_bucket(tdb, nb_off);
 			break;
 		}
 
-		if (remove_from_list(tdb, nb_off, end, r) == -1) {
+		if (remove_from_list(tdb, nb_off, end, &rec) == -1) {
 			tdb_unlock_free_bucket(tdb, nb_off);
 			goto err;
 		}
 
-		end += sizeof(struct tdb_used_record) + r->data_len;
+		end += sizeof(struct tdb_used_record) + frec_len(&rec);
 		tdb_unlock_free_bucket(tdb, nb_off);
+		add_stat(tdb, alloc_coalesce_num_merged, 1);
 	}
 
 	/* Didn't find any adjacent free? */
 	if (end == off + sizeof(struct tdb_used_record) + data_len)
 		return 0;
 
-	/* OK, expand record */
-	r = tdb_get(tdb, off, &pad, sizeof(pad));
-	if (!r)
+	/* OK, expand initial record */
+	if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
 		goto err;
 
-	if (r->data_len != data_len) {
-		tdb->ecode = TDB_ERR_CORRUPT;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "coalesce: expected data len %llu not %llu\n",
-			 (long long)data_len, (long long)r->data_len);
+	if (frec_len(&rec) != data_len) {
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+			   "coalesce: expected data len %zu not %zu",
+			   (size_t)data_len, (size_t)frec_len(&rec));
 		goto err;
 	}
 
-	if (remove_from_list(tdb, b_off, off, r) == -1)
-		goto err;
-
-	r = tdb_access_write(tdb, off, sizeof(*r), true);
-	if (!r)
+	if (remove_from_list(tdb, b_off, off, &rec) == -1)
 		goto err;
 
 	/* We have to drop this to avoid deadlocks, so make sure record
 	 * doesn't get coalesced by someone else! */
-	r->magic_and_meta = TDB_COALESCING_MAGIC << (64 - TDB_OFF_UPPER_STEAL);
-	r->data_len = end - off - sizeof(struct tdb_used_record);
-	if (tdb_access_commit(tdb, r) != 0)
+	rec.ftable_and_len = (TDB_FTABLE_NONE << (64 - TDB_OFF_UPPER_STEAL))
+		| (end - off - sizeof(struct tdb_used_record));
+	if (tdb_write_off(tdb, off + offsetof(struct tdb_free_record,
+					      ftable_and_len),
+			  rec.ftable_and_len) != 0)
 		goto err;
 
+	add_stat(tdb, alloc_coalesce_succeeded, 1);
 	tdb_unlock_free_bucket(tdb, b_off);
 
 	if (add_free_record(tdb, off, end - off) == -1)
@@ -333,19 +366,21 @@ err:
 
 /* We need size bytes to put our key and data in. */
 static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
-				tdb_off_t flist_off,
+				tdb_off_t ftable_off,
 				tdb_off_t bucket,
 				size_t keylen, size_t datalen,
 				bool want_extra,
+				unsigned magic,
 				unsigned hashlow)
 {
 	tdb_off_t off, b_off,best_off;
-	struct tdb_free_record pad, best = { 0 }, *r;
+	struct tdb_free_record best = { 0 };
 	double multiplier;
 	size_t size = adjust_size(keylen, datalen);
 
+	add_stat(tdb, allocs, 1);
 again:
-	b_off = bucket_off(flist_off, bucket);
+	b_off = bucket_off(ftable_off, bucket);
 
 	/* FIXME: Try non-blocking wait first, to measure contention. */
 	/* Lock this bucket. */
@@ -353,7 +388,7 @@ again:
 		return TDB_OFF_ERR;
 	}
 
-	best.data_len = -1ULL;
+	best.ftable_and_len = -1ULL;
 	best_off = 0;
 
 	/* Get slack if we're after extra. */
@@ -369,30 +404,40 @@ again:
 		goto unlock_err;
 
 	while (off) {
-		/* FIXME: Does tdb_get win anything here? */
-		r = tdb_get(tdb, off, &pad, sizeof(*r));
+		const struct tdb_free_record *r;
+		tdb_len_t len;
+		tdb_off_t next;
+
+		r = tdb_access_read(tdb, off, sizeof(*r), true);
 		if (!r)
 			goto unlock_err;
 
 		if (frec_magic(r) != TDB_FREE_MAGIC) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "lock_and_alloc: %llu non-free 0x%llx\n",
-				 (long long)off, (long long)r->magic_and_meta);
+			tdb_access_release(tdb, r);
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+				 "lock_and_alloc: %llu non-free 0x%llx",
+				 (long long)off, (long long)r->magic_and_prev);
 			goto unlock_err;
 		}
 
-		if (r->data_len >= size && r->data_len < best.data_len) {
+		if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
 			best_off = off;
 			best = *r;
 		}
 
-		if (best.data_len < size * multiplier && best_off)
+		if (frec_len(&best) < size * multiplier && best_off) {
+			tdb_access_release(tdb, r);
 			break;
+		}
 
 		multiplier *= 1.01;
 
+		next = r->next;
+		len = frec_len(r);
+		tdb_access_release(tdb, r);
+
 		/* Since we're going slow anyway, try coalescing here. */
-		switch (coalesce(tdb, off, b_off, r->data_len)) {
+		switch (coalesce(tdb, off, b_off, len)) {
 		case -1:
 			/* This has already unlocked on error. */
 			return -1;
@@ -400,7 +445,7 @@ again:
 			/* This has unlocked list, restart. */
 			goto again;
 		}
-		off = r->next;
+		off = next;
 	}
 
 	/* If we found anything at all, use it. */
@@ -413,28 +458,30 @@ again:
 			goto unlock_err;
 
 		leftover = record_leftover(keylen, datalen, want_extra,
-					   best.data_len);
+					   frec_len(&best));
 
-		assert(keylen + datalen + leftover <= best.data_len);
+		assert(keylen + datalen + leftover <= frec_len(&best));
 		/* We need to mark non-free before we drop lock, otherwise
 		 * coalesce() could try to merge it! */
-		if (set_header(tdb, &rec, keylen, datalen,
-			       best.data_len - leftover,
-			       hashlow) != 0)
+		if (set_header(tdb, &rec, magic, keylen, datalen,
+			       frec_len(&best) - leftover, hashlow) != 0)
 			goto unlock_err;
 
 		if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
 			goto unlock_err;
 
-		tdb_unlock_free_bucket(tdb, b_off);
-
+		/* Bucket of leftover will be <= current bucket, so nested
+		 * locking is allowed. */
 		if (leftover) {
+			add_stat(tdb, alloc_leftover, 1);
 			if (add_free_record(tdb,
 					    best_off + sizeof(rec)
-					    + best.data_len - leftover,
+					    + frec_len(&best) - leftover,
 					    leftover))
-				return TDB_OFF_ERR;
+				best_off = TDB_OFF_ERR;
 		}
+		tdb_unlock_free_bucket(tdb, b_off);
+
 		return best_off;
 	}
 
@@ -449,10 +496,10 @@ unlock_err:
 /* Get a free block from current free list, or 0 if none. */
 static tdb_off_t get_free(struct tdb_context *tdb,
 			  size_t keylen, size_t datalen, bool want_extra,
-			  unsigned hashlow)
+			  unsigned magic, unsigned hashlow)
 {
-	tdb_off_t off, flist;
-	unsigned start_b, b;
+	tdb_off_t off, ftable_off;
+	unsigned start_b, b, ftable;
 	bool wrapped = false;
 
 	/* If they are growing, add 50% to get to higher bucket. */
@@ -462,31 +509,40 @@ static tdb_off_t get_free(struct tdb_context *tdb,
 	else
 		start_b = size_to_bucket(adjust_size(keylen, datalen));
 
-	flist = tdb->flist_off;
-	while (!wrapped || flist != tdb->flist_off) {
+	ftable_off = tdb->ftable_off;
+	ftable = tdb->ftable;
+	while (!wrapped || ftable_off != tdb->ftable_off) {
 		/* Start at exact size bucket, and search up... */
-		for (b = find_free_head(tdb, flist, start_b);
+		for (b = find_free_head(tdb, ftable_off, start_b);
 		     b < TDB_FREE_BUCKETS;
-		     b = find_free_head(tdb, flist, b + 1)) {
+		     b = find_free_head(tdb, ftable_off, b + 1)) {
 			/* Try getting one from list. */
-			off = lock_and_alloc(tdb, flist,
+			off = lock_and_alloc(tdb, ftable_off,
 					     b, keylen, datalen, want_extra,
-					     hashlow);
+					     magic, hashlow);
 			if (off == TDB_OFF_ERR)
 				return TDB_OFF_ERR;
 			if (off != 0) {
+				if (b == start_b)
+					add_stat(tdb, alloc_bucket_exact, 1);
+				if (b == TDB_FREE_BUCKETS - 1)
+					add_stat(tdb, alloc_bucket_max, 1);
 				/* Worked?  Stay using this list. */
-				tdb->flist_off = flist;
+				tdb->ftable_off = ftable_off;
+				tdb->ftable = ftable;
 				return off;
 			}
 			/* Didn't work.  Try next bucket. */
 		}
 
-		/* Hmm, try next list. */
-		flist = next_flist(tdb, flist);
-		if (flist == 0) {
+		/* Hmm, try next table. */
+		ftable_off = next_ftable(tdb, ftable_off);
+		ftable++;
+
+		if (ftable_off == 0) {
 			wrapped = true;
-			flist = first_flist(tdb);
+			ftable_off = first_ftable(tdb);
+			ftable = 0;
 		}
 	}
 
@@ -495,7 +551,7 @@ static tdb_off_t get_free(struct tdb_context *tdb,
 
 int set_header(struct tdb_context *tdb,
 	       struct tdb_used_record *rec,
-	       uint64_t keylen, uint64_t datalen,
+	       unsigned magic, uint64_t keylen, uint64_t datalen,
 	       uint64_t actuallen, unsigned hashlow)
 {
 	uint64_t keybits = (fls64(keylen) + 1) / 2;
@@ -504,16 +560,15 @@ int set_header(struct tdb_context *tdb,
 	rec->magic_and_meta = (hashlow & ((1 << 11)-1))
 		| ((actuallen - (keylen + datalen)) << 11)
 		| (keybits << 43)
-		| (TDB_MAGIC << 48);
+		| ((uint64_t)magic << 48);
 	rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
 
 	/* Encoding can fail on big values. */
 	if (rec_key_length(rec) != keylen
 	    || rec_data_length(rec) != datalen
 	    || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
-		tdb->ecode = TDB_ERR_IO;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "Could not encode k=%llu,d=%llu,a=%llu\n",
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			 "Could not encode k=%llu,d=%llu,a=%llu",
 			 (long long)keylen, (long long)datalen,
 			 (long long)actuallen);
 		return -1;
@@ -533,11 +588,19 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 	/* Need to hold a hash lock to expand DB: transactions rely on it. */
 	if (!(tdb->flags & TDB_NOLOCK)
 	    && !tdb->allrecord_lock.count && !tdb_has_hash_locks(tdb)) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_expand: must hold lock during expand\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_expand: must hold lock during expand");
 		return -1;
 	}
 
+	/* always make room for at least 100 more records, and at
+           least 25% more space. */
+	if (size * TDB_EXTENSION_FACTOR > tdb->map_size / 4)
+		wanted = size * TDB_EXTENSION_FACTOR;
+	else
+		wanted = tdb->map_size / 4;
+	wanted = adjust_size(0, wanted);
+
 	/* Only one person can expand file at a time. */
 	if (tdb_lock_expand(tdb, F_WRLCK) != 0)
 		return -1;
@@ -550,7 +613,7 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 		return 0;
 	}
 
-	if (tdb->methods->expand_file(tdb, wanted*TDB_EXTENSION_FACTOR) == -1) {
+	if (tdb->methods->expand_file(tdb, wanted) == -1) {
 		tdb_unlock_expand(tdb, F_WRLCK);
 		return -1;
 	}
@@ -558,12 +621,13 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 	/* We need to drop this lock before adding free record. */
 	tdb_unlock_expand(tdb, F_WRLCK);
 
-	return add_free_record(tdb, old_size, wanted * TDB_EXTENSION_FACTOR);
+	add_stat(tdb, expands, 1);
+	return add_free_record(tdb, old_size, wanted);
 }
 
 /* This won't fail: it will expand the database if it has to. */
 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
-		uint64_t hash, bool growing)
+		uint64_t hash, unsigned magic, bool growing)
 {
 	tdb_off_t off;
 
@@ -571,7 +635,7 @@ tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
 	assert(!tdb->direct_access);
 
 	for (;;) {
-		off = get_free(tdb, keylen, datalen, growing, hash);
+		off = get_free(tdb, keylen, datalen, growing, magic, hash);
 		if (likely(off != 0))
 			break;
 

+ 187 - 46
ccan/tdb2/hash.c

@@ -42,17 +42,19 @@ uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len)
 
 uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
 {
-	struct tdb_used_record pad, *r;
+	const struct tdb_used_record *r;
 	const void *key;
 	uint64_t klen, hash;
 
-	r = tdb_get(tdb, off, &pad, sizeof(pad));
+	r = tdb_access_read(tdb, off, sizeof(*r), true);
 	if (!r)
 		/* FIXME */
 		return 0;
 
 	klen = rec_key_length(r);
-	key = tdb_access_read(tdb, off + sizeof(pad), klen, false);
+	tdb_access_release(tdb, r);
+
+	key = tdb_access_read(tdb, off + sizeof(*r), klen, false);
 	if (!key)
 		return 0;
 
@@ -76,6 +78,30 @@ static uint32_t use_bits(struct hash_info *h, unsigned num)
 	return bits(h->h, 64 - h->hash_used, num);
 }
 
+static bool key_matches(struct tdb_context *tdb,
+			const struct tdb_used_record *rec,
+			tdb_off_t off,
+			const struct tdb_data *key)
+{
+	bool ret = false;
+	const char *rkey;
+
+	if (rec_key_length(rec) != key->dsize) {
+		add_stat(tdb, compare_wrong_keylen, 1);
+		return ret;
+	}
+
+	rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false);
+	if (!rkey)
+		return ret;
+	if (memcmp(rkey, key->dptr, key->dsize) == 0)
+		ret = true;
+	else
+		add_stat(tdb, compare_wrong_keycmp, 1);
+	tdb_access_release(tdb, rkey);
+	return ret;
+}
+
 /* Does entry match? */
 static bool match(struct tdb_context *tdb,
 		  struct hash_info *h,
@@ -83,38 +109,33 @@ static bool match(struct tdb_context *tdb,
 		  tdb_off_t val,
 		  struct tdb_used_record *rec)
 {
-	bool ret;
-	const unsigned char *rkey;
 	tdb_off_t off;
 
-	/* FIXME: Handle hash value truncated. */
-	if (bits(val, TDB_OFF_HASH_TRUNCATED_BIT, 1))
-		abort();
-
+	add_stat(tdb, compares, 1);
 	/* Desired bucket must match. */
-	if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK))
+	if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK)) {
+		add_stat(tdb, compare_wrong_bucket, 1);
 		return false;
+	}
 
 	/* Top bits of offset == next bits of hash. */
 	if (bits(val, TDB_OFF_HASH_EXTRA_BIT, TDB_OFF_UPPER_STEAL_EXTRA)
 	    != bits(h->h, 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
-		    TDB_OFF_UPPER_STEAL_EXTRA))
+		    TDB_OFF_UPPER_STEAL_EXTRA)) {
+		add_stat(tdb, compare_wrong_offsetbits, 1);
 		return false;
+	}
 
 	off = val & TDB_OFF_MASK;
 	if (tdb_read_convert(tdb, off, rec, sizeof(*rec)) == -1)
 		return false;
 
-	/* FIXME: check extra bits in header? */
-	if (rec_key_length(rec) != key->dsize)
+	if ((h->h & ((1 << 11)-1)) != rec_hash(rec)) {
+		add_stat(tdb, compare_wrong_rechash, 1);
 		return false;
+	}
 
-	rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false);
-	if (!rkey)
-		return false;
-	ret = (memcmp(rkey, key->dptr, key->dsize) == 0);
-	tdb_access_release(tdb, rkey);
-	return ret;
+	return key_matches(tdb, rec, off, key);
 }
 
 static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
@@ -123,10 +144,9 @@ static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
 		+ (bucket % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
 }
 
-/* Truncated hashes can't be all 1: that's how we spot a sub-hash */
 bool is_subhash(tdb_off_t val)
 {
-	return val >> (64-TDB_OFF_UPPER_STEAL) == (1<<TDB_OFF_UPPER_STEAL) - 1;
+	return (val >> TDB_OFF_UPPER_STEAL_SUBHASH_BIT) & 1;
 }
 
 /* FIXME: Guess the depth, don't over-lock! */
@@ -136,6 +156,65 @@ static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size)
 	return group << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS));
 }
 
+static tdb_off_t COLD find_in_chain(struct tdb_context *tdb,
+				    struct tdb_data key,
+				    tdb_off_t chain,
+				    struct hash_info *h,
+				    struct tdb_used_record *rec,
+				    struct traverse_info *tinfo)
+{
+	tdb_off_t off, next;
+
+	/* In case nothing is free, we set these to zero. */
+	h->home_bucket = h->found_bucket = 0;
+
+	for (off = chain; off; off = next) {
+		unsigned int i;
+
+		h->group_start = off;
+		if (tdb_read_convert(tdb, off, h->group, sizeof(h->group)))
+			return TDB_OFF_ERR;
+
+		for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+			tdb_off_t recoff;
+			if (!h->group[i]) {
+				/* Remember this empty bucket. */
+				h->home_bucket = h->found_bucket = i;
+				continue;
+			}
+
+			/* We can insert extra bits via add_to_hash
+			 * empty bucket logic. */
+			recoff = h->group[i] & TDB_OFF_MASK;
+			if (tdb_read_convert(tdb, recoff, rec, sizeof(*rec)))
+				return TDB_OFF_ERR;
+
+			if (key_matches(tdb, rec, recoff, &key)) {
+				h->home_bucket = h->found_bucket = i;
+
+				if (tinfo) {
+					tinfo->levels[tinfo->num_levels]
+						.hashtable = off;
+					tinfo->levels[tinfo->num_levels]
+						.total_buckets
+						= 1 << TDB_HASH_GROUP_BITS;
+					tinfo->levels[tinfo->num_levels].entry
+						= i;
+					tinfo->num_levels++;
+				}
+				return recoff;
+			}
+		}
+		next = tdb_read_off(tdb, off
+				    + offsetof(struct tdb_chain, next));
+		if (next == TDB_OFF_ERR)
+			return TDB_OFF_ERR;
+		if (next)
+			next += sizeof(struct tdb_used_record);
+	}
+	return 0;
+}
+
 /* This is the core routine which searches the hashtable for an entry.
  * On error, no locks are held and TDB_OFF_ERR is returned.
  * Otherwise, hinfo is filled in (and the optional tinfo).
@@ -171,7 +250,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb,
 		tinfo->levels[0].total_buckets = 1 << TDB_HASH_GROUP_BITS;
 	}
 
-	while (likely(h->hash_used < 64)) {
+	while (h->hash_used <= 64) {
 		/* Read in the hash group. */
 		h->group_start = hashtable
 			+ group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
@@ -228,8 +307,7 @@ tdb_off_t find_and_lock(struct tdb_context *tdb,
 		return 0;
 	}
 
-	/* FIXME: We hit the bottom.  Chain! */
-	abort();
+	return find_in_chain(tdb, key, hashtable, h, rec, tinfo);
 
 fail:
 	tdb_unlock_hashes(tdb, h->hlock_start, h->hlock_range, ltype);
@@ -239,8 +317,8 @@ fail:
 /* I wrote a simple test, expanding a hash to 2GB, for the following
  * cases:
  * 1) Expanding all the buckets at once,
- * 2) Expanding the most-populated bucket,
- * 3) Expanding the bucket we wanted to place the new entry ito.
+ * 2) Expanding the bucket we wanted to place the new entry into.
+ * 3) Expanding the most-populated bucket,
  *
  * I measured the worst/average/best density during this process.
  * 1) 3%/16%/30%
@@ -315,6 +393,41 @@ int replace_in_hash(struct tdb_context *tdb,
 			     encode_offset(new_off, h));
 }
 
+/* We slot in anywhere that's empty in the chain. */
+static int COLD add_to_chain(struct tdb_context *tdb,
+			     tdb_off_t subhash,
+			     tdb_off_t new_off)
+{
+	size_t entry = tdb_find_zero_off(tdb, subhash, 1<<TDB_HASH_GROUP_BITS);
+
+	if (entry == 1 << TDB_HASH_GROUP_BITS) {
+		tdb_off_t next;
+
+		next = tdb_read_off(tdb, subhash
+				    + offsetof(struct tdb_chain, next));
+		if (next == TDB_OFF_ERR)
+			return -1;
+
+		if (!next) {
+			next = alloc(tdb, 0, sizeof(struct tdb_chain), 0,
+				     TDB_CHAIN_MAGIC, false);
+			if (next == TDB_OFF_ERR)
+				return -1;
+			if (zero_out(tdb, next+sizeof(struct tdb_used_record),
+				     sizeof(struct tdb_chain)))
+				return -1;
+			if (tdb_write_off(tdb, subhash
+					  + offsetof(struct tdb_chain, next),
+					  next) != 0)
+				return -1;
+		}
+		return add_to_chain(tdb, next, new_off);
+	}
+
+	return tdb_write_off(tdb, subhash + entry * sizeof(tdb_off_t),
+			     new_off);
+}
+
 /* Add into a newly created subhash. */
 static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
 			  unsigned hash_used, tdb_off_t val)
@@ -325,14 +438,12 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
 
 	h.hash_used = hash_used;
 
-	/* FIXME chain if hash_used == 64 */
 	if (hash_used + TDB_SUBLEVEL_HASH_BITS > 64)
-		abort();
+		return add_to_chain(tdb, subhash, off);
 
-	/* FIXME: Do truncated hash bits if we can! */
 	h.h = hash_record(tdb, off);
 	gnum = use_bits(&h, TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS);
-	h.group_start = subhash	+ sizeof(struct tdb_used_record)
+	h.group_start = subhash
 		+ gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
 	h.home_bucket = use_bits(&h, TDB_HASH_GROUP_BITS);
 
@@ -346,20 +457,29 @@ static int add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
 
 static int expand_group(struct tdb_context *tdb, struct hash_info *h)
 {
-	unsigned bucket, num_vals, i;
+	unsigned bucket, num_vals, i, magic;
+	size_t subsize;
 	tdb_off_t subhash;
 	tdb_off_t vals[1 << TDB_HASH_GROUP_BITS];
 
 	/* Attach new empty subhash under fullest bucket. */
 	bucket = fullest_bucket(tdb, h->group, h->home_bucket);
 
-	subhash = alloc(tdb, 0, sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS,
-			0, false);
+	if (h->hash_used == 64) {
+		add_stat(tdb, alloc_chain, 1);
+		subsize = sizeof(struct tdb_chain);
+		magic = TDB_CHAIN_MAGIC;
+	} else {
+		add_stat(tdb, alloc_subhash, 1);
+		subsize = (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS);
+		magic = TDB_HTABLE_MAGIC;
+	}
+
+	subhash = alloc(tdb, 0, subsize, 0, magic, false);
 	if (subhash == TDB_OFF_ERR)
 		return -1;
 
-	if (zero_out(tdb, subhash + sizeof(struct tdb_used_record),
-		     sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) == -1)
+	if (zero_out(tdb, subhash + sizeof(struct tdb_used_record), subsize))
 		return -1;
 
 	/* Remove any which are destined for bucket or are in wrong place. */
@@ -377,7 +497,10 @@ static int expand_group(struct tdb_context *tdb, struct hash_info *h)
 	/* assert(num_vals); */
 
 	/* Overwrite expanded bucket with subhash pointer. */
-	h->group[bucket] = subhash | ~((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
+	h->group[bucket] = subhash | (1ULL << TDB_OFF_UPPER_STEAL_SUBHASH_BIT);
+
+	/* Point to actual contents of record. */
+	subhash += sizeof(struct tdb_used_record);
 
 	/* Put values back. */
 	for (i = 0; i < num_vals; i++) {
@@ -433,10 +556,6 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h)
 
 int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off)
 {
-	/* FIXME: chain! */
-	if (h->hash_used >= 64)
-		abort();
-
 	/* We hit an empty bucket during search?  That's where it goes. */
 	if (!h->group[h->found_bucket]) {
 		h->group[h->found_bucket] = encode_offset(new_off, h);
@@ -445,6 +564,9 @@ int add_to_hash(struct tdb_context *tdb, struct hash_info *h, tdb_off_t new_off)
 					 h->group, sizeof(h->group));
 	}
 
+	if (h->hash_used > 64)
+		return add_to_chain(tdb, h->group_start, new_off);
+
 	/* We're full.  Expand. */
 	if (expand_group(tdb, h) == -1)
 		return -1;
@@ -523,7 +645,11 @@ again:
 		tlevel++;
 		tlevel->hashtable = off + sizeof(struct tdb_used_record);
 		tlevel->entry = 0;
-		tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS);
+		/* Next level is a chain? */
+		if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1))
+			tlevel->total_buckets = (1 << TDB_HASH_GROUP_BITS);
+		else
+			tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS);
 		goto again;
 	}
 
@@ -531,6 +657,20 @@ again:
 	if (tinfo->num_levels == 1)
 		return 0;
 
+	/* Handle chained entries. */
+	if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1)) {
+		tlevel->hashtable = tdb_read_off(tdb, tlevel->hashtable
+						 + offsetof(struct tdb_chain,
+							    next));
+		if (tlevel->hashtable == TDB_OFF_ERR)
+			return TDB_OFF_ERR;
+		if (tlevel->hashtable) {
+			tlevel->hashtable += sizeof(struct tdb_used_record);
+			tlevel->entry = 0;
+			goto again;
+		}
+	}
+
 	/* Go back up and keep searching. */
 	tinfo->num_levels--;
 	tlevel--;
@@ -563,11 +703,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype,
 						  ltype);
 				return -1;
 			}
-			if (rec_magic(&rec) != TDB_MAGIC) {
-				tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-					 "next_in_hash:"
-					 " corrupt record at %llu\n",
-					 (long long)off);
+			if (rec_magic(&rec) != TDB_USED_MAGIC) {
+				tdb_logerr(tdb, TDB_ERR_CORRUPT,
+					   TDB_DEBUG_FATAL,
+					   "next_in_hash:"
+					   " corrupt record at %llu",
+					   (long long)off);
 				return -1;
 			}
 

+ 114 - 92
ccan/tdb2/io.c

@@ -56,9 +56,9 @@ void tdb_mmap(struct tdb_context *tdb)
 	 */
 	if (tdb->map_ptr == MAP_FAILED) {
 		tdb->map_ptr = NULL;
-		tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
-			 "tdb_mmap failed for size %lld (%s)\n", 
-			 (long long)tdb->map_size, strerror(errno));
+		tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_WARNING,
+			   "tdb_mmap failed for size %lld (%s)",
+			   (long long)tdb->map_size, strerror(errno));
 	}
 }
 
@@ -70,7 +70,6 @@ void tdb_mmap(struct tdb_context *tdb)
 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 {
 	struct stat st;
-	int ret;
 
 	/* We can't hold pointers during this: we could unmap! */
 	assert(!tdb->direct_access
@@ -81,11 +80,9 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 		return 0;
 	if (tdb->flags & TDB_INTERNAL) {
 		if (!probe) {
-			/* Ensure ecode is set for log fn. */
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
 				 "tdb_oob len %lld beyond internal"
-				 " malloc size %lld\n",
+				 " malloc size %lld",
 				 (long long)len,
 				 (long long)tdb->map_size);
 		}
@@ -95,22 +92,20 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 	if (tdb_lock_expand(tdb, F_RDLCK) != 0)
 		return -1;
 
-	ret = fstat(tdb->fd, &st);
-
-	tdb_unlock_expand(tdb, F_RDLCK);
-
-	if (ret == -1) {
-		tdb->ecode = TDB_ERR_IO;
+	if (fstat(tdb->fd, &st) != 0) {
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+			   "Failed to fstat file: %s", strerror(errno));
+		tdb_unlock_expand(tdb, F_RDLCK);
 		return -1;
 	}
 
+	tdb_unlock_expand(tdb, F_RDLCK);
+
 	if (st.st_size < (size_t)len) {
 		if (!probe) {
-			/* Ensure ecode is set for log fn. */
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_oob len %lld beyond eof at %lld\n",
-				 (long long)len, (long long)st.st_size);
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+				   "tdb_oob len %zu beyond eof at %zu",
+				   (size_t)len, st.st_size);
 		}
 		return -1;
 	}
@@ -123,19 +118,6 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 	return 0;
 }
 
-/* Either make a copy into pad and return that, or return ptr into mmap. */
-/* Note: pad has to be a real object, so we can't get here if len
- * overflows size_t */
-void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
-{
-	if (likely(!(tdb->flags & TDB_CONVERT))) {
-		void *ret = tdb->methods->direct(tdb, off, len);
-		if (ret)
-			return ret;
-	}
-	return tdb_read_convert(tdb, off, pad, len) == -1 ? NULL : pad;
-}
-
 /* Endian conversion: we only ever deal with 8 byte quantities */
 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
 {
@@ -191,7 +173,9 @@ uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
 {
 	char buf[8192] = { 0 };
-	void *p = tdb->methods->direct(tdb, off, len);
+	void *p = tdb->methods->direct(tdb, off, len, true);
+
+	assert(!tdb->read_only);
 	if (p) {
 		memset(p, 0, len);
 		return 0;
@@ -208,13 +192,18 @@ int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
 
 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
 {
-	tdb_off_t pad, *ret;
+	tdb_off_t ret;
 
-	ret = tdb_get(tdb, off, &pad, sizeof(pad));
-	if (!ret) {
-		return TDB_OFF_ERR;
+	if (likely(!(tdb->flags & TDB_CONVERT))) {
+		tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
+						    false);
+		if (p)
+			return *p;
 	}
-	return *ret;
+
+	if (tdb_read_convert(tdb, off, &ret, sizeof(ret)) == -1)
+		return TDB_OFF_ERR;
+	return ret;
 }
 
 /* Even on files, we can get partial writes due to signals. */
@@ -278,15 +267,17 @@ bool tdb_read_all(int fd, void *buf, size_t len)
 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
 		     const void *buf, tdb_len_t len)
 {
-	if (len == 0) {
-		return 0;
-	}
-
 	if (tdb->read_only) {
-		tdb->ecode = TDB_ERR_RDONLY;
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Write to read-only database");
 		return -1;
 	}
 
+	/* FIXME: Bogus optimization? */
+	if (len == 0) {
+		return 0;
+	}
+
 	if (tdb->methods->oob(tdb, off + len, 0) != 0)
 		return -1;
 
@@ -294,11 +285,9 @@ static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
 		memcpy(off + (char *)tdb->map_ptr, buf, len);
 	} else {
 		if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_write failed at %llu len=%llu (%s)\n",
-				 (long long)off, (long long)len,
-				 strerror(errno));
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+				   "tdb_write failed at %zu len=%zu (%s)",
+				   (size_t)off, (size_t)len, strerror(errno));
 			return -1;
 		}
 	}
@@ -317,14 +306,12 @@ static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
 		memcpy(buf, off + (char *)tdb->map_ptr, len);
 	} else {
 		if (!tdb_pread_all(tdb->fd, buf, len, off)) {
-			/* Ensure ecode is set for log fn. */
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_read failed at %lld "
-				 "len=%lld (%s) map_size=%lld\n",
-				 (long long)off, (long long)len,
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+				   "tdb_read failed at %zu "
+				   "len=%zu (%s) map_size=%zu",
+				 (size_t)off, (size_t)len,
 				 strerror(errno),
-				 (long long)tdb->map_size);
+				 (size_t)tdb->map_size);
 			return -1;
 		}
 	}
@@ -338,10 +325,9 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
 	if (unlikely((tdb->flags & TDB_CONVERT))) {
 		void *conv = malloc(len);
 		if (!conv) {
-			tdb->ecode = TDB_ERR_OOM;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_write: no memory converting %zu bytes\n",
-				 len);
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "tdb_write: no memory converting"
+				   " %zu bytes", len);
 			return -1;
 		}
 		memcpy(conv, rec, len);
@@ -364,6 +350,20 @@ int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
 
 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
 {
+	if (tdb->read_only) {
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Write to read-only database");
+		return -1;
+	}
+
+	if (likely(!(tdb->flags & TDB_CONVERT))) {
+		tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
+						    true);
+		if (p) {
+			*p = val;
+			return 0;
+		}
+	}
 	return tdb_write_convert(tdb, off, &val, sizeof(val));
 }
 
@@ -374,12 +374,12 @@ static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
 
 	/* some systems don't like zero length malloc */
 	buf = malloc(prefix + len ? prefix + len : 1);
-	if (unlikely(!buf)) {
-		tdb->ecode = TDB_ERR_OOM;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_alloc_read malloc failed len=%lld\n",
-			 (long long)prefix + len);
-	} else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix, len))) {
+	if (!buf) {
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			   "tdb_alloc_read malloc failed len=%zu",
+			   (size_t)(prefix + len));
+	} else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix,
+					       len) == -1)) {
 		free(buf);
 		buf = NULL;
 	}
@@ -400,9 +400,8 @@ static int fill(struct tdb_context *tdb,
 		size_t n = len > size ? size : len;
 
 		if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "fill write failed: giving up!\n");
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+				 "fill write failed: giving up!");
 			return -1;
 		}
 		len -= n;
@@ -418,14 +417,16 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
 	char buf[8192];
 
 	if (tdb->read_only) {
-		tdb->ecode = TDB_ERR_RDONLY;
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Expand on read-only database");
 		return -1;
 	}
 
 	if (tdb->flags & TDB_INTERNAL) {
 		char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
 		if (!new) {
-			tdb->ecode = TDB_ERR_OOM;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "No memory to expand database");
 			return -1;
 		}
 		tdb->map_ptr = new;
@@ -443,7 +444,7 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
 		   file isn't sparse, which would be very bad if we ran out of
 		   disk. This must be done with write, not via mmap */
 		memset(buf, 0x43, sizeof(buf));
-		if (fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
+		if (0 || fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
 			return -1;
 		tdb->map_size += addition;
 		tdb_mmap(tdb);
@@ -451,25 +452,20 @@ static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
 	return 0;
 }
 
-/* This is only neded for tdb_access_commit, but used everywhere to simplify. */
-struct tdb_access_hdr {
-	tdb_off_t off;
-	tdb_len_t len;
-	bool convert;
-};
-
 const void *tdb_access_read(struct tdb_context *tdb,
 			    tdb_off_t off, tdb_len_t len, bool convert)
 {
 	const void *ret = NULL;	
 
 	if (likely(!(tdb->flags & TDB_CONVERT)))
-		ret = tdb->methods->direct(tdb, off, len);
+		ret = tdb->methods->direct(tdb, off, len, false);
 
 	if (!ret) {
 		struct tdb_access_hdr *hdr;
 		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
 		if (hdr) {
+			hdr->next = tdb->access;
+			tdb->access = hdr;
 			ret = hdr + 1;
 			if (convert)
 				tdb_convert(tdb, (void *)ret, len);
@@ -485,13 +481,21 @@ void *tdb_access_write(struct tdb_context *tdb,
 {
 	void *ret = NULL;
 
+	if (tdb->read_only) {
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Write to read-only database");
+		return NULL;
+	}
+
 	if (likely(!(tdb->flags & TDB_CONVERT)))
-		ret = tdb->methods->direct(tdb, off, len);
+		ret = tdb->methods->direct(tdb, off, len, true);
 
 	if (!ret) {
 		struct tdb_access_hdr *hdr;
 		hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
 		if (hdr) {
+			hdr->next = tdb->access;
+			tdb->access = hdr;
 			hdr->off = off;
 			hdr->len = len;
 			hdr->convert = convert;
@@ -505,30 +509,41 @@ void *tdb_access_write(struct tdb_context *tdb,
 	return ret;
 }
 
+static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p)
+{
+	struct tdb_access_hdr **hp;
+
+	for (hp = &tdb->access; *hp; hp = &(*hp)->next) {
+		if (*hp + 1 == p)
+			return hp;
+	}
+	return NULL;
+}
+
 void tdb_access_release(struct tdb_context *tdb, const void *p)
 {
-	if (!tdb->map_ptr
-	    || (char *)p < (char *)tdb->map_ptr
-	    || (char *)p >= (char *)tdb->map_ptr + tdb->map_size)
-		free((struct tdb_access_hdr *)p - 1);
-	else
+	struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
+
+	if (hp) {
+		hdr = *hp;
+		*hp = hdr->next;
+		free(hdr);
+	} else
 		tdb->direct_access--;
 }
 
 int tdb_access_commit(struct tdb_context *tdb, void *p)
 {
+	struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
 	int ret = 0;
 
-	if (!tdb->map_ptr
-	    || (char *)p < (char *)tdb->map_ptr
-	    || (char *)p >= (char *)tdb->map_ptr + tdb->map_size) {
-		struct tdb_access_hdr *hdr;
-
-		hdr = (struct tdb_access_hdr *)p - 1;
+	if (hp) {
+		hdr = *hp;
 		if (hdr->convert)
 			ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
 		else
 			ret = tdb_write(tdb, hdr->off, p, hdr->len);
+		*hp = hdr->next;
 		free(hdr);
 	} else
 		tdb->direct_access--;
@@ -536,7 +551,8 @@ int tdb_access_commit(struct tdb_context *tdb, void *p)
 	return ret;
 }
 
-static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
+static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
+			bool write)
 {
 	if (unlikely(!tdb->map_ptr))
 		return NULL;
@@ -546,6 +562,12 @@ static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
 	return (char *)tdb->map_ptr + off;
 }
 
+void add_stat_(struct tdb_context *tdb, uint64_t *stat, size_t val)
+{
+	if ((uintptr_t)stat < (uintptr_t)tdb->stats + tdb->stats->size)
+		*stat += val;
+}
+
 static const struct tdb_methods io_methods = {
 	tdb_read,
 	tdb_write,

+ 77 - 97
ccan/tdb2/lock.c

@@ -40,10 +40,13 @@ static int fcntl_lock(struct tdb_context *tdb,
 	fl.l_len = len;
 	fl.l_pid = 0;
 
+	add_stat(tdb, lock_lowlevel, 1);
 	if (waitflag)
 		return fcntl(tdb->fd, F_SETLKW, &fl);
-	else
+	else {
+		add_stat(tdb, lock_nonblock, 1);
 		return fcntl(tdb->fd, F_SETLK, &fl);
+	}
 }
 
 static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
@@ -99,7 +102,7 @@ static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
 	}
 
 	if (!found) {
-		fprintf(stderr, "Unlock on %u@%u not found!\n",
+		fprintf(stderr, "Unlock on %u@%u not found!",
 			(int)off, (int)len);
 		abort();
 	}
@@ -132,16 +135,16 @@ static int tdb_brlock(struct tdb_context *tdb,
 	}
 
 	if (rw_type == F_WRLCK && tdb->read_only) {
-		tdb->ecode = TDB_ERR_RDONLY;
+		tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
+			   "Write lock attempted on read-only database");
 		return -1;
 	}
 
 	/* A 32 bit system cannot open a 64-bit file, but it could have
 	 * expanded since then: check here. */
 	if ((size_t)(offset + len) != offset + len) {
-		tdb->ecode = TDB_ERR_IO;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_brlock: lock on giant offset %llu\n",
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			 "tdb_brlock: lock on giant offset %llu",
 			 (long long)(offset + len));
 		return -1;
 	}
@@ -157,11 +160,12 @@ static int tdb_brlock(struct tdb_context *tdb,
 		 * EAGAIN is an expected return from non-blocking
 		 * locks. */
 		if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_brlock failed (fd=%d) at"
-				 " offset %llu rw_type=%d flags=%d len=%llu\n",
-				 tdb->fd, (long long)offset, rw_type,
-				 flags, (long long)len);
+			tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+				   "tdb_brlock failed (fd=%d) at"
+				   " offset %zu rw_type=%d flags=%d len=%zu:"
+				   " %s",
+				   tdb->fd, (size_t)offset, rw_type,
+				   flags, (size_t)len, strerror(errno));
 		}
 		return -1;
 	}
@@ -182,10 +186,10 @@ static int tdb_brunlock(struct tdb_context *tdb,
 	} while (ret == -1 && errno == EINTR);
 
 	if (ret == -1) {
-		tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
-			 "tdb_brunlock failed (fd=%d) at offset %llu"
-			 " rw_type=%d len=%llu\n",
-			 tdb->fd, (long long)offset, rw_type, (long long)len);
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_TRACE,
+			   "tdb_brunlock failed (fd=%d) at offset %zu"
+			   " rw_type=%d len=%zu",
+			   tdb->fd, (size_t)offset, rw_type, (size_t)len);
 	}
 	return ret;
 }
@@ -201,15 +205,15 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
 	int count = 1000;
 
 	if (tdb->allrecord_lock.count != 1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_upgrade failed: count %u too high\n",
-			 tdb->allrecord_lock.count);
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_allrecord_upgrade failed: count %u too high",
+			   tdb->allrecord_lock.count);
 		return -1;
 	}
 
 	if (tdb->allrecord_lock.off != 1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_upgrade failed: already upgraded?\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_allrecord_upgrade failed: already upgraded?");
 		return -1;
 	}
 
@@ -230,8 +234,8 @@ int tdb_allrecord_upgrade(struct tdb_context *tdb)
 		tv.tv_usec = 1;
 		select(0, NULL, NULL, NULL, &tv);
 	}
-	tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
-		 "tdb_allrecord_upgrade failed\n");
+	tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_WARNING,
+		   "tdb_allrecord_upgrade failed");
 	return -1;
 }
 
@@ -276,23 +280,23 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
 	struct tdb_lock_type *new_lck;
 
 	if (offset > TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE + tdb->map_size / 8) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_nest_lock: invalid offset %llu ltype=%d\n",
-			 (long long)offset, ltype);
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
+			   "tdb_nest_lock: invalid offset %zu ltype=%d",
+			   (size_t)offset, ltype);
 		return -1;
 	}
 
 	if (tdb->flags & TDB_NOLOCK)
 		return 0;
 
+	add_stat(tdb, locks, 1);
+
 	new_lck = find_nestlock(tdb, offset);
 	if (new_lck) {
 		if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
-			tdb->ecode = TDB_ERR_LOCK;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_nest_lock: offset %llu has read lock\n",
-				 (long long)offset);
+			tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
+				   "tdb_nest_lock: offset %zu has read lock",
+				   (size_t)offset);
 			return -1;
 		}
 		/* Just increment the struct, posix locks don't stack. */
@@ -303,9 +307,8 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
 	if (tdb->num_lockrecs
 	    && offset >= TDB_HASH_LOCK_START
 	    && offset < TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_nest_lock: already have a hash lock?\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
+			   "tdb_nest_lock: already have a hash lock?");
 		return -1;
 	}
 
@@ -313,10 +316,9 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
 		tdb->lockrecs,
 		sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
 	if (new_lck == NULL) {
-		tdb->ecode = TDB_ERR_OOM;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_nest_lock: unable to allocate %llu lock struct",
-			 (long long)(tdb->num_lockrecs + 1));
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			 "tdb_nest_lock: unable to allocate %zu lock struct",
+			 tdb->num_lockrecs + 1);
 		errno = ENOMEM;
 		return -1;
 	}
@@ -361,9 +363,8 @@ static int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t off, int ltype)
 
 	lck = find_nestlock(tdb, off);
 	if ((lck == NULL) || (lck->count == 0)) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_nest_unlock: no lock for %llu\n", (long long)off);
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_nest_unlock: no lock for %zu", (size_t)off);
 		return -1;
 	}
 
@@ -448,9 +449,8 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 {
 	/* FIXME: There are no locks on read-only dbs */
 	if (tdb->read_only) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_lock: read-only\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_allrecord_lock: read-only");
 		return -1;
 	}
 
@@ -462,49 +462,45 @@ int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 
 	if (tdb->allrecord_lock.count) {
 		/* a global lock of a different type exists */
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_lock: already have %s lock\n",
-			 tdb->allrecord_lock.ltype == F_RDLCK
-			 ? "read" : "write");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_allrecord_lock: already have %s lock",
+			   tdb->allrecord_lock.ltype == F_RDLCK
+			   ? "read" : "write");
 		return -1;
 	}
 
 	if (tdb_has_hash_locks(tdb)) {
 		/* can't combine global and chain locks */
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_lock: already have chain lock\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			 "tdb_allrecord_lock: already have chain lock");
 		return -1;
 	}
 
 	if (upgradable && ltype != F_RDLCK) {
 		/* tdb error: you can't upgrade a write lock! */
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_lock: can't upgrade a write lock\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_allrecord_lock: can't upgrade a write lock");
 		return -1;
 	}
 
+	add_stat(tdb, locks, 1);
 again:
 	/* Lock hashes, gradually. */
 	if (tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START,
 			     TDB_HASH_LOCK_RANGE)) {
 		if (!(flags & TDB_LOCK_PROBE)) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_allrecord_lock hashes failed (%s)\n",
-				 strerror(errno));
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
+				   "tdb_allrecord_lock hashes failed");
 		}
 		return -1;
 	}
 
-	/* Lock free lists: there to end of file. */
+	/* Lock free tables: there to end of file. */
 	if (tdb_brlock(tdb, ltype, TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE,
 		       0, flags)) {
 		if (!(flags & TDB_LOCK_PROBE)) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_allrecord_lock freelist failed (%s)\n",
-				 strerror(errno));
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
+				 "tdb_allrecord_lock freetables failed");
 		}
 		tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, 
 			     TDB_HASH_LOCK_RANGE);
@@ -559,29 +555,19 @@ void tdb_unlock_expand(struct tdb_context *tdb, int ltype)
 /* unlock entire db */
 int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
 {
-	/* FIXME: There are no locks on read-only dbs */
-	if (tdb->read_only) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_unlock: read-only\n");
-		return -1;
-	}
-
 	if (tdb->allrecord_lock.count == 0) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_unlock: not locked!\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_allrecord_unlock: not locked!");
 		return -1;
 	}
 
 	/* Upgradable locks are marked as write locks. */
 	if (tdb->allrecord_lock.ltype != ltype
 	    && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_allrecord_unlock: have %s lock\n",
-			 tdb->allrecord_lock.ltype == F_RDLCK
-			 ? "read" : "write");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			 "tdb_allrecord_unlock: have %s lock",
+			   tdb->allrecord_lock.ltype == F_RDLCK
+			   ? "read" : "write");
 		return -1;
 	}
 
@@ -642,25 +628,22 @@ int tdb_lock_hashes(struct tdb_context *tdb,
 	}
 
 	if (tdb->allrecord_lock.count) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_lock_hashes: have %s allrecordlock\n",
-			 tdb->allrecord_lock.ltype == F_RDLCK
-			 ? "read" : "write");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_lock_hashes: already have %s allrecordlock",
+			   tdb->allrecord_lock.ltype == F_RDLCK
+			   ? "read" : "write");
 		return -1;
 	}
 
 	if (tdb_has_free_lock(tdb)) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_lock_hashes: have free lock already\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_lock_hashes: already have free lock");
 		return -1;
 	}
 
 	if (tdb_has_expansion_lock(tdb)) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_lock_hashes: have expansion lock already\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_lock_hashes: already have expansion lock");
 		return -1;
 	}
 
@@ -678,9 +661,8 @@ int tdb_unlock_hashes(struct tdb_context *tdb,
 	if (tdb->allrecord_lock.count) {
 		if (tdb->allrecord_lock.ltype == F_RDLCK
 		    && ltype == F_WRLCK) {
-			tdb->ecode = TDB_ERR_LOCK;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_unlock_hashes RO allrecord!\n");
+			tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
+				   "tdb_unlock_hashes RO allrecord!");
 			return -1;
 		}
 		return 0;
@@ -709,17 +691,15 @@ int tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
 	if (tdb->allrecord_lock.count) {
 		if (tdb->allrecord_lock.ltype == F_WRLCK)
 			return 0;
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_lock_free_bucket with RO allrecordlock!\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_FATAL,
+			 "tdb_lock_free_bucket with RO allrecordlock!");
 		return -1;
 	}
 
 #if 0 /* FIXME */
 	if (tdb_has_expansion_lock(tdb)) {
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_lock_free_bucket: have expansion lock already\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_lock_free_bucket: already have expansion lock");
 		return -1;
 	}
 #endif

+ 87 - 36
ccan/tdb2/private.h

@@ -36,6 +36,7 @@
 #include "config.h"
 #include <ccan/tdb2/tdb2.h>
 #include <ccan/likely/likely.h>
+#include <ccan/compiler/compiler.h>
 #ifdef HAVE_BYTESWAP_H
 #include <byteswap.h>
 #endif
@@ -63,9 +64,11 @@ typedef uint64_t tdb_off_t;
 
 #define TDB_MAGIC_FOOD "TDB file\n"
 #define TDB_VERSION ((uint64_t)(0x26011967 + 7))
-#define TDB_MAGIC ((uint64_t)0x1999)
+#define TDB_USED_MAGIC ((uint64_t)0x1999)
+#define TDB_HTABLE_MAGIC ((uint64_t)0x1888)
+#define TDB_CHAIN_MAGIC ((uint64_t)0x1777)
+#define TDB_FTABLE_MAGIC ((uint64_t)0x1666)
 #define TDB_FREE_MAGIC ((uint64_t)0xFE)
-#define TDB_COALESCING_MAGIC ((uint64_t)0xFD)
 #define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
 #define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
 #define TDB_RECOVERY_INVALID_MAGIC (0x0ULL)
@@ -91,20 +94,22 @@ typedef uint64_t tdb_off_t;
 #define TDB_SUBLEVEL_HASH_BITS 6
 /* And 8 entries in each group, ie 8 groups per sublevel. */
 #define TDB_HASH_GROUP_BITS 3
+/* This is currently 10: beyond this we chain. */
+#define TDB_MAX_LEVELS (1+(64-TDB_TOPLEVEL_HASH_BITS) / TDB_SUBLEVEL_HASH_BITS)
 
-/* Extend file by least 32 times larger than needed. */
-#define TDB_EXTENSION_FACTOR 32
+/* Extend file by least 100 times larger than needed. */
+#define TDB_EXTENSION_FACTOR 100
 
 /* We steal bits from the offsets to store hash info. */
 #define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1)
 /* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
 #define TDB_OFF_UPPER_STEAL 8
 #define   TDB_OFF_UPPER_STEAL_EXTRA 7
-#define   TDB_OFF_UPPER_STEAL_TRUNCBIT 1
-/* If this is set, hash is truncated (only 1 bit is valid). */
-#define TDB_OFF_HASH_TRUNCATED_BIT 56
-/* The bit number where we store next level of hash. */
+/* The bit number where we store extra hash bits. */
 #define TDB_OFF_HASH_EXTRA_BIT 57
+#define TDB_OFF_UPPER_STEAL_SUBHASH_BIT 56
+
+/* The bit number where we store the extra hash bits. */
 /* Convenience mask to get actual offset. */
 #define TDB_OFF_MASK \
 	(((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK)
@@ -116,6 +121,9 @@ typedef uint64_t tdb_off_t;
 #define TDB_MIN_DATA_LEN	\
 	(sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
 
+/* Indicates this entry is not on an flist (can happen during coalescing) */
+#define TDB_FTABLE_NONE ((1ULL << TDB_OFF_UPPER_STEAL) - 1)
+
 #if !HAVE_BSWAP_64
 static inline uint64_t bswap_64(uint64_t x)
 {
@@ -173,20 +181,30 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r)
 }
 
 struct tdb_free_record {
-        uint64_t magic_and_meta; /* TDB_OFF_UPPER_STEAL bits of magic */
-        uint64_t data_len; /* Not counting these two fields. */
-	/* This is why the minimum record size is 16 bytes.  */
-	uint64_t next, prev;
+        uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */
+        uint64_t ftable_and_len; /* Len not counting these two fields. */
+	/* This is why the minimum record size is 8 bytes.  */
+	uint64_t next;
 };
 
+static inline uint64_t frec_prev(const struct tdb_free_record *f)
+{
+	return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
+}
+
 static inline uint64_t frec_magic(const struct tdb_free_record *f)
 {
-	return f->magic_and_meta >> (64 - TDB_OFF_UPPER_STEAL);
+	return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL);
 }
 
-static inline uint64_t frec_flist(const struct tdb_free_record *f)
+static inline uint64_t frec_len(const struct tdb_free_record *f)
 {
-	return f->magic_and_meta & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
+	return f->ftable_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
+}
+
+static inline unsigned frec_ftable(const struct tdb_free_record *f)
+{
+	return f->ftable_and_len >> (64 - TDB_OFF_UPPER_STEAL);
 }
 
 struct tdb_recovery_record {
@@ -199,6 +217,12 @@ struct tdb_recovery_record {
 	uint64_t eof;
 };
 
+/* If we bottom out of the subhashes, we chain. */
+struct tdb_chain {
+	tdb_off_t rec[1 << TDB_HASH_GROUP_BITS];
+	tdb_off_t next;
+};
+
 /* this is stored at the front of every database */
 struct tdb_header {
 	char magic_food[64]; /* for /etc/magic */
@@ -206,7 +230,7 @@ struct tdb_header {
 	uint64_t version; /* version of the code */
 	uint64_t hash_test; /* result of hashing HASH_MAGIC. */
 	uint64_t hash_seed; /* "random" seed written at creation time. */
-	tdb_off_t free_list; /* (First) free list. */
+	tdb_off_t free_table; /* (First) free table. */
 	tdb_off_t recovery; /* Transaction recovery area. */
 
 	tdb_off_t reserved[26];
@@ -215,7 +239,7 @@ struct tdb_header {
 	tdb_off_t hashtable[1ULL << TDB_TOPLEVEL_HASH_BITS];
 };
 
-struct tdb_freelist {
+struct tdb_freetable {
 	struct tdb_used_record hdr;
 	tdb_off_t next;
 	tdb_off_t buckets[TDB_FREE_BUCKETS];
@@ -246,7 +270,7 @@ struct traverse_info {
 		/* We ignore groups here, and treat it as a big array. */
 		unsigned entry;
 		unsigned int total_buckets;
-	} levels[64 / TDB_SUBLEVEL_HASH_BITS];
+	} levels[TDB_MAX_LEVELS + 1];
 	unsigned int num_levels;
 	unsigned int toplevel_group;
 	/* This makes delete-everything-inside-traverse work as expected. */
@@ -269,6 +293,15 @@ struct tdb_lock_type {
 	uint32_t ltype;
 };
 
+/* This is only needed for tdb_access_commit, but used everywhere to
+ * simplify. */
+struct tdb_access_hdr {
+	struct tdb_access_hdr *next;
+	tdb_off_t off;
+	tdb_len_t len;
+	bool convert;
+};
+
 struct tdb_context {
 	/* Filename of the database. */
 	const char *name;
@@ -298,8 +331,8 @@ struct tdb_context {
 	uint32_t flags;
 
 	/* Logging function */
-	tdb_logfn_t log;
-	void *log_priv;
+	tdb_logfn_t logfn;
+	void *log_private;
 
 	/* Hash function. */
 	tdb_hashfn_t khash;
@@ -309,17 +342,23 @@ struct tdb_context {
 	/* Set if we are in a transaction. */
 	struct tdb_transaction *transaction;
 	
-	/* What freelist are we using? */
-	uint64_t flist_off;
+	/* What free table are we using? */
+	tdb_off_t ftable_off;
+	unsigned int ftable;
 
 	/* IO methods: changes for transactions. */
 	const struct tdb_methods *methods;
 
 	/* Lock information */
 	struct tdb_lock_type allrecord_lock;
-	uint64_t num_lockrecs;
+	size_t num_lockrecs;
 	struct tdb_lock_type *lockrecs;
 
+	struct tdb_attribute_stats *stats;
+
+	/* Direct access information */
+	struct tdb_access_hdr *access;
+
 	/* Single list of all TDBs, to avoid multiple opens. */
 	struct tdb_context *next;
 	dev_t device;	
@@ -331,7 +370,7 @@ struct tdb_methods {
 	int (*write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
 	int (*oob)(struct tdb_context *, tdb_off_t, bool);
 	int (*expand_file)(struct tdb_context *, tdb_len_t);
-	void *(*direct)(struct tdb_context *, tdb_off_t, size_t);
+	void *(*direct)(struct tdb_context *, tdb_off_t, size_t, bool);
 };
 
 /*
@@ -367,29 +406,32 @@ int delete_from_hash(struct tdb_context *tdb, struct hash_info *h);
 bool is_subhash(tdb_off_t val);
 
 /* free.c: */
-int tdb_flist_init(struct tdb_context *tdb);
+int tdb_ftable_init(struct tdb_context *tdb);
 
 /* check.c needs these to iterate through free lists. */
-tdb_off_t first_flist(struct tdb_context *tdb);
-tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist);
+tdb_off_t first_ftable(struct tdb_context *tdb);
+tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable);
 
-/* If this fails, try tdb_expand. */
+/* This returns space or TDB_OFF_ERR. */
 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
-		uint64_t hash, bool growing);
+		uint64_t hash, unsigned magic, bool growing);
 
 /* Put this record in a free list. */
 int add_free_record(struct tdb_context *tdb,
 		    tdb_off_t off, tdb_len_t len_with_header);
 
-/* Set up header for a used record. */
+/* Set up header for a used/ftable/htable/chain record. */
 int set_header(struct tdb_context *tdb,
 	       struct tdb_used_record *rec,
-	       uint64_t keylen, uint64_t datalen,
+	       unsigned magic, uint64_t keylen, uint64_t datalen,
 	       uint64_t actuallen, unsigned hashlow);
 
 /* Used by tdb_check to verify. */
 unsigned int size_to_bucket(tdb_len_t data_len);
-tdb_off_t bucket_off(tdb_off_t flist_off, unsigned bucket);
+tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket);
+
+/* Used by tdb_summary */
+size_t dead_space(struct tdb_context *tdb, tdb_off_t off);
 
 /* io.c: */
 /* Initialize tdb->methods. */
@@ -402,10 +444,6 @@ void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size);
 void tdb_munmap(struct tdb_context *tdb);
 void tdb_mmap(struct tdb_context *tdb);
 
-/* Either make a copy into pad and return that, or return ptr into mmap.
- * Converts endian (ie. will use pad in that case). */
-void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len);
-
 /* Either alloc a copy, or give direct access.  Release frees or noop. */
 const void *tdb_access_read(struct tdb_context *tdb,
 			    tdb_off_t off, tdb_len_t len, bool convert);
@@ -452,6 +490,13 @@ int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
 		     void *rec, size_t len);
 
+/* Adds a stat, if it's in range. */
+void add_stat_(struct tdb_context *tdb, uint64_t *stat, size_t val);
+#define add_stat(tdb, statname, val)					\
+	do {								\
+		if (unlikely((tdb)->stats))				\
+			add_stat_((tdb), &(tdb)->stats->statname, (val)); \
+	} while (0)
 
 /* lock.c: */
 void tdb_lock_init(struct tdb_context *tdb);
@@ -507,6 +552,12 @@ int next_in_hash(struct tdb_context *tdb, int ltype,
 int tdb_transaction_recover(struct tdb_context *tdb);
 bool tdb_needs_recovery(struct tdb_context *tdb);
 
+/* tdb.c: */
+void COLD tdb_logerr(struct tdb_context *tdb,
+		     enum TDB_ERROR ecode,
+		     enum tdb_debug_level level,
+		     const char *fmt, ...);
+
 #ifdef TDB_TRACE
 void tdb_trace(struct tdb_context *tdb, const char *op);
 void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op);

+ 63 - 36
ccan/tdb2/summary.c

@@ -37,33 +37,43 @@ static int count_hash(struct tdb_context *tdb,
 
 static bool summarize(struct tdb_context *tdb,
 		      struct tally *hashes,
-		      struct tally *flists,
+		      struct tally *ftables,
 		      struct tally *free,
 		      struct tally *keys,
 		      struct tally *data,
 		      struct tally *extra,
 		      struct tally *uncoal,
-		      struct tally *buckets)
+		      struct tally *buckets,
+		      struct tally *chains)
 {
 	tdb_off_t off;
 	tdb_len_t len;
 	tdb_len_t unc = 0;
 
 	for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
-		union {
+		const union {
 			struct tdb_used_record u;
 			struct tdb_free_record f;
-		} pad, *p;
-		p = tdb_get(tdb, off, &pad, sizeof(pad));
+			struct tdb_recovery_record r;
+		} *p;
+		/* We might not be able to get the whole thing. */
+		p = tdb_access_read(tdb, off, sizeof(p->f), true);
 		if (!p)
 			return false;
-		if (rec_magic(&p->u) != TDB_MAGIC) {
-			len = p->f.data_len;
+		if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
+		    || p->r.magic == TDB_RECOVERY_MAGIC) {
+			if (unc) {
+				tally_add(uncoal, unc);
+				unc = 0;
+			}
+			len = sizeof(p->r) + p->r.max_len;
+		} else if (frec_magic(&p->f) == TDB_FREE_MAGIC) {
+			len = frec_len(&p->f);
 			tally_add(free, len);
 			tally_add(buckets, size_to_bucket(len));
 			len += sizeof(p->u);
 			unc++;
-		} else {
+		} else if (rec_magic(&p->u) == TDB_USED_MAGIC) {
 			if (unc) {
 				tally_add(uncoal, unc);
 				unc = 0;
@@ -73,25 +83,35 @@ static bool summarize(struct tdb_context *tdb,
 				+ rec_data_length(&p->u)
 				+ rec_extra_padding(&p->u);
 
-			/* FIXME: Use different magic for hashes, flists. */
-			if (!rec_key_length(&p->u) && rec_hash(&p->u) < 2) {
-				if (rec_hash(&p->u) == 0) {
-					int count = count_hash(tdb,
-							off + sizeof(p->u),
-							TDB_SUBLEVEL_HASH_BITS);
-					if (count == -1)
-						return false;
-					tally_add(hashes, count);
-				} else {
-					tally_add(flists,
-						  rec_data_length(&p->u));
-				}
-			} else {
-				tally_add(keys, rec_key_length(&p->u));
-				tally_add(data, rec_data_length(&p->u));
-			}
+			tally_add(keys, rec_key_length(&p->u));
+			tally_add(data, rec_data_length(&p->u));
+			tally_add(extra, rec_extra_padding(&p->u));
+		} else if (rec_magic(&p->u) == TDB_HTABLE_MAGIC) {
+			int count = count_hash(tdb,
+					       off + sizeof(p->u),
+					       TDB_SUBLEVEL_HASH_BITS);
+			if (count == -1)
+				return false;
+			tally_add(hashes, count);
+			tally_add(extra, rec_extra_padding(&p->u));
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+		} else if (rec_magic(&p->u) == TDB_FTABLE_MAGIC) {
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+			tally_add(ftables, rec_data_length(&p->u));
+			tally_add(extra, rec_extra_padding(&p->u));
+		} else if (rec_magic(&p->u) == TDB_CHAIN_MAGIC) {
+			len = sizeof(p->u)
+				+ rec_data_length(&p->u)
+				+ rec_extra_padding(&p->u);
+			tally_add(chains, 1);
 			tally_add(extra, rec_extra_padding(&p->u));
-		}
+		} else
+			len = dead_space(tdb, off);
+		tdb_access_release(tdb, p);
 	}
 	if (unc)
 		tally_add(uncoal, unc);
@@ -110,6 +130,7 @@ static bool summarize(struct tdb_context *tdb,
 	"Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
 	"Number of free lists: %zu\n%s" \
 	"Toplevel hash used: %u of %u\n" \
+	"Number of chains: %zu\n" \
 	"Number of subhashes: %zu\n" \
 	"Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \
 	"Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
@@ -127,8 +148,8 @@ static bool summarize(struct tdb_context *tdb,
 char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 {
 	tdb_len_t len;
-	struct tally *flists, *hashes, *freet, *keys, *data, *extra, *uncoal,
-		*buckets;
+	struct tally *ftables, *hashes, *freet, *keys, *data, *extra, *uncoal,
+		*buckets, *chains;
 	char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg, *bucketsg;
 	char *ret = NULL;
 
@@ -143,7 +164,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 	}
 
 	/* Start stats off empty. */
-	flists = tally_new(HISTO_HEIGHT);
+	ftables = tally_new(HISTO_HEIGHT);
 	hashes = tally_new(HISTO_HEIGHT);
 	freet = tally_new(HISTO_HEIGHT);
 	keys = tally_new(HISTO_HEIGHT);
@@ -151,14 +172,16 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 	extra = tally_new(HISTO_HEIGHT);
 	uncoal = tally_new(HISTO_HEIGHT);
 	buckets = tally_new(HISTO_HEIGHT);
-	if (!flists || !hashes || !freet || !keys || !data || !extra
-	    || !uncoal || !buckets) {
-		tdb->ecode = TDB_ERR_OOM;
+	chains = tally_new(HISTO_HEIGHT);
+	if (!ftables || !hashes || !freet || !keys || !data || !extra
+	    || !uncoal || !buckets || !chains) {
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			   "tdb_summary: failed to allocate tally structures");
 		goto unlock;
 	}
 
-	if (!summarize(tdb, hashes, flists, freet, keys, data, extra, uncoal,
-		       buckets))
+	if (!summarize(tdb, hashes, ftables, freet, keys, data, extra, uncoal,
+		       buckets, chains))
 		goto unlock;
 
 	if (flags & TDB_SUMMARY_HISTOGRAMS) {
@@ -206,6 +229,7 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 		      count_hash(tdb, offsetof(struct tdb_header, hashtable),
 				 TDB_TOPLEVEL_HASH_BITS),
 		      1 << TDB_TOPLEVEL_HASH_BITS,
+		      tally_num(chains),
 		      tally_num(hashes),
 		      tally_min(hashes), tally_mean(hashes), tally_max(hashes),
 		      hashesg ? hashesg : "",
@@ -215,11 +239,12 @@ char *tdb_summary(struct tdb_context *tdb, enum tdb_summary_flags flags)
 		      tally_total(freet, NULL) * 100.0 / tdb->map_size,
 		      (tally_num(keys) + tally_num(freet) + tally_num(hashes))
 		      * sizeof(struct tdb_used_record) * 100.0 / tdb->map_size,
-		      tally_num(flists) * sizeof(struct tdb_freelist)
+		      tally_num(ftables) * sizeof(struct tdb_freetable)
 		      * 100.0 / tdb->map_size,
 		      (tally_num(hashes)
 		       * (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
-		       + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS))
+		       + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS)
+		       + sizeof(struct tdb_chain) * tally_num(chains))
 		      * 100.0 / tdb->map_size);
 
 unlock:
@@ -237,6 +262,8 @@ unlock:
 	free(data);
 	free(extra);
 	free(uncoal);
+	free(ftables);
+	free(chains);
 
 	tdb_allrecord_unlock(tdb, F_RDLCK);
 	tdb_unlock_expand(tdb, F_RDLCK);

+ 148 - 91
ccan/tdb2/tdb.c

@@ -1,8 +1,7 @@
 #include "private.h"
 #include <ccan/tdb2/tdb2.h>
-#include <ccan/build_assert/build_assert.h>
-#include <ccan/likely/likely.h>
 #include <assert.h>
+#include <stdarg.h>
 
 /* The null return. */
 struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
@@ -10,13 +9,6 @@ struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 };
 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
 static struct tdb_context *tdbs = NULL;
 
-PRINTF_FMT(4, 5) static void
-null_log_fn(struct tdb_context *tdb,
-	    enum tdb_debug_level level, void *priv,
-	    const char *fmt, ...)
-{
-}
-
 static bool tdb_already_open(dev_t device, ino_t ino)
 {
 	struct tdb_context *i;
@@ -39,8 +31,8 @@ static uint64_t random_number(struct tdb_context *tdb)
 	fd = open("/dev/urandom", O_RDONLY);
 	if (fd >= 0) {
 		if (tdb_read_all(fd, &ret, sizeof(ret))) {
-			tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
-				 "tdb_open: random from /dev/urandom\n");
+			tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
+				 "tdb_open: random from /dev/urandom");
 			close(fd);
 			return ret;
 		}
@@ -55,9 +47,9 @@ static uint64_t random_number(struct tdb_context *tdb)
 			char reply[1 + sizeof(uint64_t)];
 			int r = read(fd, reply, sizeof(reply));
 			if (r > 1) {
-				tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
-					 "tdb_open: %u random bytes from"
-					 " /dev/egd-pool\n", r-1);
+				tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
+					   "tdb_open: %u random bytes from"
+					   " /dev/egd-pool", r-1);
 				/* Copy at least some bytes. */
 				memcpy(&ret, reply+1, r - 1);
 				if (reply[0] == sizeof(uint64_t)
@@ -73,14 +65,14 @@ static uint64_t random_number(struct tdb_context *tdb)
 	/* Fallback: pid and time. */
 	gettimeofday(&now, NULL);
 	ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
-	tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
-		 "tdb_open: random from getpid and time\n");
+	tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
+		   "tdb_open: random from getpid and time");
 	return ret;
 }
 
 struct new_database {
 	struct tdb_header hdr;
-	struct tdb_freelist flist;
+	struct tdb_freetable ftable;
 };
 
 /* initialise a new database */
@@ -109,11 +101,11 @@ static int tdb_new_database(struct tdb_context *tdb,
 	memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
 
 	/* Free is empty. */
-	newdb.hdr.free_list = offsetof(struct new_database, flist);
-	memset(&newdb.flist, 0, sizeof(newdb.flist));
-	set_header(NULL, &newdb.flist.hdr, 0,
-		   sizeof(newdb.flist) - sizeof(newdb.flist.hdr),
-		   sizeof(newdb.flist) - sizeof(newdb.flist.hdr), 1);
+	newdb.hdr.free_table = offsetof(struct new_database, ftable);
+	memset(&newdb.ftable, 0, sizeof(newdb.ftable));
+	set_header(NULL, &newdb.ftable.hdr, TDB_FTABLE_MAGIC, 0,
+		   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
+		   sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr), 0);
 
 	/* Magic food */
 	memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
@@ -130,7 +122,8 @@ static int tdb_new_database(struct tdb_context *tdb,
 		tdb->map_size = sizeof(newdb);
 		tdb->map_ptr = malloc(tdb->map_size);
 		if (!tdb->map_ptr) {
-			tdb->ecode = TDB_ERR_OOM;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "tdb_new_database: failed to allocate");
 			return -1;
 		}
 		memcpy(tdb->map_ptr, &newdb, tdb->map_size);
@@ -143,7 +136,9 @@ static int tdb_new_database(struct tdb_context *tdb,
 		return -1;
 
 	if (!tdb_pwrite_all(tdb->fd, &newdb, sizeof(newdb), 0)) {
-		tdb->ecode = TDB_ERR_IO;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+			   "tdb_new_database: failed to write: %s",
+			   strerror(errno));
 		return -1;
 	}
 	return 0;
@@ -155,7 +150,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 {
 	struct tdb_context *tdb;
 	struct stat st;
-	int save_errno;
+	int saved_errno = 0;
 	uint64_t hash_test;
 	unsigned v;
 	struct tdb_header hdr;
@@ -165,7 +160,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	if (!tdb) {
 		/* Can't log this */
 		errno = ENOMEM;
-		goto fail;
+		return NULL;
 	}
 	tdb->name = NULL;
 	tdb->map_ptr = NULL;
@@ -174,9 +169,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	tdb->map_size = sizeof(struct tdb_header);
 	tdb->ecode = TDB_SUCCESS;
 	tdb->flags = tdb_flags;
-	tdb->log = null_log_fn;
-	tdb->log_priv = NULL;
+	tdb->logfn = NULL;
 	tdb->transaction = NULL;
+	tdb->stats = NULL;
+	tdb->access = NULL;
 	tdb_hash_init(tdb);
 	tdb_io_init(tdb);
 	tdb_lock_init(tdb);
@@ -184,8 +180,8 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	while (attr) {
 		switch (attr->base.attr) {
 		case TDB_ATTRIBUTE_LOG:
-			tdb->log = attr->log.log_fn;
-			tdb->log_priv = attr->log.log_private;
+			tdb->logfn = attr->log.log_fn;
+			tdb->log_private = attr->log.log_private;
 			break;
 		case TDB_ATTRIBUTE_HASH:
 			tdb->khash = attr->hash.hash_fn;
@@ -194,20 +190,24 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 		case TDB_ATTRIBUTE_SEED:
 			seed = &attr->seed;
 			break;
+		case TDB_ATTRIBUTE_STATS:
+			tdb->stats = &attr->stats;
+			/* They have stats we don't know about?  Tell them. */
+			if (tdb->stats->size > sizeof(attr->stats))
+				tdb->stats->size = sizeof(attr->stats);
+			break;
 		default:
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_open: unknown attribute type %u\n",
-				 attr->base.attr);
-			errno = EINVAL;
+			tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+				   "tdb_open: unknown attribute type %u",
+				   attr->base.attr);
 			goto fail;
 		}
 		attr = attr->base.next;
 	}
 
 	if ((open_flags & O_ACCMODE) == O_WRONLY) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_open: can't open tdb %s write-only\n", name);
-		errno = EINVAL;
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			   "tdb_open: can't open tdb %s write-only", name);
 		goto fail;
 	}
 
@@ -225,21 +225,21 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	if (tdb->flags & TDB_INTERNAL) {
 		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
 		if (tdb_new_database(tdb, seed, &hdr) != 0) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_open: tdb_new_database failed!");
 			goto fail;
 		}
 		tdb_convert(tdb, &hdr.hash_seed, sizeof(hdr.hash_seed));
 		tdb->hash_seed = hdr.hash_seed;
-		tdb_flist_init(tdb);
+		tdb_ftable_init(tdb);
 		return tdb;
 	}
 
 	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
-		tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
-			 "tdb_open: could not open file %s: %s\n",
-			 name, strerror(errno));
-		goto fail;	/* errno set by open(2) */
+		/* errno set by open(2) */
+		saved_errno = errno;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			   "tdb_open: could not open file %s: %s",
+			   name, strerror(errno));
+		goto fail;
 	}
 
 	/* on exec, don't inherit the fd */
@@ -248,19 +248,19 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 
 	/* ensure there is only one process initialising at once */
 	if (tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK) == -1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_open: failed to get open lock on %s: %s\n",
-			 name, strerror(errno));
-		goto fail;	/* errno set by tdb_brlock */
+		/* errno set by tdb_brlock */
+		saved_errno = errno;
+		goto fail;
 	}
 
 	if (!tdb_pread_all(tdb->fd, &hdr, sizeof(hdr), 0)
 	    || strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
-		if (!(open_flags & O_CREAT)
-		    || tdb_new_database(tdb, seed, &hdr) == -1) {
-			if (errno == 0) {
-				errno = EIO; /* ie bad format or something */
-			}
+		if (!(open_flags & O_CREAT)) {
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+				   "tdb_open: %s is not a tdb file", name);
+			goto fail;
+		}
+		if (tdb_new_database(tdb, seed, &hdr) == -1) {
 			goto fail;
 		}
 	} else if (hdr.version != TDB_VERSION) {
@@ -268,10 +268,9 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 			tdb->flags |= TDB_CONVERT;
 		else {
 			/* wrong version */
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_open: %s is unknown version 0x%llx\n",
-				 name, (long long)hdr.version);
-			errno = EIO;
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+				   "tdb_open: %s is unknown version 0x%llx",
+				   name, (long long)hdr.version);
 			goto fail;
 		}
 	}
@@ -282,29 +281,34 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
 	if (hdr.hash_test != hash_test) {
 		/* wrong hash variant */
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_open: %s uses a different hash function\n",
-			 name);
-		errno = EIO;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			   "tdb_open: %s uses a different hash function",
+			   name);
 		goto fail;
 	}
 
-	if (fstat(tdb->fd, &st) == -1)
+	if (fstat(tdb->fd, &st) == -1) {
+		saved_errno = errno;
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			   "tdb_open: could not stat open %s: %s",
+			   name, strerror(errno));
 		goto fail;
+	}
 
 	/* Is it already in the open list?  If so, fail. */
 	if (tdb_already_open(st.st_dev, st.st_ino)) {
 		/* FIXME */
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_open: %s (%d,%d) is already open in this process\n",
-			 name, (int)st.st_dev, (int)st.st_ino);
-		errno = EBUSY;
+		tdb_logerr(tdb, TDB_ERR_NESTING, TDB_DEBUG_ERROR,
+			   "tdb_open: %s (%d,%d) is already open in this"
+			   " process",
+			   name, (int)st.st_dev, (int)st.st_ino);
 		goto fail;
 	}
 
 	tdb->name = strdup(name);
 	if (!tdb->name) {
-		errno = ENOMEM;
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			   "tdb_open: failed to allocate name");
 		goto fail;
 	}
 
@@ -317,11 +321,10 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 
 	/* Now it's fully formed, recover if necessary. */
 	if (tdb_needs_recovery(tdb) && tdb_lock_and_recover(tdb) == -1) {
-		errno = EIO;
 		goto fail;
 	}
 
-	if (tdb_flist_init(tdb) == -1)
+	if (tdb_ftable_init(tdb) == -1)
 		goto fail;
 
 	tdb->next = tdbs;
@@ -329,10 +332,30 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	return tdb;
 
  fail:
-	save_errno = errno;
-
-	if (!tdb)
-		return NULL;
+	/* Map ecode to some logical errno. */
+	if (!saved_errno) {
+		switch (tdb->ecode) {
+		case TDB_ERR_CORRUPT:
+		case TDB_ERR_IO:
+			saved_errno = EIO;
+			break;
+		case TDB_ERR_LOCK:
+			saved_errno = EWOULDBLOCK;
+			break;
+		case TDB_ERR_OOM:
+			saved_errno = ENOMEM;
+			break;
+		case TDB_ERR_EINVAL:
+			saved_errno = EINVAL;
+			break;
+		case TDB_ERR_NESTING:
+			saved_errno = EBUSY;
+			break;
+		default:
+			saved_errno = EINVAL;
+			break;
+		}
+	}
 
 #ifdef TDB_TRACE
 	close(tdb->tracefd);
@@ -346,15 +369,14 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
 	free((char *)tdb->name);
 	if (tdb->fd != -1)
 		if (close(tdb->fd) != 0)
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-				 "tdb_open: failed to close tdb->fd"
-				 " on error!\n");
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+				   "tdb_open: failed to close tdb->fd"
+				   " on error!");
 	free(tdb);
-	errno = save_errno;
+	errno = saved_errno;
 	return NULL;
 }
 
-/* FIXME: modify, don't rewrite! */
 static int update_rec_hdr(struct tdb_context *tdb,
 			  tdb_off_t off,
 			  tdb_len_t keylen,
@@ -364,7 +386,8 @@ static int update_rec_hdr(struct tdb_context *tdb,
 {
 	uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec);
 
-	if (set_header(tdb, rec, keylen, datalen, keylen + dataroom, h))
+	if (set_header(tdb, rec, TDB_USED_MAGIC, keylen, datalen,
+		       keylen + dataroom, h))
 		return -1;
 
 	return tdb_write_convert(tdb, off, rec, sizeof(*rec));
@@ -380,12 +403,14 @@ static int replace_data(struct tdb_context *tdb,
 	tdb_off_t new_off;
 
 	/* Allocate a new record. */
-	new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, growing);
+	new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, TDB_USED_MAGIC,
+			growing);
 	if (unlikely(new_off == TDB_OFF_ERR))
 		return -1;
 
 	/* We didn't like the existing one: remove it. */
 	if (old_off) {
+		add_stat(tdb, frees, 1);
 		add_free_record(tdb, old_off,
 				sizeof(struct tdb_used_record)
 				+ key.dsize + old_room);
@@ -445,7 +470,6 @@ int tdb_store(struct tdb_context *tdb,
 						  h.hlock_range, F_WRLCK);
 				return 0;
 			}
-			/* FIXME: See if right record is free? */
 		} else {
 			if (flag == TDB_MODIFY) {
 				/* if the record doesn't exist and we
@@ -502,15 +526,13 @@ int tdb_append(struct tdb_context *tdb,
 					  F_WRLCK);
 			return 0;
 		}
-		/* FIXME: Check right record free? */
 
 		/* Slow path. */
 		newdata = malloc(key.dsize + old_dlen + dbuf.dsize);
 		if (!newdata) {
-			tdb->ecode = TDB_ERR_OOM;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_append: cannot allocate %llu bytes!\n",
-				 (long long)key.dsize + old_dlen + dbuf.dsize);
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "tdb_append: failed to allocate %zu bytes",
+				   (size_t)(key.dsize+old_dlen+dbuf.dsize));
 			goto fail;
 		}
 		if (tdb->methods->read(tdb, off + sizeof(rec) + key.dsize,
@@ -582,6 +604,7 @@ int tdb_delete(struct tdb_context *tdb, struct tdb_data key)
 		goto unlock_err;
 
 	/* Free the deleted entry. */
+	add_stat(tdb, frees, 1);
 	if (add_free_record(tdb, off,
 			    sizeof(struct tdb_used_record)
 			    + rec_key_length(&rec)
@@ -602,12 +625,11 @@ int tdb_close(struct tdb_context *tdb)
 	struct tdb_context **i;
 	int ret = 0;
 
-	/* FIXME:
+	tdb_trace(tdb, "tdb_close");
+
 	if (tdb->transaction) {
 		tdb_transaction_cancel(tdb);
 	}
-	*/
-	tdb_trace(tdb, "tdb_close");
 
 	if (tdb->map_ptr) {
 		if (tdb->flags & TDB_INTERNAL)
@@ -638,12 +660,12 @@ int tdb_close(struct tdb_context *tdb)
 	return ret;
 }
 
-enum TDB_ERROR tdb_error(struct tdb_context *tdb)
+enum TDB_ERROR tdb_error(const struct tdb_context *tdb)
 {
 	return tdb->ecode;
 }
 
-const char *tdb_errorstr(struct tdb_context *tdb)
+const char *tdb_errorstr(const struct tdb_context *tdb)
 {
 	/* Gcc warns if you miss a case in the switch, so use that. */
 	switch (tdb->ecode) {
@@ -660,3 +682,38 @@ const char *tdb_errorstr(struct tdb_context *tdb)
 	}
 	return "Invalid error code";
 }
+
+void COLD tdb_logerr(struct tdb_context *tdb,
+		     enum TDB_ERROR ecode,
+		     enum tdb_debug_level level,
+		     const char *fmt, ...)
+{
+	char *message;
+	va_list ap;
+	size_t len;
+	/* tdb_open paths care about errno, so save it. */
+	int saved_errno = errno;
+
+	tdb->ecode = ecode;
+
+	if (!tdb->logfn)
+		return;
+
+	/* FIXME: Doesn't assume asprintf. */
+	va_start(ap, fmt);
+	len = vsnprintf(NULL, 0, fmt, ap);
+	va_end(ap);
+
+	message = malloc(len + 1);
+	if (!message) {
+		tdb->logfn(tdb, level, tdb->log_private,
+			   "out of memory formatting message");
+		return;
+	}
+	va_start(ap, fmt);
+	len = vsprintf(message, fmt, ap);
+	va_end(ap);
+	tdb->logfn(tdb, level, tdb->log_private, message);
+	free(message);
+	errno = saved_errno;
+}

+ 34 - 5
ccan/tdb2/tdb2.h

@@ -67,7 +67,7 @@ enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK,
 /* flags for tdb_summary. Logical or to combine. */
 enum tdb_summary_flags { TDB_SUMMARY_HISTOGRAMS = 1 };
 
-/* debugging uses one of the following levels */
+/* logging uses one of the following levels */
 enum tdb_debug_level {TDB_DEBUG_FATAL = 0, TDB_DEBUG_ERROR, 
 		      TDB_DEBUG_WARNING, TDB_DEBUG_TRACE};
 
@@ -80,14 +80,15 @@ struct tdb_context;
 
 /* FIXME: Make typesafe */
 typedef int (*tdb_traverse_func)(struct tdb_context *, TDB_DATA, TDB_DATA, void *);
-typedef void (*tdb_logfn_t)(struct tdb_context *, enum tdb_debug_level, void *priv, const char *, ...) PRINTF_FMT(4, 5);
+typedef void (*tdb_logfn_t)(struct tdb_context *, enum tdb_debug_level, void *, const char *);
 typedef uint64_t (*tdb_hashfn_t)(const void *key, size_t len, uint64_t seed,
 				 void *priv);
 
 enum tdb_attribute_type {
 	TDB_ATTRIBUTE_LOG = 0,
 	TDB_ATTRIBUTE_HASH = 1,
-	TDB_ATTRIBUTE_SEED = 2
+	TDB_ATTRIBUTE_SEED = 2,
+	TDB_ATTRIBUTE_STATS = 3
 };
 
 struct tdb_attribute_base {
@@ -112,11 +113,39 @@ struct tdb_attribute_seed {
 	uint64_t seed;
 };
 
+struct tdb_attribute_stats {
+	struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_STATS */
+	size_t size; /* = sizeof(struct tdb_attribute_stats) */
+	uint64_t allocs;
+	uint64_t   alloc_subhash;
+	uint64_t   alloc_chain;
+	uint64_t   alloc_bucket_exact;
+	uint64_t   alloc_bucket_max;
+	uint64_t   alloc_leftover;
+	uint64_t   alloc_coalesce_tried;
+	uint64_t     alloc_coalesce_lockfail;
+	uint64_t     alloc_coalesce_race;
+	uint64_t     alloc_coalesce_succeeded;
+	uint64_t        alloc_coalesce_num_merged;
+	uint64_t compares;
+	uint64_t   compare_wrong_bucket;
+	uint64_t   compare_wrong_offsetbits;
+	uint64_t   compare_wrong_keylen;
+	uint64_t   compare_wrong_rechash;
+	uint64_t   compare_wrong_keycmp;
+	uint64_t expands;
+	uint64_t frees;
+	uint64_t locks;
+	uint64_t    lock_lowlevel;
+	uint64_t    lock_nonblock;
+};
+
 union tdb_attribute {
 	struct tdb_attribute_base base;
 	struct tdb_attribute_log log;
 	struct tdb_attribute_hash hash;
 	struct tdb_attribute_seed seed;
+	struct tdb_attribute_stats stats;
 };
 		
 struct tdb_context *tdb_open(const char *name, int tdb_flags,
@@ -139,8 +168,8 @@ int tdb_check(struct tdb_context *tdb,
 	      int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
 	      void *private_data);
 
-enum TDB_ERROR tdb_error(struct tdb_context *tdb);
-const char *tdb_errorstr(struct tdb_context *tdb);
+enum TDB_ERROR tdb_error(const struct tdb_context *tdb);
+const char *tdb_errorstr(const struct tdb_context *tdb);
 
 int tdb_transaction_start(struct tdb_context *tdb);
 void tdb_transaction_cancel(struct tdb_context *tdb);

+ 41 - 38
ccan/tdb2/test/layout.c

@@ -23,20 +23,20 @@ static void add(struct tdb_layout *layout, union tdb_layout_elem elem)
 	layout->elem[layout->num_elems++] = elem;
 }
 
-void tdb_layout_add_freelist(struct tdb_layout *layout)
+void tdb_layout_add_freetable(struct tdb_layout *layout)
 {
 	union tdb_layout_elem elem;
-	elem.base.type = FREELIST;
+	elem.base.type = FREETABLE;
 	add(layout, elem);
 }
 
 void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
-			 unsigned flist)
+			 unsigned ftable)
 {
 	union tdb_layout_elem elem;
 	elem.base.type = FREE;
 	elem.free.len = len;
-	elem.free.flist_num = flist;
+	elem.free.ftable_num = ftable;
 	add(layout, elem);
 }
 
@@ -82,9 +82,9 @@ static tdb_len_t hashtable_len(struct tle_hashtable *htable)
 		+ htable->extra;
 }
 
-static tdb_len_t freelist_len(struct tle_freelist *flist)
+static tdb_len_t freetable_len(struct tle_freetable *ftable)
 {
-	return sizeof(struct tdb_freelist);
+	return sizeof(struct tdb_freetable);
 }
 
 static void set_free_record(void *mem, tdb_len_t len)
@@ -97,7 +97,7 @@ static void set_data_record(void *mem, struct tdb_context *tdb,
 {
 	struct tdb_used_record *u = mem;
 
-	set_header(tdb, u, used->key.dsize, used->data.dsize,
+	set_header(tdb, u, TDB_USED_MAGIC, used->key.dsize, used->data.dsize,
 		   used->key.dsize + used->data.dsize + used->extra,
 		   tdb_hash(tdb, used->key.dptr, used->key.dsize));
 	memcpy(u + 1, used->key.dptr, used->key.dsize);
@@ -111,34 +111,36 @@ static void set_hashtable(void *mem, struct tdb_context *tdb,
 	struct tdb_used_record *u = mem;
 	tdb_len_t len = sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS;
 
-	set_header(tdb, u, 0, len, len + htable->extra, 0);
+	set_header(tdb, u, TDB_HTABLE_MAGIC, 0, len, len + htable->extra, 0);
 	memset(u + 1, 0, len);
 }
 
-static void set_freelist(void *mem, struct tdb_context *tdb,
-			 struct tle_freelist *freelist, struct tdb_header *hdr,
-			 tdb_off_t last_flist)
+static void set_freetable(void *mem, struct tdb_context *tdb,
+			 struct tle_freetable *freetable, struct tdb_header *hdr,
+			 tdb_off_t last_ftable)
 {
-	struct tdb_freelist *flist = mem;
-	memset(flist, 0, sizeof(*flist));
-	set_header(tdb, &flist->hdr, 0,
-		   sizeof(*flist) - sizeof(flist->hdr),
-		   sizeof(*flist) - sizeof(flist->hdr), 1);
-
-	if (last_flist) {
-		flist = (struct tdb_freelist *)((char *)hdr + last_flist);
-		flist->next = freelist->base.off;
+	struct tdb_freetable *ftable = mem;
+	memset(ftable, 0, sizeof(*ftable));
+	set_header(tdb, &ftable->hdr, TDB_FTABLE_MAGIC, 0,
+			sizeof(*ftable) - sizeof(ftable->hdr),
+			sizeof(*ftable) - sizeof(ftable->hdr), 0);
+
+	if (last_ftable) {
+		ftable = (struct tdb_freetable *)((char *)hdr + last_ftable);
+		ftable->next = freetable->base.off;
 	} else {
-		hdr->free_list = freelist->base.off;
+		hdr->free_table = freetable->base.off;
 	}
 }
 
 static void add_to_freetable(struct tdb_context *tdb,
 			     tdb_off_t eoff,
 			     tdb_off_t elen,
-			     struct tle_freelist *freelist)
+			     unsigned ftable,
+			     struct tle_freetable *freetable)
 {
-	tdb->flist_off = freelist->base.off;
+	tdb->ftable_off = freetable->base.off;
+	tdb->ftable = ftable;
 	add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen);
 }
 
@@ -202,15 +204,15 @@ static void add_to_hashtable(struct tdb_context *tdb,
 	abort();
 }
 
-static struct tle_freelist *find_flist(struct tdb_layout *layout, unsigned num)
+static struct tle_freetable *find_ftable(struct tdb_layout *layout, unsigned num)
 {
 	unsigned i;
 
 	for (i = 0; i < layout->num_elems; i++) {
-		if (layout->elem[i].base.type != FREELIST)
+		if (layout->elem[i].base.type != FREETABLE)
 			continue;
 		if (num == 0)
-			return &layout->elem[i].flist;
+			return &layout->elem[i].ftable;
 		num--;
 	}
 	abort();
@@ -220,7 +222,7 @@ static struct tle_freelist *find_flist(struct tdb_layout *layout, unsigned num)
 struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 {
 	unsigned int i;
-	tdb_off_t off, len, last_flist;
+	tdb_off_t off, len, last_ftable;
 	char *mem;
 	struct tdb_context *tdb;
 
@@ -231,8 +233,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 		union tdb_layout_elem *e = &layout->elem[i];
 		e->base.off = off;
 		switch (e->base.type) {
-		case FREELIST:
-			len = freelist_len(&e->flist);
+		case FREETABLE:
+			len = freetable_len(&e->ftable);
 			break;
 		case FREE:
 			len = free_record_len(e->free.len);
@@ -259,14 +261,14 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 	tdb->map_ptr = mem;
 	tdb->map_size = off;
 
-	last_flist = 0;
+	last_ftable = 0;
 	for (i = 0; i < layout->num_elems; i++) {
 		union tdb_layout_elem *e = &layout->elem[i];
 		switch (e->base.type) {
-		case FREELIST:
-			set_freelist(mem + e->base.off, tdb, &e->flist,
-				     (struct tdb_header *)mem, last_flist);
-			last_flist = e->base.off;
+		case FREETABLE:
+			set_freetable(mem + e->base.off, tdb, &e->ftable,
+				     (struct tdb_header *)mem, last_ftable);
+			last_ftable = e->base.off;
 			break;
 		case FREE:
 			set_free_record(mem + e->base.off, e->free.len);
@@ -279,8 +281,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 			break;
 		}
 	}
-	/* Must have a free list! */
-	assert(last_flist);
+	/* Must have a free table! */
+	assert(last_ftable);
 
 	/* Now fill the free and hash tables. */
 	for (i = 0; i < layout->num_elems; i++) {
@@ -288,7 +290,8 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 		switch (e->base.type) {
 		case FREE:
 			add_to_freetable(tdb, e->base.off, e->free.len,
-					 find_flist(layout, e->free.flist_num));
+					 e->free.ftable_num,
+					 find_ftable(layout, e->free.ftable_num));
 			break;
 		case DATA:
 			add_to_hashtable(tdb, e->base.off, e->used.key);
@@ -298,7 +301,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 		}
 	}
 
-	tdb->flist_off = find_flist(layout, 0)->base.off;
+	tdb->ftable_off = find_ftable(layout, 0)->base.off;
 
 	/* Get physical if they asked for it. */
 	if (layout->filename) {

+ 6 - 6
ccan/tdb2/test/layout.h

@@ -3,9 +3,9 @@
 #include <ccan/tdb2/private.h>
 
 struct tdb_layout *new_tdb_layout(const char *filename);
-void tdb_layout_add_freelist(struct tdb_layout *layout);
+void tdb_layout_add_freetable(struct tdb_layout *layout);
 void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
-			 unsigned flist);
+			 unsigned ftable);
 void tdb_layout_add_used(struct tdb_layout *layout,
 			 TDB_DATA key, TDB_DATA data,
 			 tdb_len_t extra);
@@ -18,7 +18,7 @@ void tdb_layout_add_hashtable(struct tdb_layout *layout,
 struct tdb_context *tdb_layout_get(struct tdb_layout *layout);
 
 enum layout_type {
-	FREELIST, FREE, DATA, HASHTABLE,
+	FREETABLE, FREE, DATA, HASHTABLE,
 };
 
 /* Shared by all union members. */
@@ -27,14 +27,14 @@ struct tle_base {
 	tdb_off_t off;
 };
 
-struct tle_freelist {
+struct tle_freetable {
 	struct tle_base base;
 };
 
 struct tle_free {
 	struct tle_base base;
 	tdb_len_t len;
-	unsigned flist_num;
+	unsigned ftable_num;
 };
 
 struct tle_used {
@@ -53,7 +53,7 @@ struct tle_hashtable {
 
 union tdb_layout_elem {
 	struct tle_base base;
-	struct tle_freelist flist;
+	struct tle_freetable ftable;
 	struct tle_free free;
 	struct tle_used used;
 	struct tle_hashtable hashtable;

+ 2 - 15
ccan/tdb2/test/logging.c

@@ -1,7 +1,5 @@
-#define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdarg.h>
 #include <ccan/tap/tap.h>
 #include "logging.h"
 
@@ -16,24 +14,13 @@ union tdb_attribute tap_log_attr = {
 
 void tap_log_fn(struct tdb_context *tdb,
 		enum tdb_debug_level level, void *priv,
-		const char *fmt, ...)
+		const char *message)
 {
-	va_list ap;
-	char *p;
-
 	if (suppress_logging)
 		return;
 
-	va_start(ap, fmt);
-	if (vasprintf(&p, fmt, ap) == -1)
-		abort();
-	/* Strip trailing \n: diag adds it. */
-	if (p[strlen(p)-1] == '\n')
-		p[strlen(p)-1] = '\0';
-	diag("tdb log level %u: %s%s", level, log_prefix, p);
-	free(p);
+	diag("tdb log level %u: %s%s", level, log_prefix, message);
 	if (level != TDB_DEBUG_TRACE)
 		tap_log_messages++;
-	va_end(ap);
 }
 

+ 1 - 1
ccan/tdb2/test/logging.h

@@ -11,7 +11,7 @@ extern union tdb_attribute tap_log_attr;
 
 void tap_log_fn(struct tdb_context *tdb,
 		enum tdb_debug_level level, void *priv,
-		const char *fmt, ...);
+		const char *message);
 
 static inline bool data_equal(struct tdb_data a, struct tdb_data b)
 {

+ 8 - 6
ccan/tdb2/test/run-001-encode.c

@@ -12,18 +12,20 @@ int main(int argc, char *argv[])
 {
 	unsigned int i;
 	struct tdb_used_record rec;
-	struct tdb_context tdb = { .log = tap_log_fn, .log_priv = NULL };
+	struct tdb_context tdb = { .logfn = tap_log_fn };
 
 	plan_tests(64 + 32 + 48*6 + 1);
 
 	/* We should be able to encode any data value. */
 	for (i = 0; i < 64; i++)
-		ok1(set_header(&tdb, &rec, 0, 1ULL << i, 1ULL << i, 0) == 0);
+		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, 0, 1ULL << i,
+			       1ULL << i, 0) == 0);
 
 	/* And any key and data with < 64 bits between them. */
 	for (i = 0; i < 32; i++) {
 		tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
-		ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen, 0) == 0);
+		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
+			       klen + dlen, 0)  == 0);
 	}
 
 	/* We should neatly encode all values. */
@@ -32,13 +34,13 @@ int main(int argc, char *argv[])
 		uint64_t klen = 1ULL << (i < 16 ? i : 15);
 		uint64_t dlen = 1ULL << i;
 		uint64_t xlen = 1ULL << (i < 32 ? i : 31);
-		ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen + xlen, h)
-		    == 0);
+		ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
+			       klen+dlen+xlen, h) == 0);
 		ok1(rec_key_length(&rec) == klen);
 		ok1(rec_data_length(&rec) == dlen);
 		ok1(rec_extra_padding(&rec) == xlen);
 		ok1((uint64_t)rec_hash(&rec) == h);
-		ok1(rec_magic(&rec) == TDB_MAGIC);
+		ok1(rec_magic(&rec) == TDB_USED_MAGIC);
 	}
 	ok1(tap_log_messages == 0);
 	return exit_status();

+ 11 - 11
ccan/tdb2/test/run-03-coalesce.c

@@ -17,7 +17,7 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
 		return TDB_OFF_ERR;
 	if (frec_magic(&f) != TDB_FREE_MAGIC)
 		return TDB_OFF_ERR;
-	return f.data_len;
+	return frec_len(&f);
 }
 
 int main(int argc, char *argv[])
@@ -38,7 +38,7 @@ int main(int argc, char *argv[])
 
 	/* No coalescing can be done due to EOF */
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	len = 1024;
 	tdb_layout_add_free(layout, len, 0);
 	tdb = tdb_layout_get(layout);
@@ -46,7 +46,7 @@ int main(int argc, char *argv[])
 	ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
 
 	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(len));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
 	/* Lock and fail to coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len) == 0);
@@ -57,7 +57,7 @@ int main(int argc, char *argv[])
 
 	/* No coalescing can be done due to used record */
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_used(layout, key, data, 6);
 	tdb = tdb_layout_get(layout);
@@ -65,7 +65,7 @@ int main(int argc, char *argv[])
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(1024));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
 	/* Lock and fail to coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 0);
@@ -76,7 +76,7 @@ int main(int argc, char *argv[])
 
 	/* Coalescing can be done due to two free records, then EOF */
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 2048, 0);
 	tdb = tdb_layout_get(layout);
@@ -85,7 +85,7 @@ int main(int argc, char *argv[])
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 	/* Figure out which bucket (first) free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(1024));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
 	/* Lock and coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);
@@ -97,7 +97,7 @@ int main(int argc, char *argv[])
 
 	/* Coalescing can be done due to two free records, then data */
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 512, 0);
 	tdb_layout_add_used(layout, key, data, 6);
@@ -107,7 +107,7 @@ int main(int argc, char *argv[])
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(1024));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
 	/* Lock and coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);
@@ -119,7 +119,7 @@ int main(int argc, char *argv[])
 
 	/* Coalescing can be done due to three free records, then EOF */
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 1024, 0);
 	tdb_layout_add_free(layout, 512, 0);
 	tdb_layout_add_free(layout, 256, 0);
@@ -130,7 +130,7 @@ int main(int argc, char *argv[])
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 	/* Figure out which bucket free entry is. */
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(1024));
+	b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
 	/* Lock and coalesce. */
 	ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
 	ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 1);

+ 4 - 2
ccan/tdb2/test/run-04-basichash.c

@@ -65,7 +65,8 @@ int main(int argc, char *argv[])
 		/* FIXME: Check lock length */
 
 		/* Allocate a new record. */
-		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h, false);
+		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
+				TDB_USED_MAGIC, false);
 		ok1(new_off != TDB_OFF_ERR);
 
 		/* We should be able to add it now. */
@@ -225,7 +226,8 @@ int main(int argc, char *argv[])
 
 		/* We should be able to add it now. */
 		/* Allocate a new record. */
-		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h, false);
+		new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
+				TDB_USED_MAGIC, false);
 		ok1(new_off != TDB_OFF_ERR);
 		ok1(add_to_hash(tdb, &h, new_off) == 0);
 

+ 117 - 0
ccan/tdb2/test/run-25-hashoverload.c

@@ -0,0 +1,117 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static uint64_t badhash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+	return 0;
+}
+
+static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
+{
+	if (p)
+		return tdb_delete(tdb, key);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j;
+	struct tdb_context *tdb;
+	struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
+	struct tdb_data dbuf = { (unsigned char *)&j, sizeof(j) };
+	union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+						.hash_fn = badhash } };
+	int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+			TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+			TDB_NOMMAP|TDB_CONVERT,
+	};
+
+	hattr.base.next = &tap_log_attr;
+
+	plan_tests(5395);
+	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+		struct tdb_data d;
+
+		tdb = tdb_open("run-25-hashoverload.tdb", flags[i],
+			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+		ok1(tdb);
+		if (!tdb)
+			continue;
+
+		/* Fill a group. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Now store one last value: should form chain. */
+		ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Check we can find them all. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS) + 1; j++) {
+			d = tdb_fetch(tdb, key);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+		}
+
+		/* Now add a *lot* more. */
+		for (j = (1 << TDB_HASH_GROUP_BITS) + 1;
+		     j < (16 << TDB_HASH_GROUP_BITS);
+		     j++) {
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+			d = tdb_fetch(tdb, key);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Traverse through them. */
+		ok1(tdb_traverse(tdb, trav, NULL) == j);
+
+		/* Empty the first chain-worth. */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++)
+			ok1(tdb_delete(tdb, key) == 0);
+
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		for (j = (1 << TDB_HASH_GROUP_BITS);
+		     j < (16 << TDB_HASH_GROUP_BITS);
+		     j++) {
+			d = tdb_fetch(tdb, key);
+			ok1(d.dsize == sizeof(j));
+			ok1(d.dptr != NULL);
+			ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+		}
+
+		/* Traverse through them. */
+		ok1(tdb_traverse(tdb, trav, NULL)
+		    == (15 << TDB_HASH_GROUP_BITS));
+
+		/* Re-add */
+		for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+			ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+		}
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+		/* Now try deleting as we go. */
+		ok1(tdb_traverse(tdb, trav, trav)
+		    == (16 << TDB_HASH_GROUP_BITS));
+		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tdb_traverse(tdb, trav, NULL) == 0);
+		tdb_close(tdb);
+	}
+
+	ok1(tap_log_messages == 0);
+	return exit_status();
+}

+ 8 - 8
ccan/tdb2/test/run-30-exhaust-before-expand.c

@@ -9,13 +9,13 @@
 #include <err.h>
 #include "logging.h"
 
-static bool empty_freelist(struct tdb_context *tdb)
+static bool empty_freetable(struct tdb_context *tdb)
 {
-	struct tdb_freelist free;
+	struct tdb_freetable free;
 	unsigned int i;
 
-	/* Now, free list should be completely exhausted in zone 0 */
-	if (tdb_read_convert(tdb, tdb->flist_off, &free, sizeof(free)) != 0)
+	/* Now, free table should be completely exhausted in zone 0 */
+	if (tdb_read_convert(tdb, tdb->ftable_off, &free, sizeof(free)) != 0)
 		abort();
 
 	for (i = 0; i < sizeof(free.buckets)/sizeof(free.buckets[0]); i++) {
@@ -50,26 +50,26 @@ int main(int argc, char *argv[])
 		if (!tdb)
 			continue;
 
-		ok1(empty_freelist(tdb));
+		ok1(empty_freetable(tdb));
 		/* Need some hash lock for expand. */
 		ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
 		/* Create some free space. */
 		ok1(tdb_expand(tdb, 1) == 0);
 		ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
-		ok1(!empty_freelist(tdb));
+		ok1(!empty_freetable(tdb));
 
 		size = tdb->map_size;
 		/* Insert minimal-length records until we expand. */
 		for (j = 0; tdb->map_size == size; j++) {
-			was_empty = empty_freelist(tdb);
+			was_empty = empty_freetable(tdb);
 			if (tdb_store(tdb, k, k, TDB_INSERT) != 0)
 				err(1, "Failed to store record %i", j);
 		}
 
 		/* Would have been empty before expansion, but no longer. */
 		ok1(was_empty);
-		ok1(!empty_freelist(tdb));
+		ok1(!empty_freetable(tdb));
 		tdb_close(tdb);
 	}
 

+ 17 - 13
ccan/tdb2/test/run-50-multiple-freelists.c

@@ -22,11 +22,11 @@ int main(int argc, char *argv[])
 	data.dsize = 5;
 	key.dsize = 5;
 
-	/* Create a TDB with three free lists. */
+	/* Create a TDB with three free tables. */
 	layout = new_tdb_layout(NULL);
-	tdb_layout_add_freelist(layout);
-	tdb_layout_add_freelist(layout);
-	tdb_layout_add_freelist(layout);
+	tdb_layout_add_freetable(layout);
+	tdb_layout_add_freetable(layout);
+	tdb_layout_add_freetable(layout);
 	tdb_layout_add_free(layout, 80, 0);
 	/* Used record prevent coalescing. */
 	tdb_layout_add_used(layout, key, data, 6);
@@ -40,24 +40,28 @@ int main(int argc, char *argv[])
 	tdb = tdb_layout_get(layout);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 
-	off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0, 0);
+	off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
 	ok1(off == layout->elem[3].base.off);
-	ok1(tdb->flist_off == layout->elem[0].base.off);
+	ok1(tdb->ftable_off == layout->elem[0].base.off);
 
-	off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0, 0);
+	off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
 	ok1(off == layout->elem[5].base.off);
-	ok1(tdb->flist_off == layout->elem[1].base.off);
+	ok1(tdb->ftable_off == layout->elem[1].base.off);
 
-	off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0, 0);
+	off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
 	ok1(off == layout->elem[7].base.off);
-	ok1(tdb->flist_off == layout->elem[2].base.off);
+	ok1(tdb->ftable_off == layout->elem[2].base.off);
 
-	off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0, 0);
+	off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0,
+		       TDB_USED_MAGIC, 0);
 	ok1(off == layout->elem[9].base.off);
-	ok1(tdb->flist_off == layout->elem[0].base.off);
+	ok1(tdb->ftable_off == layout->elem[0].base.off);
 
 	/* Now we fail. */
-	off = get_free(tdb, 0, 0, 1, 0);
+	off = get_free(tdb, 0, 0, 1, TDB_USED_MAGIC, 0);
 	ok1(off == 0);
 
 	tdb_close(tdb);

+ 1 - 1
ccan/tdb2/test/run-seed.c

@@ -13,7 +13,7 @@ static int log_count = 0;
 /* Normally we get a log when setting random seed. */
 static void my_log_fn(struct tdb_context *tdb,
 		      enum tdb_debug_level level, void *priv,
-		      const char *fmt, ...)
+		      const char *message)
 {
 	log_count++;
 }

+ 6 - 2
ccan/tdb2/test/run-traverse.c

@@ -56,7 +56,6 @@ static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
 		td->high = val;
 
 	if (td->delete) {
-
 		if (tdb_delete(tdb, key) != 0) {
 			td->delete_error = tdb_error(tdb);
 			return -1;
@@ -120,7 +119,7 @@ int main(int argc, char *argv[])
 
 	hattr.base.next = &tap_log_attr;
 
-	plan_tests(sizeof(flags) / sizeof(flags[0]) * 50 + 1);
+	plan_tests(sizeof(flags) / sizeof(flags[0]) * 53 + 1);
 	for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
 		tdb = tdb_open("run-traverse.tdb", flags[i],
 			       O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
@@ -182,6 +181,7 @@ int main(int argc, char *argv[])
 		ok1(td.low <= NUM_RECORDS / 2);
 		ok1(td.high > NUM_RECORDS / 2);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
+		ok1(tap_log_messages == 0);
 
 		/* Growing traverse.  Expect failure on r/o traverse. */
 		tgd.calls = 0;
@@ -193,6 +193,8 @@ int main(int argc, char *argv[])
 		ok1(tgd.error == TDB_ERR_RDONLY);
 		ok1(tgd.calls == 1);
 		ok1(!tgd.mismatch);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 		/* Deleting traverse.  Expect failure on r/o traverse. */
@@ -209,6 +211,8 @@ int main(int argc, char *argv[])
 		ok1(!td.mismatch);
 		ok1(td.calls == 1);
 		ok1(td.low == td.high);
+		ok1(tap_log_messages == 1);
+		tap_log_messages = 0;
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 		/* Deleting traverse (delete everything). */

+ 4 - 3
ccan/tdb2/tools/Makefile

@@ -1,12 +1,13 @@
 OBJS:=../../tdb2.o ../../hash.o ../../tally.o
-CFLAGS:=-I../../.. -Wall -g #-g -O3 #-g -pg
+CFLAGS:=-I../../.. -Wall -g -O3 #-g -pg
 LDFLAGS:=-L../../..
 
-default: tdbtorture tdbtool mktdb
+default: tdbtorture tdbtool mktdb speed
 
 tdbtorture: tdbtorture.c $(OBJS)
 tdbtool: tdbtool.c $(OBJS)
 mktdb: mktdb.c $(OBJS)
+speed: speed.c $(OBJS)
 
 clean:
-	rm -f tdbtorture tdbtool mktdb
+	rm -f tdbtorture tdbtool mktdb speed

+ 331 - 0
ccan/tdb2/tools/speed.c

@@ -0,0 +1,331 @@
+/* Simple speed test for TDB */
+#include <err.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <ccan/tdb2/tdb2.h>
+
+/* Nanoseconds per operation */
+static size_t normalize(const struct timeval *start,
+			const struct timeval *stop,
+			unsigned int num)
+{
+	struct timeval diff;
+
+	timersub(stop, start, &diff);
+
+	/* Floating point is more accurate here. */
+	return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
+		/ num * 1000;
+}
+
+static size_t file_size(void)
+{
+	struct stat st;
+
+	if (stat("/tmp/speed.tdb", &st) != 0)
+		return -1;
+	return st.st_size;
+}
+
+static int count_record(struct tdb_context *tdb,
+			TDB_DATA key, TDB_DATA data, void *p)
+{
+	int *total = p;
+	*total += *(int *)data.dptr;
+	return 0;
+}
+
+static void dump_and_clear_stats(struct tdb_attribute_stats *stats)
+{
+	printf("allocs = %llu\n",
+	       (unsigned long long)stats->allocs);
+	printf("  alloc_subhash = %llu\n",
+	       (unsigned long long)stats->alloc_subhash);
+	printf("  alloc_chain = %llu\n",
+	       (unsigned long long)stats->alloc_chain);
+	printf("  alloc_bucket_exact = %llu\n",
+	       (unsigned long long)stats->alloc_bucket_exact);
+	printf("  alloc_bucket_max = %llu\n",
+	       (unsigned long long)stats->alloc_bucket_max);
+	printf("  alloc_leftover = %llu\n",
+	       (unsigned long long)stats->alloc_leftover);
+	printf("  alloc_coalesce_tried = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_tried);
+	printf("    alloc_coalesce_lockfail = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_lockfail);
+	printf("    alloc_coalesce_race = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_race);
+	printf("    alloc_coalesce_succeeded = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_succeeded);
+	printf("       alloc_coalesce_num_merged = %llu\n",
+	       (unsigned long long)stats->alloc_coalesce_num_merged);
+	printf("compares = %llu\n",
+	       (unsigned long long)stats->compares);
+	printf("  compare_wrong_bucket = %llu\n",
+	       (unsigned long long)stats->compare_wrong_bucket);
+	printf("  compare_wrong_offsetbits = %llu\n",
+	       (unsigned long long)stats->compare_wrong_offsetbits);
+	printf("  compare_wrong_keylen = %llu\n",
+	       (unsigned long long)stats->compare_wrong_keylen);
+	printf("  compare_wrong_rechash = %llu\n",
+	       (unsigned long long)stats->compare_wrong_rechash);
+	printf("  compare_wrong_keycmp = %llu\n",
+	       (unsigned long long)stats->compare_wrong_keycmp);
+	printf("expands = %llu\n",
+	       (unsigned long long)stats->expands);
+	printf("frees = %llu\n",
+	       (unsigned long long)stats->frees);
+	printf("locks = %llu\n",
+	       (unsigned long long)stats->locks);
+	printf("   lock_lowlevel = %llu\n",
+	       (unsigned long long)stats->lock_lowlevel);
+	printf("   lock_nonblock = %llu\n",
+	       (unsigned long long)stats->lock_nonblock);
+
+	/* Now clear. */
+	memset(&stats->allocs, 0, (char *)(stats+1) - (char *)&stats->allocs);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int i, j, num = 1000, stage = 0, stopat = -1;
+	int flags = TDB_DEFAULT;
+	bool transaction = false;
+	TDB_DATA key, data;
+	struct tdb_context *tdb;
+	struct timeval start, stop;
+	union tdb_attribute seed, stats;
+
+	/* Try to keep benchmarks even. */
+	seed.base.attr = TDB_ATTRIBUTE_SEED;
+	seed.base.next = NULL;
+	seed.seed.seed = 0;
+
+	memset(&stats, 0, sizeof(stats));
+	stats.base.attr = TDB_ATTRIBUTE_STATS;
+	stats.base.next = NULL;
+	stats.stats.size = sizeof(stats);
+
+	if (argv[1] && strcmp(argv[1], "--internal") == 0) {
+		flags = TDB_INTERNAL;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
+		transaction = true;
+		argc--;
+		argv++;
+	}
+	if (argv[1] && strcmp(argv[1], "--stats") == 0) {
+		seed.base.next = &stats;
+		argc--;
+		argv++;
+	}
+
+	tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR|O_CREAT|O_TRUNC,
+		       0600, &seed);
+	if (!tdb)
+		err(1, "Opening /tmp/speed.tdb");
+
+	key.dptr = (void *)&i;
+	key.dsize = sizeof(i);
+	data = key;
+
+	if (argv[1]) {
+		num = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+
+	if (argv[1]) {
+		stopat = atoi(argv[1]);
+		argv++;
+		argc--;
+	}
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Add 1000 records. */
+	printf("Adding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Finding 1000 records. */
+	printf("Finding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++) {
+		int *dptr;
+		dptr = (int *)tdb_fetch(tdb, key).dptr;
+		if (!dptr || *dptr != i)
+			errx(1, "Fetching key %u in tdb gave %u",
+			     i, dptr ? *dptr : -1);
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Missing 1000 records. */
+	printf("Missing %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = num; i < num*2; i++) {
+		int *dptr;
+		dptr = (int *)tdb_fetch(tdb, key).dptr;
+		if (dptr)
+			errx(1, "Fetching key %u in tdb gave %u", i, *dptr);
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Traverse 1000 records. */
+	printf("Traversing %u records: ", num); fflush(stdout);
+	i = 0;
+	gettimeofday(&start, NULL);
+	if (tdb_traverse(tdb, count_record, &i) != num)
+		errx(1, "Traverse returned wrong number of records");
+	if (i != (num - 1) * (num / 2))
+		errx(1, "Traverse tallied to %u", i);
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Delete 1000 records (not in order). */
+	printf("Deleting %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if (tdb_delete(tdb, key) != 0)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Re-add 1000 records (not in order). */
+	printf("Re-adding %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 100003) % num;
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Append 1000 records. */
+	printf("Appending %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < num; i++)
+		if (tdb_append(tdb, key, data) != 0)
+			errx(1, "Appending key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+	if (++stage == stopat)
+		exit(0);
+
+	if (transaction && tdb_transaction_start(tdb))
+		errx(1, "starting transaction: %s", tdb_errorstr(tdb));
+
+	/* Churn 1000 records: not in order! */
+	printf("Churning %u records: ", num); fflush(stdout);
+	gettimeofday(&start, NULL);
+	for (j = 0; j < num; j++) {
+		i = (j + 1000019) % num;
+		if (tdb_delete(tdb, key) != 0)
+			errx(1, "Deleting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+		i += num;
+		if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+			errx(1, "Inserting key %u in tdb: %s",
+			     i, tdb_errorstr(tdb));
+	}
+	gettimeofday(&stop, NULL);
+	if (transaction && tdb_transaction_commit(tdb))
+		errx(1, "committing transaction: %s", tdb_errorstr(tdb));
+	printf(" %zu ns (%zu bytes)\n",
+	       normalize(&start, &stop, num), file_size());
+
+	if (seed.base.next)
+		dump_and_clear_stats(&stats.stats);
+	if (++stage == stopat)
+		exit(0);
+
+	return 0;
+}

+ 147 - 120
ccan/tdb2/transaction.c

@@ -169,10 +169,9 @@ static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
 	return 0;
 
 fail:
-	tdb->ecode = TDB_ERR_IO;
-	tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-		 "transaction_read: failed at off=%llu len=%llu\n",
-		 (long long)off, (long long)len);
+	tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+		   "transaction_read: failed at off=%zu len=%zu",
+		   (size_t)off, (size_t)len);
 	tdb->transaction->transaction_error = 1;
 	return -1;
 }
@@ -188,12 +187,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 
 	/* Only a commit is allowed on a prepared transaction */
 	if (tdb->transaction->prepared) {
-		tdb->ecode = TDB_ERR_EINVAL;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_FATAL,
 			 "transaction_write: transaction already prepared,"
-			 " write not allowed\n");
-		tdb->transaction->transaction_error = 1;
-		return -1;
+			 " write not allowed");
+		goto fail;
 	}
 
 	/* break it up into block sized chunks */
@@ -228,7 +225,8 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 				(blk+1)*sizeof(uint8_t *));
 		}
 		if (new_blocks == NULL) {
-			tdb->ecode = TDB_ERR_OOM;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "transaction_write: failed to allocate");
 			goto fail;
 		}
 		memset(&new_blocks[tdb->transaction->num_blocks], 0,
@@ -242,9 +240,9 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 	if (tdb->transaction->blocks[blk] == NULL) {
 		tdb->transaction->blocks[blk] = (uint8_t *)calloc(getpagesize(), 1);
 		if (tdb->transaction->blocks[blk] == NULL) {
-			tdb->ecode = TDB_ERR_OOM;
-			tdb->transaction->transaction_error = 1;
-			return -1;
+			tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+				   "transaction_write: failed to allocate");
+			goto fail;
 		}
 		if (tdb->transaction->old_map_size > blk * getpagesize()) {
 			tdb_len_t len2 = getpagesize();
@@ -254,6 +252,10 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 			if (tdb->transaction->io_methods->read(tdb, blk * getpagesize(),
 							       tdb->transaction->blocks[blk],
 							       len2) != 0) {
+				tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+					   "transaction_write: failed to"
+					   " read old block: %s",
+					   strerror(errno));
 				SAFE_FREE(tdb->transaction->blocks[blk]);
 				goto fail;
 			}
@@ -278,10 +280,6 @@ static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 	return 0;
 
 fail:
-	tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-		 "transaction_write: failed at off=%llu len=%llu\n",
-		 (long long)((blk*getpagesize()) + off),
-		 (long long)len);
 	tdb->transaction->transaction_error = 1;
 	return -1;
 }
@@ -341,6 +339,12 @@ static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 		return 0;
 	}
 	tdb->ecode = TDB_ERR_IO;
+	if (!probe) {
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+			   "tdb_oob len %lld beyond transaction size %lld",
+			   (long long)len,
+			   (long long)tdb->map_size);
+	}
 	return -1;
 }
 
@@ -359,10 +363,39 @@ static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t addition)
 }
 
 static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off,
-				size_t len)
+				size_t len, bool write)
 {
-	/* FIXME */
-	return NULL;
+	size_t blk = off / getpagesize(), end_blk;
+
+	/* This is wrong for zero-length blocks, but will fail gracefully */
+	end_blk = (off + len - 1) / getpagesize();
+
+	/* Can only do direct if in single block and we've already copied. */
+	if (write) {
+		if (blk != end_blk)
+			return NULL;
+		if (blk >= tdb->transaction->num_blocks)
+			return NULL;
+		if (tdb->transaction->blocks[blk] == NULL)
+			return NULL;
+		return tdb->transaction->blocks[blk] + off % getpagesize();
+	}
+
+	/* Single which we have copied? */
+	if (blk == end_blk
+	    && blk < tdb->transaction->num_blocks
+	    && tdb->transaction->blocks[blk])
+		return tdb->transaction->blocks[blk] + off % getpagesize();
+
+	/* Otherwise must be all not copied. */
+	while (blk < end_blk) {
+		if (blk >= tdb->transaction->num_blocks)
+			break;
+		if (tdb->transaction->blocks[blk])
+			return NULL;
+		blk++;
+	}
+	return tdb->transaction->io_methods->direct(tdb, off, len, write);
 }
 
 static const struct tdb_methods transaction_methods = {
@@ -383,9 +416,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t
 	}
 
 	if (fsync(tdb->fd) != 0) {
-		tdb->ecode = TDB_ERR_IO;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction: fsync failed\n");
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+			   "tdb_transaction: fsync failed: %s",
+			   strerror(errno));
 		return -1;
 	}
 #ifdef MS_SYNC
@@ -393,10 +426,9 @@ static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t
 		tdb_off_t moffset = offset & ~(getpagesize()-1);
 		if (msync(moffset + (char *)tdb->map_ptr,
 			  length + (offset - moffset), MS_SYNC) != 0) {
-			tdb->ecode = TDB_ERR_IO;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_transaction: msync failed - %s\n",
-				 strerror(errno));
+			tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
+				   "tdb_transaction: msync failed: %s",
+				   strerror(errno));
 			return -1;
 		}
 	}
@@ -410,9 +442,8 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb)
 	int i;
 
 	if (tdb->transaction == NULL) {
-		tdb->ecode = TDB_ERR_EINVAL;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_cancel: no transaction\n");
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			   "tdb_transaction_cancel: no transaction");
 		return;
 	}
 
@@ -441,9 +472,9 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb)
 				   &invalid, sizeof(invalid)) == -1 ||
 		    transaction_sync(tdb, tdb->transaction->magic_offset,
 				     sizeof(invalid)) == -1) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_transaction_cancel: failed to remove"
-				 " recovery magic\n");
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+				   "tdb_transaction_cancel: failed to remove"
+				   " recovery magic");
 		}
 	}
 
@@ -469,16 +500,17 @@ int tdb_transaction_start(struct tdb_context *tdb)
 {
 	/* some sanity checks */
 	if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) {
-		tdb->ecode = TDB_ERR_EINVAL;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_start: cannot start a transaction"
-			 " on a read-only or internal db\n");
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			   "tdb_transaction_start: cannot start a transaction"
+			   " on a read-only or internal db");
 		return -1;
 	}
 
 	/* cope with nested tdb_transaction_start() calls */
 	if (tdb->transaction != NULL) {
-		tdb->ecode = TDB_ERR_NESTING;
+		tdb_logerr(tdb, TDB_ERR_NESTING, TDB_DEBUG_ERROR,
+			   "tdb_transaction_start:"
+			   " already inside transaction");
 		return -1;
 	}
 
@@ -486,17 +518,17 @@ int tdb_transaction_start(struct tdb_context *tdb)
 		/* the caller must not have any locks when starting a
 		   transaction as otherwise we'll be screwed by lack
 		   of nested locks in posix */
-		tdb->ecode = TDB_ERR_LOCK;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_start: cannot start a transaction"
-			 " with locks held\n");
+		tdb_logerr(tdb, TDB_ERR_LOCK, TDB_DEBUG_ERROR,
+			   "tdb_transaction_start: cannot start a transaction"
+			   " with locks held");
 		return -1;
 	}
 
 	tdb->transaction = (struct tdb_transaction *)
 		calloc(sizeof(struct tdb_transaction), 1);
 	if (tdb->transaction == NULL) {
-		tdb->ecode = TDB_ERR_OOM;
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
+			   "tdb_transaction_start: cannot allocate");
 		return -1;
 	}
 
@@ -585,17 +617,17 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 
 	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
 	if (recovery_head == TDB_OFF_ERR) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_recovery_allocate:"
-			 " failed to read recovery head\n");
+			 " failed to read recovery head");
 		return -1;
 	}
 
 	if (recovery_head != 0) {
 		if (methods->read(tdb, recovery_head, &rec, sizeof(rec))) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 				 "tdb_recovery_allocate:"
-				 " failed to read recovery record\n");
+				 " failed to read recovery record");
 			return -1;
 		}
 		tdb_convert(tdb, &rec, sizeof(rec));
@@ -621,11 +653,12 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 	   us an area that is being currently used (as of the start of
 	   the transaction) */
 	if (recovery_head != 0) {
+		add_stat(tdb, frees, 1);
 		if (add_free_record(tdb, recovery_head,
 				    sizeof(rec) + rec.max_len) != 0) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_recovery_allocate:"
-				 " failed to free previous recovery area\n");
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+				   "tdb_recovery_allocate:"
+				   " failed to free previous recovery area");
 			return -1;
 		}
 	}
@@ -649,9 +682,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 		sizeof(rec) + *recovery_max_size;
 	tdb->map_size = tdb->transaction->old_map_size;
 	if (methods->expand_file(tdb, addition) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_recovery_allocate:"
-			 " failed to create recovery area\n");
+			 " failed to create recovery area");
 		return -1;
 	}
 
@@ -665,9 +698,9 @@ static int tdb_recovery_allocate(struct tdb_context *tdb,
 	tdb_convert(tdb, &recovery_head, sizeof(recovery_head));
 	if (methods->write(tdb, offsetof(struct tdb_header, recovery),
 			   &recovery_head, sizeof(tdb_off_t)) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_recovery_allocate:"
-			 " failed to write recovery head\n");
+			 " failed to write recovery head");
 		return -1;
 	}
 	transaction_write_existing(tdb, offsetof(struct tdb_header, recovery),
@@ -713,7 +746,8 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 
 	data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
 	if (data == NULL) {
-		tdb->ecode = TDB_ERR_OOM;
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+			   "transaction_setup_recovery: cannot allocate");
 		return -1;
 	}
 
@@ -743,10 +777,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 			continue;
 		}
 		if (offset + length > tdb->map_size) {
-			tdb->ecode = TDB_ERR_CORRUPT;
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_transaction_setup_recovery:"
-				 " transaction data over new region boundary\n");
+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+				   "tdb_transaction_setup_recovery:"
+				   " transaction data over new region boundary");
 			free(data);
 			return -1;
 		}
@@ -774,9 +807,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 	/* write the recovery data to the recovery area */
 	if (methods->write(tdb, recovery_offset, data,
 			   sizeof(*rec) + recovery_size) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_transaction_setup_recovery:"
-			 " failed to write recovery data\n");
+			 " failed to write recovery data");
 		free(data);
 		return -1;
 	}
@@ -801,9 +834,9 @@ static int transaction_setup_recovery(struct tdb_context *tdb,
 						   magic);
 
 	if (methods->write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_transaction_setup_recovery:"
-			 " failed to write recovery magic\n");
+			 " failed to write recovery magic");
 		return -1;
 	}
 	transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic));
@@ -821,27 +854,24 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 	const struct tdb_methods *methods;
 
 	if (tdb->transaction == NULL) {
-		tdb->ecode = TDB_ERR_EINVAL;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_prepare_commit: no transaction\n");
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			   "tdb_transaction_prepare_commit: no transaction");
 		return -1;
 	}
 
 	if (tdb->transaction->prepared) {
-		tdb->ecode = TDB_ERR_EINVAL;
 		_tdb_transaction_cancel(tdb);
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_prepare_commit:"
-			 " transaction already prepared\n");
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			   "tdb_transaction_prepare_commit:"
+			   " transaction already prepared");
 		return -1;
 	}
 
 	if (tdb->transaction->transaction_error) {
-		tdb->ecode = TDB_ERR_IO;
 		_tdb_transaction_cancel(tdb);
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_prepare_commit:"
-			 " transaction error pending\n");
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			   "tdb_transaction_prepare_commit:"
+			   " transaction error pending");
 		return -1;
 	}
 
@@ -860,9 +890,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 
 	/* upgrade the main transaction lock region to a write lock */
 	if (tdb_allrecord_upgrade(tdb) == -1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
 			 "tdb_transaction_prepare_commit:"
-			 " failed to upgrade hash locks\n");
+			 " failed to upgrade hash locks");
 		_tdb_transaction_cancel(tdb);
 		return -1;
 	}
@@ -870,9 +900,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 	/* get the open lock - this prevents new users attaching to the database
 	   during the commit */
 	if (tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK) == -1) {
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
 			 "tdb_transaction_prepare_commit:"
-			 " failed to get open lock\n");
+			 " failed to get open lock");
 		_tdb_transaction_cancel(tdb);
 		return -1;
 	}
@@ -881,9 +911,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 	if (!(tdb->flags & TDB_NOSYNC)) {
 		/* write the recovery data to the end of the file */
 		if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 				 "tdb_transaction_prepare_commit:"
-				 " failed to setup recovery data\n");
+				 " failed to setup recovery data");
 			_tdb_transaction_cancel(tdb);
 			return -1;
 		}
@@ -897,9 +927,9 @@ static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 		/* Restore original map size for tdb_expand_file */
 		tdb->map_size = tdb->transaction->old_map_size;
 		if (methods->expand_file(tdb, add) == -1) {
-			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_ERROR,
 				 "tdb_transaction_prepare_commit:"
-				 " expansion failed\n");
+				 " expansion failed");
 			_tdb_transaction_cancel(tdb);
 			return -1;
 		}
@@ -927,19 +957,18 @@ int tdb_transaction_commit(struct tdb_context *tdb)
 	int i;
 
 	if (tdb->transaction == NULL) {
-		tdb->ecode = TDB_ERR_EINVAL;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_commit: no transaction\n");
+		tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_DEBUG_ERROR,
+			 "tdb_transaction_commit: no transaction");
 		return -1;
 	}
 
 	tdb_trace(tdb, "tdb_transaction_commit");
 
 	if (tdb->transaction->transaction_error) {
-		tdb->ecode = TDB_ERR_IO;
 		tdb_transaction_cancel(tdb);
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_transaction_commit: transaction error pending\n");
+		tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_ERROR,
+			   "tdb_transaction_commit:"
+			   " transaction error pending");
 		return -1;
 	}
 
@@ -980,9 +1009,9 @@ int tdb_transaction_commit(struct tdb_context *tdb)
 
 		if (methods->write(tdb, offset, tdb->transaction->blocks[i],
 				   length) == -1) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-				 "tdb_transaction_commit:"
-				 " write failed during commit\n");
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+				   "tdb_transaction_commit:"
+				   " write failed during commit");
 
 			/* we've overwritten part of the data and
 			   possibly expanded the file, so we need to
@@ -1042,9 +1071,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	/* find the recovery area */
 	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
 	if (recovery_head == TDB_OFF_ERR) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_transaction_recover:"
-			 " failed to read recovery head\n");
+			 " failed to read recovery head");
 		return -1;
 	}
 
@@ -1055,9 +1084,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 
 	/* read the recovery record */
 	if (tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec)) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction_recover:"
-			 " failed to read recovery record\n");
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+			   "tdb_transaction_recover:"
+			   " failed to read recovery record");
 		return -1;
 	}
 
@@ -1067,10 +1096,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	}
 
 	if (tdb->read_only) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction_recover:"
-			 " attempt to recover read only database\n");
-		tdb->ecode = TDB_ERR_CORRUPT;
+		tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_FATAL,
+			   "tdb_transaction_recover:"
+			   " attempt to recover read only database");
 		return -1;
 	}
 
@@ -1078,19 +1106,18 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 
 	data = (unsigned char *)malloc(rec.len);
 	if (data == NULL) {
-		tdb->ecode = TDB_ERR_OOM;
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction_recover:"
-			 " failed to allocate recovery data\n");
+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
+			   "tdb_transaction_recover:"
+			   " failed to allocate recovery data");
 		return -1;
 	}
 
 	/* read the full recovery data */
 	if (tdb->methods->read(tdb, recovery_head + sizeof(rec), data,
 			       rec.len) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction_recover:"
-			 " failed to read recovery data\n");
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+			   "tdb_transaction_recover:"
+			   " failed to read recovery data");
 		return -1;
 	}
 
@@ -1106,9 +1133,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 
 		if (tdb->methods->write(tdb, ofs, p, len) == -1) {
 			free(data);
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 				 "tdb_transaction_recover:"
-				 " failed to recover %zu bytes at offset %zu\n",
+				 " failed to recover %zu bytes at offset %zu",
 				 (size_t)len, (size_t)ofs);
 			return -1;
 		}
@@ -1118,8 +1145,8 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	free(data);
 
 	if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction_recover: failed to sync recovery\n");
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+			   "tdb_transaction_recover: failed to sync recovery");
 		return -1;
 	}
 
@@ -1127,9 +1154,9 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 	if (recovery_eof <= recovery_head) {
 		if (tdb_write_off(tdb, offsetof(struct tdb_header,recovery), 0)
 		    == -1) {
-			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 				 "tdb_transaction_recover:"
-				 " failed to remove recovery head\n");
+				 " failed to remove recovery head");
 			return -1;
 		}
 	}
@@ -1139,21 +1166,21 @@ int tdb_transaction_recover(struct tdb_context *tdb)
 			  recovery_head
 			  + offsetof(struct tdb_recovery_record, magic),
 			  TDB_RECOVERY_INVALID_MAGIC) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
 			 "tdb_transaction_recover:"
-			 " failed to remove recovery magic\n");
+			 " failed to remove recovery magic");
 		return -1;
 	}
 
 	if (transaction_sync(tdb, 0, recovery_eof) == -1) {
-		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-			 "tdb_transaction_recover: failed to sync2 recovery\n");
+		tdb_logerr(tdb, tdb->ecode, TDB_DEBUG_FATAL,
+			 "tdb_transaction_recover: failed to sync2 recovery");
 		return -1;
 	}
 
-	tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv,
-		 "tdb_transaction_recover: recovered %zu byte database\n",
-		 (size_t)recovery_eof);
+	tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_TRACE,
+		   "tdb_transaction_recover: recovered %zu byte database",
+		   (size_t)recovery_eof);
 
 	/* all done */
 	return 0;

Some files were not shown because too many files changed in this diff