Browse Source

tdb2: remove tailer

We don't actually need it.
Rusty Russell 15 years ago
parent
commit
d1383862ad

+ 13 - 22
ccan/tdb2/check.c

@@ -368,7 +368,7 @@ static tdb_off_t check_zone(struct tdb_context *tdb, tdb_off_t zone_off,
 			    unsigned int *max_zone_bits)
 			    unsigned int *max_zone_bits)
 {
 {
 	struct free_zone_header zhdr;
 	struct free_zone_header zhdr;
-	tdb_off_t off, hdrlen;
+	tdb_off_t off, hdrlen, end;
 	tdb_len_t len;
 	tdb_len_t len;
 
 
 	if (tdb_read_convert(tdb, zone_off, &zhdr, sizeof(zhdr)) == -1)
 	if (tdb_read_convert(tdb, zone_off, &zhdr, sizeof(zhdr)) == -1)
@@ -391,15 +391,18 @@ static tdb_off_t check_zone(struct tdb_context *tdb, tdb_off_t zone_off,
 		return TDB_OFF_ERR;
 		return TDB_OFF_ERR;
 	}
 	}
 
 
-	/* Zone must be within file! */
-	if (tdb->methods->oob(tdb, zone_off + (1ULL << zhdr.zone_bits), false))
-		return TDB_OFF_ERR;
-
+	/* Zone header must be within file! */
 	hdrlen = sizeof(zhdr)
 	hdrlen = sizeof(zhdr)
 		+ (BUCKETS_FOR_ZONE(zhdr.zone_bits) + 1) * sizeof(tdb_off_t);
 		+ (BUCKETS_FOR_ZONE(zhdr.zone_bits) + 1) * sizeof(tdb_off_t);
-	for (off = zone_off + hdrlen;
-	     off < zone_off + (1ULL << zhdr.zone_bits);
-	     off += len) {
+
+	if (tdb->methods->oob(tdb, zone_off + hdrlen, true))
+		return TDB_OFF_ERR;
+
+	end = zone_off + (1ULL << zhdr.zone_bits);
+	if (end > tdb->map_size)
+		end = tdb->map_size;
+
+	for (off = zone_off + hdrlen; off < end; off += len) {
 		union {
 		union {
 			struct tdb_used_record u;
 			struct tdb_used_record u;
 			struct tdb_free_record f;
 			struct tdb_free_record f;
@@ -476,7 +479,7 @@ static tdb_off_t check_zone(struct tdb_context *tdb, tdb_off_t zone_off,
 			}
 			}
 		}
 		}
 	}
 	}
-	return 1ULL << zhdr.zone_bits;
+	return off - zone_off;
 }
 }
 
 
 /* FIXME: call check() function. */
 /* FIXME: call check() function. */
@@ -488,7 +491,6 @@ int tdb_check(struct tdb_context *tdb,
 	tdb_len_t len;
 	tdb_len_t len;
 	size_t num_free = 0, num_used = 0, num_found = 0;
 	size_t num_free = 0, num_used = 0, num_found = 0;
 	unsigned max_zone_bits = INITIAL_ZONE_BITS;
 	unsigned max_zone_bits = INITIAL_ZONE_BITS;
-	uint8_t tailer;
 
 
 	if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
 	if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
 		return -1;
 		return -1;
@@ -503,7 +505,7 @@ int tdb_check(struct tdb_context *tdb,
 
 
 	/* First we do a linear scan, checking all records. */
 	/* First we do a linear scan, checking all records. */
 	for (off = sizeof(struct tdb_header);
 	for (off = sizeof(struct tdb_header);
-	     off < tdb->map_size - 1;
+	     off < tdb->map_size;
 	     off += len) {
 	     off += len) {
 		len = check_zone(tdb, off, &used, &num_used, &free, &num_free,
 		len = check_zone(tdb, off, &used, &num_used, &free, &num_free,
 				 &max_zone_bits);
 				 &max_zone_bits);
@@ -511,17 +513,6 @@ int tdb_check(struct tdb_context *tdb,
 			goto fail;
 			goto fail;
 	}
 	}
 
 
-	/* Check tailer. */
-	if (tdb->methods->read(tdb, tdb->map_size - 1, &tailer, 1) == -1)
-		goto fail;
-	if (tailer != max_zone_bits) {
-		tdb->ecode = TDB_ERR_CORRUPT;
-		tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
-			 "tdb_check: Bad tailer value %u vs %u\n", tailer,
-			 max_zone_bits);
-		goto fail;
-	}
-
 	/* FIXME: Check key uniqueness? */
 	/* FIXME: Check key uniqueness? */
 	if (!check_hash(tdb, used, num_used))
 	if (!check_hash(tdb, used, num_used))
 		goto fail;
 		goto fail;

+ 132 - 78
ccan/tdb2/free.c

@@ -27,6 +27,44 @@ static unsigned fls64(uint64_t val)
 	return ilog64(val);
 	return ilog64(val);
 }
 }
 
 
+static unsigned ffs64(uint64_t val)
+{
+#if HAVE_BUILTIN_FFSLL
+	return __builtin_ffsll(val);
+#else
+	unsigned r = 0;
+
+	if (!val)
+		return 0;
+
+	if (!(val & 0xffffffff)) {
+		val >>= 32;
+		r += 32;
+	}
+	if (!(val & 0xffff)) {
+		val >>= 16;
+		r += 16;
+	}
+	if (!(val & 0xff)) {
+		val >>= 8;
+		r += 8;
+	}
+	if (!(val & 0xf)) {
+		val >>= 4;
+		r += 4;
+	}
+	if (!(val & 0x3)) {
+		val >>= 2;
+		r += 2;
+	}
+	if (!(val & 0x1)) {
+		val >>= 1;
+		r += 1;
+	}
+	return r;
+#endif
+}
+
 /* In which bucket would we find a particular record size? (ignoring header) */
 /* In which bucket would we find a particular record size? (ignoring header) */
 unsigned int size_to_bucket(unsigned int zone_bits, tdb_len_t data_len)
 unsigned int size_to_bucket(unsigned int zone_bits, tdb_len_t data_len)
 {
 {
@@ -49,59 +87,51 @@ unsigned int size_to_bucket(unsigned int zone_bits, tdb_len_t data_len)
 	return bucket;
 	return bucket;
 }
 }
 
 
-/* Subtract 1-byte tailer and header.  Then round up to next power of 2. */
-static unsigned max_zone_bits(struct tdb_context *tdb)
+/* Binary search for the zone for this offset. */
+static tdb_off_t off_to_zone(struct tdb_context *tdb, tdb_off_t off,
+			     struct free_zone_header *zhdr)
 {
 {
-	return fls64(tdb->map_size-1-sizeof(struct tdb_header)-1) + 1;
-}
+	tdb_off_t start, end;
 
 
-/* Start by using a random zone to spread the load: returns the offset. */
-static uint64_t random_zone(struct tdb_context *tdb)
-{
-	struct free_zone_header zhdr;
-	tdb_off_t off = sizeof(struct tdb_header);
-	tdb_len_t half_bits;
-	uint64_t randbits = 0;
-	unsigned int i;
+	start = sizeof(struct tdb_header);
+	end = start + (1ULL << fls64(tdb->map_size - start));
 
 
-	for (i = 0; i < 64; i += fls64(RAND_MAX)) 
-		randbits ^= ((uint64_t)random()) << i;
-
-	/* FIXME: Does this work?  Test! */
-	half_bits = max_zone_bits(tdb) - 1;
-	do {
-		/* Pick left or right side (not outside file) */
-		if ((randbits & 1)
-		    && !tdb->methods->oob(tdb, off + (1ULL << half_bits)
-					  + sizeof(zhdr), true)) {
-			off += 1ULL << half_bits;
-		}
-		randbits >>= 1;
-
-		if (tdb_read_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1) 
+	for (;;) {
+		if (tdb_read_convert(tdb, start, zhdr, sizeof(*zhdr)) == -1)
 			return TDB_OFF_ERR;
 			return TDB_OFF_ERR;
 
 
-		if (zhdr.zone_bits == half_bits)
-			return off;
+		/* Is it inside this zone? */
+		if (off < start + (1ULL << zhdr->zone_bits))
+			return start;
 
 
-		half_bits--;
-	} while (half_bits >= INITIAL_ZONE_BITS);
+		/* In practice, start + end won't overflow. */
+		if (off >= (start + end) / 2)
+			start = (start + end) / 2;
+		else
+			end = (start + end) / 2;
+	}
+}
 
 
-	tdb->ecode = TDB_ERR_CORRUPT;
-	tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-		 "random_zone: zone at %llu smaller than %u bits?",
-		 (long long)off, INITIAL_ZONE_BITS);
-	return TDB_OFF_ERR;
+static tdb_off_t last_zone(struct tdb_context *tdb,
+			   struct free_zone_header *zhdr)
+{
+	return off_to_zone(tdb, tdb->map_size - 1, zhdr);
 }
 }
 
 
 int tdb_zone_init(struct tdb_context *tdb)
 int tdb_zone_init(struct tdb_context *tdb)
 {
 {
-	tdb->zone_off = random_zone(tdb);
+	unsigned int i;
+	uint64_t randoff = 0;
+
+	/* We start in a random zone, to spread the load. */
+	for (i = 0; i < 64; i += fls64(RAND_MAX))
+		randoff ^= ((uint64_t)random()) << i;
+	randoff = sizeof(struct tdb_header)
+		+ (randoff % (tdb->map_size - sizeof(struct tdb_header)));
+
+	tdb->zone_off = off_to_zone(tdb, randoff, &tdb->zhdr);
 	if (tdb->zone_off == TDB_OFF_ERR)
 	if (tdb->zone_off == TDB_OFF_ERR)
 		return -1;
 		return -1;
-	if (tdb_read_convert(tdb, tdb->zone_off,
-			     &tdb->zhdr, sizeof(tdb->zhdr)) == -1) 
-		return -1;
 	return 0;
 	return 0;
 }
 }
 
 
@@ -225,7 +255,7 @@ int add_free_record(struct tdb_context *tdb,
 	int ret;
 	int ret;
 
 
 	assert(len_with_header >= sizeof(new));
 	assert(len_with_header >= sizeof(new));
-	assert(zone_bits < (1 << 6));
+	assert(zone_bits < 64);
 
 
 	new.magic_and_meta = TDB_FREE_MAGIC | zone_bits;
 	new.magic_and_meta = TDB_FREE_MAGIC | zone_bits;
 	new.data_len = len_with_header - sizeof(struct tdb_used_record);
 	new.data_len = len_with_header - sizeof(struct tdb_used_record);
@@ -279,9 +309,15 @@ static int coalesce(struct tdb_context *tdb,
 		    tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
 		    tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
 {
 {
 	struct tdb_free_record pad, *r;
 	struct tdb_free_record pad, *r;
-	tdb_off_t end = off + sizeof(struct tdb_used_record) + data_len;
+	tdb_off_t zone_end, end;
+
+	end = off + sizeof(struct tdb_used_record) + data_len;
+	zone_end = zone_off + (1ULL << zone_bits);
 
 
-	while (end < (zone_off + (1ULL << zone_bits))) {
+	if (tdb->methods->oob(tdb, zone_end, true))
+		zone_end = tdb->map_size;
+
+	while (end < zone_end) {
 		tdb_off_t nb_off;
 		tdb_off_t nb_off;
 
 
 		/* FIXME: do tdb_get here and below really win? */
 		/* FIXME: do tdb_get here and below really win? */
@@ -586,10 +622,9 @@ int set_header(struct tdb_context *tdb,
 	return 0;
 	return 0;
 }
 }
 
 
-static bool zones_happy(struct tdb_context *tdb)
+static bool zones_contended(struct tdb_context *tdb)
 {
 {
-	/* FIXME: look at distribution of zones. */
-	return true;
+	return false;
 }
 }
 
 
 /* Assume we want buckets up to the comfort factor. */
 /* Assume we want buckets up to the comfort factor. */
@@ -604,11 +639,9 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 {
 {
 	uint64_t old_size;
 	uint64_t old_size;
 	tdb_off_t off;
 	tdb_off_t off;
-	uint8_t zone_bits;
-	unsigned int num_buckets;
-	tdb_len_t wanted;
+	unsigned int num_buckets, zone_bits;
+	tdb_len_t wanted, expand;
 	struct free_zone_header zhdr;
 	struct free_zone_header zhdr;
-	bool enlarge_zone;
 
 
 	/* We need room for the record header too. */
 	/* We need room for the record header too. */
 	wanted = sizeof(struct tdb_used_record) + size;
 	wanted = sizeof(struct tdb_used_record) + size;
@@ -623,43 +656,63 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 	if (tdb->map_size != old_size)
 	if (tdb->map_size != old_size)
 		goto success;
 		goto success;
 
 
-	/* FIXME: Tailer is a bogus optimization, remove it. */
-	/* zone bits tailer char is protected by EXPAND lock. */
-	if (tdb->methods->read(tdb, old_size - 1, &zone_bits, 1) == -1)
+	/* Treat last zone as minimum reasonable zone size. */
+	off = last_zone(tdb, &zhdr);
+	if (off == TDB_OFF_ERR)
 		goto fail;
 		goto fail;
 
 
-	/* If zones aren't working well, add larger zone if possible. */
-	enlarge_zone = !zones_happy(tdb);
+	/* Zone isn't fully expanded? */
+	if (tdb->map_size < off + (1ULL << zhdr.zone_bits)) {
+		expand = off + (1ULL << zhdr.zone_bits) - tdb->map_size;
+		/* Expand more than we want. */
+		if (expand > (wanted << TDB_COMFORT_FACTOR_BITS))
+			expand = (wanted << TDB_COMFORT_FACTOR_BITS);
+		if (tdb->methods->expand_file(tdb, expand) == -1)
+			goto fail;
+		/* We need to drop this lock before adding free record. */
+		tdb_unlock_expand(tdb, F_WRLCK);
+
+		/* Allocate from here. */
+		tdb->zone_off = off;
+		tdb->zhdr = zhdr;
+
+		/* FIXME: If this isn't sufficient, we search again... */
+		return add_free_record(tdb, zhdr.zone_bits,
+				       tdb->map_size - expand, expand);
+	}
 
 
-	/* New zone can be between zone_bits or larger if we're on the right
-	 * boundary. */
-	for (;;) {
-		/* Does this fit the allocation comfortably? */
-		if ((1ULL << zone_bits) >= overhead(zone_bits) + wanted) {
-			/* Only let enlarge_zone enlarge us once. */
-			if (!enlarge_zone)
-				break;
-			enlarge_zone = false;
-		}
-		if ((old_size - 1 - sizeof(struct tdb_header))
-		    & (1 << zone_bits))
-			break;
-		zone_bits++;
+	/* We are never allowed to cross a power-of-two boundary, and our
+	 * minimum zone size is 1 << INITIAL_ZONE_BITS.
+	 *
+	 * If our filesize is 128k, we can add a 64k or a 128k zone.  If it's
+	 * 192k, we can only add a 64k zone.
+	 *
+	 * In other words, our max zone size is (1 << (ffs(filesize) - 1)) */
+	zone_bits = ffs64(old_size - sizeof(struct tdb_header)) - 1;
+	assert(zone_bits >= INITIAL_ZONE_BITS);
+
+	/* Big zones generally good, but more zones wanted if contended. */
+	if (zones_contended(tdb)) {
+		/* If it suffices, make zone same size as last one. */
+		if (zhdr.zone_bits < zone_bits
+		    && (1ULL << zhdr.zone_bits) >= overhead(zone_bits)+wanted)
+			zone_bits = zhdr.zone_bits;
 	}
 	}
 
 
 	zhdr.zone_bits = zone_bits;
 	zhdr.zone_bits = zone_bits;
 	num_buckets = BUCKETS_FOR_ZONE(zone_bits);
 	num_buckets = BUCKETS_FOR_ZONE(zone_bits);
 
 
-	/* FIXME: I don't think we need to expand to full zone, do we? */
-	if (tdb->methods->expand_file(tdb, 1ULL << zone_bits) == -1)
-		goto fail;
+	/* Expand the file by more than we need right now. */
+	expand = 1ULL << zone_bits;
+	if (expand > overhead(zone_bits) + (wanted << TDB_COMFORT_FACTOR_BITS))
+		expand = overhead(zone_bits)
+			+ (wanted << TDB_COMFORT_FACTOR_BITS);
 
 
-	/* Write new tailer. */
-	if (tdb->methods->write(tdb, tdb->map_size - 1, &zone_bits, 1) == -1)
+	if (tdb->methods->expand_file(tdb, expand) == -1)
 		goto fail;
 		goto fail;
 
 
-	/* Write new zone header (just before old tailer). */
-	off = old_size - 1;
+	/* Write new zone header (at old end). */
+	off = old_size;
 	if (tdb_write_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1)
 	if (tdb_write_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1)
 		goto fail;
 		goto fail;
 
 
@@ -670,11 +723,12 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 	off += (num_buckets+1) * sizeof(tdb_off_t);
 	off += (num_buckets+1) * sizeof(tdb_off_t);
 
 
 	/* Now add the rest as our free record. */
 	/* Now add the rest as our free record. */
-	if (add_free_record(tdb, zone_bits, off, tdb->map_size-1-off) == -1)
+	if (add_free_record(tdb, zone_bits, off, expand - overhead(zone_bits))
+	    == -1)
 		goto fail;
 		goto fail;
 
 
 	/* Try allocating from this zone now. */
 	/* Try allocating from this zone now. */
-	tdb->zone_off = old_size - 1;
+	tdb->zone_off = old_size;
 	tdb->zhdr = zhdr;
 	tdb->zhdr = zhdr;
 
 
 success:
 success:

+ 15 - 18
ccan/tdb2/io.c

@@ -123,27 +123,13 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
 	return 0;
 	return 0;
 }
 }
 
 
-static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
-{
-	if (unlikely(!tdb->map_ptr))
-		return NULL;
-
-	/* FIXME: We can do a subset of this! */
-	if (tdb->transaction)
-		return NULL;
-
-	if (unlikely(tdb_oob(tdb, off + len, true) == -1))
-		return NULL;
-	return (char *)tdb->map_ptr + off;
-}
-
 /* Either make a copy into pad and return that, or return ptr into mmap. */
 /* Either make a copy into pad and return that, or return ptr into mmap. */
 /* Note: pad has to be a real object, so we can't get here if len
 /* Note: pad has to be a real object, so we can't get here if len
  * overflows size_t */
  * overflows size_t */
 void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
 void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
 {
 {
 	if (likely(!(tdb->flags & TDB_CONVERT))) {
 	if (likely(!(tdb->flags & TDB_CONVERT))) {
-		void *ret = tdb_direct(tdb, off, len);
+		void *ret = tdb->methods->direct(tdb, off, len);
 		if (ret)
 		if (ret)
 			return ret;
 			return ret;
 	}
 	}
@@ -205,7 +191,7 @@ uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
 {
 {
 	char buf[8192] = { 0 };
 	char buf[8192] = { 0 };
-	void *p = tdb_direct(tdb, off, len);
+	void *p = tdb->methods->direct(tdb, off, len);
 	if (p) {
 	if (p) {
 		memset(p, 0, len);
 		memset(p, 0, len);
 		return 0;
 		return 0;
@@ -478,7 +464,7 @@ const void *tdb_access_read(struct tdb_context *tdb,
 	const void *ret = NULL;	
 	const void *ret = NULL;	
 
 
 	if (likely(!(tdb->flags & TDB_CONVERT)))
 	if (likely(!(tdb->flags & TDB_CONVERT)))
-		ret = tdb_direct(tdb, off, len);
+		ret = tdb->methods->direct(tdb, off, len);
 
 
 	if (!ret) {
 	if (!ret) {
 		struct tdb_access_hdr *hdr;
 		struct tdb_access_hdr *hdr;
@@ -500,7 +486,7 @@ void *tdb_access_write(struct tdb_context *tdb,
 	void *ret = NULL;
 	void *ret = NULL;
 
 
 	if (likely(!(tdb->flags & TDB_CONVERT)))
 	if (likely(!(tdb->flags & TDB_CONVERT)))
-		ret = tdb_direct(tdb, off, len);
+		ret = tdb->methods->direct(tdb, off, len);
 
 
 	if (!ret) {
 	if (!ret) {
 		struct tdb_access_hdr *hdr;
 		struct tdb_access_hdr *hdr;
@@ -658,11 +644,22 @@ int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *
 }
 }
 #endif
 #endif
 
 
+static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
+{
+	if (unlikely(!tdb->map_ptr))
+		return NULL;
+
+	if (unlikely(tdb_oob(tdb, off + len, true) == -1))
+		return NULL;
+	return (char *)tdb->map_ptr + off;
+}
+
 static const struct tdb_methods io_methods = {
 static const struct tdb_methods io_methods = {
 	tdb_read,
 	tdb_read,
 	tdb_write,
 	tdb_write,
 	tdb_oob,
 	tdb_oob,
 	tdb_expand_file,
 	tdb_expand_file,
+	tdb_direct,
 };
 };
 
 
 /*
 /*

+ 1 - 0
ccan/tdb2/private.h

@@ -332,6 +332,7 @@ struct tdb_methods {
 	int (*write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
 	int (*write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
 	int (*oob)(struct tdb_context *, tdb_off_t, bool);
 	int (*oob)(struct tdb_context *, tdb_off_t, bool);
 	int (*expand_file)(struct tdb_context *, tdb_len_t);
 	int (*expand_file)(struct tdb_context *, tdb_len_t);
+	void *(*direct)(struct tdb_context *, tdb_off_t, size_t);
 };
 };
 
 
 /*
 /*

+ 7 - 4
ccan/tdb2/summary.c

@@ -66,7 +66,7 @@ static tdb_len_t summarize_zone(struct tdb_context *tdb, tdb_off_t zone_off,
 				unsigned int *num_buckets)
 				unsigned int *num_buckets)
 {
 {
 	struct free_zone_header zhdr;
 	struct free_zone_header zhdr;
-	tdb_off_t off;
+	tdb_off_t off, end;
 	tdb_len_t len;
 	tdb_len_t len;
 	unsigned int hdrlen;
 	unsigned int hdrlen;
 	tdb_len_t unc = 0;
 	tdb_len_t unc = 0;
@@ -79,9 +79,12 @@ static tdb_len_t summarize_zone(struct tdb_context *tdb, tdb_off_t zone_off,
 
 
 	hdrlen = sizeof(zhdr)
 	hdrlen = sizeof(zhdr)
 		+ (BUCKETS_FOR_ZONE(zhdr.zone_bits) + 1) * sizeof(tdb_off_t);
 		+ (BUCKETS_FOR_ZONE(zhdr.zone_bits) + 1) * sizeof(tdb_off_t);
-	for (off = zone_off + hdrlen;
-	     off < zone_off + (1ULL << zhdr.zone_bits);
-	     off += len) {
+
+	end = zone_off + (1ULL << zhdr.zone_bits);
+	if (end > tdb->map_size)
+		end = tdb->map_size;
+
+	for (off = zone_off + hdrlen; off < end; off += len) {
 		union {
 		union {
 			struct tdb_used_record u;
 			struct tdb_used_record u;
 			struct tdb_free_record f;
 			struct tdb_free_record f;

+ 5 - 34
ccan/tdb2/tdb.c

@@ -83,14 +83,6 @@ struct new_database {
 	/* Initial free zone. */
 	/* Initial free zone. */
 	struct free_zone_header zhdr;
 	struct free_zone_header zhdr;
 	tdb_off_t free[BUCKETS_FOR_ZONE(INITIAL_ZONE_BITS) + 1];
 	tdb_off_t free[BUCKETS_FOR_ZONE(INITIAL_ZONE_BITS) + 1];
-	struct tdb_free_record frec;
-	/* Rest up to 1 << INITIAL_ZONE_BITS is empty. */
-	char space[(1 << INITIAL_ZONE_BITS)
-		   - sizeof(struct free_zone_header)
-		   - sizeof(tdb_off_t) * (BUCKETS_FOR_ZONE(INITIAL_ZONE_BITS)+1)
-		   - sizeof(struct tdb_free_record)];
-	uint8_t tailer;
-	/* Don't count final padding! */
 };
 };
 
 
 /* initialise a new database */
 /* initialise a new database */
@@ -100,10 +92,7 @@ static int tdb_new_database(struct tdb_context *tdb,
 {
 {
 	/* We make it up in memory, then write it out if not internal */
 	/* We make it up in memory, then write it out if not internal */
 	struct new_database newdb;
 	struct new_database newdb;
-	unsigned int bucket, magic_len, dbsize;
-
-	/* Don't want any extra padding! */
-	dbsize = offsetof(struct new_database, tailer) + sizeof(newdb.tailer);
+	unsigned int magic_len;
 
 
 	/* Fill in the header */
 	/* Fill in the header */
 	newdb.hdr.version = TDB_VERSION;
 	newdb.hdr.version = TDB_VERSION;
@@ -120,27 +109,10 @@ static int tdb_new_database(struct tdb_context *tdb,
 	/* Initial hashes are empty. */
 	/* Initial hashes are empty. */
 	memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
 	memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
 
 
-	/* Free is mostly empty... */
+	/* Free is empty. */
 	newdb.zhdr.zone_bits = INITIAL_ZONE_BITS;
 	newdb.zhdr.zone_bits = INITIAL_ZONE_BITS;
 	memset(newdb.free, 0, sizeof(newdb.free));
 	memset(newdb.free, 0, sizeof(newdb.free));
 
 
-	/* Create the single free entry. */
-	newdb.frec.magic_and_meta = TDB_FREE_MAGIC | INITIAL_ZONE_BITS;
-	newdb.frec.data_len = (sizeof(newdb.frec)
-				 - sizeof(struct tdb_used_record)
-				 + sizeof(newdb.space));
-
-	/* Add it to the correct bucket. */
-	bucket = size_to_bucket(INITIAL_ZONE_BITS, newdb.frec.data_len);
-	newdb.free[bucket] = offsetof(struct new_database, frec);
-	newdb.frec.next = newdb.frec.prev = 0;
-
-	/* Clear free space to keep valgrind happy, and avoid leaking stack. */
-	memset(newdb.space, 0, sizeof(newdb.space));
-
-	/* Tailer contains maximum number of free_zone bits. */
-	newdb.tailer = INITIAL_ZONE_BITS;
-
 	/* Magic food */
 	/* Magic food */
 	memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
 	memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
 	strcpy(newdb.hdr.magic_food, TDB_MAGIC_FOOD);
 	strcpy(newdb.hdr.magic_food, TDB_MAGIC_FOOD);
@@ -148,13 +120,12 @@ static int tdb_new_database(struct tdb_context *tdb,
 	/* This creates an endian-converted database, as if read from disk */
 	/* This creates an endian-converted database, as if read from disk */
 	magic_len = sizeof(newdb.hdr.magic_food);
 	magic_len = sizeof(newdb.hdr.magic_food);
 	tdb_convert(tdb,
 	tdb_convert(tdb,
-		    (char *)&newdb.hdr + magic_len,
-		    offsetof(struct new_database, space) - magic_len);
+		    (char *)&newdb.hdr + magic_len, sizeof(newdb) - magic_len);
 
 
 	*hdr = newdb.hdr;
 	*hdr = newdb.hdr;
 
 
 	if (tdb->flags & TDB_INTERNAL) {
 	if (tdb->flags & TDB_INTERNAL) {
-		tdb->map_size = dbsize;
+		tdb->map_size = sizeof(newdb);
 		tdb->map_ptr = malloc(tdb->map_size);
 		tdb->map_ptr = malloc(tdb->map_size);
 		if (!tdb->map_ptr) {
 		if (!tdb->map_ptr) {
 			tdb->ecode = TDB_ERR_OOM;
 			tdb->ecode = TDB_ERR_OOM;
@@ -169,7 +140,7 @@ static int tdb_new_database(struct tdb_context *tdb,
 	if (ftruncate(tdb->fd, 0) == -1)
 	if (ftruncate(tdb->fd, 0) == -1)
 		return -1;
 		return -1;
 
 
-	if (!tdb_pwrite_all(tdb->fd, &newdb, dbsize, 0)) {
+	if (!tdb_pwrite_all(tdb->fd, &newdb, sizeof(newdb), 0)) {
 		tdb->ecode = TDB_ERR_IO;
 		tdb->ecode = TDB_ERR_IO;
 		return -1;
 		return -1;
 	}
 	}

+ 3 - 14
ccan/tdb2/test/layout.c

@@ -245,16 +245,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 		zone_left -= len;
 		zone_left -= len;
 	}
 	}
 
 
-	/* Fill final zone with free record. */
-	if (zone_left != 0) {
-		tdb_layout_add_free(layout,
-				    zone_left
-				    - sizeof(struct tdb_used_record));
-		layout->elem[layout->num_elems-1].base.off = off;
-		off += zone_left;
-	}
-
-	mem = malloc(off+1);
+	mem = malloc(off);
 	/* Now populate our header, cribbing from a real TDB header. */
 	/* Now populate our header, cribbing from a real TDB header. */
 	tdb = tdb_open(NULL, TDB_INTERNAL, O_RDWR, 0, &tap_log_attr);
 	tdb = tdb_open(NULL, TDB_INTERNAL, O_RDWR, 0, &tap_log_attr);
 	memcpy(mem, tdb->map_ptr, sizeof(struct tdb_header));
 	memcpy(mem, tdb->map_ptr, sizeof(struct tdb_header));
@@ -262,7 +253,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 	/* Mug the tdb we have to make it use this. */
 	/* Mug the tdb we have to make it use this. */
 	free(tdb->map_ptr);
 	free(tdb->map_ptr);
 	tdb->map_ptr = mem;
 	tdb->map_ptr = mem;
-	tdb->map_size = off+1;
+	tdb->map_size = off;
 
 
 	for (i = 0; i < layout->num_elems; i++) {
 	for (i = 0; i < layout->num_elems; i++) {
 		union tdb_layout_elem *e = &layout->elem[i];
 		union tdb_layout_elem *e = &layout->elem[i];
@@ -304,9 +295,6 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 		}
 		}
 	}
 	}
 
 
-	/* Write tailer. */
-	((uint8_t *)tdb->map_ptr)[tdb->map_size-1] = last_zone->zone_bits;
-
 	/* Get physical if they asked for it. */
 	/* Get physical if they asked for it. */
 	if (layout->filename) {
 	if (layout->filename) {
 		int fd = open(layout->filename, O_WRONLY|O_TRUNC|O_CREAT,
 		int fd = open(layout->filename, O_WRONLY|O_TRUNC|O_CREAT,
@@ -321,5 +309,6 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
 		tdb = tdb_open(layout->filename, TDB_NOMMAP, O_RDWR, 0,
 		tdb = tdb_open(layout->filename, TDB_NOMMAP, O_RDWR, 0,
 			       &tap_log_attr);
 			       &tap_log_attr);
 	}
 	}
+
 	return tdb;
 	return tdb;
 }
 }

+ 62 - 0
ccan/tdb2/test/run-01-zones.c

@@ -0,0 +1,62 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+#include "layout.h"
+
+/* Calculate start of zone offset from layout directly. */
+static tdb_off_t layout_zone_off(tdb_off_t off, struct tdb_layout *layout)
+{
+	unsigned int i;
+
+	/* Every second one is a free entry, so divide by 2 to get zone */
+	for (i = 0; i < layout->num_elems; i++) {
+		if (layout->elem[i].base.type != ZONE)
+			continue;
+		if (layout->elem[i].base.off
+		    + (1ULL << layout->elem[i].zone.zone_bits) > off)
+			return layout->elem[i].base.off;
+	}
+	abort();
+}
+
+int main(int argc, char *argv[])
+{
+	struct tdb_context *tdb;
+	struct tdb_layout *layout;
+	struct free_zone_header zhdr;
+	tdb_off_t off, step;
+	unsigned int i;
+
+	/* FIXME: Test TDB_CONVERT */
+
+	plan_tests(3 + 100);
+
+	/* No coalescing can be done due to EOF */
+	layout = new_tdb_layout(NULL);
+	tdb_layout_add_zone(layout, INITIAL_ZONE_BITS, false);
+	tdb_layout_add_zone(layout, INITIAL_ZONE_BITS, true);
+	tdb_layout_add_zone(layout, INITIAL_ZONE_BITS+1, true);
+	tdb_layout_add_zone(layout, INITIAL_ZONE_BITS+2, true);
+	tdb_layout_add_zone(layout, INITIAL_ZONE_BITS+2, true);
+	tdb = tdb_layout_get(layout);
+
+	ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+	/* Last zone should get right zone. */
+	ok1(last_zone(tdb, &zhdr)
+	    == layout->elem[layout->num_elems-1].base.off);
+	ok1(zhdr.zone_bits == INITIAL_ZONE_BITS+2);
+
+	off = sizeof(struct tdb_header);
+	step = (tdb->map_size - 1 - off) / 100;
+	for (i = 0; i < 100; i++, off += step) {
+		ok1(off_to_zone(tdb, off, &zhdr) == layout_zone_off(off, layout));
+	}
+
+	return exit_status();
+}

+ 27 - 16
ccan/tdb2/test/run-02-expand.c

@@ -25,36 +25,47 @@ int main(int argc, char *argv[])
 		if (!tdb)
 		if (!tdb)
 			continue;
 			continue;
 
 
-		/* First expand. Should add a zone, doubling file size.. */
-		val = tdb->map_size - 1 - sizeof(struct tdb_header);
+		/* First expand. Should not fill zone. */
+		val = tdb->map_size - sizeof(struct tdb_header);
 		ok1(tdb_expand(tdb, 1) == 0);
 		ok1(tdb_expand(tdb, 1) == 0);
-		ok1(tdb->map_size == 2 * val + 1 + sizeof(struct tdb_header));
+		ok1(tdb->map_size < sizeof(struct tdb_header)
+		    + (1 << INITIAL_ZONE_BITS));
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
-		/* Second expand, add another zone of same size. */
-		ok1(tdb_expand(tdb, 1) == 0);
-		ok1(tdb->map_size == 3 * val + 1 + sizeof(struct tdb_header));
+		/* Fill zone. */
+		val = (1<<INITIAL_ZONE_BITS)
+			- sizeof(struct tdb_used_record)
+			- (tdb->map_size - sizeof(struct tdb_header));
+		ok1(tdb_expand(tdb, val) == 0);
+		ok1(tdb->map_size == sizeof(struct tdb_header)
+		    + (1 << INITIAL_ZONE_BITS));
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
-		/* Large expand, but can only add 4th zone of same size. */
-		ok1(tdb_expand(tdb, 4*val) == 0);
-		ok1(tdb->map_size == 4 * val + 1 + sizeof(struct tdb_header));
+		/* Second expand, adds another zone of same size. */
+		ok1(tdb_expand(tdb, 4 << INITIAL_ZONE_BITS) == 0);
+		ok1(tdb->map_size ==
+		    (2<<INITIAL_ZONE_BITS) + sizeof(struct tdb_header));
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
 		/* Large expand now will double file. */
 		/* Large expand now will double file. */
-		ok1(tdb_expand(tdb, 4*val) == 0);
-		ok1(tdb->map_size == 8 * val + 1 + sizeof(struct tdb_header));
+		ok1(tdb_expand(tdb, 4 << INITIAL_ZONE_BITS) == 0);
+		ok1(tdb->map_size ==
+		    (4<<INITIAL_ZONE_BITS) + sizeof(struct tdb_header));
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
 		/* And again? */
 		/* And again? */
-		ok1(tdb_expand(tdb, 4*val) == 0);
-		ok1(tdb->map_size == 16 * val + 1 + sizeof(struct tdb_header));
+		ok1(tdb_expand(tdb, 4 << INITIAL_ZONE_BITS) == 0);
+		ok1(tdb->map_size ==
+		    (8<<INITIAL_ZONE_BITS) + sizeof(struct tdb_header));
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 		ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
-		/* Below comfort level, will add a single 8*val zone. */
-		ok1(tdb_expand(tdb, ((8*val) >> TDB_COMFORT_FACTOR_BITS)
+		/* Below comfort level, won't fill zone. */
+		ok1(tdb_expand(tdb,
+			       ((3 << INITIAL_ZONE_BITS)
+				>> TDB_COMFORT_FACTOR_BITS)
 			       - sizeof(struct tdb_used_record)) == 0);
 			       - sizeof(struct tdb_used_record)) == 0);
-		ok1(tdb->map_size == 24 * val + 1 + sizeof(struct tdb_header));
+		ok1(tdb->map_size < (12<<INITIAL_ZONE_BITS)
+		    + sizeof(struct tdb_header));
 		tdb_close(tdb);
 		tdb_close(tdb);
 	}
 	}
 
 

+ 8 - 7
ccan/tdb2/test/run-03-coalesce.c

@@ -39,8 +39,9 @@ int main(int argc, char *argv[])
 	/* No coalescing can be done due to EOF */
 	/* No coalescing can be done due to EOF */
 	layout = new_tdb_layout(NULL);
 	layout = new_tdb_layout(NULL);
 	tdb_layout_add_zone(layout, zone_bits, false);
 	tdb_layout_add_zone(layout, zone_bits, false);
+	len = 1024;
+	tdb_layout_add_free(layout, len);
 	tdb = tdb_layout_get(layout);
 	tdb = tdb_layout_get(layout);
-	len = layout->elem[1].free.len;
 	zone_off = layout->elem[0].base.off;
 	zone_off = layout->elem[0].base.off;
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
 	ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
@@ -81,11 +82,11 @@ int main(int argc, char *argv[])
 	layout = new_tdb_layout(NULL);
 	layout = new_tdb_layout(NULL);
 	tdb_layout_add_zone(layout, zone_bits, false);
 	tdb_layout_add_zone(layout, zone_bits, false);
 	tdb_layout_add_free(layout, 1024);
 	tdb_layout_add_free(layout, 1024);
+	tdb_layout_add_free(layout, 2048);
 	tdb = tdb_layout_get(layout);
 	tdb = tdb_layout_get(layout);
 	zone_off = layout->elem[0].base.off;
 	zone_off = layout->elem[0].base.off;
-	len = layout->elem[2].free.len;
 	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
 	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
-	ok1(free_record_length(tdb, layout->elem[2].base.off) == len);
+	ok1(free_record_length(tdb, layout->elem[2].base.off) == 2048);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
 	/* Figure out which bucket (first) free entry is. */
 	/* Figure out which bucket (first) free entry is. */
@@ -96,7 +97,7 @@ int main(int argc, char *argv[])
 		     b_off, 1024) == 1);
 		     b_off, 1024) == 1);
 	ok1(!tdb_has_locks(tdb));
 	ok1(!tdb_has_locks(tdb));
 	ok1(free_record_length(tdb, layout->elem[1].base.off)
 	ok1(free_record_length(tdb, layout->elem[1].base.off)
-	    == 1024 + sizeof(struct tdb_used_record) + len);
+	    == 1024 + sizeof(struct tdb_used_record) + 2048);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	tdb_close(tdb);
 	tdb_close(tdb);
 
 
@@ -129,12 +130,12 @@ int main(int argc, char *argv[])
 	tdb_layout_add_zone(layout, zone_bits, false);
 	tdb_layout_add_zone(layout, zone_bits, false);
 	tdb_layout_add_free(layout, 1024);
 	tdb_layout_add_free(layout, 1024);
 	tdb_layout_add_free(layout, 512);
 	tdb_layout_add_free(layout, 512);
+	tdb_layout_add_free(layout, 256);
 	tdb = tdb_layout_get(layout);
 	tdb = tdb_layout_get(layout);
 	zone_off = layout->elem[0].base.off;
 	zone_off = layout->elem[0].base.off;
-	len = layout->elem[3].free.len;
 	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
 	ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
 	ok1(free_record_length(tdb, layout->elem[2].base.off) == 512);
 	ok1(free_record_length(tdb, layout->elem[2].base.off) == 512);
-	ok1(free_record_length(tdb, layout->elem[3].base.off) == len);
+	ok1(free_record_length(tdb, layout->elem[3].base.off) == 256);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 
 
 	/* Figure out which bucket free entry is. */
 	/* Figure out which bucket free entry is. */
@@ -146,7 +147,7 @@ int main(int argc, char *argv[])
 	ok1(!tdb_has_locks(tdb));
 	ok1(!tdb_has_locks(tdb));
 	ok1(free_record_length(tdb, layout->elem[1].base.off)
 	ok1(free_record_length(tdb, layout->elem[1].base.off)
 	    == 1024 + sizeof(struct tdb_used_record) + 512
 	    == 1024 + sizeof(struct tdb_used_record) + 512
-	    + sizeof(struct tdb_used_record) + len);
+	    + sizeof(struct tdb_used_record) + 256);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	ok1(tdb_check(tdb, NULL, NULL) == 0);
 	tdb_close(tdb);
 	tdb_close(tdb);
 
 

+ 2 - 2
ccan/tdb2/test/run-30-exhaust-before-expand.c

@@ -39,12 +39,12 @@ int main(int argc, char *argv[])
 		d.dptr = malloc(d.dsize);
 		d.dptr = malloc(d.dsize);
 		ok1(tdb_store(tdb, k, d, TDB_INSERT) == 0);
 		ok1(tdb_store(tdb, k, d, TDB_INSERT) == 0);
 		ok1(tdb->map_size == sizeof(struct tdb_header)
 		ok1(tdb->map_size == sizeof(struct tdb_header)
-		    + (1 << INITIAL_ZONE_BITS)+1);
+		    + (1 << INITIAL_ZONE_BITS));
 
 
 		/* Insert minimal-length records until we add a zone. */ 
 		/* Insert minimal-length records until we add a zone. */ 
 		for (j = 0;
 		for (j = 0;
 		     tdb->map_size == sizeof(struct tdb_header)
 		     tdb->map_size == sizeof(struct tdb_header)
-			     + (1 << INITIAL_ZONE_BITS)+1;
+			     + (1 << INITIAL_ZONE_BITS);
 		     j++) {
 		     j++) {
 			if (tdb_store(tdb, k, k, TDB_INSERT) != 0)
 			if (tdb_store(tdb, k, k, TDB_INSERT) != 0)
 				err(1, "Failed to store record %i", j);
 				err(1, "Failed to store record %i", j);