15 years ago · a942862195
--- a/ccan/tdb2/private.h
+++ b/ccan/tdb2/private.h
@@ -222,7 +222,7 @@ static inline unsigned frec_ftable(const struct tdb_free_record *f)
 
															 struct tdb_recovery_record {
														
 
															 	uint64_t magic;
														
 
															-	/* Length of record. */
														
 
															+	/* Length of record (add this header to get total length). */
														
 
															 	uint64_t max_len;
														
 
															 	/* Length used. */
														
 
															 	uint64_t len;
														
--- a/ccan/tdb2/transaction.c
+++ b/ccan/tdb2/transaction.c
@@ -88,7 +88,6 @@
 
															     fsync/msync calls are made.
														
 
															 */
														
 
															-
														
 
															 /*
														
 
															   hold the context of any current transaction
														
 
															 */
														
@@ -608,7 +607,7 @@ void tdb_transaction_cancel(struct tdb_context *tdb)
 
															 }
														
 
															 /*
														
 
															-  work out how much space the linearised recovery data will consume
														
 
															+  work out how much space the linearised recovery data will consume (worst case)
														
 
															 */
														
 
															 static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
														
 
															 {
														
@@ -634,129 +633,38 @@ static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
 
															 	return recovery_size;
														
 
															 }
														
 
															-/*
														
 
															-  allocate the recovery area, or use an existing recovery area if it is
														
 
															-  large enough
														
 
															-*/
														
 
															-static enum TDB_ERROR tdb_recovery_allocate(struct tdb_context *tdb,
														
 
															-					    tdb_len_t *recovery_size,
														
 
															-					    tdb_off_t *recovery_offset,
														
 
															-					    tdb_len_t *recovery_max_size)
														
 
															+static enum TDB_ERROR tdb_recovery_area(struct tdb_context *tdb,
														
 
															+					const struct tdb_methods *methods,
														
 
															+					tdb_off_t *recovery_offset,
														
 
															+					struct tdb_recovery_record *rec)
														
 
															 {
														
 
															-	struct tdb_recovery_record rec;
														
 
															-	const struct tdb_methods *methods = tdb->transaction->io_methods;
														
 
															-	tdb_off_t recovery_head;
														
 
															-	size_t addition;
														
 
															 	enum TDB_ERROR ecode;
														
 
															-	recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
														
 
															-	if (TDB_OFF_IS_ERR(recovery_head)) {
														
 
															-		return tdb_logerr(tdb, recovery_head, TDB_LOG_ERROR,
														
 
															-				  "tdb_recovery_allocate:"
														
 
															-				  " failed to read recovery head");
														
 
															+	*recovery_offset = tdb_read_off(tdb,
														
 
															+					offsetof(struct tdb_header, recovery));
														
 
															+	if (TDB_OFF_IS_ERR(*recovery_offset)) {
														
 
															+		return *recovery_offset;
														
 
															 	}
														
 
															-	if (recovery_head != 0) {
														
 
															-		ecode = methods->tread(tdb, recovery_head, &rec, sizeof(rec));
														
 
															-		if (ecode != TDB_SUCCESS) {
														
 
															-			return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
														
 
															-					  "tdb_recovery_allocate:"
														
 
															-					  " failed to read recovery record");
														
 
															-		}
														
 
															-		tdb_convert(tdb, &rec, sizeof(rec));
														
 
															-		/* ignore invalid recovery regions: can happen in crash */
														
 
															-		if (rec.magic != TDB_RECOVERY_MAGIC &&
														
 
															-		    rec.magic != TDB_RECOVERY_INVALID_MAGIC) {
														
 
															-			recovery_head = 0;
														
 
															-		}
														
 
															-	}
														
 
															-
														
 
															-	*recovery_size = tdb_recovery_size(tdb);
														
 
															-
														
 
															-	if (recovery_head != 0 && *recovery_size <= rec.max_len) {
														
 
															-		/* it fits in the existing area */
														
 
															-		*recovery_max_size = rec.max_len;
														
 
															-		*recovery_offset = recovery_head;
														
 
															+	if (*recovery_offset == 0) {
														
 
															+		rec->max_len = 0;
														
 
															 		return TDB_SUCCESS;
														
 
															 	}
														
 
															-	/* we need to free up the old recovery area, then allocate a
														
 
															-	   new one at the end of the file. Note that we cannot use
														
 
															-	   normal allocation to allocate the new one as that might return
														
 
															-	   us an area that is being currently used (as of the start of
														
 
															-	   the transaction) */
														
 
															-	if (recovery_head != 0) {
														
 
															-		tdb->stats.frees++;
														
 
															-		ecode = add_free_record(tdb, recovery_head,
														
 
															-					sizeof(rec) + rec.max_len,
														
 
															-					TDB_LOCK_WAIT, true);
														
 
															-		if (ecode != TDB_SUCCESS) {
														
 
															-			return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
														
 
															-					  "tdb_recovery_allocate:"
														
 
															-					  " failed to free previous"
														
 
															-					  " recovery area");
														
 
															-		}
														
 
															-	}
														
 
															-
														
 
															-	/* the tdb_free() call might have increased the recovery size */
														
 
															-	*recovery_size = tdb_recovery_size(tdb);
														
 
															-
														
 
															-	/* round up to a multiple of page size. Overallocate, since each
														
 
															-	 * such allocation forces us to expand the file. */
														
 
															-	*recovery_max_size
														
 
															-		= (((sizeof(rec) + *recovery_size + *recovery_size / 2)
														
 
															-		    + PAGESIZE-1) & ~(PAGESIZE-1))
														
 
															-		- sizeof(rec);
														
 
															-	*recovery_offset = tdb->file->map_size;
														
 
															-	recovery_head = *recovery_offset;
														
 
															-
														
 
															-	/* Restore ->map_size before calling underlying expand_file.
														
 
															-	   Also so that we don't try to expand the file again in the
														
 
															-	   transaction commit, which would destroy the recovery
														
 
															-	   area */
														
 
															-	addition = (tdb->file->map_size - tdb->transaction->old_map_size) +
														
 
															-		sizeof(rec) + *recovery_max_size;
														
 
															-	tdb->file->map_size = tdb->transaction->old_map_size;
														
 
															-	ecode = methods->expand_file(tdb, addition);
														
 
															-	if (ecode != TDB_SUCCESS) {
														
 
															-		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
														
 
															-				  "tdb_recovery_allocate:"
														
 
															-				  " failed to create recovery area");
														
 
															-	}
														
 
															-
														
 
															-	/* we have to reset the old map size so that we don't try to
														
 
															-	   expand the file again in the transaction commit, which
														
 
															-	   would destroy the recovery area */
														
 
															-	tdb->transaction->old_map_size = tdb->file->map_size;
														
 
															+	ecode = methods->tread(tdb, *recovery_offset, rec, sizeof(*rec));
														
 
															+	if (ecode != TDB_SUCCESS)
														
 
															+		return ecode;
														
 
															-	/* write the recovery header offset and sync - we can sync without a race here
														
 
															-	   as the magic ptr in the recovery record has not been set */
														
 
															-	tdb_convert(tdb, &recovery_head, sizeof(recovery_head));
														
 
															-	ecode = methods->twrite(tdb, offsetof(struct tdb_header, recovery),
														
 
															-				&recovery_head, sizeof(tdb_off_t));
														
 
															-	if (ecode != TDB_SUCCESS) {
														
 
															-		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
														
 
															-				  "tdb_recovery_allocate:"
														
 
															-				  " failed to write recovery head");
														
 
															+	tdb_convert(tdb, rec, sizeof(*rec));
														
 
															+	/* ignore invalid recovery regions: can happen in crash */
														
 
															+	if (rec->magic != TDB_RECOVERY_MAGIC &&
														
 
															+	    rec->magic != TDB_RECOVERY_INVALID_MAGIC) {
														
 
															+		*recovery_offset = 0;
														
 
															+		rec->max_len = 0;
														
 
															 	}
														
 
															-	transaction_write_existing(tdb, offsetof(struct tdb_header, recovery),
														
 
															-				   &recovery_head,
														
 
															-				   sizeof(tdb_off_t));
														
 
															 	return TDB_SUCCESS;
														
 
															 }
														
 
															-/* Set up header for the recovery record. */
														
 
															-static void set_recovery_header(struct tdb_recovery_record *rec,
														
 
															-				uint64_t magic,
														
 
															-				uint64_t datalen, uint64_t actuallen,
														
 
															-				uint64_t oldsize)
														
 
															-{
														
 
															-	rec->magic = magic;
														
 
															-	rec->max_len = actuallen;
														
 
															-	rec->len = datalen;
														
 
															-	rec->eof = oldsize;
														
 
															-}
														
 
															-
														
 
															 static unsigned int same(const unsigned char *new,
														
 
															 			 const unsigned char *old,
														
 
															 			 unsigned int length)
														
@@ -795,44 +703,27 @@ static unsigned int different(const unsigned char *new,
 
															 	return length - *samelen;
														
 
															 }
														
 
															-/*
														
 
															-  setup the recovery data that will be used on a crash during commit
														
 
															-*/
														
 
															-static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb,
														
 
															-						 tdb_off_t *magic_offset)
														
 
															+/* Allocates recovery blob, without tdb_recovery_record at head set up. */
														
 
															+static struct tdb_recovery_record *alloc_recovery(struct tdb_context *tdb,
														
 
															+						  tdb_len_t *len)
														
 
															 {
														
 
															-	/* Initialized for GCC's 4.4.5 overzealous uninitialized warnings. */
														
 
															-	tdb_len_t recovery_size = 0;
														
 
															-	tdb_off_t recovery_offset = 0, recovery_max_size = 0;
														
 
															-	unsigned char *data, *p;
														
 
															-	const struct tdb_methods *methods = tdb->transaction->io_methods;
														
 
															 	struct tdb_recovery_record *rec;
														
 
															-	tdb_off_t old_map_size = tdb->transaction->old_map_size;
														
 
															-	uint64_t magic;
														
 
															-	int i;
														
 
															+	size_t i;
														
 
															 	enum TDB_ERROR ecode;
														
 
															+	unsigned char *p;
														
 
															+	const struct tdb_methods *methods = tdb->transaction->io_methods;
														
 
															-	/*
														
 
															-	  check that the recovery area has enough space
														
 
															-	*/
														
 
															-	ecode = tdb_recovery_allocate(tdb, &recovery_size,
														
 
															-				      &recovery_offset, &recovery_max_size);
														
 
															-	if (ecode != TDB_SUCCESS) {
														
 
															-		return ecode;
														
 
															+	rec = malloc(sizeof(*rec) + tdb_recovery_size(tdb));
														
 
															+	if (!rec) {
														
 
															+		tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
														
 
															+			   "transaction_setup_recovery:"
														
 
															+			   " cannot allocate");
														
 
															+		return TDB_ERR_PTR(TDB_ERR_OOM);
														
 
															 	}
														
 
															-	data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
														
 
															-	if (data == NULL) {
														
 
															-		return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
														
 
															-				  "transaction_setup_recovery:"
														
 
															-				  " cannot allocate");
														
 
															-	}
														
 
															-
														
 
															-	rec = (struct tdb_recovery_record *)data;
														
 
															-
														
 
															 	/* build the recovery data into a single blob to allow us to do a single
														
 
															 	   large write, which should be more efficient */
														
 
															-	p = data + sizeof(*rec);
														
 
															+	p = (unsigned char *)(rec + 1);
														
 
															 	for (i=0;i<tdb->transaction->num_blocks;i++) {
														
 
															 		tdb_off_t offset;
														
 
															 		tdb_len_t length;
														
@@ -849,25 +740,26 @@ static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb,
 
															 			length = tdb->transaction->last_block_size;
														
 
															 		}
														
 
															-		if (offset >= old_map_size) {
														
 
															+		if (offset >= tdb->transaction->old_map_size) {
														
 
															 			continue;
														
 
															 		}
														
 
															 		if (offset + length > tdb->file->map_size) {
														
 
															-			free(data);
														
 
															-			return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
														
 
															-					  "tdb_transaction_setup_recovery:"
														
 
															-					  " transaction data over new region"
														
 
															-					  " boundary");
														
 
															+			free(rec);
														
 
															+			tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
														
 
															+				   "tdb_transaction_setup_recovery:"
														
 
															+				   " transaction data over new region"
														
 
															+				   " boundary");
														
 
															+			return TDB_ERR_PTR(TDB_ERR_CORRUPT);
														
 
															 		}
														
 
															-		if (offset + length > old_map_size) {
														
 
															+		if (offset + length > tdb->transaction->old_map_size) {
														
 
															 			/* Short read at EOF. */
														
 
															-			length = old_map_size - offset;
														
 
															+			length = tdb->transaction->old_map_size - offset;
														
 
															 		}
														
 
															 		ecode = methods->tread(tdb, offset, buffer, length);
														
 
															 		if (ecode != TDB_SUCCESS) {
														
 
															-			free(data);
														
 
															-			return ecode;
														
 
															+			free(rec);
														
 
															+			return TDB_ERR_PTR(ecode);
														
 
															 		}
														
 
															 		/* Skip over anything the same at the start. */
														
@@ -894,49 +786,160 @@ static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb,
 
															 		}
														
 
															 	}
														
 
															-	/* Now we know size, set up rec header. */
														
 
															-	set_recovery_header(rec, TDB_RECOVERY_INVALID_MAGIC,
														
 
															-			    p - data - sizeof(*rec),
														
 
															-			    recovery_max_size, old_map_size);
														
 
															-	tdb_convert(tdb, rec, sizeof(*rec));
														
 
															+	*len = p - (unsigned char *)(rec + 1);
														
 
															+	return rec;
														
 
															+}
														
 
															+
														
 
															+static tdb_off_t create_recovery_area(struct tdb_context *tdb,
														
 
															+				      tdb_len_t rec_length,
														
 
															+				      struct tdb_recovery_record *rec)
														
 
															+{
														
 
															+	tdb_off_t off, recovery_off;
														
 
															+	tdb_len_t addition;
														
 
															+	enum TDB_ERROR ecode;
														
 
															+	const struct tdb_methods *methods = tdb->transaction->io_methods;
														
 
															+
														
 
															+	/* round up to a multiple of page size. Overallocate, since each
														
 
															+	 * such allocation forces us to expand the file. */
														
 
															+	rec->max_len
														
 
															+		= (((sizeof(*rec) + rec_length + rec_length / 2)
														
 
															+		    + PAGESIZE-1) & ~(PAGESIZE-1))
														
 
															+		- sizeof(*rec);
														
 
															+	off = tdb->file->map_size;
														
 
															+
														
 
															+	/* Restore ->map_size before calling underlying expand_file.
														
 
															+	   Also so that we don't try to expand the file again in the
														
 
															+	   transaction commit, which would destroy the recovery
														
 
															+	   area */
														
 
															+	addition = (tdb->file->map_size - tdb->transaction->old_map_size) +
														
 
															+		sizeof(*rec) + rec->max_len;
														
 
															+	tdb->file->map_size = tdb->transaction->old_map_size;
														
 
															+	ecode = methods->expand_file(tdb, addition);
														
 
															+	if (ecode != TDB_SUCCESS) {
														
 
															+		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
														
 
															+				  "tdb_recovery_allocate:"
														
 
															+				  " failed to create recovery area");
														
 
															+	}
														
 
															+
														
 
															+	/* we have to reset the old map size so that we don't try to
														
 
															+	   expand the file again in the transaction commit, which
														
 
															+	   would destroy the recovery area */
														
 
															+	tdb->transaction->old_map_size = tdb->file->map_size;
														
 
															+
														
 
															+	/* write the recovery header offset and sync - we can sync without a race here
														
 
															+	   as the magic ptr in the recovery record has not been set */
														
 
															+	recovery_off = off;
														
 
															+	tdb_convert(tdb, &recovery_off, sizeof(recovery_off));
														
 
															+	ecode = methods->twrite(tdb, offsetof(struct tdb_header, recovery),
														
 
															+				&recovery_off, sizeof(tdb_off_t));
														
 
															+	if (ecode != TDB_SUCCESS) {
														
 
															+		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
														
 
															+				  "tdb_recovery_allocate:"
														
 
															+				  " failed to write recovery head");
														
 
															+	}
														
 
															+	transaction_write_existing(tdb, offsetof(struct tdb_header, recovery),
														
 
															+				   &recovery_off,
														
 
															+				   sizeof(tdb_off_t));
														
 
															+	return off;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+  setup the recovery data that will be used on a crash during commit
														
 
															+*/
														
 
															+static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb)
														
 
															+{
														
 
															+	tdb_len_t recovery_size = 0;
														
 
															+	tdb_off_t recovery_off = 0;
														
 
															+	tdb_off_t old_map_size = tdb->transaction->old_map_size;
														
 
															+	struct tdb_recovery_record *recovery;
														
 
															+	const struct tdb_methods *methods = tdb->transaction->io_methods;
														
 
															+	uint64_t magic;
														
 
															+	enum TDB_ERROR ecode;
														
 
															+
														
 
															+	recovery = alloc_recovery(tdb, &recovery_size);
														
 
															+	if (TDB_PTR_IS_ERR(recovery))
														
 
															+		return TDB_PTR_ERR(recovery);
														
 
															+
														
 
															+	ecode = tdb_recovery_area(tdb, methods, &recovery_off, recovery);
														
 
															+	if (ecode) {
														
 
															+		free(recovery);
														
 
															+		return ecode;
														
 
															+	}
														
 
															+
														
 
															+	if (recovery->max_len < recovery_size) {
														
 
															+		/* Not large enough. Free up old recovery area. */
														
 
															+		if (recovery_off) {
														
 
															+			tdb->stats.frees++;
														
 
															+			ecode = add_free_record(tdb, recovery_off,
														
 
															+						sizeof(*recovery)
														
 
															+						+ recovery->max_len,
														
 
															+						TDB_LOCK_WAIT, true);
														
 
															+			free(recovery);
														
 
															+			if (ecode != TDB_SUCCESS) {
														
 
															+				return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
														
 
															+						  "tdb_recovery_allocate:"
														
 
															+						  " failed to free previous"
														
 
															+						  " recovery area");
														
 
															+			}
														
 
															+
														
 
															+			/* Refresh recovery after add_free_record above. */
														
 
															+			recovery = alloc_recovery(tdb, &recovery_size);
														
 
															+			if (TDB_PTR_IS_ERR(recovery))
														
 
															+				return TDB_PTR_ERR(recovery);
														
 
															+		}
														
 
															+
														
 
															+		recovery_off = create_recovery_area(tdb, recovery_size,
														
 
															+						    recovery);
														
 
															+		if (TDB_OFF_IS_ERR(recovery_off)) {
														
 
															+			free(recovery);
														
 
															+			return recovery_off;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* Now we know size, convert rec header. */
														
 
															+	recovery->magic = TDB_RECOVERY_INVALID_MAGIC;
														
 
															+	recovery->len = recovery_size;
														
 
															+	recovery->eof = old_map_size;
														
 
															+	tdb_convert(tdb, recovery, sizeof(*recovery));
														
 
															 	/* write the recovery data to the recovery area */
														
 
															-	ecode = methods->twrite(tdb, recovery_offset, data, p - data);
														
 
															+	ecode = methods->twrite(tdb, recovery_off, recovery, recovery_size);
														
 
															 	if (ecode != TDB_SUCCESS) {
														
 
															-		free(data);
														
 
															+		free(recovery);
														
 
															 		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
														
 
															 				  "tdb_transaction_setup_recovery:"
														
 
															 				  " failed to write recovery data");
														
 
															 	}
														
 
															-	transaction_write_existing(tdb, recovery_offset, data, p - data);
														
 
															+	transaction_write_existing(tdb, recovery_off, recovery, recovery_size);
														
 
															+
														
 
															+	free(recovery);
														
 
															 	/* as we don't have ordered writes, we have to sync the recovery
														
 
															 	   data before we update the magic to indicate that the recovery
														
 
															 	   data is present */
														
 
															-	ecode = transaction_sync(tdb, recovery_offset, p - data);
														
 
															-	if (ecode != TDB_SUCCESS) {
														
 
															-		free(data);
														
 
															+	ecode = transaction_sync(tdb, recovery_off, recovery_size);
														
 
															+	if (ecode != TDB_SUCCESS)
														
 
															 		return ecode;
														
 
															-	}
														
 
															-
														
 
															-	free(data);
														
 
															 	magic = TDB_RECOVERY_MAGIC;
														
 
															 	tdb_convert(tdb, &magic, sizeof(magic));
														
 
															-	*magic_offset = recovery_offset + offsetof(struct tdb_recovery_record,
														
 
															-						   magic);
														
 
															+	tdb->transaction->magic_offset
														
 
															+		= recovery_off + offsetof(struct tdb_recovery_record, magic);
														
 
															-	ecode = methods->twrite(tdb, *magic_offset, &magic, sizeof(magic));
														
 
															+	ecode = methods->twrite(tdb, tdb->transaction->magic_offset,
														
 
															+				&magic, sizeof(magic));
														
 
															 	if (ecode != TDB_SUCCESS) {
														
 
															 		return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
														
 
															 				  "tdb_transaction_setup_recovery:"
														
 
															 				  " failed to write recovery magic");
														
 
															 	}
														
 
															-	transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic));
														
 
															+	transaction_write_existing(tdb, tdb->transaction->magic_offset,
														
 
															+				   &magic, sizeof(magic));
														
 
															 	/* ensure the recovery magic marker is on disk */
														
 
															-	return transaction_sync(tdb, *magic_offset, sizeof(magic));
														
 
															+	return transaction_sync(tdb, tdb->transaction->magic_offset,
														
 
															+				sizeof(magic));
														
 
															 }
														
 
															 static enum TDB_ERROR _tdb_transaction_prepare_commit(struct tdb_context *tdb)
														
@@ -991,10 +994,9 @@ static enum TDB_ERROR _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 
															 	/* Since we have whole db locked, we don't need the expansion lock. */
														
 
															 	if (!(tdb->flags & TDB_NOSYNC)) {
														
 
															-		/* write the recovery data to the end of the file */
														
 
															-		ecode = transaction_setup_recovery(tdb,
														
 
															-						   &tdb->transaction
														
 
															-						   ->magic_offset);
														
 
															+		/* Sets up tdb->transaction->recovery and
														
 
															+		 * tdb->transaction->magic_offset. */
														
 
															+		ecode = transaction_setup_recovery(tdb);
														
 
															 		if (ecode != TDB_SUCCESS) {
														
 
															 			return ecode;
														
 
															 		}